summaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-12-04 03:58:56 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-12-04 03:58:56 +0000
commit1d67e90f19a7acfd9a05dc59678e7d0c5090bd0d (patch)
tree357efc7b93f8f5102110d20d293f41360ec212fc /arch/i386/kernel
parentaea27b2e18d69af87e673972246e66657b4fa274 (diff)
Merge with Linux 2.3.21.
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile9
-rw-r--r--arch/i386/kernel/apm.c35
-rw-r--r--arch/i386/kernel/entry.S9
-rw-r--r--arch/i386/kernel/head.S9
-rw-r--r--arch/i386/kernel/i386_ksyms.c12
-rw-r--r--arch/i386/kernel/i8259.c244
-rw-r--r--arch/i386/kernel/io_apic.c513
-rw-r--r--arch/i386/kernel/irq.c84
-rw-r--r--arch/i386/kernel/mtrr.c30
-rw-r--r--arch/i386/kernel/pci-i386.c312
-rw-r--r--arch/i386/kernel/pci-i386.h29
-rw-r--r--arch/i386/kernel/pci-pc.c (renamed from arch/i386/kernel/bios32.c)665
-rw-r--r--arch/i386/kernel/pci-visws.c131
-rw-r--r--arch/i386/kernel/smp.c2091
-rw-r--r--arch/i386/kernel/smpboot.c1650
-rw-r--r--arch/i386/kernel/time.c2
-rw-r--r--arch/i386/kernel/traps.c126
17 files changed, 3626 insertions, 2325 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 875f52d5a..29afabd7a 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -19,7 +19,12 @@ OX_OBJS := i386_ksyms.o
MX_OBJS :=
ifdef CONFIG_PCI
-O_OBJS += bios32.o
+O_OBJS += pci-i386.o
+ifdef CONFIG_VISWS
+O_OBJS += pci-visws.o
+else
+O_OBJS += pci-pc.o
+endif
endif
ifdef CONFIG_MCA
@@ -43,7 +48,7 @@ else
endif
ifdef CONFIG_SMP
-O_OBJS += smp.o trampoline.o
+O_OBJS += smp.o smpboot.o trampoline.o
endif
ifdef CONFIG_X86_IO_APIC
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 3bafdfcfc..a54994667 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -643,33 +643,6 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
return APM_SUCCESS;
}
-static int apm_get_battery_status(u_short which, u_short *status,
- u_short *bat, u_short *life, u_short *nbat)
-{
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esi;
-
- if (apm_bios_info.version < 0x0102) {
- /* pretend we only have one battery. */
- if (which != 1)
- return APM_BAD_DEVICE;
- *nbat = 1;
- return apm_get_power_status(status, bat, life);
- }
-
- if (apm_bios_call(0x530a, (0x8000 | (which)), 0, &eax,
- &ebx, &ecx, &edx, &esi))
- return (eax >> 8) & 0xff;
- *status = ebx;
- *bat = ecx;
- *life = edx;
- *nbat = esi;
- return APM_SUCCESS;
-}
-
static int __init apm_engage_power_management(u_short device)
{
u32 eax;
@@ -1263,7 +1236,6 @@ int apm_get_info(char *buf, char **start, off_t fpos, int length, int dummy)
unsigned short bx;
unsigned short cx;
unsigned short dx;
- unsigned short nbat;
unsigned short error;
unsigned short ac_line_status = 0xff;
unsigned short battery_status = 0xff;
@@ -1473,7 +1445,7 @@ static int __init apm_init(void)
if (apm_bios_info.version == 0) {
printk(KERN_INFO "apm: BIOS not found.\n");
- return;
+ return -1;
}
printk(KERN_INFO
"apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
@@ -1483,7 +1455,7 @@ static int __init apm_init(void)
driver_version);
if ((apm_bios_info.flags & APM_32_BIT_SUPPORT) == 0) {
printk(KERN_INFO "apm: no 32 bit BIOS support\n");
- return;
+ return -1;
}
/*
@@ -1512,7 +1484,7 @@ static int __init apm_init(void)
if (apm_disabled) {
printk(KERN_NOTICE "apm: disabled on user request.\n");
- return;
+ return -1;
}
#ifdef CONFIG_SMP
@@ -1571,6 +1543,7 @@ static int __init apm_init(void)
misc_register(&apm_device);
kernel_thread(apm, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND | SIGCHLD);
+ return 0;
}
module_init(apm_init)
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 47f23b6b6..4b88dda89 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -323,9 +323,14 @@ ENTRY(debug)
jmp error_code
ENTRY(nmi)
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
pushl $0
- pushl $ SYMBOL_NAME(do_nmi)
- jmp error_code
+ pushl %edx
+ call SYMBOL_NAME(do_nmi)
+ addl $8,%esp
+ RESTORE_ALL
ENTRY(int3)
pushl $0
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index ac854e721..f1aa50586 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -243,6 +243,15 @@ is386: pushl %ecx # restore original EFLAGS
xorl %eax,%eax
lldt %ax
cld # gcc2 wants the direction flag cleared at all times
+#ifdef __SMP__
+ movb ready, %cl
+ cmpb $1,%cl
+ je 1f # the first CPU calls start_kernel
+ # all other CPUs call initialize_secondary
+ call SYMBOL_NAME(initialize_secondary)
+ jmp L6
+1:
+#endif
call SYMBOL_NAME(start_kernel)
L6:
jmp L6 # main should never return here, but
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 61422f372..043132b8e 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -8,6 +8,7 @@
#include <linux/in6.h>
#include <linux/interrupt.h>
#include <linux/smp_lock.h>
+#include <linux/acpi.h>
#include <asm/semaphore.h>
#include <asm/processor.h>
@@ -17,6 +18,7 @@
#include <asm/hardirq.h>
#include <asm/delay.h>
#include <asm/irq.h>
+#include <asm/mmx.h>
extern void dump_thread(struct pt_regs *, struct user *);
extern int dump_fpu(elf_fpregset_t *);
@@ -41,6 +43,7 @@ EXPORT_SYMBOL(enable_irq);
EXPORT_SYMBOL(disable_irq);
EXPORT_SYMBOL(disable_irq_nosync);
EXPORT_SYMBOL(kernel_thread);
+EXPORT_SYMBOL(acpi_idle);
EXPORT_SYMBOL_NOVERS(__down_failed);
EXPORT_SYMBOL_NOVERS(__down_failed_interruptible);
@@ -71,7 +74,13 @@ EXPORT_SYMBOL(clear_user);
EXPORT_SYMBOL(__clear_user);
EXPORT_SYMBOL(__generic_copy_from_user);
EXPORT_SYMBOL(__generic_copy_to_user);
-EXPORT_SYMBOL(strlen_user);
+EXPORT_SYMBOL(strnlen_user);
+
+#ifdef CONFIG_X86_USE_3DNOW
+EXPORT_SYMBOL(_mmx_memcpy);
+EXPORT_SYMBOL(mmx_clear_page);
+EXPORT_SYMBOL(mmx_copy_page);
+#endif
#ifdef __SMP__
EXPORT_SYMBOL(cpu_data);
@@ -117,3 +126,4 @@ EXPORT_SYMBOL(mca_is_adapter_used);
#ifdef CONFIG_VT
EXPORT_SYMBOL(screen_info);
#endif
+
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index ce4082848..3e9097f06 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -1,7 +1,6 @@
#include <linux/config.h>
#include <linux/ptrace.h>
#include <linux/errno.h>
-#include <linux/kernel_stat.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
@@ -9,68 +8,23 @@
#include <linux/timex.h>
#include <linux/malloc.h>
#include <linux/random.h>
-#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
+#include <linux/kernel_stat.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/irq.h>
#include <asm/bitops.h>
-#include <asm/smp.h>
#include <asm/pgtable.h>
#include <asm/delay.h>
#include <asm/desc.h>
#include <linux/irq.h>
-
-/*
- * Intel specific no controller code
- * odd that no-controller should be architecture dependent
- * but see the ifdef __SMP__
- */
-
-static void enable_none(unsigned int irq) { }
-static unsigned int startup_none(unsigned int irq) { return 0; }
-static void disable_none(unsigned int irq) { }
-static void ack_none(unsigned int irq)
-{
-#ifdef __SMP__
- /*
- * [currently unexpected vectors happen only on SMP and APIC.
- * if we want to have non-APIC and non-8259A controllers
- * in the future with unexpected vectors, this ack should
- * probably be made controller-specific.]
- */
- ack_APIC_irq();
-#endif
-}
-
-/* startup is the same as "enable", shutdown is same as "disable" */
-#define shutdown_none disable_none
-#define end_none enable_none
-
-struct hw_interrupt_type no_irq_type = {
- "none",
- startup_none,
- shutdown_none,
- enable_none,
- disable_none,
- ack_none,
- end_none
-};
-
-
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
/*
+ * Common place to define all x86 IRQ vectors
+ *
* This builds up the IRQ handler stubs using some ugly macros in irq.h
*
* These macros create the low-level assembly IRQ routines that save
@@ -79,7 +33,6 @@ struct hw_interrupt_type no_irq_type = {
* interrupt-controller happy.
*/
-
BUILD_COMMON_IRQ()
#define BI(x,y) \
@@ -93,7 +46,7 @@ BUILD_COMMON_IRQ()
/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x20-0x30)
+ * (these are usually mapped to vectors 0x20-0x2f)
*/
BUILD_16_IRQS(0x0)
@@ -126,9 +79,9 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
*/
BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
-BUILD_SMP_INTERRUPT(stop_cpu_interrupt,STOP_CPU_VECTOR)
BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
BUILD_SMP_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+BUILD_SMP_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
/*
* every pentium local APIC has two 'local interrupts', with a
@@ -150,7 +103,7 @@ BUILD_SMP_TIMER_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-static void (*interrupt[NR_IRQS])(void) = {
+void (*interrupt[NR_IRQS])(void) = {
IRQLIST_16(0x0),
#ifdef CONFIG_X86_IO_APIC
@@ -164,17 +117,23 @@ static void (*interrupt[NR_IRQS])(void) = {
#undef IRQ
#undef IRQLIST_16
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
-
-
-static void enable_8259A_irq(unsigned int irq);
+void enable_8259A_irq(unsigned int irq);
void disable_8259A_irq(unsigned int irq);
/* shutdown is same as "disable" */
#define end_8259A_irq enable_8259A_irq
#define shutdown_8259A_irq disable_8259A_irq
-static void mask_and_ack_8259A(unsigned int);
+void mask_and_ack_8259A(unsigned int);
static unsigned int startup_8259A_irq(unsigned int irq)
{
@@ -207,8 +166,8 @@ static unsigned int cached_irq_mask = 0xffff;
/*
* Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not connected to any IO-APIC pin, it's
- * fed to the CPU IRQ line directly.
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
*
* Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
* this 'mixed mode' IRQ handling costs nothing because it's only used
@@ -224,22 +183,20 @@ void disable_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
cached_irq_mask |= mask;
- if (irq & 8) {
+ if (irq & 8)
outb(cached_A1,0xA1);
- } else {
+ else
outb(cached_21,0x21);
- }
}
-static void enable_8259A_irq(unsigned int irq)
+void enable_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
cached_irq_mask &= mask;
- if (irq & 8) {
+ if (irq & 8)
outb(cached_A1,0xA1);
- } else {
+ else
outb(cached_21,0x21);
- }
}
int i8259A_irq_pending(unsigned int irq)
@@ -260,24 +217,139 @@ void make_8259A_irq(unsigned int irq)
}
/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+ int value;
+ int irqmask = 1<<irq;
+
+ if (irq < 8) {
+ outb(0x0B,0x20); /* ISR register */
+ value = inb(0x20) & irqmask;
+ outb(0x0A,0x20); /* back to the IRR register */
+ return value;
+ }
+ outb(0x0B,0xA0); /* ISR register */
+ value = inb(0xA0) & (irqmask >> 8);
+ outb(0x0A,0xA0); /* back to the IRR register */
+ return value;
+}
+
+/*
* Careful! The 8259A is a fragile beast, it pretty
* much _has_ to be done exactly like this (mask it
* first, _then_ send the EOI, and the order of EOI
* to the two 8259s is important!
*/
-static void mask_and_ack_8259A(unsigned int irq)
+void mask_and_ack_8259A(unsigned int irq)
{
- cached_irq_mask |= 1 << irq;
+ unsigned int irqmask = 1 << irq;
+
+ /*
+ * Lightweight spurious IRQ detection. We do not want
+ * to overdo spurious IRQ handling - it's usually a sign
+ * of hardware problems, so we only do the checks we can
+ * do without slowing down good hardware unnecesserily.
+ *
+ * Note that IRQ7 and IRQ15 (the two spurious IRQs
+ * usually resulting from the 8259A-1|2 PICs) occur
+ * even if the IRQ is masked in the 8259A. Thus we
+ * can check spurious 8259A IRQs without doing the
+ * quite slow i8259A_irq_real() call for every IRQ.
+ * This does not cover 100% of spurious interrupts,
+ * but should be enough to warn the user that there
+ * is something bad going on ...
+ */
+ if (cached_irq_mask & irqmask)
+ goto spurious_8259A_irq;
+ cached_irq_mask |= irqmask;
+
+handle_real_irq:
if (irq & 8) {
- inb(0xA1); /* DUMMY */
+ inb(0xA1); /* DUMMY - (do we need this?) */
outb(cached_A1,0xA1);
- outb(0x62,0x20); /* Specific EOI to cascade */
- outb(0x20,0xA0);
+ outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */
+ outb(0x20,0xA0); /* 'generic EOI' to slave */
} else {
- inb(0x21); /* DUMMY */
+ inb(0x21); /* DUMMY - (do we need this?) */
outb(cached_21,0x21);
- outb(0x20,0x20);
+ outb(0x20,0x20); /* 'generic EOI' to master */
}
+ return;
+
+spurious_8259A_irq:
+ /*
+ * this is the slow path - should happen rarely.
+ */
+ if (i8259A_irq_real(irq))
+ /*
+ * oops, the IRQ _is_ in service according to the
+ * 8259A - not spurious, go handle it.
+ */
+ goto handle_real_irq;
+
+ {
+ static int spurious_irq_mask = 0;
+ /*
+ * At this point we can be sure the IRQ is spurious,
+ * lets ACK and report it. [once per IRQ]
+ */
+ if (!(spurious_irq_mask & irqmask)) {
+ printk("spurious 8259A interrupt: IRQ%d.\n", irq);
+ spurious_irq_mask |= irqmask;
+ }
+ irq_err_count++;
+ /*
+ * Theoretically we do not have to handle this IRQ,
+ * but in Linux this does not cause problems and is
+ * simpler for us.
+ */
+ goto handle_real_irq;
+ }
+}
+
+void init_8259A(int auto_eoi)
+{
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+
+ outb(0xff, 0x21); /* mask all of 8259A-1 */
+ outb(0xff, 0xA1); /* mask all of 8259A-2 */
+
+ /*
+ * outb_p - this has to work on a wide range of PC hardware.
+ */
+ outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
+ outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+ outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
+ if (auto_eoi)
+ outb_p(0x03, 0x21); /* master does Auto EOI */
+ else
+ outb_p(0x01, 0x21); /* master expects normal EOI */
+
+ outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
+ outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+ outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
+ outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
+ is to be investigated) */
+
+ if (auto_eoi)
+ /*
+ * in AEOI mode we just have to mask the interrupt
+ * when acking.
+ */
+ i8259A_irq_type.ack = disable_8259A_irq;
+
+ udelay(100); /* wait for 8259A to initialize */
+
+ outb(cached_21, 0x21); /* restore master IRQ mask */
+ outb(cached_A1, 0xA1); /* restore slave IRQ mask */
+
+ restore_flags(flags);
}
#ifndef CONFIG_VISWS
@@ -307,7 +379,7 @@ static struct irqaction irq13 = { math_error_irq, 0, 0, "fpu", NULL, NULL };
* IRQ2 is cascade interrupt to second interrupt controller
*/
-static struct irqaction irq2 = { no_action, 0, 0, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, 0, 0, "cascade", NULL, NULL};
#endif
@@ -315,6 +387,8 @@ void init_ISA_irqs (void)
{
int i;
+ init_8259A(0);
+
for (i = 0; i < NR_IRQS; i++) {
irq_desc[i].status = IRQ_DISABLED;
irq_desc[i].action = 0;
@@ -357,9 +431,9 @@ void __init init_IRQ(void)
#ifdef __SMP__
/*
- IRQ0 must be given a fixed assignment and initialized
- before init_IRQ_SMP.
- */
+ * IRQ0 must be given a fixed assignment and initialized,
+ * because it's used before the IO-APIC is set up.
+ */
set_intr_gate(IRQ0_TRAP_VECTOR, interrupt[0]);
/*
@@ -371,17 +445,15 @@ void __init init_IRQ(void)
/* IPI for invalidation */
set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
- /* IPI for CPU halt */
- set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt);
-
/* self generated IPI for local APIC timer */
set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
- /* IPI vector for APIC spurious interrupts */
+ /* IPI vectors for APIC spurious and error interrupts */
set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
#endif
/*
@@ -397,13 +469,3 @@ void __init init_IRQ(void)
setup_irq(13, &irq13);
#endif
}
-
-#ifdef CONFIG_X86_IO_APIC
-void __init init_IRQ_SMP(void)
-{
- int i;
- for (i = 0; i < NR_IRQS ; i++)
- if (IO_APIC_VECTOR(i) > 0)
- set_intr_gate(IO_APIC_VECTOR(i), interrupt[i]);
-}
-#endif
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 34e3ff86f..9fb8bcd3a 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1,7 +1,7 @@
/*
* Intel IO-APIC support for multi-Pentium hosts.
*
- * Copyright (C) 1997, 1998 Ingo Molnar, Hajnalka Szabo
+ * Copyright (C) 1997, 1998, 1999 Ingo Molnar, Hajnalka Szabo
*
* Many thanks to Stig Venaas for trying out countless experimental
* patches and reporting/debugging problems patiently!
@@ -18,15 +18,21 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <asm/io.h>
+#include <asm/desc.h>
#include <linux/irq.h>
+#undef __init
+#define __init
+
/*
* volatile is justified in this case, IO-APIC register contents
* might change spontaneously, GCC should not cache it
*/
#define IO_APIC_BASE(idx) ((volatile int *)__fix_to_virt(FIX_IO_APIC_BASE_0 + idx))
+extern int nmi_watchdog;
+
/*
* The structure of the IO-APIC:
*/
@@ -59,6 +65,11 @@ int nr_ioapic_registers[MAX_IO_APICS];
enum ioapic_irq_destination_types {
dest_Fixed = 0,
dest_LowestPrio = 1,
+ dest_SMI = 2,
+ dest__reserved_1 = 3,
+ dest_NMI = 4,
+ dest_INIT = 5,
+ dest__reserved_2 = 6,
dest_ExtINT = 7
};
@@ -94,14 +105,7 @@ struct IO_APIC_route_entry {
* MP-BIOS irq configuration table structures:
*/
-enum mp_irq_source_types {
- mp_INT = 0,
- mp_NMI = 1,
- mp_SMI = 2,
- mp_ExtINT = 3
-};
-
-struct mpc_config_ioapic mp_apics[MAX_IO_APICS];/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];/* I/O APIC entries */
int mp_irq_entries = 0; /* # of MP IRQ source entries */
struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* MP IRQ source entries */
@@ -202,16 +206,10 @@ static void name##_IO_APIC_irq(unsigned int irq) \
FINAL; \
}
-/*
- * We disable IO-APIC IRQs by setting their 'destination CPU mask' to
- * zero. Trick by Ramesh Nalluri.
- */
-DO_ACTION( disable, 1, &= 0x00ffffff, io_apic_sync(entry->apic))/* destination = 0x00 */
-DO_ACTION( enable, 1, |= 0xff000000, ) /* destination = 0xff */
DO_ACTION( mask, 0, |= 0x00010000, io_apic_sync(entry->apic))/* mask = 1 */
DO_ACTION( unmask, 0, &= 0xfffeffff, ) /* mask = 0 */
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
@@ -289,7 +287,7 @@ static int __init find_irq_entry(int apic, int pin, int type)
for (i = 0; i < mp_irq_entries; i++)
if ( (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_dstapic == mp_apics[apic].mpc_apicid) &&
+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid) &&
(mp_irqs[i].mpc_dstirq == pin))
return i;
@@ -330,7 +328,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pci_pin)
int lbus = mp_irqs[i].mpc_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_apics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
break;
if ((apic || IO_APIC_IRQ(mp_irqs[i].mpc_dstirq)) &&
@@ -589,24 +587,30 @@ static int __init assign_irq_vector(int irq)
static int current_vector = IRQ0_TRAP_VECTOR, offset = 0;
if (IO_APIC_VECTOR(irq) > 0)
return IO_APIC_VECTOR(irq);
+ if (current_vector == 0xFF)
+ panic("ran out of interrupt sources!");
+next:
current_vector += 8;
- if (current_vector > 0xFE) {
+ if (current_vector == SYSCALL_VECTOR)
+ goto next;
+
+ if (current_vector > 0xFF) {
offset++;
current_vector = IRQ0_TRAP_VECTOR + offset;
- printk("WARNING: ASSIGN_IRQ_VECTOR wrapped back to %02X\n",
- current_vector);
}
- if (current_vector == SYSCALL_VECTOR)
- panic("ran out of interrupt sources!");
IO_APIC_VECTOR(irq) = current_vector;
return current_vector;
}
+extern void (*interrupt[NR_IRQS])(void);
+static struct hw_interrupt_type ioapic_level_irq_type;
+static struct hw_interrupt_type ioapic_edge_irq_type;
+
void __init setup_IO_APIC_irqs(void)
{
struct IO_APIC_route_entry entry;
- int apic, pin, idx, irq, first_notcon = 1;
+ int apic, pin, idx, irq, first_notcon = 1, vector;
printk("init IO_APIC IRQs\n");
@@ -621,15 +625,15 @@ void __init setup_IO_APIC_irqs(void)
entry.delivery_mode = dest_LowestPrio;
entry.dest_mode = 1; /* logical delivery */
entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest = 0; /* but no route */
+ entry.dest.logical.logical_dest = APIC_ALL_CPUS; /* all CPUs */
idx = find_irq_entry(apic,pin,mp_INT);
if (idx == -1) {
if (first_notcon) {
- printk(" IO-APIC (apicid-pin) %d-%d", mp_apics[apic].mpc_apicid, pin);
+ printk(" IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
first_notcon = 0;
} else
- printk(", %d-%d", mp_apics[apic].mpc_apicid, pin);
+ printk(", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
continue;
}
@@ -639,17 +643,29 @@ void __init setup_IO_APIC_irqs(void)
if (irq_trigger(idx)) {
entry.trigger = 1;
entry.mask = 1;
- entry.dest.logical.logical_dest = 0xff;
+ entry.dest.logical.logical_dest = APIC_ALL_CPUS;
}
- irq = pin_2_irq(idx,apic,pin);
+ irq = pin_2_irq(idx, apic, pin);
add_pin_to_irq(irq, apic, pin);
if (!apic && !IO_APIC_IRQ(irq))
continue;
- entry.vector = assign_irq_vector(irq);
+ if (IO_APIC_IRQ(irq)) {
+ vector = assign_irq_vector(irq);
+ entry.vector = vector;
+
+ if (IO_APIC_irq_trigger(irq))
+ irq_desc[irq].handler = &ioapic_level_irq_type;
+ else
+ irq_desc[irq].handler = &ioapic_edge_irq_type;
+ set_intr_gate(vector, interrupt[irq]);
+
+ if (!apic && (irq < 16))
+ disable_8259A_irq(irq);
+ }
io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
}
@@ -660,34 +676,47 @@ void __init setup_IO_APIC_irqs(void)
}
/*
- * Set up a certain pin as ExtINT delivered interrupt
+ * Set up the 8259A-master output pin as broadcast to all
+ * CPUs.
*/
-void __init setup_ExtINT_pin(unsigned int apic, unsigned int pin, int irq)
+void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
{
struct IO_APIC_route_entry entry;
- /*
- * add it to the IO-APIC irq-routing table:
- */
memset(&entry,0,sizeof(entry));
- entry.delivery_mode = dest_ExtINT;
- entry.dest_mode = 0; /* physical delivery */
- entry.mask = 0; /* unmask IRQ now */
- /*
- * We use physical delivery to get the timer IRQ
- * to the boot CPU. 'boot_cpu_id' is the physical
- * APIC ID of the boot CPU.
- */
- entry.dest.physical.physical_dest = boot_cpu_id;
+ disable_8259A_irq(0);
- entry.vector = assign_irq_vector(irq);
+ apic_readaround(APIC_LVT0);
+ apic_write(APIC_LVT0, 0x00010700); // mask LVT0
+ init_8259A(1);
+
+ /*
+ * We use logical delivery to get the timer IRQ
+ * to the first CPU.
+ */
+ entry.dest_mode = 1; /* logical delivery */
+ entry.mask = 0; /* unmask IRQ now */
+ entry.dest.logical.logical_dest = APIC_ALL_CPUS;
+ entry.delivery_mode = dest_LowestPrio;
entry.polarity = 0;
entry.trigger = 0;
+ entry.vector = vector;
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
+ /*
+ * The timer IRQ doesnt have to know that behind the
+ * scene we have a 8259A-master in AEOI mode ...
+ */
+ irq_desc[0].handler = &ioapic_edge_irq_type;
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+
+ enable_8259A_irq(0);
}
void __init UNEXPECTED_IO_APIC(void)
@@ -705,7 +734,7 @@ void __init print_IO_APIC(void)
printk("number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
- printk("number of IO-APIC #%d registers: %d.\n", mp_apics[i].mpc_apicid, nr_ioapic_registers[i]);
+ printk("number of IO-APIC #%d registers: %d.\n", mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -717,8 +746,10 @@ void __init print_IO_APIC(void)
*(int *)&reg_00 = io_apic_read(apic, 0);
*(int *)&reg_01 = io_apic_read(apic, 1);
- *(int *)&reg_02 = io_apic_read(apic, 2);
- printk("\nIO APIC #%d......\n", mp_apics[apic].mpc_apicid);
+ if (reg_01.version >= 0x10)
+ *(int *)&reg_02 = io_apic_read(apic, 2);
+
+ printk("\nIO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
printk(".... register #00: %08X\n", *(int *)&reg_00);
printk("....... : physical APIC id: %02X\n", reg_00.ID);
if (reg_00.__reserved_1 || reg_00.__reserved_2)
@@ -730,12 +761,15 @@ void __init print_IO_APIC(void)
(reg_01.entries != 0x17) && /* typical ISA+PCI boards */
(reg_01.entries != 0x1b) && /* Compaq Proliant boards */
(reg_01.entries != 0x1f) && /* dual Xeon boards */
- (reg_01.entries != 0x3F) /* bigger Xeon boards */
+ (reg_01.entries != 0x22) && /* bigger Xeon boards */
+ (reg_01.entries != 0x2E) &&
+ (reg_01.entries != 0x3F)
)
UNEXPECTED_IO_APIC();
printk("....... : IO APIC version: %04X\n", reg_01.version);
- if ( (reg_01.version != 0x10) && /* oldest IO-APICs */
+ if ( (reg_01.version != 0x01) && /* 82489DX IO-APICs */
+ (reg_01.version != 0x10) && /* oldest IO-APICs */
(reg_01.version != 0x11) && /* Pentium/Pro IO-APICs */
(reg_01.version != 0x13) /* Xeon IO-APICs */
)
@@ -743,10 +777,12 @@ void __init print_IO_APIC(void)
if (reg_01.__reserved_1 || reg_01.__reserved_2)
UNEXPECTED_IO_APIC();
- printk(".... register #02: %08X\n", *(int *)&reg_02);
- printk("....... : arbitration: %02X\n", reg_02.arbitration);
- if (reg_02.__reserved_1 || reg_02.__reserved_2)
- UNEXPECTED_IO_APIC();
+ if (reg_01.version >= 0x10) {
+ printk(".... register #02: %08X\n", *(int *)&reg_02);
+ printk("....... : arbitration: %02X\n", reg_02.arbitration);
+ if (reg_02.__reserved_1 || reg_02.__reserved_2)
+ UNEXPECTED_IO_APIC();
+ }
printk(".... IRQ redirection table:\n");
@@ -797,8 +833,116 @@ void __init print_IO_APIC(void)
return;
}
+static void print_APIC_bitfield (int base)
+{
+ unsigned int v;
+ int i, j;
+
+ printk("0123456789abcdef0123456789abcdef\n");
+ for (i = 0; i < 8; i++) {
+ v = apic_read(base + i*0x10);
+ for (j = 0; j < 32; j++) {
+ if (v & (1<<j))
+ printk("1");
+ else
+ printk("0");
+ }
+ printk("\n");
+ }
+}
+
+void /*__init*/ print_local_APIC(void * dummy)
+{
+ unsigned int v, ver, maxlvt;
+
+ printk("\nprinting local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
+ printk("... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
+ v = apic_read(APIC_LVR);
+ printk("... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ printk("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ v = apic_read(APIC_ARBPRI);
+ printk("... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ v = apic_read(APIC_PROCPRI);
+ printk("... APIC PROCPRI: %08x\n", v);
+ }
+
+ v = apic_read(APIC_EOI);
+ printk("... APIC EOI: %08x\n", v);
+ v = apic_read(APIC_LDR);
+ printk("... APIC LDR: %08x\n", v);
+ v = apic_read(APIC_DFR);
+ printk("... APIC DFR: %08x\n", v);
+ v = apic_read(APIC_SPIV);
+ printk("... APIC SPIV: %08x\n", v);
+
+ printk("... APIC ISR field:\n");
+ print_APIC_bitfield(APIC_ISR);
+ printk("... APIC TMR field:\n");
+ print_APIC_bitfield(APIC_TMR);
+ printk("... APIC IRR field:\n");
+ print_APIC_bitfield(APIC_IRR);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ /*
+ * Due to the Pentium erratum 3AP.
+ */
+ if (maxlvt > 3) {
+ apic_readaround(APIC_SPIV); // not strictly necessery
+ apic_write(APIC_ESR, 0);
+ }
+ v = apic_read(APIC_ESR);
+ printk("... APIC ESR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_ICR);
+ printk("... APIC ICR: %08x\n", v);
+ v = apic_read(APIC_ICR2);
+ printk("... APIC ICR2: %08x\n", v);
+
+ v = apic_read(APIC_LVTT);
+ printk("... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) { /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ printk("... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ printk("... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ printk("... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) { /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ printk("... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ printk("... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ printk("... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ printk("... APIC TDCR: %08x\n", v);
+ printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+ smp_call_function(print_local_APIC, NULL, 1, 1);
+ print_local_APIC(NULL);
+}
+
static void __init init_sym_mode(void)
{
+ struct IO_APIC_reg_01 reg_01;
int i;
for (i = 0; i < PIN_MAP_SIZE; i++) {
@@ -809,24 +953,21 @@ static void __init init_sym_mode(void)
for (i = 0; i < MAX_PIRQS; i++)
pirq_entries[i] =- 1;
- printk("enabling symmetric IO mode... ");
-
- outb(0x70, 0x22);
- outb(0x01, 0x23);
-
- printk("...done.\n");
+ if (pic_mode) {
+ /*
+ * PIC mode, enable symmetric IO mode in the IMCR.
+ */
+ printk("leaving PIC mode, enabling symmetric IO mode.\n");
+ outb(0x70, 0x22);
+ outb(0x01, 0x23);
+ }
/*
* The number of IO-APIC IRQ registers (== #pins):
*/
- {
- struct IO_APIC_reg_01 reg_01;
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- *(int *)&reg_01 = io_apic_read(i, 1);
- nr_ioapic_registers[i] = reg_01.entries+1;
- }
+ for (i = 0; i < nr_ioapics; i++) {
+ *(int *)&reg_01 = io_apic_read(i, 1);
+ nr_ioapic_registers[i] = reg_01.entries+1;
}
/*
@@ -835,24 +976,41 @@ static void __init init_sym_mode(void)
clear_IO_APIC();
}
+static void clear_lapic_ints (void * dummy)
+{
+ int maxlvt;
+
+ maxlvt = get_maxlvt();
+ apic_write_around(APIC_LVTT, 0x00010000);
+ apic_write_around(APIC_LVT0, 0x00010000);
+ apic_write_around(APIC_LVT1, 0x00010000);
+ if (maxlvt >= 3)
+ apic_write_around(APIC_LVTERR, 0x00010000);
+ if (maxlvt >= 4)
+ apic_write_around(APIC_LVTPC, 0x00010000);
+}
+
/*
* Not an __init, needed by the reboot code
*/
void init_pic_mode(void)
{
/*
- * Clear the IO-APIC before rebooting:
+ * Clear the IO-APIC and local APICs before rebooting:
*/
clear_IO_APIC();
+ smp_call_function(clear_lapic_ints, NULL, 1, 1);
+ clear_lapic_ints(NULL);
/*
* Put it back into PIC mode (has an effect only on
- * certain boards)
+ * certain older boards)
*/
- printk("disabling symmetric IO mode... ");
+ if (pic_mode) {
+ printk("disabling symmetric IO mode, entering PIC mode.\n");
outb_p(0x70, 0x22);
outb_p(0x00, 0x23);
- printk("...done.\n");
+ }
}
static void __init setup_ioapic_id(void)
@@ -914,10 +1072,13 @@ static void __init construct_default_ISA_mptable(void)
* MP specification 1.4 defines some extra rules for default
* configurations, fix them up here:
*/
-
switch (mpc_default_type)
{
case 2:
+ /*
+ * IRQ0 is not connected:
+ */
+ mp_irqs[0].mpc_irqtype = mp_ExtINT;
break;
default:
/*
@@ -942,7 +1103,7 @@ static int __init timer_irq_works(void)
unsigned int t1 = jiffies;
sti();
- mdelay(100);
+ mdelay(40);
if (jiffies-t1>1)
return 1;
@@ -950,6 +1111,27 @@ static int __init timer_irq_works(void)
return 0;
}
+extern atomic_t nmi_counter[NR_CPUS];
+
+static int __init nmi_irq_works(void)
+{
+ atomic_t tmp[NR_CPUS];
+ int j, cpu;
+
+ memcpy(tmp, nmi_counter, sizeof(tmp));
+ sti();
+ mdelay(50);
+
+ for (j = 0; j < smp_num_cpus; j++) {
+ cpu = cpu_logical_map(j);
+ if (atomic_read(nmi_counter+cpu) - atomic_read(tmp+cpu) <= 3) {
+ printk("CPU#%d NMI appears to be stuck.\n", cpu);
+ return 0;
+ }
+ }
+ return 1;
+}
+
/*
* In the SMP+IOAPIC case it might happen that there are an unspecified
* number of pending IRQ events unhandled. These cases are very rare,
@@ -964,12 +1146,11 @@ static int __init timer_irq_works(void)
*/
static void enable_edge_ioapic_irq(unsigned int irq)
{
- enable_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq(irq);
}
static void disable_edge_ioapic_irq(unsigned int irq)
{
- disable_IO_APIC_irq(irq);
}
/*
@@ -995,8 +1176,17 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
}
#define shutdown_edge_ioapic_irq disable_edge_ioapic_irq
-void static ack_edge_ioapic_irq(unsigned int i)
+
+/*
+ * Once we have recorded IRQ_PENDING already, we can mask the
+ * interrupt for real. This prevents IRQ storms from unhandled
+ * devices.
+ */
+void static ack_edge_ioapic_irq(unsigned int irq)
{
+ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
+ == (IRQ_PENDING | IRQ_DISABLED))
+ mask_IO_APIC_irq(irq);
ack_APIC_irq();
}
void static end_edge_ioapic_irq(unsigned int i){}
@@ -1055,7 +1245,8 @@ static struct hw_interrupt_type ioapic_level_irq_type = {
static inline void init_IO_APIC_traps(void)
{
- int i;
+ int irq;
+
/*
* NOTE! The local APIC isn't very good at handling
* multiple interrupts at the same interrupt level.
@@ -1067,36 +1258,62 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for (i = 0; i < NR_IRQS ; i++) {
- if (IO_APIC_VECTOR(i) > 0) {
- if (IO_APIC_irq_trigger(i))
- irq_desc[i].handler = &ioapic_level_irq_type;
- else
- irq_desc[i].handler = &ioapic_edge_irq_type;
- /*
- * disable it in the 8259A:
- */
- if (i < 16)
- disable_8259A_irq(i);
- } else {
- if (!IO_APIC_IRQ(i))
- continue;
-
+ for (irq = 0; irq < NR_IRQS ; irq++) {
+ if (IO_APIC_IRQ(irq) && !IO_APIC_VECTOR(irq)) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (i < 16) {
- make_8259A_irq(i);
- continue;
- }
-
- /* Strange. Oh, well.. */
- irq_desc[i].handler = &no_irq_type;
+ if (irq < 16)
+ make_8259A_irq(irq);
+ else
+ /* Strange. Oh, well.. */
+ irq_desc[irq].handler = &no_irq_type;
}
}
- init_IRQ_SMP();
+}
+
+void static ack_lapic_irq (unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+void static end_lapic_irq (unsigned int i) { /* nothing */ }
+
+static struct hw_interrupt_type lapic_irq_type = {
+ "local-APIC-edge",
+ NULL, /* startup_irq() not used for IRQ0 */
+ NULL, /* shutdown_irq() not used for IRQ0 */
+ NULL, /* enable_irq() not used for IRQ0 */
+ NULL, /* disable_irq() not used for IRQ0 */
+ ack_lapic_irq,
+ end_lapic_irq
+};
+
+static void enable_NMI_through_LVT0 (void * dummy)
+{
+ apic_readaround(APIC_LVT0);
+ apic_write(APIC_LVT0, 0x00000400); // unmask and set to NMI
+}
+
+static void setup_nmi (void)
+{
+ /*
+ * Dirty trick to enable the NMI watchdog ...
+ * We put the 8259A master into AEOI mode and
+ * unmask on all local APICs LVT0 as NMI.
+ *
+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+ * is from Maciej W. Rozycki - so we do not have to EOI from
+ * the NMI handler or the timer interrupt.
+ */
+ printk("activating NMI Watchdog ...");
+
+ smp_call_function(enable_NMI_through_LVT0, NULL, 1, 1);
+ enable_NMI_through_LVT0(NULL);
+
+ printk(" done.\n");
}
/*
@@ -1108,45 +1325,78 @@ static inline void init_IO_APIC_traps(void)
static inline void check_timer(void)
{
int pin1, pin2;
+ int vector;
+
+ /*
+ * get/set the timer IRQ vector:
+ */
+ vector = assign_irq_vector(0);
+ set_intr_gate(vector, interrupt[0]);
pin1 = find_timer_pin(mp_INT);
pin2 = find_timer_pin(mp_ExtINT);
- enable_IO_APIC_irq(0);
- if (!timer_irq_works()) {
- if (pin1 != -1)
- printk("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
- printk("...trying to set up timer as ExtINT... ");
+ /*
+ * Ok, does IRQ0 through the IOAPIC work?
+ */
+ if (timer_irq_works()) {
+ if (nmi_watchdog) {
+ disable_8259A_irq(0);
+ init_8259A(1);
+ setup_nmi();
+ enable_8259A_irq(0);
+ if (nmi_irq_works())
+ return;
+ } else
+ return;
+ }
- if (pin2 != -1) {
- printk(".. (found pin %d) ...", pin2);
- /*
- * legacy devices should be connected to IO APIC #0
- */
- setup_ExtINT_pin(0, pin2, 0);
- make_8259A_irq(0);
+ if (pin1 != -1) {
+ printk("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
+ clear_IO_APIC_pin(0, pin1);
+ }
+
+ printk("...trying to set up timer (IRQ0) through the 8259A ... ");
+ if (pin2 != -1) {
+ printk("\n..... (found pin %d) ...", pin2);
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+ setup_ExtINT_IRQ0_pin(pin2, vector);
+ if (timer_irq_works()) {
+ printk("works.\n");
+ if (nmi_watchdog) {
+ setup_nmi();
+ if (nmi_irq_works())
+ return;
+ } else
+ return;
}
+ /*
+ * Cleanup, just in case ...
+ */
+ clear_IO_APIC_pin(0, pin2);
+ }
+ printk(" failed.\n");
- if (!timer_irq_works()) {
- printk(" failed.\n");
- printk("...trying to set up timer as BP IRQ...");
- /*
- * Just in case ...
- */
- if (pin1 != -1)
- clear_IO_APIC_pin(0, pin1);
- if (pin2 != -1)
- clear_IO_APIC_pin(0, pin2);
+ if (nmi_watchdog)
+ printk("timer doesnt work through the IO-APIC - cannot activate NMI Watchdog!\n");
- make_8259A_irq(0);
+ printk("...trying to set up timer as Virtual Wire IRQ...");
- if (!timer_irq_works()) {
- printk(" failed.\n");
- panic("IO-APIC + timer doesn't work!");
- }
- }
+ disable_8259A_irq(0);
+ irq_desc[0].handler = &lapic_irq_type;
+ init_8259A(1); // AEOI mode
+ apic_readaround(APIC_LVT0);
+ apic_write(APIC_LVT0, 0x00000000 | vector); // Fixed mode
+ enable_8259A_irq(0);
+
+ if (timer_irq_works()) {
printk(" works.\n");
+ return;
}
+ printk(" failed :(.\n");
+ panic("IO-APIC + timer doesn't work! pester mingo@redhat.com");
}
/*
@@ -1189,6 +1439,5 @@ void __init setup_IO_APIC(void)
setup_IO_APIC_irqs();
init_IO_APIC_traps();
check_timer();
-
print_IO_APIC();
}
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 3106f1966..8ec329287 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -22,7 +22,6 @@
#include <linux/ptrace.h>
#include <linux/errno.h>
-#include <linux/kernel_stat.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
@@ -30,14 +29,13 @@
#include <linux/timex.h>
#include <linux/malloc.h>
#include <linux/random.h>
-#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
+#include <linux/kernel_stat.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/bitops.h>
-#include <asm/smp.h>
#include <asm/pgtable.h>
#include <asm/delay.h>
#include <asm/desc.h>
@@ -48,7 +46,7 @@
unsigned int local_bh_count[NR_CPUS];
unsigned int local_irq_count[NR_CPUS];
-atomic_t nmi_counter;
+extern atomic_t nmi_counter[NR_CPUS];
/*
* Linux has a controller-independent x86 interrupt architecture.
@@ -75,7 +73,8 @@ spinlock_t irq_controller_lock = SPIN_LOCK_UNLOCKED;
/*
* Controller mappings for all interrupt sources:
*/
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { 0, &no_irq_type, }};
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
+ { [0 ... NR_IRQS-1] = { 0, &no_irq_type, }};
/*
* Special irq handlers.
@@ -84,6 +83,52 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { 0, &n
void no_action(int cpl, void *dev_id, struct pt_regs *regs) { }
/*
+ * Generic no controller code
+ */
+
+static void enable_none(unsigned int irq) { }
+static unsigned int startup_none(unsigned int irq) { return 0; }
+static void disable_none(unsigned int irq) { }
+static void ack_none(unsigned int irq)
+{
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves, it doesnt deserve
+ * a generic callback i think.
+ */
+#if CONFIG_X86
+ printk("unexpected IRQ trap at vector %02x\n", irq);
+#ifdef __SMP__
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ */
+ ack_APIC_irq();
+#endif
+#endif
+}
+
+/* startup is the same as "enable", shutdown is same as "disable" */
+#define shutdown_none disable_none
+#define end_none enable_none
+
+struct hw_interrupt_type no_irq_type = {
+ "none",
+ startup_none,
+ shutdown_none,
+ enable_none,
+ disable_none,
+ ack_none,
+ end_none
+};
+
+volatile unsigned long irq_err_count;
+
+/*
* Generic, controller-independent functions:
*/
@@ -106,22 +151,30 @@ int get_irq_list(char *buf)
#ifndef __SMP__
p += sprintf(p, "%10u ", kstat_irqs(i));
#else
- for (j=0; j<smp_num_cpus; j++)
+ for (j = 0; j < smp_num_cpus; j++)
p += sprintf(p, "%10u ",
kstat.irqs[cpu_logical_map(j)][i]);
#endif
p += sprintf(p, " %14s", irq_desc[i].handler->typename);
p += sprintf(p, " %s", action->name);
- for (action=action->next; action; action = action->next) {
+ for (action=action->next; action; action = action->next)
p += sprintf(p, ", %s", action->name);
- }
*p++ = '\n';
}
- p += sprintf(p, "NMI: %10u\n", atomic_read(&nmi_counter));
-#ifdef __SMP__
- p += sprintf(p, "ERR: %10lu\n", ipi_count);
-#endif
+ p += sprintf(p, "NMI: ");
+ for (j = 0; j < smp_num_cpus; j++)
+ p += sprintf(p, "%10u ",
+ atomic_read(nmi_counter+cpu_logical_map(j)));
+ p += sprintf(p, "\n");
+#if CONFIG_SMP
+ p += sprintf(p, "LOC: ");
+ for (j = 0; j < smp_num_cpus; j++)
+ p += sprintf(p, "%10u ",
+ apic_timer_irqs[cpu_logical_map(j)]);
+ p += sprintf(p, "\n");
+#endif
+ p += sprintf(p, "ERR: %10lu\n", irq_err_count);
return p - buf;
}
@@ -520,7 +573,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
kstat.irqs[cpu][irq]++;
desc = irq_desc + irq;
spin_lock(&irq_controller_lock);
- irq_desc[irq].handler->ack(irq);
+ desc->handler->ack(irq);
/*
REPLAY is when Linux resends an IRQ that was dropped earlier
WAITING is used by probe to mark irqs that are being tested
@@ -570,9 +623,8 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
spin_unlock(&irq_controller_lock);
}
desc->status &= ~IRQ_INPROGRESS;
- if (!(desc->status & IRQ_DISABLED)){
- irq_desc[irq].handler->end(irq);
- }
+ if (!(desc->status & IRQ_DISABLED))
+ desc->handler->end(irq);
spin_unlock(&irq_controller_lock);
/*
diff --git a/arch/i386/kernel/mtrr.c b/arch/i386/kernel/mtrr.c
index f76c68f59..f55e86b61 100644
--- a/arch/i386/kernel/mtrr.c
+++ b/arch/i386/kernel/mtrr.c
@@ -223,6 +223,8 @@
19990819 Alan Cox <alan@redhat.com>
Tested Zoltan's changes on a pre production Athlon - 100%
success.
+ 19991008 Manfred Spraul <manfreds@colorfullife.com>
+ replaced spin_lock_reschedule() with a normal semaphore.
*/
#include <linux/types.h>
#include <linux/errno.h>
@@ -303,8 +305,6 @@ typedef u8 mtrr_type;
TRUE)
#endif
-#define spin_lock_reschedule(lock) while (!spin_trylock(lock)) schedule ();
-
#ifndef CONFIG_PROC_FS
# define compute_ascii() while (0)
#endif
@@ -314,7 +314,7 @@ static char *ascii_buffer = NULL;
static unsigned int ascii_buf_bytes = 0;
#endif
static unsigned int *usage_table = NULL;
-static spinlock_t main_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_MUTEX(main_lock);
/* Private functions */
#ifdef CONFIG_PROC_FS
@@ -1172,7 +1172,7 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type,
increment = increment ? 1 : 0;
max = get_num_var_ranges ();
/* Search for existing MTRR */
- spin_lock_reschedule (&main_lock);
+ down(&main_lock);
for (i = 0; i < max; ++i)
{
(*get_mtrr) (i, &lbase, &lsize, &ltype);
@@ -1181,7 +1181,7 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type,
/* At this point we know there is some kind of overlap/enclosure */
if ( (base < lbase) || (base + size > lbase + lsize) )
{
- spin_unlock (&main_lock);
+ up(&main_lock);
printk ("mtrr: 0x%lx,0x%lx overlaps existing 0x%lx,0x%lx\n",
base, size, lbase, lsize);
return -EINVAL;
@@ -1190,14 +1190,14 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type,
if (ltype != type)
{
if (type == MTRR_TYPE_UNCACHABLE) continue;
- spin_unlock (&main_lock);
+ up(&main_lock);
printk ( "mtrr: type mismatch for %lx,%lx old: %s new: %s\n",
base, size, attrib_to_str (ltype), attrib_to_str (type) );
return -EINVAL;
}
if (increment) ++usage_table[i];
compute_ascii ();
- spin_unlock (&main_lock);
+ up(&main_lock);
return i;
}
/* Search for an empty MTRR */
@@ -1211,7 +1211,7 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type,
set_mtrr (i, base, size, type);
usage_table[i] = 1;
compute_ascii ();
- spin_unlock (&main_lock);
+ up(&main_lock);
return i;
} /* End Function mtrr_add */
@@ -1232,7 +1232,7 @@ int mtrr_del (int reg, unsigned long base, unsigned long size)
if ( !(boot_cpu_data.x86_capability & X86_FEATURE_MTRR) ) return -ENODEV;
max = get_num_var_ranges ();
- spin_lock_reschedule (&main_lock);
+ down(&main_lock);
if (reg < 0)
{
/* Search for existing MTRR */
@@ -1247,14 +1247,14 @@ int mtrr_del (int reg, unsigned long base, unsigned long size)
}
if (reg < 0)
{
- spin_unlock (&main_lock);
+ up(&main_lock);
printk ("mtrr: no MTRR for %lx,%lx found\n", base, size);
return -EINVAL;
}
}
if (reg >= max)
{
- spin_unlock (&main_lock);
+ up(&main_lock);
printk ("mtrr: register: %d too big\n", reg);
return -EINVAL;
}
@@ -1262,7 +1262,7 @@ int mtrr_del (int reg, unsigned long base, unsigned long size)
{
if ((reg == 3) && arr3_protected)
{
- spin_unlock (&main_lock);
+ up(&main_lock);
printk ("mtrr: ARR3 cannot be changed\n");
return -EINVAL;
}
@@ -1270,19 +1270,19 @@ int mtrr_del (int reg, unsigned long base, unsigned long size)
(*get_mtrr) (reg, &lbase, &lsize, &ltype);
if (lsize < 1)
{
- spin_unlock (&main_lock);
+ up(&main_lock);
printk ("mtrr: MTRR %d not used\n", reg);
return -EINVAL;
}
if (usage_table[reg] < 1)
{
- spin_unlock (&main_lock);
+ up(&main_lock);
printk ("mtrr: reg: %d has count=0\n", reg);
return -EINVAL;
}
if (--usage_table[reg] < 1) set_mtrr (reg, 0, 0, 0);
compute_ascii ();
- spin_unlock (&main_lock);
+ up(&main_lock);
return reg;
} /* End Function mtrr_del */
diff --git a/arch/i386/kernel/pci-i386.c b/arch/i386/kernel/pci-i386.c
new file mode 100644
index 000000000..af362611d
--- /dev/null
+++ b/arch/i386/kernel/pci-i386.c
@@ -0,0 +1,312 @@
+/*
+ * Low-Level PCI Access for i386 machines
+ *
+ * Copyright 1993, 1994 Drew Eckhardt
+ * Visionary Computing
+ * (Unix and Linux consulting and custom programming)
+ * Drew@Colorado.EDU
+ * +1 (303) 786-7975
+ *
+ * Drew's work was sponsored by:
+ * iX Multiuser Multitasking Magazine
+ * Hannover, Germany
+ * hm@ix.de
+ *
+ * Copyright 1997--1999 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * For more information, please consult the following manuals (look at
+ * http://www.pcisig.com/ for how to get them):
+ *
+ * PCI BIOS Specification
+ * PCI Local Bus Specification
+ * PCI to PCI Bridge Specification
+ * PCI System Design Guide
+ *
+ *
+ * CHANGELOG :
+ * Jun 17, 1994 : Modified to accommodate the broken pre-PCI BIOS SPECIFICATION
+ * Revision 2.0 present on <thys@dennis.ee.up.ac.za>'s ASUS mainboard.
+ *
+ * Jan 5, 1995 : Modified to probe PCI hardware at boot time by Frederic
+ * Potter, potter@cao-vlsi.ibp.fr
+ *
+ * Jan 10, 1995 : Modified to store the information about configured pci
+ * devices into a list, which can be accessed via /proc/pci by
+ * Curtis Varner, cvarner@cs.ucr.edu
+ *
+ * Jan 12, 1995 : CPU-PCI bridge optimization support by Frederic Potter.
+ * Alpha version. Intel & UMC chipset support only.
+ *
+ * Apr 16, 1995 : Source merge with the DEC Alpha PCI support. Most of the code
+ * moved to drivers/pci/pci.c.
+ *
+ * Dec 7, 1996 : Added support for direct configuration access of boards
+ * with Intel compatible access schemes (tsbogend@alpha.franken.de)
+ *
+ * Feb 3, 1997 : Set internal functions to static, save/restore flags
+ * avoid dead locks reading broken PCI BIOS, werner@suse.de
+ *
+ * Apr 26, 1997 : Fixed case when there is BIOS32, but not PCI BIOS
+ * (mj@atrey.karlin.mff.cuni.cz)
+ *
+ * May 7, 1997 : Added some missing cli()'s. [mj]
+ *
+ * Jun 20, 1997 : Corrected problems in "conf1" type accesses.
+ * (paubert@iram.es)
+ *
+ * Aug 2, 1997 : Split to PCI BIOS handling and direct PCI access parts
+ * and cleaned it up... Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Feb 6, 1998 : No longer using BIOS to find devices and device classes. [mj]
+ *
+ * May 1, 1998 : Support for peer host bridges. [mj]
+ *
+ * Jun 19, 1998 : Changed to use spinlocks, so that PCI configuration space
+ * can be accessed from interrupts even on SMP systems. [mj]
+ *
+ * August 1998 : Better support for peer host bridges and more paranoid
+ * checks for direct hardware access. Ugh, this file starts to look as
+ * a large gallery of common hardware bug workarounds (watch the comments)
+ * -- the PCI specs themselves are sane, but most implementors should be
+ * hit hard with \hammer scaled \magstep5. [mj]
+ *
+ * Jan 23, 1999 : More improvements to peer host bridge logic. i450NX fixup. [mj]
+ *
+ * Feb 8, 1999 : Added UM8886BF I/O address fixup. [mj]
+ *
+ * August 1999 : New resource management and configuration access stuff. [mj]
+ *
+ * Sep 19, 1999 : Use PCI IRQ routing tables for detection of peer host bridges.
+ * Based on ideas by Chris Frantz and David Hinds. [mj]
+ *
+ * Sep 28, 1999 : Handle unreported/unassigned IRQs. Thanks to Shuu Yamaguchi
+ * for a lot of patience during testing. [mj]
+ *
+ * Oct 8, 1999 : Split to pci-i386.c, pci-pc.c and pci-visws.c. [mj]
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/errno.h>
+
+#include "pci-i386.h"
+
+/*
+ * Assign new address to PCI resource. We hope our resource information
+ * is complete. On the PC, we don't re-assign resources unless we are
+ * forced to do so.
+ *
+ * Expects start=0, end=size-1, flags=resource type.
+ */
+
+static int __init pcibios_assign_resource(struct pci_dev *dev, int i)
+{
+ struct resource *r = &dev->resource[i];
+ struct resource *pr = pci_find_parent_resource(dev, r);
+ unsigned long size = r->end + 1;
+ u32 new, check;
+
+ if (!pr) {
+ printk(KERN_ERR "PCI: Cannot find parent resource for device %s\n", dev->slot_name);
+ return -EINVAL;
+ }
+ if (r->flags & IORESOURCE_IO) {
+ /*
+ * We need to avoid collisions with `mirrored' VGA ports and other strange
+ * ISA hardware, so we always want the addresses kilobyte aligned.
+ */
+ if (size > 0x100) {
+ printk(KERN_ERR "PCI: I/O Region %s/%d too large (%ld bytes)\n", dev->slot_name, i, size);
+ return -EFBIG;
+ }
+ if (allocate_resource(pr, r, size, 0x1000, ~0, 1024)) {
+ printk(KERN_ERR "PCI: Allocation of I/O region %s/%d (%ld bytes) failed\n", dev->slot_name, i, size);
+ return -EBUSY;
+ }
+ } else {
+ if (allocate_resource(pr, r, size, 0x10000000, ~0, size)) {
+ printk(KERN_ERR "PCI: Allocation of memory region %s/%d (%ld bytes) failed\n", dev->slot_name, i, size);
+ return -EBUSY;
+ }
+ }
+ if (i < 6) {
+ int reg = PCI_BASE_ADDRESS_0 + 4*i;
+ new = r->start | (r->flags & PCI_REGION_FLAG_MASK);
+ pci_write_config_dword(dev, reg, new);
+ pci_read_config_dword(dev, reg, &check);
+ if (new != check)
+ printk(KERN_ERR "PCI: Error while updating region %s/%d (%08x != %08x)\n", dev->slot_name, i, new, check);
+ } else if (i == PCI_ROM_RESOURCE) {
+ r->flags |= PCI_ROM_ADDRESS_ENABLE;
+ pci_write_config_dword(dev, dev->rom_base_reg, r->start | (r->flags & PCI_REGION_FLAG_MASK));
+ }
+ printk("PCI: Assigned addresses %08lx-%08lx to region %s/%d\n", r->start, r->end, dev->slot_name, i);
+ return 0;
+}
+
+/*
+ * Handle resources of PCI devices. If the world were perfect, we could
+ * just allocate all the resource regions and do nothing more. It isn't.
+ * On the other hand, we cannot just re-allocate all devices, as it would
+ * require us to know lots of host bridge internals. So we attempt to
+ * keep as much of the original configuration as possible, but tweak it
+ * when it's found to be wrong.
+ *
+ * Known BIOS problems we have to work around:
+ * - I/O or memory regions not configured
+ * - regions configured, but not enabled in the command register
+ * - bogus I/O addresses above 64K used
+ * - expansion ROMs left enabled (this may sound harmless, but given
+ * the fact the PCI specs explicitly allow address decoders to be
+ * shared between expansion ROMs and other resource regions, it's
+ * at least dangerous)
+ *
+ * Our solution:
+ * (1) Allocate resources for all buses behind PCI-to-PCI bridges.
+ * This gives us fixed barriers on where we can allocate.
+ * (2) Allocate resources for all enabled devices. If there is
+ * a collision, just mark the resource as unallocated. Also
+ * disable expansion ROMs during this step.
+ * (3) Try to allocate resources for disabled devices. If the
+ * resources were assigned correctly, everything goes well,
+ * if they weren't, they won't disturb allocation of other
+ * resources.
+ * (4) Assign new addresses to resources which were either
+ * not configured at all or misconfigured. If explicitly
+ * requested by the user, configure expansion ROM address
+ * as well. Finally enable the I/O and Memory bits.
+ */
+
+static void __init pcibios_allocate_bus_resources(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+ int idx;
+ struct resource *r, *pr;
+
+ /* Depth-First Search on bus tree */
+ while (bus) {
+ if ((dev = bus->self)) {
+ for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) {
+ r = &dev->resource[idx];
+ if (!r->start)
+ continue;
+ pr = pci_find_parent_resource(dev, r);
+ if (!pr || request_resource(pr, r) < 0)
+ printk(KERN_ERR "PCI: Cannot allocate resource region %d of bridge %s\n", idx, dev->slot_name);
+ }
+ }
+ if (bus->children)
+ pcibios_allocate_bus_resources(bus->children);
+ bus = bus->next;
+ }
+}
+
+static void __init pcibios_allocate_resources(int pass)
+{
+ struct pci_dev *dev;
+ int idx, disabled;
+ u16 command;
+ struct resource *r, *pr;
+
+ for(dev=pci_devices; dev; dev=dev->next) {
+ pci_read_config_word(dev, PCI_COMMAND, &command);
+ for(idx = 0; idx < 6; idx++) {
+ r = &dev->resource[idx];
+ if (r->parent) /* Already allocated */
+ continue;
+ if (!r->start) /* Address not assigned at all */
+ continue;
+ if (r->flags & IORESOURCE_IO)
+ disabled = !(command & PCI_COMMAND_IO);
+ else
+ disabled = !(command & PCI_COMMAND_MEMORY);
+ if (pass == disabled) {
+ DBG("PCI: Resource %08lx-%08lx (f=%lx, d=%d, p=%d)\n",
+ r->start, r->end, r->flags, disabled, pass);
+ pr = pci_find_parent_resource(dev, r);
+ if (!pr || request_resource(pr, r) < 0) {
+ printk(KERN_ERR "PCI: Cannot allocate resource region %d of device %s\n", idx, dev->slot_name);
+ /* We'll assign a new address later */
+ r->start -= r->end;
+ r->start = 0;
+ }
+ }
+ }
+ if (!pass) {
+ r = &dev->resource[PCI_ROM_RESOURCE];
+ if (r->flags & PCI_ROM_ADDRESS_ENABLE) {
+ /* Turn the ROM off, leave the resource region, but keep it unregistered. */
+ u32 reg;
+ DBG("PCI: Switching off ROM of %s\n", dev->slot_name);
+ r->flags &= ~PCI_ROM_ADDRESS_ENABLE;
+ pci_read_config_dword(dev, dev->rom_base_reg, &reg);
+ pci_write_config_dword(dev, dev->rom_base_reg, reg & ~PCI_ROM_ADDRESS_ENABLE);
+ }
+ }
+ }
+}
+
+static void __init pcibios_assign_resources(void)
+{
+ struct pci_dev *dev;
+ u16 cmd, old_cmd;
+ int idx;
+ int fault = 0;
+ struct resource *r;
+
+ for(dev=pci_devices; dev; dev=dev->next) {
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+ old_cmd = cmd;
+ for(idx=0; idx<6; idx++) {
+ r = &dev->resource[idx];
+ if (((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && idx < 4) ||
+ ((dev->class >> 8) == PCI_CLASS_DISPLAY_VGA && (r->flags & IORESOURCE_IO)))
+ /*
+ * Don't touch IDE controllers and I/O ports of video cards!
+ * Neither enable anything in their command registers.
+ */
+ continue;
+ if (!r->start && r->end) {
+ /*
+ * We shall assign a new address to this resource, either because
+ * the BIOS forgot to do so or because we have decided the old
+ * address was unusable for some reason.
+ */
+ if (pcibios_assign_resource(dev, idx) < 0)
+ fault = 1;
+ }
+ if (r->flags & IORESOURCE_IO)
+ cmd |= PCI_COMMAND_IO;
+ if (r->flags & IORESOURCE_MEM)
+ cmd |= PCI_COMMAND_MEMORY;
+ }
+
+ if (cmd != old_cmd) {
+ if (fault)
+ printk("PCI: Not enabling device %s because of resource collisions\n", dev->slot_name);
+ else {
+ printk("PCI: Enabling device %s (%04x -> %04x)\n", dev->slot_name, old_cmd, cmd);
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
+ }
+ }
+
+ if (pci_probe & PCI_ASSIGN_ROMS) {
+ r = &dev->resource[PCI_ROM_RESOURCE];
+ r->end -= r->start;
+ r->start = 0;
+ if (r->end)
+ pcibios_assign_resource(dev, PCI_ROM_RESOURCE);
+ }
+ }
+}
+
+void __init pcibios_resource_survey(void)
+{
+ pcibios_allocate_bus_resources(pci_root);
+ pcibios_allocate_resources(0);
+ pcibios_allocate_resources(1);
+ pcibios_assign_resources();
+}
diff --git a/arch/i386/kernel/pci-i386.h b/arch/i386/kernel/pci-i386.h
new file mode 100644
index 000000000..41ac2b856
--- /dev/null
+++ b/arch/i386/kernel/pci-i386.h
@@ -0,0 +1,29 @@
+/*
+ * Low-Level PCI Access for i386 machines.
+ *
+ * (c) 1999 Martin Mares <mj@ucw.cz>
+ */
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define DBG(x...) printk(x)
+#else
+#define DBG(x...)
+#endif
+
+#define PCI_PROBE_BIOS 1
+#define PCI_PROBE_CONF1 2
+#define PCI_PROBE_CONF2 4
+#define PCI_NO_SORT 0x100
+#define PCI_BIOS_SORT 0x200
+#define PCI_NO_CHECKS 0x400
+#define PCI_NO_PEER_FIXUP 0x800
+#define PCI_ASSIGN_ROMS 0x1000
+#define PCI_NO_IRQ_SCAN 0x2000
+
+extern unsigned int pci_probe;
+
+/* pci-i386.c */
+
+void pcibios_resource_survey(void);
diff --git a/arch/i386/kernel/bios32.c b/arch/i386/kernel/pci-pc.c
index f0c63c938..be3076f30 100644
--- a/arch/i386/kernel/bios32.c
+++ b/arch/i386/kernel/pci-pc.c
@@ -1,119 +1,53 @@
/*
- * bios32.c - Low-Level PCI Access
+ * Low-Level PCI Support for PC
*
- * $Id: bios32.c,v 1.48 1998/09/26 08:06:55 mj Exp $
- *
- * Copyright 1993, 1994 Drew Eckhardt
- * Visionary Computing
- * (Unix and Linux consulting and custom programming)
- * Drew@Colorado.EDU
- * +1 (303) 786-7975
- *
- * Drew's work was sponsored by:
- * iX Multiuser Multitasking Magazine
- * Hannover, Germany
- * hm@ix.de
- *
- * Copyright 1997--1999 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
- *
- * For more information, please consult the following manuals (look at
- * http://www.pcisig.com/ for how to get them):
- *
- * PCI BIOS Specification
- * PCI Local Bus Specification
- * PCI to PCI Bridge Specification
- * PCI System Design Guide
- *
- *
- * CHANGELOG :
- * Jun 17, 1994 : Modified to accommodate the broken pre-PCI BIOS SPECIFICATION
- * Revision 2.0 present on <thys@dennis.ee.up.ac.za>'s ASUS mainboard.
- *
- * Jan 5, 1995 : Modified to probe PCI hardware at boot time by Frederic
- * Potter, potter@cao-vlsi.ibp.fr
- *
- * Jan 10, 1995 : Modified to store the information about configured pci
- * devices into a list, which can be accessed via /proc/pci by
- * Curtis Varner, cvarner@cs.ucr.edu
- *
- * Jan 12, 1995 : CPU-PCI bridge optimization support by Frederic Potter.
- * Alpha version. Intel & UMC chipset support only.
- *
- * Apr 16, 1995 : Source merge with the DEC Alpha PCI support. Most of the code
- * moved to drivers/pci/pci.c.
- *
- * Dec 7, 1996 : Added support for direct configuration access of boards
- * with Intel compatible access schemes (tsbogend@alpha.franken.de)
- *
- * Feb 3, 1997 : Set internal functions to static, save/restore flags
- * avoid dead locks reading broken PCI BIOS, werner@suse.de
- *
- * Apr 26, 1997 : Fixed case when there is BIOS32, but not PCI BIOS
- * (mj@atrey.karlin.mff.cuni.cz)
- *
- * May 7, 1997 : Added some missing cli()'s. [mj]
- *
- * Jun 20, 1997 : Corrected problems in "conf1" type accesses.
- * (paubert@iram.es)
- *
- * Aug 2, 1997 : Split to PCI BIOS handling and direct PCI access parts
- * and cleaned it up... Martin Mares <mj@atrey.karlin.mff.cuni.cz>
- *
- * Feb 6, 1998 : No longer using BIOS to find devices and device classes. [mj]
- *
- * May 1, 1998 : Support for peer host bridges. [mj]
- *
- * Jun 19, 1998 : Changed to use spinlocks, so that PCI configuration space
- * can be accessed from interrupts even on SMP systems. [mj]
- *
- * August 1998 : Better support for peer host bridges and more paranoid
- * checks for direct hardware access. Ugh, this file starts to look as
- * a large gallery of common hardware bug workarounds (watch the comments)
- * -- the PCI specs themselves are sane, but most implementors should be
- * hit hard with \hammer scaled \magstep5. [mj]
- *
- * Jan 23, 1999 : More improvements to peer host bridge logic. i450NX fixup. [mj]
- *
- * Feb 8, 1999 : Added UM8886BF I/O address fixup. [mj]
- *
- * August 1999 : New resource management and configuration access stuff. [mj]
+ * (c) 1999 Martin Mares <mj@ucw.cz>
*/
#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/pci.h>
#include <linux/init.h>
-#include <linux/ioport.h>
#include <linux/malloc.h>
-#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
#include <linux/irq.h>
-#include <linux/spinlock.h>
-#include <asm/page.h>
#include <asm/segment.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/smp.h>
-#undef DEBUG
+#include "pci-i386.h"
-#ifdef DEBUG
-#define DBG(x...) printk(x)
-#else
-#define DBG(x...)
-#endif
+unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2;
-#define PCI_PROBE_BIOS 1
-#define PCI_PROBE_CONF1 2
-#define PCI_PROBE_CONF2 4
-#define PCI_NO_SORT 0x100
-#define PCI_BIOS_SORT 0x200
-#define PCI_NO_CHECKS 0x400
-#define PCI_NO_PEER_FIXUP 0x800
-#define PCI_ASSIGN_ROMS 0x1000
+/*
+ * IRQ routing table provided by the BIOS
+ */
-static unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2;
+struct irq_info {
+ u8 bus, devfn; /* Bus, device and function */
+ struct {
+ u8 link; /* IRQ line ID, chipset dependent, 0=not routed */
+ u16 bitmap; /* Available IRQs */
+ } __attribute__((packed)) irq[4];
+ u8 slot; /* Slot number, 0=onboard */
+ u8 rfu;
+} __attribute__((packed));
+
+struct irq_routing_table {
+ u32 signature; /* PIRQ_SIGNATURE should be here */
+ u16 version; /* PIRQ_VERSION */
+ u16 size; /* Table size in bytes */
+ u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */
+ u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */
+ u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */
+ u32 miniport_data; /* Crap */
+ u8 rfu[11];
+ u8 checksum; /* Modulo 256 checksum must give zero */
+ struct irq_info slots[0];
+} __attribute__((packed));
/*
* Direct access to PCI hardware...
@@ -129,55 +63,55 @@ static unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CON
static int pci_conf1_read_config_byte(struct pci_dev *dev, int where, u8 *value)
{
- outl(CONFIG_CMD(dev,where), 0xCF8);
- *value = inb(0xCFC + (where&3));
- return PCIBIOS_SUCCESSFUL;
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ *value = inb(0xCFC + (where&3));
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf1_read_config_word(struct pci_dev *dev, int where, u16 *value)
{
- outl(CONFIG_CMD(dev,where), 0xCF8);
- *value = inw(0xCFC + (where&2));
- return PCIBIOS_SUCCESSFUL;
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ *value = inw(0xCFC + (where&2));
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf1_read_config_dword(struct pci_dev *dev, int where, u32 *value)
{
- outl(CONFIG_CMD(dev,where), 0xCF8);
- *value = inl(0xCFC);
- return PCIBIOS_SUCCESSFUL;
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ *value = inl(0xCFC);
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf1_write_config_byte(struct pci_dev *dev, int where, u8 value)
{
- outl(CONFIG_CMD(dev,where), 0xCF8);
- outb(value, 0xCFC + (where&3));
- return PCIBIOS_SUCCESSFUL;
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ outb(value, 0xCFC + (where&3));
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf1_write_config_word(struct pci_dev *dev, int where, u16 value)
{
- outl(CONFIG_CMD(dev,where), 0xCF8);
- outw(value, 0xCFC + (where&2));
- return PCIBIOS_SUCCESSFUL;
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ outw(value, 0xCFC + (where&2));
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf1_write_config_dword(struct pci_dev *dev, int where, u32 value)
{
- outl(CONFIG_CMD(dev,where), 0xCF8);
- outl(value, 0xCFC);
- return PCIBIOS_SUCCESSFUL;
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ outl(value, 0xCFC);
+ return PCIBIOS_SUCCESSFUL;
}
#undef CONFIG_CMD
static struct pci_ops pci_direct_conf1 = {
- pci_conf1_read_config_byte,
- pci_conf1_read_config_word,
- pci_conf1_read_config_dword,
- pci_conf1_write_config_byte,
- pci_conf1_write_config_word,
- pci_conf1_write_config_dword
+ pci_conf1_read_config_byte,
+ pci_conf1_read_config_word,
+ pci_conf1_read_config_dword,
+ pci_conf1_write_config_byte,
+ pci_conf1_write_config_word,
+ pci_conf1_write_config_dword
};
/*
@@ -192,50 +126,50 @@ static struct pci_ops pci_direct_conf1 = {
static int pci_conf2_read_config_byte(struct pci_dev *dev, int where, u8 *value)
{
- SET(dev);
- *value = inb(IOADDR(dev->devfn,where));
- outb (0, 0xCF8);
- return PCIBIOS_SUCCESSFUL;
+ SET(dev);
+ *value = inb(IOADDR(dev->devfn,where));
+ outb (0, 0xCF8);
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf2_read_config_word(struct pci_dev *dev, int where, u16 *value)
{
- SET(dev);
- *value = inw(IOADDR(dev->devfn,where));
- outb (0, 0xCF8);
- return PCIBIOS_SUCCESSFUL;
+ SET(dev);
+ *value = inw(IOADDR(dev->devfn,where));
+ outb (0, 0xCF8);
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf2_read_config_dword(struct pci_dev *dev, int where, u32 *value)
{
- SET(dev);
- *value = inl (IOADDR(dev->devfn,where));
- outb (0, 0xCF8);
- return PCIBIOS_SUCCESSFUL;
+ SET(dev);
+ *value = inl (IOADDR(dev->devfn,where));
+ outb (0, 0xCF8);
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf2_write_config_byte(struct pci_dev *dev, int where, u8 value)
{
- SET(dev);
- outb (value, IOADDR(dev->devfn,where));
- outb (0, 0xCF8);
- return PCIBIOS_SUCCESSFUL;
+ SET(dev);
+ outb (value, IOADDR(dev->devfn,where));
+ outb (0, 0xCF8);
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf2_write_config_word(struct pci_dev *dev, int where, u16 value)
{
- SET(dev);
- outw (value, IOADDR(dev->devfn,where));
- outb (0, 0xCF8);
- return PCIBIOS_SUCCESSFUL;
+ SET(dev);
+ outw (value, IOADDR(dev->devfn,where));
+ outb (0, 0xCF8);
+ return PCIBIOS_SUCCESSFUL;
}
static int pci_conf2_write_config_dword(struct pci_dev *dev, int where, u32 value)
{
- SET(dev);
- outl (value, IOADDR(dev->devfn,where));
- outb (0, 0xCF8);
- return PCIBIOS_SUCCESSFUL;
+ SET(dev);
+ outl (value, IOADDR(dev->devfn,where));
+ outb (0, 0xCF8);
+ return PCIBIOS_SUCCESSFUL;
}
#undef SET
@@ -243,12 +177,12 @@ static int pci_conf2_write_config_dword(struct pci_dev *dev, int where, u32 valu
#undef FUNC
static struct pci_ops pci_direct_conf2 = {
- pci_conf2_read_config_byte,
- pci_conf2_read_config_word,
- pci_conf2_read_config_dword,
- pci_conf2_write_config_byte,
- pci_conf2_write_config_word,
- pci_conf2_write_config_dword
+ pci_conf2_read_config_byte,
+ pci_conf2_read_config_word,
+ pci_conf2_read_config_dword,
+ pci_conf2_write_config_byte,
+ pci_conf2_write_config_word,
+ pci_conf2_write_config_dword
};
/*
@@ -267,10 +201,6 @@ static int __init pci_sanity_check(struct pci_ops *o)
struct pci_bus bus; /* Fake bus and device */
struct pci_dev dev;
-#ifdef CONFIG_VISWS
- return 1; /* Lithium PCI Bridges are non-standard */
-#endif
-
if (pci_probe & PCI_NO_CHECKS)
return 1;
bus.number = 0;
@@ -347,6 +277,8 @@ static struct pci_ops * __init pci_check_direct(void)
#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b
#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c
#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d
+#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e
+#define PCIBIOS_SET_PCI_HW_INT 0xb10f
/* BIOS32 signature: "_32_" */
#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -427,7 +359,7 @@ static unsigned long bios32_service(unsigned long service)
printk("bios32_service(0x%lx): not present\n", service);
return 0;
default: /* Shouldn't happen */
- printk("bios32_service(0x%lx): returned 0x%x, report to <mj@ucw.cz>.\n",
+ printk("bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n",
service, return_code);
return 0;
}
@@ -489,31 +421,6 @@ static int __init check_pcibios(void)
return 0;
}
-#if 0 /* Not used */
-
-static int pci_bios_find_class (unsigned int class_code, unsigned short index,
- unsigned char *bus, unsigned char *device_fn)
-{
- unsigned long bx;
- unsigned long ret;
-
- __asm__ ("lcall (%%edi)\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=b" (bx),
- "=a" (ret)
- : "1" (PCIBIOS_FIND_PCI_CLASS_CODE),
- "c" (class_code),
- "S" ((int) index),
- "D" (&pci_indirect));
- *bus = (bx >> 8) & 0xff;
- *device_fn = bx & 0xff;
- return (int) (ret & 0xff00) >> 8;
-}
-
-#endif
-
static int __init pci_bios_find_device (unsigned short vendor, unsigned short device_id,
unsigned short index, unsigned char *bus, unsigned char *device_fn)
{
@@ -757,85 +664,73 @@ static void __init pcibios_sort(void)
*last = NULL;
}
-#endif
-
/*
- * Several BIOS'es forget to assign addresses to I/O ranges. Try to fix it.
+ * Ask BIOS for IRQ Routing Table
*/
-static void __init pcibios_fixup_io_addr(struct pci_dev *dev, int idx)
-{
- unsigned int reg = PCI_BASE_ADDRESS_0 + 4*idx;
- struct resource *r = &dev->resource[idx];
- unsigned int size = r->end - r->start + 1;
+struct irq_routing_options {
+ u16 size;
+ struct irq_info *table;
+ u16 segment;
+} __attribute__((packed));
- if (((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && idx < 4) ||
- (dev->class >> 8) == PCI_CLASS_DISPLAY_VGA) {
- /*
- * In case the BIOS didn't assign an address 0--3 to an IDE
- * controller, we don't try to fix it as it means "use default
- * addresses" at least with several broken chips and the IDE
- * driver needs the original settings to recognize which devices
- * correspond to the primary controller.
- *
- * We don't assign VGA I/O ranges as well.
- */
- return;
- }
- /*
- * We need to avoid collisions with `mirrored' VGA ports and other strange
- * ISA hardware, so we always want the addresses kilobyte aligned.
- */
- if (!size || size > 256) {
- printk(KERN_ERR "PCI: Cannot assign I/O space to device %s, %d bytes are too much.\n", dev->name, size);
- return;
- } else {
- u32 try;
+static unsigned long pcibios_irq_page __initdata = 0;
- r->start = 0;
- r->end = size - 1;
- if (pci_assign_resource(dev, idx)) {
- printk(KERN_ERR "PCI: Unable to find free %d bytes of I/O space for device %s.\n", size, dev->name);
- return;
- }
- printk("PCI: Assigned I/O space %04lx-%04lx to device %s\n", r->start, r->end, dev->name);
- pci_read_config_dword(dev, reg, &try);
- if ((try & PCI_BASE_ADDRESS_IO_MASK) != r->start) {
- r->start = 0;
- pci_write_config_dword(dev, reg, 0);
- printk(KERN_ERR "PCI: I/O address setup failed, got %04x\n", try);
- }
- }
+static inline void __init pcibios_free_irq_routing_table(void)
+{
+ if (pcibios_irq_page)
+ free_page(pcibios_irq_page);
}
-/*
- * Assign address to expansion ROM. This is a highly experimental feature
- * and you must enable it by "pci=rom". It's even not guaranteed to work
- * with all cards since the PCI specs allow address decoders to be shared
- * between the ROM space and one of the standard regions (sigh!).
- */
-static void __init pcibios_fixup_rom_addr(struct pci_dev *dev)
+static struct irq_routing_table * __init pcibios_get_irq_routing_table(void)
{
- int reg = (dev->hdr_type == 1) ? PCI_ROM_ADDRESS1 : PCI_ROM_ADDRESS;
- struct resource *r = &dev->resource[PCI_ROM_RESOURCE];
- unsigned long rom_size = r->end - r->start + 1;
-
- r->start = 0;
- r->end = rom_size - 1;
- if (pci_assign_resource(dev, PCI_ROM_RESOURCE))
- printk(KERN_ERR "PCI: Unable to find free space for expansion ROM of device %s (0x%lx bytes)\n",
- dev->name, rom_size);
- else {
- DBG("PCI: Assigned address %08lx to expansion ROM of %s (0x%lx bytes)\n", r->start, dev->name, rom_size);
- pci_write_config_dword(dev, reg, r->start | PCI_ROM_ADDRESS_ENABLE);
- r->flags |= PCI_ROM_ADDRESS_ENABLE;
+ struct irq_routing_options opt;
+ struct irq_routing_table *rt;
+ int ret, map;
+
+ if (pci_probe & PCI_NO_IRQ_SCAN)
+ return NULL;
+ pcibios_irq_page = __get_free_page(GFP_KERNEL);
+ if (!pcibios_irq_page)
+ return 0;
+ rt = (void *) pcibios_irq_page;
+ opt.table = rt->slots;
+ opt.size = PAGE_SIZE - sizeof(struct irq_routing_table);
+ opt.segment = __KERNEL_DS;
+
+ DBG("PCI: Fetching IRQ routing table... ");
+ __asm__("push %%es\n\t"
+ "push %%ds\n\t"
+ "pop %%es\n\t"
+ "lcall (%%esi)\n\t"
+ "pop %%es\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=a" (ret),
+ "=b" (map)
+ : "0" (PCIBIOS_GET_ROUTING_OPTIONS),
+ "1" (0),
+ "D" ((long) &opt),
+ "S" (&pci_indirect));
+ DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map);
+ if (ret & 0xff00) {
+ printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", (ret >> 8) & 0xff);
+ return 0;
}
+
+ memset(rt, 0, sizeof(struct irq_routing_table));
+ rt->size = opt.size + sizeof(struct irq_routing_table);
+ printk("PCI: Using BIOS Interrupt Routing Table\n");
+ return rt;
}
+#endif
+
/*
* Several buggy motherboards address only 16 devices and mirror
* them to next 16 IDs. We try to detect this `feature' on all
- * primary busses (those containing host bridges as they are
+ * primary buses (those containing host bridges as they are
* expected to be unique) and remove the ghost devices.
*/
@@ -868,7 +763,7 @@ static void __init pcibios_fixup_ghosts(struct pci_bus *b)
}
if (!seen_host_bridge)
return;
- printk("PCI: Ignoring ghost devices on bus %d\n", b->number);
+ printk("PCI: Ignoring ghost devices on bus %02x\n", b->number);
for(e=b->devices; e->sibling != d; e=e->sibling);
e->sibling = NULL;
for(z=&pci_devices; (d=*z);)
@@ -893,16 +788,11 @@ static void __init pcibios_fixup_peer_bridges(void)
struct pci_dev *d;
struct pci_ops *ops = pci_root->ops;
-#ifdef CONFIG_VISWS
- pci_scan_bus(1, ops, NULL);
- return;
-#endif
-
#ifdef CONFIG_PCI_DIRECT
/*
* Don't search for peer host bridges if we use config type 2
- * since it reads bogus values for non-existent busses and
- * chipsets supporting multiple primary busses use conf1 anyway.
+ * since it reads bogus values for non-existent buses and
+ * chipsets supporting multiple primary buses use conf1 anyway.
*/
if (ops == &pci_direct_conf2)
return;
@@ -966,7 +856,7 @@ static void __init pci_fixup_i450nx(struct pci_dev *d)
*/
int pxb, reg;
u8 busno, suba, subb;
- printk("PCI: Searching for i450NX host bridges on %s\n", d->name);
+ printk("PCI: Searching for i450NX host bridges on %s\n", d->slot_name);
reg = 0xd0;
for(pxb=0; pxb<2; pxb++) {
pci_read_config_byte(d, reg++, &busno);
@@ -989,113 +879,214 @@ static void __init pci_fixup_umc_ide(struct pci_dev *d)
*/
int i;
- printk("PCI: Fixing base address flags for device %s\n", d->name);
+ printk("PCI: Fixing base address flags for device %s\n", d->slot_name);
for(i=0; i<4; i++)
d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO;
}
+static void __init pci_fixup_ide_bases(struct pci_dev *d)
+{
+ int i;
+
+ /*
+ * PCI IDE controllers use non-standard I/O port decoding, respect it.
+ */
+ if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE)
+ return;
+ DBG("PCI: IDE base address fixup for %s\n", d->slot_name);
+ for(i=0; i<4; i++) {
+ struct resource *r = &d->resource[i];
+ if ((r->start & ~0x80) == 0x374) {
+ r->start |= 2;
+ r->end = r->start;
+ }
+ }
+}
+
struct pci_fixup pcibios_fixups[] = {
{ PCI_FIXUP_HEADER, PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx },
{ PCI_FIXUP_HEADER, PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide },
+ { PCI_FIXUP_HEADER, PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases },
{ 0 }
};
/*
- * Allocate resources for all PCI devices. We need to do that before
- * we try to fix up anything.
+ * Fix up IRQs of all PCI devices.
*/
-static void __init pcibios_claim_resources(struct pci_bus *bus)
-{
- struct pci_dev *dev;
- int idx;
+extern int skip_ioapic_setup;
- while (bus) {
- for (dev=bus->devices; dev; dev=dev->sibling)
- for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) {
- struct resource *r = &dev->resource[idx];
- struct resource *pr;
- if (!r->start)
- continue;
- pr = pci_find_parent_resource(dev, r);
- if (!pr || request_resource(pr, r) < 0) {
- printk(KERN_ERR "PCI: Address space collision on region %d of device %s\n", idx, dev->name);
- /* We probably should disable the region, shouldn't we? */
- }
+#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
+#define PIRQ_VERSION 0x0100
+
+/*
+ * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
+ */
+
+static struct irq_routing_table * __init pcibios_find_irq_routing_table(void)
+{
+ u8 *addr;
+ struct irq_routing_table *rt;
+ int i;
+ u8 sum;
+
+ for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
+ rt = (struct irq_routing_table *) addr;
+ if (rt->signature != PIRQ_SIGNATURE ||
+ rt->version != PIRQ_VERSION ||
+ rt->size % 16 ||
+ rt->size < sizeof(struct irq_routing_table))
+ continue;
+ sum = 0;
+ for(i=0; i<rt->size; i++)
+ sum += addr[i];
+ if (!sum) {
+ printk("PCI: Interrupt Routing Table found at 0x%p [router type %04x/%04x]\n",
+ rt, rt->rtr_vendor, rt->rtr_device);
+ return rt;
}
- if (bus->children)
- pcibios_claim_resources(bus->children);
- bus = bus->next;
}
+ return NULL;
}
/*
- * Fix base addresses, I/O and memory enables and IRQ's (mostly work-arounds
- * for buggy PCI BIOS'es :-[).
+ * If we have a IRQ routing table, use it to search for peer host
+ * bridges. It's a gross hack, but since there are no other known
+ * ways how to get a list of buses, we have to go this way.
*/
-extern int skip_ioapic_setup;
-
-static void __init pcibios_fixup_devices(void)
+static void __init pcibios_irq_peer_trick(struct irq_routing_table *rt)
{
- struct pci_dev *dev;
- int i, has_io, has_mem;
- unsigned short cmd;
+ u8 busmap[256];
+ int i;
+ struct irq_info *e;
- for(dev = pci_devices; dev; dev=dev->next) {
+ memset(busmap, 0, sizeof(busmap));
+ for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
+ e = &rt->slots[i];
+ DBG("b=%02x d=%02x s=%02x\n", e->bus, e->devfn, e->slot);
+ busmap[e->bus] = 1;
+ }
+ for(i=1; i<256; i++)
/*
- * There are buggy BIOSes that forget to enable I/O and memory
- * access to PCI devices. We try to fix this, but we need to
- * be sure that the BIOS didn't forget to assign an address
- * to the device. [mj]
+ * It might be a secondary bus, but in this case its parent is already
+ * known (ascending bus order) and therefore pci_scan_bus returns immediately.
*/
- has_io = has_mem = 0;
- for(i=0; i<6; i++) {
- struct resource *r = &dev->resource[i];
- if (r->flags & PCI_BASE_ADDRESS_SPACE_IO) {
- has_io = 1;
- if (!r->start || r->start == PCI_BASE_ADDRESS_IO_MASK)
- pcibios_fixup_io_addr(dev, i);
- } else if (r->start)
- has_mem = 1;
- }
+ if (busmap[i] && pci_scan_bus(i, pci_root->ops, NULL))
+ printk("PCI: Discovered primary peer bus %02x [IRQ]\n", i);
+ pci_probe |= PCI_NO_PEER_FIXUP;
+}
+
+/*
+ * In case BIOS forgets to tell us about IRQ, we try to look it up in the routing
+ * table, but unfortunately we have to know the interrupt router chip.
+ */
+
+static char * __init pcibios_lookup_irq(struct pci_dev *dev, struct irq_routing_table *rt, int pin)
+{
+ struct irq_info *q;
+ struct pci_dev *router;
+ int i, pirq, newirq;
+ u32 rtrid, mask;
+ u8 x;
+
+ pin--;
+ DBG("IRQ for %s(%d)", dev->slot_name, pin);
+ while (dev->bus->self) {
+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ dev = dev->bus->self;
+ DBG(" -> %s(%d)", dev->slot_name, pin);
+ }
+ for(q = rt->slots, i = rt->size - sizeof(struct irq_routing_table);
+ i && (q->bus != dev->bus->number || PCI_SLOT(q->devfn) != PCI_SLOT(dev->devfn));
+ i -= sizeof(struct irq_info), q++)
+ ;
+ if (!i) {
+ DBG(" -> not found in routing table\n");
+ return NULL;
+ }
+ pirq = q->irq[pin].link;
+ mask = q->irq[pin].bitmap;
+ if (!pirq) {
+ DBG(" -> not routed\n");
+ return NULL;
+ }
+ DBG(" -> PIRQ %02x, mask %04x", pirq, mask);
+ if ((dev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
+ newirq = 0;
+ else for(newirq = 15; newirq && !(mask & (1 << newirq)); newirq--)
+ ;
+ if (!(router = pci_find_slot(rt->rtr_bus, rt->rtr_devfn))) {
+ DBG(" -> router not found\n");
+ return NULL;
+ }
+#define ID(x,y) ((x << 16) | y)
+ rtrid = ID(rt->rtr_vendor, rt->rtr_device);
+ if (!rtrid) {
/*
- * Don't enable VGA-compatible cards since they have
- * fixed I/O and memory space.
- *
- * Don't enabled disabled IDE interfaces either because
- * some BIOSes may reallocate the same address when they
- * find that no devices are attached.
+ * Several BIOSes forget to set the router type. In such cases, we
+ * use chip vendor/device. This doesn't guarantee us semantics of
+ * PIRQ values, but was found to work in practice and it's still
+ * better than not trying.
*/
- if (((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) &&
- ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)) {
- pci_read_config_word(dev, PCI_COMMAND, &cmd);
- if (has_io && !(cmd & PCI_COMMAND_IO)) {
- printk("PCI: Enabling I/O for device %s\n", dev->name);
- cmd |= PCI_COMMAND_IO;
- pci_write_config_word(dev, PCI_COMMAND, cmd);
- }
- if (has_mem && !(cmd & PCI_COMMAND_MEMORY)) {
- printk("PCI: Enabling memory for device %s\n", dev->name);
- cmd |= PCI_COMMAND_MEMORY;
- pci_write_config_word(dev, PCI_COMMAND, cmd);
- }
+ DBG(" [%s]", router->slot_name);
+ rtrid = ID(router->vendor, router->device);
+ }
+ switch (rtrid) {
+ case ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371FB_0):
+ case ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371SB_0):
+ case ID(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_0):
+ /* Intel PIIX: PIRQ holds configuration register address */
+ pci_read_config_byte(router, pirq, &x);
+ if (x < 16) {
+ DBG(" -> [PIIX] %02x\n", x);
+ dev->irq = x;
+ return "PIIX";
+ } else if (newirq) {
+ DBG(" -> [PIIX] set to %02x\n", newirq);
+ pci_write_config_byte(router, pirq, newirq);
+ dev->irq = newirq;
+ return "PIIX-NEW";
}
- /*
- * Assign address to expansion ROM if requested.
- */
- if ((pci_probe & PCI_ASSIGN_ROMS) && dev->resource[PCI_ROM_RESOURCE].end)
- pcibios_fixup_rom_addr(dev);
+ DBG(" -> [PIIX] sink\n");
+ return NULL;
+ default:
+ DBG(" -> unknown router %04x/%04x\n", rt->rtr_vendor, rt->rtr_device);
+ if (newirq && mask == (1 << newirq)) {
+ /* Only one IRQ available -> use it */
+ dev->irq = newirq;
+ return "guess";
+ }
+ return NULL;
+ }
+#undef ID
+}
+
+static void __init pcibios_fixup_irqs(void)
+{
+ struct irq_routing_table *rtable;
+ struct pci_dev *dev;
+ u8 pin;
+
+ rtable = pcibios_find_irq_routing_table();
+#ifdef CONFIG_PCI_BIOS
+ if (!rtable && pci_bios_present)
+ rtable = pcibios_get_irq_routing_table();
+#endif
+
+ if (rtable)
+ pcibios_irq_peer_trick(rtable);
+
+ for(dev=pci_devices; dev; dev=dev->next) {
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
#if defined(CONFIG_X86_IO_APIC)
/*
- * Recalculate IRQ numbers if we use the I/O APIC
+ * Recalculate IRQ numbers if we use the I/O APIC.
*/
if(!skip_ioapic_setup)
{
int irq;
- unsigned char pin;
- pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (pin) {
pin--; /* interrupt pins are numbered starting from 1 */
irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
@@ -1115,14 +1106,24 @@ static void __init pcibios_fixup_devices(void)
dev->irq = irq;
}
}
+ rtable = NULL; /* Avoid IRQ assignment below */
}
#endif
/*
- * Fix out-of-range IRQ numbers
+ * Fix out-of-range IRQ numbers and missing IRQs.
*/
if (dev->irq >= NR_IRQS)
dev->irq = 0;
+ if (pin && !dev->irq && rtable && rtable->version) {
+ char *msg = pcibios_lookup_irq(dev, rtable, pin);
+ if (msg)
+ printk("PCI: Assigned IRQ %d to device %s [%s]\n", dev->irq, dev->slot_name, msg);
+ }
}
+
+#ifdef CONFIG_PCI_BIOS
+ pcibios_free_irq_routing_table();
+#endif
}
/*
@@ -1133,6 +1134,7 @@ static void __init pcibios_fixup_devices(void)
void __init pcibios_fixup_bus(struct pci_bus *b)
{
pcibios_fixup_ghosts(b);
+ pci_read_bridge_bases(b);
}
/*
@@ -1170,10 +1172,10 @@ void __init pcibios_init(void)
printk("PCI: Probing PCI hardware\n");
pci_scan_bus(0, ops, NULL);
+ pcibios_fixup_irqs();
if (!(pci_probe & PCI_NO_PEER_FIXUP))
pcibios_fixup_peer_bridges();
- pcibios_claim_resources(pci_root);
- pcibios_fixup_devices();
+ pcibios_resource_survey();
#ifdef CONFIG_PCI_BIOS
if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT))
@@ -1197,6 +1199,9 @@ char * __init pcibios_setup(char *str)
} else if (!strcmp(str, "nosort")) {
pci_probe |= PCI_NO_SORT;
return NULL;
+ } else if (!strcmp(str, "noirq")) {
+ pci_probe |= PCI_NO_IRQ_SCAN;
+ return NULL;
}
#endif
#ifdef CONFIG_PCI_DIRECT
diff --git a/arch/i386/kernel/pci-visws.c b/arch/i386/kernel/pci-visws.c
new file mode 100644
index 000000000..31a767a22
--- /dev/null
+++ b/arch/i386/kernel/pci-visws.c
@@ -0,0 +1,131 @@
+/*
+ * Low-Level PCI Support for SGI Visual Workstation
+ *
+ * (c) 1999 Martin Mares <mj@ucw.cz>
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+
+#include <asm/smp.h>
+#include <asm/lithium.h>
+
+#include "pci-i386.h"
+
+unsigned int pci_probe = 0;
+
+/*
+ * The VISWS uses configuration access type 1 only.
+ */
+
+#define CONFIG_CMD(dev, where) (0x80000000 | (dev->bus->number << 16) | (dev->devfn << 8) | (where & ~3))
+
+static int pci_conf1_read_config_byte(struct pci_dev *dev, int where, u8 *value)
+{
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ *value = inb(0xCFC + (where&3));
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int pci_conf1_read_config_word(struct pci_dev *dev, int where, u16 *value)
+{
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ *value = inw(0xCFC + (where&2));
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int pci_conf1_read_config_dword(struct pci_dev *dev, int where, u32 *value)
+{
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ *value = inl(0xCFC);
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int pci_conf1_write_config_byte(struct pci_dev *dev, int where, u8 value)
+{
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ outb(value, 0xCFC + (where&3));
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int pci_conf1_write_config_word(struct pci_dev *dev, int where, u16 value)
+{
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ outw(value, 0xCFC + (where&2));
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int pci_conf1_write_config_dword(struct pci_dev *dev, int where, u32 value)
+{
+ outl(CONFIG_CMD(dev,where), 0xCF8);
+ outl(value, 0xCFC);
+ return PCIBIOS_SUCCESSFUL;
+}
+
+#undef CONFIG_CMD
+
+static struct pci_ops visws_pci_ops = {
+ pci_conf1_read_config_byte,
+ pci_conf1_read_config_word,
+ pci_conf1_read_config_dword,
+ pci_conf1_write_config_byte,
+ pci_conf1_write_config_word,
+ pci_conf1_write_config_dword
+};
+
+static void __init pcibios_fixup_irqs(void)
+{
+ struct pci_dev *dev, *p;
+ u8 pin;
+ int irq;
+
+ for(dev=pci_devices; dev; dev=dev->next) {
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ dev->irq = 0;
+ if (!pin)
+ continue;
+ pin--;
+ if (dev->bus->parent) {
+ p = dev->bus->parent->self;
+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ } else
+ p = dev;
+ irq = visws_get_PCI_irq_vector(p->bus->number, PCI_SLOT(p->devfn), pin+1);
+ if (irq >= 0)
+ dev->irq = irq;
+ DBG("PCI IRQ: %s pin %d -> %d\n", dev->slot_name, pin, irq);
+ }
+}
+
+void __init pcibios_fixup_bus(struct pci_bus *b)
+{
+ pci_read_bridge_bases(b);
+}
+
+#if 0
+static struct resource visws_pci_bus_resources[2] = {
+ { "Host bus 1", 0xf4000000, 0xf7ffffff, 0 },
+ { "Host bus 2", 0xf0000000, 0xf3ffffff, 0 }
+};
+#endif
+
+void __init pcibios_init(void)
+{
+ unsigned int sec_bus = li_pcib_read16(LI_PCI_BUSNUM) & 0xff;
+
+ printk("PCI: Probing PCI hardware on host buses 00 and %02x\n", sec_bus);
+ pci_scan_bus(0, &visws_pci_ops, NULL);
+ pci_scan_bus(sec_bus, &visws_pci_ops, NULL);
+ pcibios_fixup_irqs();
+ pcibios_resource_survey();
+}
+
+char * __init pcibios_setup(char *str)
+{
+ return str;
+}
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index f44234eb7..e2253ccca 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -1,63 +1,61 @@
/*
- * Intel MP v1.1/v1.4 specification support routines for multi-pentium
- * hosts.
+ * Intel SMP support routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998 Ingo Molnar
- *
- * Supported by Caldera http://www.caldera.com.
- * Much of the core SMP work is based on previous work by Thomas Radke, to
- * whom a great many thanks are extended.
- *
- * Thanks to Intel for making available several different Pentium,
- * Pentium Pro and Pentium-II/Xeon MP machines.
+ * (c) 1998-99 Ingo Molnar <mingo@redhat.com>
*
* This code is released under the GNU public license version 2 or
* later.
- *
- * Fixes
- * Felix Koop : NR_CPUS used properly
- * Jose Renau : Handle single CPU case.
- * Alan Cox : By repeated request 8) - Total BogoMIP report.
- * Greg Wright : Fix for kernel stacks panic.
- * Erich Boleyn : MP v1.4 and additional changes.
- * Matthias Sattler : Changes for 2.1 kernel map.
- * Michel Lespinasse : Changes for 2.1 kernel map.
- * Michael Chastain : Change trampoline.S to gnu as.
- * Alan Cox : Dumb bug: 'B' step PPro's are fine
- * Ingo Molnar : Added APIC timers, based on code
- * from Jose Renau
- * Alan Cox : Added EBDA scanning
- * Ingo Molnar : various cleanups and rewrites
- * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
*/
#include <linux/config.h>
+#include <linux/init.h>
+
#include <linux/mm.h>
#include <linux/kernel_stat.h>
-#include <linux/delay.h>
-#include <linux/mc146818rtc.h>
#include <linux/smp_lock.h>
-#include <linux/init.h>
-#include <asm/mtrr.h>
-#include <asm/msr.h>
-
#include <linux/irq.h>
-#define JIFFIE_TIMEOUT 100
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/mtrr.h>
-extern void update_one_process( struct task_struct *p,
- unsigned long ticks, unsigned long user,
- unsigned long system, int cpu);
/*
* Some notes on processor bugs:
*
- * Pentium and Pentium Pro (and all CPUs) have bugs. The Linux issues
- * for SMP are handled as follows.
+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ * The Linux implications for SMP are handled as follows:
+ *
+ * Pentium III / [Xeon]
+ * None of the E1AP-E3AP erratas are visible to the user.
+ *
+ * E1AP. see PII A1AP
+ * E2AP. see PII A2AP
+ * E3AP. see PII A3AP
+ *
+ * Pentium II / [Xeon]
+ * None of the A1AP-A3AP erratas are visible to the user.
+ *
+ * A1AP. see PPro 1AP
+ * A2AP. see PPro 2AP
+ * A3AP. see PPro 7AP
*
* Pentium Pro
- * Occasional delivery of 'spurious interrupt' as trap #16. This
- * is very rare. The kernel logs the event and recovers
+ * None of 1AP-9AP erratas are visible to the normal user,
+ * except occasional delivery of 'spurious interrupt' as trap #15.
+ * This is very rare and a non-problem.
+ *
+ * 1AP. Linux maps APIC as non-cacheable
+ * 2AP. worked around in hardware
+ * 3AP. fixed in C0 and above steppings microcode update.
+ * Linux does not use excessive STARTUP_IPIs.
+ * 4AP. worked around in hardware
+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
+ * 'noapic' mode has vector 0xf filled out properly.
+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
+ * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
+ * 9AP. We do not use mixed mode
*
* Pentium
* There is a marginal case where REP MOVS on 100MHz SMP
@@ -77,1351 +75,34 @@ extern void update_one_process( struct task_struct *p,
* 4AP. Linux never generated 3 interrupts of the same priority
* to cause a lost local interrupt.
* 5AP. Remote read is never used
- * 9AP. XXX NEED TO CHECK WE HANDLE THIS XXX
- * 10AP. XXX NEED TO CHECK WE HANDLE THIS XXX
+ * 6AP. not affected - worked around in hardware
+ * 7AP. not affected - worked around in hardware
+ * 8AP. worked around in hardware - we get explicit CS errors if not
+ * 9AP. only 'noapic' mode affected. Might generate spurious
+ * interrupts, we log only the first one and count the
+ * rest silently.
+ * 10AP. not affected - worked around in hardware
* 11AP. Linux reads the APIC between writes to avoid this, as per
* the documentation. Make sure you preserve this as it affects
* the C stepping chips too.
+ * 12AP. not affected - worked around in hardware
+ * 13AP. not affected - worked around in hardware
+ * 14AP. we always deassert INIT during bootup
+ * 15AP. not affected - worked around in hardware
+ * 16AP. not affected - worked around in hardware
+ * 17AP. not affected - worked around in hardware
+ * 18AP. not affected - worked around in hardware
+ * 19AP. not affected - worked around in BIOS
*
- * If this sounds worrying believe me these bugs are ___RARE___ and
- * there's about nothing of note with C stepping upwards.
+ * If this sounds worrying believe me these bugs are either ___RARE___,
+ * or are signal timing bugs worked around in hardware and there's
+ * about nothing of note with C stepping upwards.
*/
-
-/* Kernel spinlock */
+/* The 'big kernel lock' */
spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
-/*
- * function prototypes:
- */
-static void cache_APIC_registers (void);
-static void stop_this_cpu (void);
-
-static int smp_b_stepping = 0; /* Set if we find a B stepping CPU */
-
-static int max_cpus = -1; /* Setup configured maximum number of CPUs to activate */
-int smp_found_config=0; /* Have we found an SMP box */
-
-unsigned long cpu_present_map = 0; /* Bitmask of physically existing CPUs */
-unsigned long cpu_online_map = 0; /* Bitmask of currently online CPUs */
-int smp_num_cpus = 0; /* Total count of live CPUs */
-int smp_threads_ready=0; /* Set when the idlers are all forked */
-volatile int cpu_number_map[NR_CPUS]; /* which CPU maps to which logical number */
-volatile int __cpu_logical_map[NR_CPUS]; /* which logical number maps to which CPU */
-static volatile unsigned long cpu_callin_map[NR_CPUS] = {0,}; /* We always use 0 the rest is ready for parallel delivery */
-static volatile unsigned long cpu_callout_map[NR_CPUS] = {0,}; /* We always use 0 the rest is ready for parallel delivery */
-volatile unsigned long smp_invalidate_needed; /* Used for the invalidate map that's also checked in the spinlock */
-volatile unsigned long kstack_ptr; /* Stack vector for booting CPUs */
-struct cpuinfo_x86 cpu_data[NR_CPUS]; /* Per CPU bogomips and other parameters */
-static unsigned int num_processors = 1; /* Internal processor count */
-unsigned long mp_ioapic_addr = 0xFEC00000; /* Address of the I/O apic (not yet used) */
-unsigned char boot_cpu_id = 0; /* Processor that is doing the boot up */
-static int smp_activated = 0; /* Tripped once we need to start cross invalidating */
-int apic_version[NR_CPUS]; /* APIC version number */
-unsigned long apic_retval; /* Just debugging the assembler.. */
-
-volatile unsigned long kernel_counter=0; /* Number of times the processor holds the lock */
-volatile unsigned long syscall_count=0; /* Number of times the processor holds the syscall lock */
-
-volatile unsigned long ipi_count; /* Number of IPIs delivered */
-
-const char lk_lockmsg[] = "lock from interrupt context at %p\n";
-
-int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, };
-extern int nr_ioapics;
-extern struct mpc_config_ioapic mp_apics [MAX_IO_APICS];
-extern int mp_irq_entries;
-extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
-extern int mpc_default_type;
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, };
-int mp_current_pci_id = 0;
-unsigned long mp_lapic_addr = 0;
-int skip_ioapic_setup = 0; /* 1 if "noapic" boot option passed */
-
-/* #define SMP_DEBUG */
-
-#ifdef SMP_DEBUG
-#define SMP_PRINTK(x) printk x
-#else
-#define SMP_PRINTK(x)
-#endif
-
-/*
- * IA s/w dev Vol 3, Section 7.4
- */
-#define APIC_DEFAULT_PHYS_BASE 0xfee00000
-
-#define CLEAR_TSC wrmsr(0x10, 0x00001000, 0x00001000)
-
-/*
- * Setup routine for controlling SMP activation
- *
- * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
- * activation entirely (the MPS table probe still happens, though).
- *
- * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
- * greater than 0, limits the maximum number of CPUs activated in
- * SMP mode to <NUM>.
- */
-
-static int __init nosmp(char *str)
-{
- max_cpus = 0;
- return 1;
-}
-
-__setup("nosmp", nosmp);
-
-static int __init maxcpus(char *str)
-{
- get_option(&str, &max_cpus);
- return 1;
-}
-
-__setup("maxcpus=", maxcpus);
-
-void ack_APIC_irq(void)
-{
- /* Clear the IPI */
-
- /* Dummy read */
- apic_read(APIC_SPIV);
-
- /* Docs say use 0 for future compatibility */
- apic_write(APIC_EOI, 0);
-}
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
-#ifndef CONFIG_X86_VISWS_APIC
-/*
- * Checksum an MP configuration block.
- */
-
-static int mpf_checksum(unsigned char *mp, int len)
-{
- int sum=0;
- while(len--)
- sum+=*mp++;
- return sum&0xFF;
-}
-
-/*
- * Processor encoding in an MP configuration block
- */
-
-static char *mpc_family(int family,int model)
-{
- static char n[32];
- static char *model_defs[]=
- {
- "80486DX","80486DX",
- "80486SX","80486DX/2 or 80487",
- "80486SL","Intel5X2(tm)",
- "Unknown","Unknown",
- "80486DX/4"
- };
- if (family==0x6)
- return("Pentium(tm) Pro");
- if (family==0x5)
- return("Pentium(tm)");
- if (family==0x0F && model==0x0F)
- return("Special controller");
- if (family==0x04 && model<9)
- return model_defs[model];
- sprintf(n,"Unknown CPU [%d:%d]",family, model);
- return n;
-}
-
-
-/*
- * Read the MPC
- */
-
-static int __init smp_read_mpc(struct mp_config_table *mpc)
-{
- char str[16];
- int count=sizeof(*mpc);
- int ioapics = 0;
- unsigned char *mpt=((unsigned char *)mpc)+count;
-
- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4))
- {
- panic("SMP mptable: bad signature [%c%c%c%c]!\n",
- mpc->mpc_signature[0],
- mpc->mpc_signature[1],
- mpc->mpc_signature[2],
- mpc->mpc_signature[3]);
- return 1;
- }
- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length))
- {
- panic("SMP mptable: checksum error!\n");
- return 1;
- }
- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04)
- {
- printk("Bad Config Table version (%d)!!\n",mpc->mpc_spec);
- return 1;
- }
- memcpy(str,mpc->mpc_oem,8);
- str[8]=0;
- printk("OEM ID: %s ",str);
-
- memcpy(str,mpc->mpc_productid,12);
- str[12]=0;
- printk("Product ID: %s ",str);
-
- printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
-
- /* save the local APIC address, it might be non-default */
- mp_lapic_addr = mpc->mpc_lapic;
-
- /*
- * Now process the configuration blocks.
- */
-
- while(count<mpc->mpc_length)
- {
- switch(*mpt)
- {
- case MP_PROCESSOR:
- {
- struct mpc_config_processor *m=
- (struct mpc_config_processor *)mpt;
- if (m->mpc_cpuflag&CPU_ENABLED)
- {
- printk("Processor #%d %s APIC version %d\n",
- m->mpc_apicid,
- mpc_family((m->mpc_cpufeature&
- CPU_FAMILY_MASK)>>8,
- (m->mpc_cpufeature&
- CPU_MODEL_MASK)>>4),
- m->mpc_apicver);
-#ifdef SMP_DEBUG
- if (m->mpc_featureflag&(1<<0))
- printk(" Floating point unit present.\n");
- if (m->mpc_featureflag&(1<<7))
- printk(" Machine Exception supported.\n");
- if (m->mpc_featureflag&(1<<8))
- printk(" 64 bit compare & exchange supported.\n");
- if (m->mpc_featureflag&(1<<9))
- printk(" Internal APIC present.\n");
-#endif
- if (m->mpc_cpuflag&CPU_BOOTPROCESSOR)
- {
- SMP_PRINTK((" Bootup CPU\n"));
- boot_cpu_id=m->mpc_apicid;
- }
- else /* Boot CPU already counted */
- num_processors++;
-
- if (m->mpc_apicid>NR_CPUS)
- printk("Processor #%d unused. (Max %d processors).\n",m->mpc_apicid, NR_CPUS);
- else
- {
- int ver = m->mpc_apicver;
-
- cpu_present_map|=(1<<m->mpc_apicid);
- /*
- * Validate version
- */
- if (ver == 0x0) {
- printk("BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
- ver = 0x10;
- }
- apic_version[m->mpc_apicid] = ver;
- }
- }
- mpt+=sizeof(*m);
- count+=sizeof(*m);
- break;
- }
- case MP_BUS:
- {
- struct mpc_config_bus *m=
- (struct mpc_config_bus *)mpt;
- memcpy(str,m->mpc_bustype,6);
- str[6]=0;
- SMP_PRINTK(("Bus #%d is %s\n",
- m->mpc_busid,
- str));
- if (strncmp(m->mpc_bustype,"ISA",3) == 0)
- mp_bus_id_to_type[m->mpc_busid] =
- MP_BUS_ISA;
- else
- if (strncmp(m->mpc_bustype,"EISA",4) == 0)
- mp_bus_id_to_type[m->mpc_busid] =
- MP_BUS_EISA;
- if (strncmp(m->mpc_bustype,"PCI",3) == 0) {
- mp_bus_id_to_type[m->mpc_busid] =
- MP_BUS_PCI;
- mp_bus_id_to_pci_bus[m->mpc_busid] =
- mp_current_pci_id;
- mp_current_pci_id++;
- }
- mpt+=sizeof(*m);
- count+=sizeof(*m);
- break;
- }
- case MP_IOAPIC:
- {
- struct mpc_config_ioapic *m=
- (struct mpc_config_ioapic *)mpt;
- if (m->mpc_flags&MPC_APIC_USABLE)
- {
- ioapics++;
- printk("I/O APIC #%d Version %d at 0x%lX.\n",
- m->mpc_apicid,m->mpc_apicver,
- m->mpc_apicaddr);
- mp_apics [nr_ioapics] = *m;
- if (++nr_ioapics > MAX_IO_APICS)
- --nr_ioapics;
- }
- mpt+=sizeof(*m);
- count+=sizeof(*m);
- break;
- }
- case MP_INTSRC:
- {
- struct mpc_config_intsrc *m=
- (struct mpc_config_intsrc *)mpt;
-
- mp_irqs [mp_irq_entries] = *m;
- if (++mp_irq_entries == MAX_IRQ_SOURCES) {
- printk("Max irq sources exceeded!!\n");
- printk("Skipping remaining sources.\n");
- --mp_irq_entries;
- }
-
- mpt+=sizeof(*m);
- count+=sizeof(*m);
- break;
- }
- case MP_LINTSRC:
- {
- struct mpc_config_intlocal *m=
- (struct mpc_config_intlocal *)mpt;
- mpt+=sizeof(*m);
- count+=sizeof(*m);
- break;
- }
- }
- }
- if (ioapics > MAX_IO_APICS)
- {
- printk("Warning: Max I/O APICs exceeded (max %d, found %d).\n", MAX_IO_APICS, ioapics);
- printk("Warning: switching to non APIC mode.\n");
- skip_ioapic_setup=1;
- }
- return num_processors;
-}
-
-/*
- * Scan the memory blocks for an SMP configuration block.
- */
-
-static int __init smp_scan_config(unsigned long base, unsigned long length)
-{
- unsigned long *bp=phys_to_virt(base);
- struct intel_mp_floating *mpf;
-
- SMP_PRINTK(("Scan SMP from %p for %ld bytes.\n",
- bp,length));
- if (sizeof(*mpf)!=16)
- printk("Error: MPF size\n");
-
- while (length>0)
- {
- if (*bp==SMP_MAGIC_IDENT)
- {
- mpf=(struct intel_mp_floating *)bp;
- if (mpf->mpf_length==1 &&
- !mpf_checksum((unsigned char *)bp,16) &&
- (mpf->mpf_specification == 1
- || mpf->mpf_specification == 4) )
- {
- printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
- if (mpf->mpf_feature2&(1<<7))
- printk(" IMCR and PIC compatibility mode.\n");
- else
- printk(" Virtual Wire compatibility mode.\n");
- smp_found_config=1;
- /*
- * Now see if we need to read further.
- */
- if (mpf->mpf_feature1!=0)
- {
- unsigned long cfg;
-
- /* local APIC has default address */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- /*
- * We need to know what the local
- * APIC id of the boot CPU is!
- */
-
-/*
- *
- * HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK
- *
- * It's not just a crazy hack. ;-)
- */
- /*
- * Standard page mapping
- * functions don't work yet.
- * We know that page 0 is not
- * used. Steal it for now!
- */
-
- cfg=pg0[0];
- pg0[0] = (mp_lapic_addr | _PAGE_RW | _PAGE_PRESENT);
- local_flush_tlb();
-
- boot_cpu_id = GET_APIC_ID(*((volatile unsigned long *) APIC_ID));
-
- /*
- * Give it back
- */
-
- pg0[0]= cfg;
- local_flush_tlb();
-
-/*
- *
- * END OF HACK END OF HACK END OF HACK END OF HACK END OF HACK
- *
- */
- /*
- * 2 CPUs, numbered 0 & 1.
- */
- cpu_present_map=3;
- num_processors=2;
- printk("I/O APIC at 0xFEC00000.\n");
-
- /*
- * Save the default type number, we
- * need it later to set the IO-APIC
- * up properly:
- */
- mpc_default_type = mpf->mpf_feature1;
-
- printk("Bus #0 is ");
- }
- switch(mpf->mpf_feature1)
- {
- case 1:
- case 5:
- printk("ISA\n");
- break;
- case 2:
- printk("EISA with no IRQ8 chaining\n");
- break;
- case 6:
- case 3:
- printk("EISA\n");
- break;
- case 4:
- case 7:
- printk("MCA\n");
- break;
- case 0:
- break;
- default:
- printk("???\nUnknown standard configuration %d\n",
- mpf->mpf_feature1);
- return 1;
- }
- if (mpf->mpf_feature1>4)
- {
- printk("Bus #1 is PCI\n");
-
- /*
- * Set local APIC version to
- * the integrated form.
- * It's initialized to zero
- * otherwise, representing
- * a discrete 82489DX.
- */
- apic_version[0] = 0x10;
- apic_version[1] = 0x10;
- }
- /*
- * Read the physical hardware table.
- * Anything here will override the
- * defaults.
- */
- if (mpf->mpf_physptr)
- smp_read_mpc((void *)mpf->mpf_physptr);
-
- __cpu_logical_map[0] = boot_cpu_id;
- global_irq_holder = boot_cpu_id;
- current->processor = boot_cpu_id;
-
- printk("Processors: %d\n", num_processors);
- /*
- * Only use the first configuration found.
- */
- return 1;
- }
- }
- bp+=4;
- length-=16;
- }
-
- return 0;
-}
-
-void __init init_intel_smp (void)
-{
- /*
- * FIXME: Linux assumes you have 640K of base ram..
- * this continues the error...
- *
- * 1) Scan the bottom 1K for a signature
- * 2) Scan the top 1K of base RAM
- * 3) Scan the 64K of bios
- */
- if (!smp_scan_config(0x0,0x400) &&
- !smp_scan_config(639*0x400,0x400) &&
- !smp_scan_config(0xF0000,0x10000)) {
- /*
- * If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
- * extended bios data area.
- *
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E, calculate and scan it here.
- *
- * NOTE! There are Linux loaders that will corrupt the EBDA
- * area, and as such this kind of SMP config may be less
- * trustworthy, simply because the SMP table may have been
- * stomped on during early boot. These loaders are buggy and
- * should be fixed.
- */
- unsigned int address;
-
- address = *(unsigned short *)phys_to_virt(0x40E);
- address<<=4;
- smp_scan_config(address, 0x1000);
- if (smp_found_config)
- printk(KERN_WARNING "WARNING: MP table in the EBDA can be UNSAFE, contact linux-smp@vger.rutgers.edu if you experience SMP problems!\n");
- }
-}
-
-#else
-
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesnt have a BIOS(-configuration table).
- * No problem for Linux.
- */
-void __init init_visws_smp(void)
-{
- smp_found_config = 1;
-
- cpu_present_map |= 2; /* or in id 1 */
- apic_version[1] |= 0x10; /* integrated APIC */
- apic_version[0] |= 0x10;
-
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-}
-
-#endif
-
-/*
- * - Intel MP Configuration Table
- * - or SGI Visual Workstation configuration
- */
-void __init init_smp_config (void)
-{
-#ifndef CONFIG_VISWS
- init_intel_smp();
-#else
- init_visws_smp();
-#endif
-}
-
-
-
-/*
- * Trampoline 80x86 program as an array.
- */
-
-extern unsigned char trampoline_data [];
-extern unsigned char trampoline_end [];
-static unsigned char *trampoline_base;
-
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
-
-static unsigned long __init setup_trampoline(void)
-{
- memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
- return virt_to_phys(trampoline_base);
-}
-
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-unsigned long __init smp_alloc_memory(unsigned long mem_base)
-{
- if (virt_to_phys((void *)mem_base) >= 0x9F000)
- panic("smp_alloc_memory: Insufficient low memory for kernel trampoline 0x%lx.", mem_base);
- trampoline_base = (void *)mem_base;
- return mem_base + PAGE_SIZE;
-}
-
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
-
-void __init smp_store_cpu_info(int id)
-{
- struct cpuinfo_x86 *c=&cpu_data[id];
-
- *c = boot_cpu_data;
- c->pte_quick = 0;
- c->pgd_quick = 0;
- c->pgtable_cache_sz = 0;
- identify_cpu(c);
- /*
- * Mask B, Pentium, but not Pentium MMX
- */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3)
- smp_b_stepping=1; /* Remember we have B step Pentia with bugs */
-}
-
-/*
- * Architecture specific routine called by the kernel just before init is
- * fired off. This allows the BP to have everything in order [we hope].
- * At the end of this all the APs will hit the system scheduling and off
- * we go. Each AP will load the system gdt's and jump through the kernel
- * init into idle(). At this point the scheduler will one day take over
- * and give them jobs to do. smp_callin is a standard routine
- * we use to track CPUs as they power up.
- */
-
-static atomic_t smp_commenced = ATOMIC_INIT(0);
-
-void __init smp_commence(void)
-{
- /*
- * Lets the callins below out of their loop.
- */
- SMP_PRINTK(("Setting commenced=1, go go go\n"));
-
- wmb();
- atomic_set(&smp_commenced,1);
-}
-
-void __init enable_local_APIC(void)
-{
- unsigned long value;
-
- value = apic_read(APIC_SPIV);
- value |= (1<<8); /* Enable APIC (bit==1) */
-#if 0
- value &= ~(1<<9); /* Enable focus processor (bit==0) */
-#else
- value |= (1<<9); /* Disable focus processor (bit==1) */
-#endif
- value |= 0xff; /* Set spurious IRQ vector to 0xff */
- apic_write(APIC_SPIV,value);
-
- /*
- * Set Task Priority to 'accept all'
- */
- value = apic_read(APIC_TASKPRI);
- value &= ~APIC_TPRI_MASK;
- apic_write(APIC_TASKPRI,value);
-
- /*
- * Clear the logical destination ID, just to be safe.
- * also, put the APIC into flat delivery mode.
- */
- value = apic_read(APIC_LDR);
- value &= ~APIC_LDR_MASK;
- apic_write(APIC_LDR,value);
-
- value = apic_read(APIC_DFR);
- value |= SET_APIC_DFR(0xf);
- apic_write(APIC_DFR, value);
-
- udelay(100); /* B safe */
-}
-
-unsigned long __init init_smp_mappings(unsigned long memory_start)
-{
- unsigned long apic_phys;
-
- memory_start = PAGE_ALIGN(memory_start);
- if (smp_found_config) {
- apic_phys = mp_lapic_addr;
- } else {
- /*
- * set up a fake all zeroes page to simulate the
- * local APIC and another one for the IO-APIC. We
- * could use the real zero-page, but it's safer
- * this way if some buggy code writes to this page ...
- */
- apic_phys = __pa(memory_start);
- memset((void *)memory_start, 0, PAGE_SIZE);
- memory_start += PAGE_SIZE;
- }
- set_fixmap(FIX_APIC_BASE,apic_phys);
- printk("mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys);
-
-#ifdef CONFIG_X86_IO_APIC
- {
- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
- ioapic_phys = mp_apics[i].mpc_apicaddr;
- } else {
- ioapic_phys = __pa(memory_start);
- memset((void *)memory_start, 0, PAGE_SIZE);
- memory_start += PAGE_SIZE;
- }
- set_fixmap(idx,ioapic_phys);
- printk("mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
- idx++;
- }
- }
-#endif
-
- return memory_start;
-}
-
-extern void calibrate_delay(void);
-
-void __init smp_callin(void)
-{
- int cpuid;
- unsigned long timeout;
-
- /*
- * (This works even if the APIC is not enabled.)
- */
- cpuid = GET_APIC_ID(apic_read(APIC_ID));
-
- SMP_PRINTK(("CPU#%d waiting for CALLOUT\n", cpuid));
-
- /*
- * STARTUP IPIs are fragile beasts as they might sometimes
- * trigger some glue motherboard logic. Complete APIC bus
- * silence for 1 second, this overestimates the time the
- * boot CPU is spending to send the up to 2 STARTUP IPIs
- * by a factor of two. This should be enough.
- */
-
- /*
- * Waiting 2s total for startup (udelay is not yet working)
- */
- timeout = jiffies + 2*HZ;
- while (time_before(jiffies,timeout))
- {
- /*
- * Has the boot CPU finished it's STARTUP sequence?
- */
- if (test_bit(cpuid, (unsigned long *)&cpu_callout_map[0]))
- break;
- }
-
- while (!time_before(jiffies,timeout)) {
- printk("BUG: CPU%d started up but did not get a callout!\n",
- cpuid);
- stop_this_cpu();
- }
-
- /*
- * the boot CPU has finished the init stage and is spinning
- * on callin_map until we finish. We are free to set up this
- * CPU, first the APIC. (this is probably redundant on most
- * boards)
- */
-
- SMP_PRINTK(("CALLIN, before enable_local_APIC().\n"));
- enable_local_APIC();
-
- /*
- * Set up our APIC timer.
- */
- setup_APIC_clock();
-
- __sti();
-
-#ifdef CONFIG_MTRR
- /* Must be done before calibration delay is computed */
- mtrr_init_secondary_cpu ();
-#endif
- /*
- * Get our bogomips.
- */
- calibrate_delay();
- SMP_PRINTK(("Stack at about %p\n",&cpuid));
-
- /*
- * Save our processor parameters
- */
- smp_store_cpu_info(cpuid);
-
- /*
- * Allow the master to continue.
- */
- set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]);
-}
-
-int cpucount = 0;
-
-extern int cpu_idle(void);
-
-/*
- * Activate a secondary processor.
- */
-int __init start_secondary(void *unused)
-{
- /*
- * Dont put anything before smp_callin(), SMP
- * booting is too fragile that we want to limit the
- * things done here to the most necessary things.
- */
- cpu_init();
- smp_callin();
- while (!atomic_read(&smp_commenced))
- /* nothing */ ;
- return cpu_idle();
-}
-
-/*
- * Everything has been set up for the secondary
- * CPUs - they just need to reload everything
- * from the task structure
- * This function must not return.
- */
-void __init initialize_secondary(void)
-{
- /*
- * We don't actually need to load the full TSS,
- * basically just the stack pointer and the eip.
- */
-
- asm volatile(
- "movl %0,%%esp\n\t"
- "jmp *%1"
- :
- :"r" (current->thread.esp),"r" (current->thread.eip));
-}
-
-extern struct {
- void * esp;
- unsigned short ss;
-} stack_start;
-
-static int __init fork_by_hand(void)
-{
- struct pt_regs regs;
- /* don't care about the eip and regs settings since we'll never
- reschedule the forked task. */
- return do_fork(CLONE_VM|CLONE_PID, 0, &regs);
-}
-
-static void __init do_boot_cpu(int i)
-{
- unsigned long cfg;
- pgd_t maincfg;
- struct task_struct *idle;
- unsigned long send_status, accept_status;
- int timeout, num_starts, j;
- unsigned long start_eip;
-
- cpucount++;
- /* We can't use kernel_thread since we must _avoid_ to reschedule
- the child. */
- if (fork_by_hand() < 0)
- panic("failed fork for CPU %d", i);
-
- /*
- * We remove it from the pidhash and the runqueue
- * once we got the process:
- */
- idle = init_task.prev_task;
- if (!idle)
- panic("No idle process for CPU %d", i);
-
- idle->processor = i;
- __cpu_logical_map[cpucount] = i;
- cpu_number_map[i] = cpucount;
- idle->has_cpu = 1; /* we schedule the first task manually */
- idle->thread.eip = (unsigned long) start_secondary;
-
- del_from_runqueue(idle);
- unhash_process(idle);
- init_tasks[cpucount] = idle;
-
- /* start_eip had better be page-aligned! */
- start_eip = setup_trampoline();
-
- printk("Booting processor %d eip %lx\n", i, start_eip); /* So we see what's up */
- stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
-
- /*
- * This grunge runs the startup process for
- * the targeted processor.
- */
-
- SMP_PRINTK(("Setting warm reset code and vector.\n"));
-
- CMOS_WRITE(0xa, 0xf);
- local_flush_tlb();
- SMP_PRINTK(("1.\n"));
- *((volatile unsigned short *) phys_to_virt(0x469)) = start_eip >> 4;
- SMP_PRINTK(("2.\n"));
- *((volatile unsigned short *) phys_to_virt(0x467)) = start_eip & 0xf;
- SMP_PRINTK(("3.\n"));
-
- maincfg=swapper_pg_dir[0];
- ((unsigned long *)swapper_pg_dir)[0]=0x102007;
-
- /*
- * Be paranoid about clearing APIC errors.
- */
-
- if ( apic_version[i] & 0xF0 )
- {
- apic_write(APIC_ESR, 0);
- accept_status = (apic_read(APIC_ESR) & 0xEF);
- }
-
- /*
- * Status is now clean
- */
-
- send_status = 0;
- accept_status = 0;
-
- /*
- * Starting actual IPI sequence...
- */
-
- SMP_PRINTK(("Asserting INIT.\n"));
-
- /*
- * Turn INIT on
- */
-
- cfg=apic_read(APIC_ICR2);
- cfg&=0x00FFFFFF;
- apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); /* Target chip */
- cfg=apic_read(APIC_ICR);
- cfg&=~0xCDFFF; /* Clear bits */
- cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_ASSERT | APIC_DEST_DM_INIT);
- apic_write(APIC_ICR, cfg); /* Send IPI */
-
- udelay(200);
- SMP_PRINTK(("Deasserting INIT.\n"));
-
- cfg=apic_read(APIC_ICR2);
- cfg&=0x00FFFFFF;
- apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); /* Target chip */
- cfg=apic_read(APIC_ICR);
- cfg&=~0xCDFFF; /* Clear bits */
- cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_DM_INIT);
- apic_write(APIC_ICR, cfg); /* Send IPI */
-
- /*
- * Should we send STARTUP IPIs ?
- *
- * Determine this based on the APIC version.
- * If we don't have an integrated APIC, don't
- * send the STARTUP IPIs.
- */
-
- if ( apic_version[i] & 0xF0 )
- num_starts = 2;
- else
- num_starts = 0;
-
- /*
- * Run STARTUP IPI loop.
- */
-
- for (j = 1; !(send_status || accept_status)
- && (j <= num_starts) ; j++)
- {
- SMP_PRINTK(("Sending STARTUP #%d.\n",j));
- apic_write(APIC_ESR, 0);
- SMP_PRINTK(("After apic_write.\n"));
-
- /*
- * STARTUP IPI
- */
-
- cfg=apic_read(APIC_ICR2);
- cfg&=0x00FFFFFF;
- apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); /* Target chip */
- cfg=apic_read(APIC_ICR);
- cfg&=~0xCDFFF; /* Clear bits */
- cfg |= (APIC_DEST_DM_STARTUP | (start_eip >> 12)); /* Boot on the stack */
- SMP_PRINTK(("Before start apic_write.\n"));
- apic_write(APIC_ICR, cfg); /* Kick the second */
-
- SMP_PRINTK(("Startup point 1.\n"));
-
- timeout = 0;
- SMP_PRINTK(("Waiting for send to finish...\n"));
- do {
- SMP_PRINTK(("+"));
- udelay(100);
- send_status = apic_read(APIC_ICR) & 0x1000;
- } while (send_status && (timeout++ < 1000));
-
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(200);
- accept_status = (apic_read(APIC_ESR) & 0xEF);
- }
- SMP_PRINTK(("After Startup.\n"));
-
- if (send_status) /* APIC never delivered?? */
- printk("APIC never delivered???\n");
- if (accept_status) /* Send accept error */
- printk("APIC delivery error (%lx).\n", accept_status);
-
- if ( !(send_status || accept_status) )
- {
- /*
- * allow APs to start initializing.
- */
- SMP_PRINTK(("Before Callout %d.\n", i));
- set_bit(i, (unsigned long *)&cpu_callout_map[0]);
- SMP_PRINTK(("After Callout %d.\n", i));
-
- for(timeout=0;timeout<50000;timeout++)
- {
- if (cpu_callin_map[0]&(1<<i))
- break; /* It has booted */
- udelay(100); /* Wait 5s total for a response */
- }
- if (cpu_callin_map[0]&(1<<i))
- {
- /* number CPUs logically, starting from 1 (BSP is 0) */
-#if 0
- cpu_number_map[i] = cpucount;
- __cpu_logical_map[cpucount] = i;
-#endif
- printk("OK.\n");
- printk("CPU%d: ", i);
- print_cpu_info(&cpu_data[i]);
- }
- else
- {
- if (*((volatile unsigned char *)phys_to_virt(8192))==0xA5)
- printk("Stuck ??\n");
- else
- printk("Not responding.\n");
- }
- SMP_PRINTK(("CPU has booted.\n"));
- }
- else
- {
- __cpu_logical_map[cpucount] = -1;
- cpu_number_map[i] = -1;
- cpucount--;
- }
-
- swapper_pg_dir[0]=maincfg;
- local_flush_tlb();
-
- /* mark "stuck" area as not stuck */
- *((volatile unsigned long *)phys_to_virt(8192)) = 0;
-}
-
-cycles_t cacheflush_time;
-extern unsigned long cpu_hz;
-
-static void smp_tune_scheduling (void)
-{
- unsigned long cachesize;
- /*
- * Rough estimation for SMP scheduling, this is the number of
- * cycles it takes for a fully memory-limited process to flush
- * the SMP-local cache.
- *
- * (For a P5 this pretty much means we will choose another idle
- * CPU almost always at wakeup time (this is due to the small
- * L1 cache), on PIIs it's around 50-100 usecs, depending on
- * the cache size)
- */
-
- if (!cpu_hz) {
- /*
- * this basically disables processor-affinity
- * scheduling on SMP without a TSC.
- */
- cacheflush_time = 0;
- return;
- } else {
- cachesize = boot_cpu_data.x86_cache_size;
- if (cachesize == -1)
- cachesize = 8; /* Pentiums */
-
- cacheflush_time = cpu_hz/1024*cachesize/5000;
- }
-
- printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
- (long)cacheflush_time/(cpu_hz/1000000),
- ((long)cacheflush_time*100/(cpu_hz/1000000)) % 100);
-}
-
-unsigned int prof_multiplier[NR_CPUS];
-unsigned int prof_old_multiplier[NR_CPUS];
-unsigned int prof_counter[NR_CPUS];
-
-/*
- * Cycle through the processors sending APIC IPIs to boot each.
- */
-
-void __init smp_boot_cpus(void)
-{
- int i;
-
-#ifdef CONFIG_MTRR
- /* Must be done before other processors booted */
- mtrr_init_boot_cpu ();
-#endif
- /*
- * Initialize the logical to physical CPU number mapping
- * and the per-CPU profiling counter/multiplier
- */
-
- for (i = 0; i < NR_CPUS; i++) {
- cpu_number_map[i] = -1;
- prof_counter[i] = 1;
- prof_old_multiplier[i] = 1;
- prof_multiplier[i] = 1;
- }
-
- /*
- * Setup boot CPU information
- */
-
- smp_store_cpu_info(boot_cpu_id); /* Final full version of the data */
- smp_tune_scheduling();
- printk("CPU%d: ", boot_cpu_id);
- print_cpu_info(&cpu_data[boot_cpu_id]);
-
- /*
- * not necessary because the MP table should list the boot
- * CPU too, but we do it for the sake of robustness anyway.
- * (and for the case when a non-SMP board boots an SMP kernel)
- */
- cpu_present_map |= (1 << hard_smp_processor_id());
-
- cpu_number_map[boot_cpu_id] = 0;
-
- init_idle();
-
- /*
- * If we couldnt find an SMP configuration at boot time,
- * get out of here now!
- */
-
- if (!smp_found_config)
- {
- printk(KERN_NOTICE "SMP motherboard not detected. Using dummy APIC emulation.\n");
-#ifndef CONFIG_VISWS
- io_apic_irqs = 0;
-#endif
- cpu_online_map = cpu_present_map;
- smp_num_cpus = 1;
- goto smp_done;
- }
-
- /*
- * If SMP should be disabled, then really disable it!
- */
-
- if (!max_cpus)
- {
- smp_found_config = 0;
- printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
- }
-
-#ifdef SMP_DEBUG
- {
- int reg;
-
- /*
- * This is to verify that we're looking at
- * a real local APIC. Check these against
- * your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-
- reg = apic_read(APIC_VERSION);
- SMP_PRINTK(("Getting VERSION: %x\n", reg));
-
- apic_write(APIC_VERSION, 0);
- reg = apic_read(APIC_VERSION);
- SMP_PRINTK(("Getting VERSION: %x\n", reg));
-
- /*
- * The two version reads above should print the same
- * NON-ZERO!!! numbers. If the second one is zero,
- * there is a problem with the APIC write/read
- * definitions.
- *
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
-
-
- reg = apic_read(APIC_LVT0);
- SMP_PRINTK(("Getting LVT0: %x\n", reg));
-
- reg = apic_read(APIC_LVT1);
- SMP_PRINTK(("Getting LVT1: %x\n", reg));
- }
-#endif
-
- enable_local_APIC();
-
- /*
- * Set up our local APIC timer:
- */
- setup_APIC_clock ();
-
- /*
- * Now scan the CPU present map and fire up the other CPUs.
- */
-
- /*
- * Add all detected CPUs. (later on we can down individual
- * CPUs which will change cpu_online_map but not necessarily
- * cpu_present_map. We are pretty much ready for hot-swap CPUs.)
- */
- cpu_online_map = cpu_present_map;
- mb();
-
- SMP_PRINTK(("CPU map: %lx\n", cpu_present_map));
-
- for(i=0;i<NR_CPUS;i++)
- {
- /*
- * Don't even attempt to start the boot CPU!
- */
- if (i == boot_cpu_id)
- continue;
-
- if ((cpu_online_map & (1 << i))
- && (max_cpus < 0 || max_cpus > cpucount+1))
- {
- do_boot_cpu(i);
- }
-
- /*
- * Make sure we unmap all failed CPUs
- */
-
- if (cpu_number_map[i] == -1 && (cpu_online_map & (1 << i))) {
- printk("CPU #%d not responding. Removing from cpu_online_map.\n",i);
- cpu_online_map &= ~(1 << i);
- }
- }
-
- /*
- * Cleanup possible dangling ends...
- */
-
-#ifndef CONFIG_VISWS
- {
- unsigned long cfg;
-
- /*
- * Install writable page 0 entry.
- */
- cfg = pg0[0];
- pg0[0] = _PAGE_RW | _PAGE_PRESENT; /* writeable, present, addr 0 */
- local_flush_tlb();
-
- /*
- * Paranoid: Set warm reset code and vector here back
- * to default values.
- */
-
- CMOS_WRITE(0, 0xf);
-
- *((volatile long *) phys_to_virt(0x467)) = 0;
-
- /*
- * Restore old page 0 entry.
- */
-
- pg0[0] = cfg;
- local_flush_tlb();
- }
-#endif
-
- /*
- * Allow the user to impress friends.
- */
-
- SMP_PRINTK(("Before bogomips.\n"));
- if (!cpucount) {
- printk(KERN_ERR "Error: only one processor found.\n");
- cpu_online_map = (1<<hard_smp_processor_id());
- } else {
- unsigned long bogosum = 0;
- for(i = 0; i < 32; i++)
- if (cpu_online_map&(1<<i))
- bogosum+=cpu_data[i].loops_per_sec;
- printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
- cpucount+1,
- (bogosum+2500)/500000,
- ((bogosum+2500)/5000)%100);
- SMP_PRINTK(("Before bogocount - setting activated=1.\n"));
- smp_activated = 1;
- }
- smp_num_cpus = cpucount + 1;
-
- if (smp_b_stepping)
- printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
- SMP_PRINTK(("Boot done.\n"));
-
- cache_APIC_registers();
-#ifndef CONFIG_VISWS
- /*
- * Here we can be sure that there is an IO-APIC in the system. Let's
- * go and set it up:
- */
- if (!skip_ioapic_setup)
- setup_IO_APIC();
-#endif
-
-smp_done:
- /*
- * now we know the other CPUs have fired off and we know our
- * APIC ID, so we can go init the TSS and stuff:
- */
- cpu_init();
-}
-
+volatile unsigned long smp_invalidate_needed;
/*
* the following functions deal with sending IPIs between CPUs.
@@ -1429,17 +110,6 @@ smp_done:
* We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
*/
-
-/*
- * Silly serialization to work around CPU bug in P5s.
- * We can safely turn it off on a 686.
- */
-#ifdef CONFIG_X86_GOOD_APIC
-# define FORCE_APIC_SERIALIZATION 0
-#else
-# define FORCE_APIC_SERIALIZATION 1
-#endif
-
static unsigned int cached_APIC_ICR;
static unsigned int cached_APIC_ICR2;
@@ -1462,7 +132,7 @@ void cache_APIC_registers (void)
static inline unsigned int __get_ICR (void)
{
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
/*
* Wait for the APIC to become ready - this should never occur. It's
* a debugging check really.
@@ -1473,11 +143,11 @@ static inline unsigned int __get_ICR (void)
while (count < 1000)
{
cfg = slow_ICR;
- if (!(cfg&(1<<12))) {
- if (count)
- atomic_add(count, (atomic_t*)&ipi_count);
+ if (!(cfg&(1<<12)))
return cfg;
- }
+ printk("CPU #%d: ICR still busy [%08x]\n",
+ smp_processor_id(), cfg);
+ irq_err_count++;
count++;
udelay(10);
}
@@ -1491,19 +161,25 @@ static inline unsigned int __get_ICR (void)
static inline unsigned int __get_ICR2 (void)
{
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
return slow_ICR2;
#else
return cached_APIC_ICR2;
#endif
}
+#define LOGICAL_DELIVERY 1
+
static inline int __prepare_ICR (unsigned int shortcut, int vector)
{
unsigned int cfg;
cfg = __get_ICR();
- cfg |= APIC_DEST_DM_FIXED|shortcut|vector;
+ cfg |= APIC_DEST_DM_FIXED|shortcut|vector
+#if LOGICAL_DELIVERY
+ |APIC_DEST_LOGICAL
+#endif
+ ;
return cfg;
}
@@ -1513,7 +189,11 @@ static inline int __prepare_ICR2 (unsigned int dest)
unsigned int cfg;
cfg = __get_ICR2();
+#if LOGICAL_DELIVERY
+ cfg |= SET_APIC_DEST_FIELD((1<<dest));
+#else
cfg |= SET_APIC_DEST_FIELD(dest);
+#endif
return cfg;
}
@@ -1526,7 +206,7 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
* have to lock out interrupts to be safe. Otherwise it's just one
* single atomic write to the APIC, no need for cli/sti.
*/
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
unsigned long flags;
__save_flags(flags);
@@ -1536,21 +216,26 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
/*
* No need to touch the target chip field
*/
-
cfg = __prepare_ICR(shortcut, vector);
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write(APIC_ICR, cfg);
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
__restore_flags(flags);
#endif
}
static inline void send_IPI_allbutself(int vector)
{
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+ /*
+ * if there are no other CPUs in the system then
+ * we get an APIC send error if we try to broadcast.
+ * thus we have to avoid sending IPIs in this case.
+ */
+ if (smp_num_cpus > 1)
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
}
static inline void send_IPI_all(int vector)
@@ -1566,7 +251,7 @@ void send_IPI_self(int vector)
static inline void send_IPI_single(int dest, int vector)
{
unsigned long cfg;
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
unsigned long flags;
__save_flags(flags);
@@ -1589,7 +274,7 @@ static inline void send_IPI_single(int dest, int vector)
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write(APIC_ICR, cfg);
-#if FORCE_APIC_SERIALIZATION
+#if FORCE_READ_AROUND_WRITE
__restore_flags(flags);
#endif
}
@@ -1715,200 +400,97 @@ void smp_send_reschedule(int cpu)
}
/*
- * this function sends a 'stop' IPI to all other CPUs in the system.
- * it goes straight through.
- */
-
-void smp_send_stop(void)
-{
- send_IPI_allbutself(STOP_CPU_VECTOR);
-}
-
-/* Structure and data for smp_call_function(). This is designed to minimise
+ * Structure and data for smp_call_function(). This is designed to minimise
* static memory requirements. It also looks cleaner.
*/
-struct smp_call_function_struct {
+static volatile struct call_data_struct {
void (*func) (void *info);
void *info;
- atomic_t unstarted_count;
- atomic_t unfinished_count;
+ atomic_t started;
+ atomic_t finished;
int wait;
-};
-static volatile struct smp_call_function_struct *smp_call_function_data = NULL;
+} *call_data = NULL;
/*
* this function sends a 'generic call function' IPI to all other CPUs
* in the system.
*/
-int smp_call_function (void (*func) (void *info), void *info, int retry,
- int wait)
-/* [SUMMARY] Run a function on all other CPUs.
- <func> The function to run. This must be fast and non-blocking.
- <info> An arbitrary pointer to pass to the function.
- <retry> If true, keep retrying until ready.
- <wait> If true, wait until function has completed on other CPUs.
- [RETURNS] 0 on success, else a negative status code. Does not return until
- remote CPUs are nearly ready to execute <<func>> or are or have executed.
-*/
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+ int wait)
+/*
+ * [SUMMARY] Run a function on all other CPUs.
+ * <func> The function to run. This must be fast and non-blocking.
+ * <info> An arbitrary pointer to pass to the function.
+ * <nonatomic> If true, we might schedule away to lock the mutex
+ * <wait> If true, wait (atomically) until function has completed on other CPUs.
+ * [RETURNS] 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
+ */
{
+ struct call_data_struct data;
+ int ret, cpus = smp_num_cpus-1;
+ static DECLARE_MUTEX(lock);
unsigned long timeout;
- struct smp_call_function_struct data;
- static spinlock_t lock = SPIN_LOCK_UNLOCKED;
-
- if (retry) {
- while (1) {
- if (smp_call_function_data) {
- schedule (); /* Give a mate a go */
- continue;
- }
- spin_lock (&lock);
- if (smp_call_function_data) {
- spin_unlock (&lock); /* Bad luck */
- continue;
- }
- /* Mine, all mine! */
- break;
- }
- }
- else {
- if (smp_call_function_data) return -EBUSY;
- spin_lock (&lock);
- if (smp_call_function_data) {
- spin_unlock (&lock);
+
+ if (nonatomic)
+ down(&lock);
+ else
+ if (down_trylock(&lock))
return -EBUSY;
- }
- }
- smp_call_function_data = &data;
- spin_unlock (&lock);
+
+ if (call_data) // temporary debugging check
+ BUG();
+
+ call_data = &data;
data.func = func;
data.info = info;
- atomic_set (&data.unstarted_count, smp_num_cpus - 1);
+ atomic_set(&data.started, 0);
data.wait = wait;
- if (wait) atomic_set (&data.unfinished_count, smp_num_cpus - 1);
- /* Send a message to all other CPUs and wait for them to respond */
- send_IPI_allbutself (CALL_FUNCTION_VECTOR);
- /* Wait for response */
- timeout = jiffies + JIFFIE_TIMEOUT;
- while ( (atomic_read (&data.unstarted_count) > 0) &&
- time_before (jiffies, timeout) )
- barrier ();
- if (atomic_read (&data.unstarted_count) > 0) {
- smp_call_function_data = NULL;
- return -ETIMEDOUT;
- }
if (wait)
- while (atomic_read (&data.unfinished_count) > 0)
- barrier ();
- smp_call_function_data = NULL;
+ atomic_set(&data.finished, 0);
+ mb();
+
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ timeout = jiffies + HZ;
+ while ((atomic_read(&data.started) != cpus)
+ && time_before(jiffies, timeout))
+ barrier();
+ ret = -ETIMEDOUT;
+ if (atomic_read(&data.started) != cpus)
+ goto out;
+ ret = 0;
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ barrier();
+out:
+ call_data = NULL;
+ up(&lock);
return 0;
}
-static unsigned int calibration_result;
-
-void setup_APIC_timer(unsigned int clocks);
-
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-void smp_local_timer_interrupt(struct pt_regs * regs)
+static void stop_this_cpu (void * dummy)
{
- int user = (user_mode(regs) != 0);
- int cpu = smp_processor_id();
-
/*
- * The profiling function is SMP safe. (nothing can mess
- * around with "current", and the profiling counters are
- * updated with atomic operations). This is especially
- * useful with a profiling multiplier != 1
+ * Remove this CPU:
*/
- if (!user)
- x86_do_profile(regs->eip);
-
- if (!--prof_counter[cpu]) {
- int system = 1 - user;
- struct task_struct * p = current;
-
- /*
- * The multiplier may have changed since the last time we got
- * to this point as a result of the user writing to
- * /proc/profile. In this case we need to adjust the APIC
- * timer accordingly.
- *
- * Interrupts are already masked off at this point.
- */
- prof_counter[cpu] = prof_multiplier[cpu];
- if (prof_counter[cpu] != prof_old_multiplier[cpu]) {
- setup_APIC_timer(calibration_result/prof_counter[cpu]);
- prof_old_multiplier[cpu] = prof_counter[cpu];
- }
-
- /*
- * After doing the above, we need to make like
- * a normal interrupt - otherwise timer interrupts
- * ignore the global interrupt lock, which is the
- * WrongThing (tm) to do.
- */
-
- irq_enter(cpu, 0);
- update_one_process(p, 1, user, system, cpu);
- if (p->pid) {
- p->counter -= 1;
- if (p->counter <= 0) {
- p->counter = 0;
- p->need_resched = 1;
- }
- if (p->priority < DEF_PRIORITY) {
- kstat.cpu_nice += user;
- kstat.per_cpu_nice[cpu] += user;
- } else {
- kstat.cpu_user += user;
- kstat.per_cpu_user[cpu] += user;
- }
- kstat.cpu_system += system;
- kstat.per_cpu_system[cpu] += system;
-
- }
- irq_exit(cpu, 0);
- }
+ clear_bit(smp_processor_id(), &cpu_online_map);
- /*
- * We take the 'long' return path, and there every subsystem
- * grabs the apropriate locks (kernel lock/ irq lock).
- *
- * we might want to decouple profiling from the 'long path',
- * and do the profiling totally in assembly.
- *
- * Currently this isn't too much of an issue (performance wise),
- * we can take more than 100K local irqs per second on a 100 MHz P5.
- */
+ if (cpu_data[smp_processor_id()].hlt_works_ok)
+ for(;;) __asm__("hlt");
+ for (;;);
}
/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesnt support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- * interrupt as well. Thus we cannot inline the local irq ... ]
+ * this function calls the 'stop' function on all other CPUs in the system.
*/
-void smp_apic_timer_interrupt(struct pt_regs * regs)
+
+void smp_send_stop(void)
{
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow, and we
- * want to be able to accept NMI tlb invalidates
- * during this time.
- */
- ack_APIC_irq();
- smp_local_timer_interrupt(regs);
+ smp_call_function(stop_this_cpu, NULL, 1, 0);
}
/*
@@ -1944,39 +526,24 @@ asmlinkage void smp_invalidate_interrupt(void)
}
-static void stop_this_cpu (void)
+asmlinkage void smp_call_function_interrupt(void)
{
+ void (*func) (void *info) = call_data->func;
+ void *info = call_data->info;
+ int wait = call_data->wait;
+
+ ack_APIC_irq();
/*
- * Remove this CPU:
+ * Notify initiating CPU that I've grabbed the data and am
+ * about to execute the function
*/
- clear_bit(smp_processor_id(), &cpu_online_map);
-
- if (cpu_data[smp_processor_id()].hlt_works_ok)
- for(;;) __asm__("hlt");
- for (;;);
-}
-
-/*
- * CPU halt call-back
- */
-asmlinkage void smp_stop_cpu_interrupt(void)
-{
- stop_this_cpu();
-}
-
-asmlinkage void smp_call_function_interrupt(void)
-{
- void (*func) (void *info) = smp_call_function_data->func;
- void *info = smp_call_function_data->info;
- int wait = smp_call_function_data->wait;
-
- ack_APIC_irq ();
- /* Notify initiating CPU that I've grabbed the data and am about to
- execute the function */
- atomic_dec (&smp_call_function_data->unstarted_count);
- /* At this point the structure may be out of scope unless wait==1 */
- (*func) (info);
- if (wait) atomic_dec (&smp_call_function_data->unfinished_count);
+ atomic_inc(&call_data->started);
+ /*
+ * At this point the structure may be out of scope unless wait==1
+ */
+ (*func)(info);
+ if (wait)
+ atomic_inc(&call_data->finished);
}
/*
@@ -1991,6 +558,34 @@ asmlinkage void smp_spurious_interrupt(void)
}
/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+
+static spinlock_t err_lock;
+
+asmlinkage void smp_error_interrupt(void)
+{
+ unsigned long v;
+
+ spin_lock(&err_lock);
+
+ v = apic_read(APIC_ESR);
+ printk("APIC error interrupt on CPU#%d, should never happen.\n",
+ smp_processor_id());
+ printk("... APIC ESR0: %08lx\n", v);
+
+ apic_write(APIC_ESR, 0);
+ v = apic_read(APIC_ESR);
+ printk("... APIC ESR1: %08lx\n", v);
+
+ ack_APIC_irq();
+
+ irq_err_count++;
+
+ spin_unlock(&err_lock);
+}
+
+/*
* This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts
* per second. We assume that the caller has already set up the local
* APIC.
@@ -1999,6 +594,10 @@ asmlinkage void smp_spurious_interrupt(void)
* closely follows bus clocks.
*/
+int prof_multiplier[NR_CPUS] = { 1, };
+int prof_old_multiplier[NR_CPUS] = { 1, };
+int prof_counter[NR_CPUS] = { 1, };
+
/*
* The timer chip is already set up at HZ interrupts per second here,
* but we do not accept timer interrupts yet. We only allow the BP
@@ -2015,66 +614,102 @@ static unsigned int __init get_8254_timer_count(void)
return count;
}
+void __init wait_8254_wraparound(void)
+{
+ unsigned int curr_count, prev_count=~0;
+ int delta;
+
+ curr_count = get_8254_timer_count();
+
+ do {
+ prev_count = curr_count;
+ curr_count = get_8254_timer_count();
+ delta = curr_count-prev_count;
+
+ /*
+ * This limit for delta seems arbitrary, but it isn't, it's
+ * slightly above the level of error a buggy Mercury/Neptune
+ * chipset timer can cause.
+ */
+
+ } while (delta<300);
+}
+
/*
* This function sets up the local APIC timer, with a timeout of
* 'clocks' APIC bus clock. During calibration we actually call
- * this function twice, once with a bogus timeout value, second
- * time for real. The other (noncalibrating) CPUs call this
- * function only once, with the real value.
- *
- * We are strictly in irqs off mode here, as we do not want to
- * get an APIC interrupt go off accidentally.
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
*
* We do reads before writes even if unnecessary, to get around the
- * APIC double write bug.
+ * P5 APIC double write bug.
*/
#define APIC_DIVISOR 16
-void setup_APIC_timer(unsigned int clocks)
+void __setup_APIC_LVTT(unsigned int clocks)
{
- unsigned long lvtt1_value;
- unsigned int tmp_value;
+ unsigned int lvtt1_value, tmp_value;
- /*
- * Unfortunately the local APIC timer cannot be set up into NMI
- * mode. With the IO APIC we can re-route the external timer
- * interrupt and broadcast it as an NMI to all CPUs, so no pain.
- */
tmp_value = apic_read(APIC_LVTT);
- lvtt1_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
- apic_write(APIC_LVTT , lvtt1_value);
+ lvtt1_value = SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV) |
+ APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+ apic_write(APIC_LVTT, lvtt1_value);
/*
* Divide PICLK by 16
*/
tmp_value = apic_read(APIC_TDCR);
- apic_write(APIC_TDCR , (tmp_value & ~APIC_TDR_DIV_1 )
- | APIC_TDR_DIV_16);
+ apic_write(APIC_TDCR, (tmp_value
+ & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+ | APIC_TDR_DIV_16);
tmp_value = apic_read(APIC_TMICT);
apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
}
-void __init wait_8254_wraparound(void)
+void setup_APIC_timer(void * data)
{
- unsigned int curr_count, prev_count=~0;
+ unsigned int clocks = (unsigned int) data, slice, t0, t1, nr;
+ unsigned long flags;
int delta;
- curr_count = get_8254_timer_count();
-
- do {
- prev_count = curr_count;
- curr_count = get_8254_timer_count();
- delta = curr_count-prev_count;
+ __save_flags(flags);
+ __sti();
+ /*
+ * ok, Intel has some smart code in their APIC that knows
+ * if a CPU was in 'hlt' lowpower mode, and this increases
+ * its APIC arbitration priority. To avoid the external timer
+ * IRQ APIC event being in synchron with the APIC clock we
+ * introduce an interrupt skew to spread out timer events.
+ *
+ * The number of slices within a 'big' timeslice is smp_num_cpus+1
+ */
+ slice = clocks / (smp_num_cpus+1);
+ nr = cpu_number_map[smp_processor_id()] + 1;
+ printk("cpu: %d, clocks: %d, slice: %d, nr: %d.\n",
+ smp_processor_id(), clocks, slice, nr);
/*
- * This limit for delta seems arbitrary, but it isn't, it's
- * slightly above the level of error a buggy Mercury/Neptune
- * chipset timer can cause.
+ * Wait for IRQ0's slice:
*/
+ wait_8254_wraparound();
- } while (delta<300);
+ __setup_APIC_LVTT(clocks);
+
+ t0 = apic_read(APIC_TMCCT)*APIC_DIVISOR;
+ do {
+ t1 = apic_read(APIC_TMCCT)*APIC_DIVISOR;
+ delta = (int)(t0 - t1 - slice*nr);
+ } while (delta < 0);
+
+ __setup_APIC_LVTT(clocks);
+
+ printk("CPU%d<C0:%d,C:%d,D:%d,S:%d,C:%d>\n",
+ smp_processor_id(), t0, t1, delta, slice, clocks);
+
+ __restore_flags(flags);
}
/*
@@ -2092,10 +727,11 @@ void __init wait_8254_wraparound(void)
int __init calibrate_APIC_clock(void)
{
- unsigned long long t1,t2;
- long tt1,tt2;
- long calibration_result;
+ unsigned long long t1 = 0, t2 = 0;
+ long tt1, tt2;
+ long result;
int i;
+ const int LOOPS = HZ/10;
printk("calibrating APIC timer ... ");
@@ -2104,7 +740,7 @@ int __init calibrate_APIC_clock(void)
* value into the APIC clock, we just want to get the
* counter running for calibration.
*/
- setup_APIC_timer(1000000000);
+ __setup_APIC_LVTT(1000000000);
/*
* The timer chip counts down to zero. Let's wait
@@ -2112,23 +748,24 @@ int __init calibrate_APIC_clock(void)
* (the current tick might have been already half done)
*/
- wait_8254_wraparound ();
+ wait_8254_wraparound();
/*
* We wrapped around just now. Let's start:
*/
- rdtscll(t1);
- tt1=apic_read(APIC_TMCCT);
+ if (cpu_has_tsc)
+ rdtscll(t1);
+ tt1 = apic_read(APIC_TMCCT);
-#define LOOPS (HZ/10)
/*
* Let's wait LOOPS wraprounds:
*/
- for (i=0; i<LOOPS; i++)
- wait_8254_wraparound ();
+ for (i = 0; i < LOOPS; i++)
+ wait_8254_wraparound();
- tt2=apic_read(APIC_TMCCT);
- rdtscll(t2);
+ tt2 = apic_read(APIC_TMCCT);
+ if (cpu_has_tsc)
+ rdtscll(t2);
/*
* The APIC bus clock counter is 32 bits only, it
@@ -2138,71 +775,37 @@ int __init calibrate_APIC_clock(void)
* underflown to be exact, as the timer counts down ;)
*/
- calibration_result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
-
- SMP_PRINTK(("\n..... %ld CPU clocks in 1 timer chip tick.",
- (unsigned long)(t2-t1)/LOOPS));
-
- SMP_PRINTK(("\n..... %ld APIC bus clocks in 1 timer chip tick.",
- calibration_result));
+ result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
+ if (cpu_has_tsc)
+ printk("\n..... CPU clock speed is %ld.%04ld MHz.\n",
+ ((long)(t2-t1)/LOOPS)/(1000000/HZ),
+ ((long)(t2-t1)/LOOPS)%(1000000/HZ));
- printk("\n..... CPU clock speed is %ld.%04ld MHz.\n",
- ((long)(t2-t1)/LOOPS)/(1000000/HZ),
- ((long)(t2-t1)/LOOPS)%(1000000/HZ) );
+ printk("..... host bus clock speed is %ld.%04ld MHz.\n",
+ result/(1000000/HZ),
+ result%(1000000/HZ));
- printk("..... system bus clock speed is %ld.%04ld MHz.\n",
- calibration_result/(1000000/HZ),
- calibration_result%(1000000/HZ) );
-#undef LOOPS
-
- return calibration_result;
+ return result;
}
-void __init setup_APIC_clock(void)
+static unsigned int calibration_result;
+
+void __init setup_APIC_clocks(void)
{
unsigned long flags;
- static volatile int calibration_lock;
-
__save_flags(flags);
__cli();
- SMP_PRINTK(("setup_APIC_clock() called.\n"));
-
- /*
- * [ setup_APIC_clock() is called from all CPUs, but we want
- * to do this part of the setup only once ... and it fits
- * here best ]
- */
- if (!test_and_set_bit(0,&calibration_lock)) {
-
- calibration_result=calibrate_APIC_clock();
- /*
- * Signal completion to the other CPU[s]:
- */
- calibration_lock = 3;
-
- } else {
- /*
- * Other CPU is calibrating, wait for finish:
- */
- SMP_PRINTK(("waiting for other CPU calibrating APIC ... "));
- while (calibration_lock == 1);
- SMP_PRINTK(("done, continuing.\n"));
- }
-
-/*
- * Now set up the timer for real.
- */
+ calibration_result = calibrate_APIC_clock();
- setup_APIC_timer (calibration_result);
+ smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1);
/*
- * We ACK the APIC, just in case there is something pending.
+ * Now set up the timer for real.
*/
-
- ack_APIC_irq ();
+ setup_APIC_timer((void *)calibration_result);
__restore_flags(flags);
}
@@ -2224,9 +827,9 @@ int setup_profiling_timer(unsigned int multiplier)
return -EINVAL;
/*
- * Set the new multiplier for each CPU. CPUs don't start using the
+ * Set the new multiplier for each CPU. CPUs don't start using the
* new values until the next timer interrupt in which they do process
- * accounting. At that time they also adjust their APIC timers
+ * accounting. At that time they also adjust their APIC timers
* accordingly.
*/
for (i = 0; i < NR_CPUS; ++i)
@@ -2237,3 +840,111 @@ int setup_profiling_timer(unsigned int multiplier)
#undef APIC_DIVISOR
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+
+inline void smp_local_timer_interrupt(struct pt_regs * regs)
+{
+ int user = (user_mode(regs) != 0);
+ int cpu = smp_processor_id();
+
+ /*
+ * The profiling function is SMP safe. (nothing can mess
+ * around with "current", and the profiling counters are
+ * updated with atomic operations). This is especially
+ * useful with a profiling multiplier != 1
+ */
+ if (!user)
+ x86_do_profile(regs->eip);
+
+ if (--prof_counter[cpu] <= 0) {
+ int system = 1 - user;
+ struct task_struct * p = current;
+
+ /*
+ * The multiplier may have changed since the last time we got
+ * to this point as a result of the user writing to
+ * /proc/profile. In this case we need to adjust the APIC
+ * timer accordingly.
+ *
+ * Interrupts are already masked off at this point.
+ */
+ prof_counter[cpu] = prof_multiplier[cpu];
+ if (prof_counter[cpu] != prof_old_multiplier[cpu]) {
+ __setup_APIC_LVTT(calibration_result/prof_counter[cpu]);
+ prof_old_multiplier[cpu] = prof_counter[cpu];
+ }
+
+ /*
+ * After doing the above, we need to make like
+ * a normal interrupt - otherwise timer interrupts
+ * ignore the global interrupt lock, which is the
+ * WrongThing (tm) to do.
+ */
+
+ irq_enter(cpu, 0);
+ update_one_process(p, 1, user, system, cpu);
+ if (p->pid) {
+ p->counter -= 1;
+ if (p->counter <= 0) {
+ p->counter = 0;
+ p->need_resched = 1;
+ }
+ if (p->priority < DEF_PRIORITY) {
+ kstat.cpu_nice += user;
+ kstat.per_cpu_nice[cpu] += user;
+ } else {
+ kstat.cpu_user += user;
+ kstat.per_cpu_user[cpu] += user;
+ }
+ kstat.cpu_system += system;
+ kstat.per_cpu_system[cpu] += system;
+
+ }
+ irq_exit(cpu, 0);
+ }
+
+ /*
+ * We take the 'long' return path, and there every subsystem
+ * grabs the apropriate locks (kernel lock/ irq lock).
+ *
+ * we might want to decouple profiling from the 'long path',
+ * and do the profiling totally in assembly.
+ *
+ * Currently this isn't too much of an issue (performance wise),
+ * we can take more than 100K local irqs per second on a 100 MHz P5.
+ */
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesnt support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ * interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+unsigned int apic_timer_irqs [NR_CPUS] = { 0, };
+
+void smp_apic_timer_interrupt(struct pt_regs * regs)
+{
+ /*
+ * the NMI deadlock-detector uses this.
+ */
+ apic_timer_irqs[smp_processor_id()]++;
+
+ /*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ */
+ ack_APIC_irq();
+ smp_local_timer_interrupt(regs);
+}
+
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
new file mode 100644
index 000000000..46335ee8f
--- /dev/null
+++ b/arch/i386/kernel/smpboot.c
@@ -0,0 +1,1650 @@
+/*
+ * Intel MP v1.1/v1.4 specification compliant parsing routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998, 1999 Ingo Molnar <mingo@redhat.com>
+ *
+ * Much of the core SMP work is based on previous work by Thomas Radke, to
+ * whom a great many thanks are extended.
+ *
+ * Thanks to Intel for making available several different Pentium,
+ * Pentium Pro and Pentium-II/Xeon MP machines.
+ * Original development of Linux SMP code supported by Caldera.
+ *
+ * This code is released under the GNU public license version 2 or
+ * later.
+ *
+ * Fixes
+ * Felix Koop : NR_CPUS used properly
+ * Jose Renau : Handle single CPU case.
+ * Alan Cox : By repeated request 8) - Total BogoMIP report.
+ * Greg Wright : Fix for kernel stacks panic.
+ * Erich Boleyn : MP v1.4 and additional changes.
+ * Matthias Sattler : Changes for 2.1 kernel map.
+ * Michel Lespinasse : Changes for 2.1 kernel map.
+ * Michael Chastain : Change trampoline.S to gnu as.
+ * Alan Cox : Dumb bug: 'B' step PPro's are fine
+ * Ingo Molnar : Added APIC timers, based on code
+ * from Jose Renau
+ * Alan Cox : Added EBDA scanning
+ * Ingo Molnar : various cleanups and rewrites
+ * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
+ * Maciej W. Rozycki : Bits for genuine 82489DX timers
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/irq.h>
+
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/mtrr.h>
+
+/* Set if we find a B stepping CPU */
+static int smp_b_stepping = 0;
+
+/* Setup configured maximum number of CPUs to activate */
+static int max_cpus = -1;
+/* 1 if "noapic" boot option passed */
+int skip_ioapic_setup = 0;
+
+/* Total count of live CPUs */
+int smp_num_cpus = 0;
+/* Internal processor count */
+static unsigned int num_processors = 1;
+
+/* Have we found an SMP box */
+int smp_found_config = 0;
+
+/* Bitmask of physically existing CPUs */
+unsigned long cpu_present_map = 0;
+/* Bitmask of currently online CPUs */
+unsigned long cpu_online_map = 0;
+
+/* which CPU maps to which logical number */
+volatile int cpu_number_map[NR_CPUS];
+/* which logical number maps to which CPU */
+volatile int __cpu_logical_map[NR_CPUS];
+
+static volatile unsigned long cpu_callin_map = 0;
+static volatile unsigned long cpu_callout_map = 0;
+
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS];
+/* Processor that is doing the boot up */
+static unsigned int boot_cpu_id = 0;
+
+/* Tripped once we need to start cross invalidating */
+static int smp_activated = 0;
+/* Set when the idlers are all forked */
+int smp_threads_ready = 0;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [NR_CPUS];
+int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, };
+extern int nr_ioapics;
+extern struct mpc_config_ioapic mp_ioapics [MAX_IO_APICS];
+extern int mp_irq_entries;
+extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
+extern int mpc_default_type;
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, };
+int mp_current_pci_id = 0;
+unsigned long mp_lapic_addr = 0;
+int pic_mode;
+
+extern void cache_APIC_registers (void);
+
+#define SMP_DEBUG 1
+
+#if SMP_DEBUG
+#define dprintk(x...) printk(##x)
+#else
+#define dprintk(x...)
+#endif
+
+/*
+ * IA s/w dev Vol 3, Section 7.4
+ */
+#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+
+static int __init nosmp(char *str)
+{
+ max_cpus = 0;
+ return 1;
+}
+
+__setup("nosmp", nosmp);
+
+static int __init maxcpus(char *str)
+{
+ get_option(&str, &max_cpus);
+ return 1;
+}
+
+__setup("maxcpus=", maxcpus);
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+#ifndef CONFIG_X86_VISWS_APIC
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+ int sum=0;
+ while(len--)
+ sum+=*mp++;
+ return sum&0xFF;
+}
+
+/*
+ * Processor encoding in an MP configuration block
+ */
+
+static char __init *mpc_family(int family,int model)
+{
+ static char n[32];
+ static char *model_defs[]=
+ {
+ "80486DX","80486DX",
+ "80486SX","80486DX/2 or 80487",
+ "80486SL","80486SX/2",
+ "Unknown","80486DX/2-WB",
+ "80486DX/4","80486DX/4-WB"
+ };
+
+ switch (family) {
+ case 0x04:
+ if (model < 10)
+ return model_defs[model];
+ break;
+
+ case 0x05:
+ return("Pentium(tm)");
+
+ case 0x06:
+ return("Pentium(tm) Pro");
+
+ case 0x0F:
+ if (model == 0x0F)
+ return("Special controller");
+ }
+ sprintf(n,"Unknown CPU [%d:%d]",family, model);
+ return n;
+}
+
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+ int ver;
+
+ if (!(m->mpc_cpuflag & CPU_ENABLED))
+ return;
+
+ printk("Processor #%d %s APIC version %d\n",
+ m->mpc_apicid,
+ mpc_family( (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8 ,
+ (m->mpc_cpufeature & CPU_MODEL_MASK)>>4),
+ m->mpc_apicver);
+
+#ifdef SMP_DEBUG
+ if (m->mpc_featureflag&(1<<0))
+ printk(" Floating point unit present.\n");
+ if (m->mpc_featureflag&(1<<7))
+ printk(" Machine Exception supported.\n");
+ if (m->mpc_featureflag&(1<<8))
+ printk(" 64 bit compare & exchange supported.\n");
+ if (m->mpc_featureflag&(1<<9))
+ printk(" Internal APIC present.\n");
+#endif
+
+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+ dprintk(" Bootup CPU\n");
+ boot_cpu_id = m->mpc_apicid;
+ } else
+ /* Boot CPU already counted */
+ num_processors++;
+
+ if (m->mpc_apicid > NR_CPUS) {
+ printk("Processor #%d unused. (Max %d processors).\n",
+ m->mpc_apicid, NR_CPUS);
+ return;
+ }
+ ver = m->mpc_apicver;
+
+ cpu_present_map |= (1<<m->mpc_apicid);
+ /*
+ * Validate version
+ */
+ if (ver == 0x0) {
+ printk("BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
+ ver = 0x10;
+ }
+ apic_version[m->mpc_apicid] = ver;
+}
+
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+ char str[7];
+
+ memcpy(str, m->mpc_bustype, 6);
+ str[6] = 0;
+ dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+
+ if (strncmp(str, "ISA", 3) == 0) {
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+ } else {
+ if (strncmp(str, "EISA", 4) == 0) {
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+ } else {
+ if (strncmp(str, "PCI", 3) == 0) {
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+ mp_current_pci_id++;
+ } else {
+ printk("Unknown bustype %s\n", str);
+ panic("cannot handle bus - mail to linux-smp@vger.rutgers.edu");
+ } } }
+}
+
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+ if (!(m->mpc_flags & MPC_APIC_USABLE))
+ return;
+
+ printk("I/O APIC #%d Version %d at 0x%lX.\n",
+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk("Max # of I/O APICs (%d) exceeded (found %d).\n",
+ MAX_IO_APICS, nr_ioapics);
+ panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+ }
+ mp_ioapics[nr_ioapics] = *m;
+ nr_ioapics++;
+}
+
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+ mp_irqs [mp_irq_entries] = *m;
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+ /*
+ * Well it seems all SMP boards in existence
+ * use ExtINT/LVT1 == LINT0 and
+ * NMI/LVT2 == LINT1 - the following check
+ * will show us if this assumptions is false.
+ * Until then we do not have to add baggage.
+ */
+ if ((m->mpc_irqtype == mp_ExtINT) &&
+ (m->mpc_destapiclint != 0))
+ BUG();
+ if ((m->mpc_irqtype == mp_NMI) &&
+ (m->mpc_destapiclint != 1))
+ BUG();
+}
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+ char str[16];
+ int count=sizeof(*mpc);
+ unsigned char *mpt=((unsigned char *)mpc)+count;
+
+ if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4))
+ {
+ panic("SMP mptable: bad signature [%c%c%c%c]!\n",
+ mpc->mpc_signature[0],
+ mpc->mpc_signature[1],
+ mpc->mpc_signature[2],
+ mpc->mpc_signature[3]);
+ return 1;
+ }
+ if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length))
+ {
+ panic("SMP mptable: checksum error!\n");
+ return 1;
+ }
+ if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04)
+ {
+ printk("Bad Config Table version (%d)!!\n",mpc->mpc_spec);
+ return 1;
+ }
+ memcpy(str,mpc->mpc_oem,8);
+ str[8]=0;
+ printk("OEM ID: %s ",str);
+
+ memcpy(str,mpc->mpc_productid,12);
+ str[12]=0;
+ printk("Product ID: %s ",str);
+
+ printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+
+ /* save the local APIC address, it might be non-default */
+ mp_lapic_addr = mpc->mpc_lapic;
+
+ /*
+ * Now process the configuration blocks.
+ */
+ while (count < mpc->mpc_length) {
+ switch(*mpt) {
+ case MP_PROCESSOR:
+ {
+ struct mpc_config_processor *m=
+ (struct mpc_config_processor *)mpt;
+ MP_processor_info(m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_BUS:
+ {
+ struct mpc_config_bus *m=
+ (struct mpc_config_bus *)mpt;
+ MP_bus_info(m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_IOAPIC:
+ {
+ struct mpc_config_ioapic *m=
+ (struct mpc_config_ioapic *)mpt;
+ MP_ioapic_info(m);
+ mpt+=sizeof(*m);
+ count+=sizeof(*m);
+ break;
+ }
+ case MP_INTSRC:
+ {
+ struct mpc_config_intsrc *m=
+ (struct mpc_config_intsrc *)mpt;
+
+ MP_intsrc_info(m);
+ mpt+=sizeof(*m);
+ count+=sizeof(*m);
+ break;
+ }
+ case MP_LINTSRC:
+ {
+ struct mpc_config_lintsrc *m=
+ (struct mpc_config_lintsrc *)mpt;
+ MP_lintsrc_info(m);
+ mpt+=sizeof(*m);
+ count+=sizeof(*m);
+ break;
+ }
+ }
+ }
+ return num_processors;
+}
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+static int __init smp_get_mpf(struct intel_mp_floating *mpf)
+{
+ printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+ if (mpf->mpf_feature2 & (1<<7)) {
+ printk(" IMCR and PIC compatibility mode.\n");
+ pic_mode = 1;
+ } else {
+ printk(" Virtual Wire compatibility mode.\n");
+ pic_mode = 0;
+ }
+ smp_found_config = 1;
+ /*
+ * default CPU id - if it's different in the mptable
+ * then we change it before first using it.
+ */
+ boot_cpu_id = 0;
+ /*
+ * Now see if we need to read further.
+ */
+ if (mpf->mpf_feature1 != 0) {
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /*
+ * 2 CPUs, numbered 0 & 1.
+ */
+ cpu_present_map = 3;
+ num_processors = 2;
+
+ nr_ioapics = 1;
+ mp_ioapics[0].mpc_apicaddr = 0xFEC00000;
+ /*
+ * Save the default type number, we
+ * need it later to set the IO-APIC
+ * up properly:
+ */
+ mpc_default_type = mpf->mpf_feature1;
+
+ printk("Bus #0 is ");
+ }
+
+ switch (mpf->mpf_feature1) {
+ case 1:
+ case 5:
+ printk("ISA\n");
+ break;
+ case 2:
+ printk("EISA with no IRQ0 and no IRQ13 DMA chaining\n");
+ break;
+ case 6:
+ case 3:
+ printk("EISA\n");
+ break;
+ case 4:
+ case 7:
+ printk("MCA\n");
+ break;
+ case 0:
+ if (!mpf->mpf_physptr)
+ BUG();
+ break;
+ default:
+ printk("???\nUnknown standard configuration %d\n",
+ mpf->mpf_feature1);
+ return 1;
+ }
+ if (mpf->mpf_feature1 > 4) {
+ printk("Bus #1 is PCI\n");
+
+ /*
+ * Set local APIC version to the integrated form.
+ * It's initialized to zero otherwise, representing
+ * a discrete 82489DX.
+ */
+ apic_version[0] = 0x10;
+ apic_version[1] = 0x10;
+ }
+ /*
+ * Read the physical hardware table. Anything here will override the
+ * defaults.
+ */
+ if (mpf->mpf_physptr)
+ smp_read_mpc((void *)mpf->mpf_physptr);
+
+ __cpu_logical_map[0] = boot_cpu_id;
+ global_irq_holder = boot_cpu_id;
+ current->processor = boot_cpu_id;
+
+ printk("Processors: %d\n", num_processors);
+ /*
+ * Only use the first configuration found.
+ */
+ return 1;
+}
+
+static int __init smp_scan_config(unsigned long base, unsigned long length)
+{
+ unsigned long *bp = phys_to_virt(base);
+ struct intel_mp_floating *mpf;
+
+ dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+ if (sizeof(*mpf) != 16)
+ printk("Error: MPF size\n");
+
+ while (length > 0) {
+ mpf = (struct intel_mp_floating *)bp;
+ if ((*bp == SMP_MAGIC_IDENT) &&
+ (mpf->mpf_length == 1) &&
+ !mpf_checksum((unsigned char *)bp, 16) &&
+ ((mpf->mpf_specification == 1)
+ || (mpf->mpf_specification == 4)) ) {
+
+ printk("found SMP MP-table at %08ld\n",
+ virt_to_phys(mpf));
+ smp_get_mpf(mpf);
+ return 1;
+ }
+ bp += 4;
+ length -= 16;
+ }
+ return 0;
+}
+
+void __init init_intel_smp (void)
+{
+ unsigned int address;
+
+ /*
+ * FIXME: Linux assumes you have 640K of base ram..
+ * this continues the error...
+ *
+ * 1) Scan the bottom 1K for a signature
+ * 2) Scan the top 1K of base RAM
+ * 3) Scan the 64K of bios
+ */
+ if (smp_scan_config(0x0,0x400) ||
+ smp_scan_config(639*0x400,0x400) ||
+ smp_scan_config(0xF0000,0x10000))
+ return;
+ /*
+ * If it is an SMP machine we should know now, unless the
+ * configuration is in an EISA/MCA bus machine with an
+ * extended bios data area.
+ *
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E, calculate and scan it here.
+ *
+ * NOTE! There are Linux loaders that will corrupt the EBDA
+ * area, and as such this kind of SMP config may be less
+ * trustworthy, simply because the SMP table may have been
+ * stomped on during early boot. These loaders are buggy and
+ * should be fixed.
+ */
+
+ address = *(unsigned short *)phys_to_virt(0x40E);
+ address <<= 4;
+ smp_scan_config(address, 0x1000);
+ if (smp_found_config)
+ printk(KERN_WARNING "WARNING: MP table in the EBDA can be UNSAFE, contact linux-smp@vger.rutgers.edu if you experience SMP problems!\n");
+}
+
+#else
+
+/*
+ * The Visual Workstation is Intel MP compliant in the hardware
+ * sense, but it doesnt have a BIOS(-configuration table).
+ * No problem for Linux.
+ */
+void __init init_visws_smp(void)
+{
+ smp_found_config = 1;
+
+ cpu_present_map |= 2; /* or in id 1 */
+ apic_version[1] |= 0x10; /* integrated APIC */
+ apic_version[0] |= 0x10;
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+}
+
+#endif
+
+/*
+ * - Intel MP Configuration Table
+ * - or SGI Visual Workstation configuration
+ */
+void __init init_smp_config (void)
+{
+#ifndef CONFIG_VISWS
+ init_intel_smp();
+#else
+ init_visws_smp();
+#endif
+}
+
+
+
+/*
+ * Trampoline 80x86 program as an array.
+ */
+
+extern unsigned char trampoline_data [];
+extern unsigned char trampoline_end [];
+static unsigned char *trampoline_base;
+
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+
+static unsigned long __init setup_trampoline(void)
+{
+ memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
+ return virt_to_phys(trampoline_base);
+}
+
+/*
+ * We are called very early to get the low memory for the
+ * SMP bootup trampoline page.
+ */
+unsigned long __init smp_alloc_memory(unsigned long mem_base)
+{
+ if (virt_to_phys((void *)mem_base) >= 0x9F000)
+ BUG();
+ trampoline_base = (void *)mem_base;
+ return mem_base + PAGE_SIZE;
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+
+void __init smp_store_cpu_info(int id)
+{
+ struct cpuinfo_x86 *c=&cpu_data[id];
+
+ *c = boot_cpu_data;
+ c->pte_quick = 0;
+ c->pgd_quick = 0;
+ c->pgtable_cache_sz = 0;
+ identify_cpu(c);
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ c->x86 == 5 &&
+ c->x86_mask >= 1 && c->x86_mask <= 4 &&
+ c->x86_model <= 3)
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ smp_b_stepping = 1;
+}
+
+/*
+ * Architecture specific routine called by the kernel just before init is
+ * fired off. This allows the BP to have everything in order [we hope].
+ * At the end of this all the APs will hit the system scheduling and off
+ * we go. Each AP will load the system gdt's and jump through the kernel
+ * init into idle(). At this point the scheduler will one day take over
+ * and give them jobs to do. smp_callin is a standard routine
+ * we use to track CPUs as they power up.
+ */
+
+static atomic_t smp_commenced = ATOMIC_INIT(0);
+
+void __init smp_commence(void)
+{
+ /*
+ * Lets the callins below out of their loop.
+ */
+ dprintk("Setting commenced=1, go go go\n");
+
+ wmb();
+ atomic_set(&smp_commenced,1);
+}
+
+extern void __error_in_io_apic_c(void);
+
+
+int get_maxlvt(void)
+{
+ unsigned int v, ver, maxlvt;
+
+ v = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(v);
+ /* 82489DXs do not report # of LVT entries. */
+ maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2;
+ return maxlvt;
+}
+
+void __init setup_local_APIC(void)
+{
+ unsigned long value, ver, maxlvt;
+
+ if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
+ __error_in_io_apic_c();
+
+ value = apic_read(APIC_SPIV);
+ value = 0xf;
+ /*
+ * Enable APIC
+ */
+ value |= (1<<8);
+#if 0
+ /* Enable focus processor (bit==0) */
+ value &= ~(1<<9);
+#else
+ /* Disable focus processor (bit==1) */
+ value |= (1<<9);
+#endif
+ /*
+ * Set spurious IRQ vector
+ */
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write(APIC_SPIV,value);
+
+ /*
+ * Set up LVT0, LVT1:
+ *
+ * set up through-local-APIC on the BP's LINT0. This is not
+ * strictly necessery in pure symmetric-IO mode, but sometimes
+ * we delegate interrupts to the 8259A.
+ */
+ if (hard_smp_processor_id() == boot_cpu_id) {
+ value = 0x00000700;
+ printk("enabled ExtINT on CPU#%d\n", hard_smp_processor_id());
+ } else {
+ value = 0x00010700;
+ printk("masked ExtINT on CPU#%d\n", hard_smp_processor_id());
+ }
+ apic_write_around(APIC_LVT0,value);
+
+ /*
+ * only the BP should see the LINT1 NMI signal, obviously.
+ */
+ if (hard_smp_processor_id() == boot_cpu_id)
+ value = 0x00000400; // unmask NMI
+ else
+ value = 0x00010400; // mask NMI
+ apic_write_around(APIC_LVT1,value);
+
+ value = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(value);
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ maxlvt = get_maxlvt();
+ /*
+ * Due to the Pentium erratum 3AP.
+ */
+ if (maxlvt > 3) {
+ apic_readaround(APIC_SPIV); // not strictly necessery
+ apic_write(APIC_ESR, 0);
+ }
+ value = apic_read(APIC_ESR);
+ printk("ESR value before enabling vector: %08lx\n", value);
+
+ value = apic_read(APIC_LVTERR);
+ value = ERROR_APIC_VECTOR; // enables sending errors
+ apic_write(APIC_LVTERR,value);
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt != 3) {
+ apic_readaround(APIC_SPIV);
+ apic_write(APIC_ESR, 0);
+ }
+ value = apic_read(APIC_ESR);
+ printk("ESR value after enabling vector: %08lx\n", value);
+ } else
+ printk("No ESR for 82489DX.\n");
+
+ /*
+ * Set Task Priority to 'accept all'. We never change this
+ * later on.
+ */
+ value = apic_read(APIC_TASKPRI);
+ value &= ~APIC_TPRI_MASK;
+ apic_write(APIC_TASKPRI,value);
+
+ /*
+ * Set up the logical destination ID and put the
+ * APIC into flat delivery mode.
+ */
+ value = apic_read(APIC_LDR);
+ value &= ~APIC_LDR_MASK;
+ value |= (1<<(smp_processor_id()+24));
+ apic_write(APIC_LDR,value);
+
+ value = apic_read(APIC_DFR);
+ value |= SET_APIC_DFR(0xf);
+ apic_write(APIC_DFR, value);
+}
+
+unsigned long __init init_smp_mappings(unsigned long memory_start)
+{
+ unsigned long apic_phys;
+
+ memory_start = PAGE_ALIGN(memory_start);
+ if (smp_found_config) {
+ apic_phys = mp_lapic_addr;
+ } else {
+ /*
+ * set up a fake all zeroes page to simulate the
+ * local APIC and another one for the IO-APIC. We
+ * could use the real zero-page, but it's safer
+ * this way if some buggy code writes to this page ...
+ */
+ apic_phys = __pa(memory_start);
+ memset((void *)memory_start, 0, PAGE_SIZE);
+ memory_start += PAGE_SIZE;
+ }
+ set_fixmap(FIX_APIC_BASE,apic_phys);
+ dprintk("mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys);
+
+#ifdef CONFIG_X86_IO_APIC
+ {
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ int i;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ if (smp_found_config) {
+ ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+ } else {
+ ioapic_phys = __pa(memory_start);
+ memset((void *)memory_start, 0, PAGE_SIZE);
+ memory_start += PAGE_SIZE;
+ }
+ set_fixmap(idx,ioapic_phys);
+ dprintk("mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx), ioapic_phys);
+ idx++;
+ }
+ }
+#endif
+
+ return memory_start;
+}
+
+/*
+ * TSC synchronization.
+ *
+ * We first check wether all CPUs have their TSC's synchronized,
+ * then we print a warning if not, and always resync.
+ */
+
+static atomic_t tsc_start_flag = ATOMIC_INIT(0);
+static atomic_t tsc_count_start = ATOMIC_INIT(0);
+static atomic_t tsc_count_stop = ATOMIC_INIT(0);
+static unsigned long long tsc_values[NR_CPUS] = { 0, };
+
+#define NR_LOOPS 5
+
+extern unsigned long fast_gettimeoffset_quotient;
+
+/*
+ * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit
+ * multiplication. Not terribly optimized but we need it at boot time only
+ * anyway.
+ *
+ * result == a / b
+ * == (a1 + a2*(2^32)) / b
+ * == a1/b + a2*(2^32/b)
+ * == a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b
+ * ^---- (this multiplication can overflow)
+ */
+
+static unsigned long long div64 (unsigned long long a, unsigned long b0)
+{
+ unsigned int a1, a2;
+ unsigned long long res;
+
+ a1 = ((unsigned int*)&a)[0];
+ a2 = ((unsigned int*)&a)[1];
+
+ res = a1/b0 +
+ (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) +
+ a2 / b0 +
+ (a2 * (0xffffffff % b0)) / b0;
+
+ return res;
+}
+
+static void __init synchronize_tsc_bp (void)
+{
+ int i;
+ unsigned long long t0;
+ unsigned long long sum, avg;
+ long long delta;
+ unsigned long one_usec;
+ int buggy = 0;
+
+ printk("checking TSC synchronization across CPUs: ");
+
+ one_usec = ((1<<30)/fast_gettimeoffset_quotient)*(1<<2);
+
+ atomic_set(&tsc_start_flag, 1);
+ wmb();
+
+ /*
+ * We loop a few times to get a primed instruction cache,
+ * then the last pass is more or less synchronized and
+ * the BP and APs set their cycle counters to zero all at
+ * once. This reduces the chance of having random offsets
+ * between the processors, and guarantees that the maximum
+ * delay between the cycle counters is never bigger than
+ * the latency of information-passing (cachelines) between
+ * two CPUs.
+ */
+ for (i = 0; i < NR_LOOPS; i++) {
+ /*
+ * all APs synchronize but they loop on '== num_cpus'
+ */
+ while (atomic_read(&tsc_count_start) != smp_num_cpus-1) mb();
+ atomic_set(&tsc_count_stop, 0);
+ wmb();
+ /*
+ * this lets the APs save their current TSC:
+ */
+ atomic_inc(&tsc_count_start);
+
+ rdtscll(tsc_values[smp_processor_id()]);
+ /*
+ * We clear the TSC in the last loop:
+ */
+ if (i == NR_LOOPS-1)
+ write_tsc(0, 0);
+
+ /*
+ * Wait for all APs to leave the synchronization point:
+ */
+ while (atomic_read(&tsc_count_stop) != smp_num_cpus-1) mb();
+ atomic_set(&tsc_count_start, 0);
+ wmb();
+ atomic_inc(&tsc_count_stop);
+ }
+
+ sum = 0;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!(cpu_online_map & (1 << i)))
+ continue;
+
+ t0 = tsc_values[i];
+ sum += t0;
+ }
+ avg = div64(sum, smp_num_cpus);
+
+ sum = 0;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!(cpu_online_map & (1 << i)))
+ continue;
+
+ delta = tsc_values[i] - avg;
+ if (delta < 0)
+ delta = -delta;
+ /*
+ * We report bigger than 2 microseconds clock differences.
+ */
+ if (delta > 2*one_usec) {
+ long realdelta;
+ if (!buggy) {
+ buggy = 1;
+ printk("\n");
+ }
+ realdelta = div64(delta, one_usec);
+ if (tsc_values[i] < avg)
+ realdelta = -realdelta;
+
+ printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
+ i, realdelta);
+ }
+
+ sum += delta;
+ }
+ if (!buggy)
+ printk("passed.\n");
+}
+
+static void __init synchronize_tsc_ap (void)
+{
+ int i;
+
+ /*
+ * smp_num_cpus is not necessarily known at the time
+ * this gets called, so we first wait for the BP to
+ * finish SMP initialization:
+ */
+ while (!atomic_read(&tsc_start_flag)) mb();
+
+ for (i = 0; i < NR_LOOPS; i++) {
+ atomic_inc(&tsc_count_start);
+ while (atomic_read(&tsc_count_start) != smp_num_cpus) mb();
+
+ rdtscll(tsc_values[smp_processor_id()]);
+ if (i == NR_LOOPS-1)
+ write_tsc(0, 0);
+
+ atomic_inc(&tsc_count_stop);
+ while (atomic_read(&tsc_count_stop) != smp_num_cpus) mb();
+ }
+}
+#undef NR_LOOPS
+
+extern void calibrate_delay(void);
+
+void __init smp_callin(void)
+{
+ int cpuid;
+ unsigned long timeout;
+
+ /*
+ * (This works even if the APIC is not enabled.)
+ */
+ cpuid = GET_APIC_ID(apic_read(APIC_ID));
+
+ dprintk("CPU#%d waiting for CALLOUT\n", cpuid);
+
+ /*
+ * STARTUP IPIs are fragile beasts as they might sometimes
+ * trigger some glue motherboard logic. Complete APIC bus
+ * silence for 1 second, this overestimates the time the
+ * boot CPU is spending to send the up to 2 STARTUP IPIs
+ * by a factor of two. This should be enough.
+ */
+
+ /*
+ * Waiting 2s total for startup (udelay is not yet working)
+ */
+ timeout = jiffies + 2*HZ;
+ while (time_before(jiffies, timeout)) {
+ /*
+ * Has the boot CPU finished it's STARTUP sequence?
+ */
+ if (test_bit(cpuid, &cpu_callout_map))
+ break;
+ }
+
+ if (!time_before(jiffies, timeout)) {
+ printk("BUG: CPU%d started up but did not get a callout!\n",
+ cpuid);
+ BUG();
+ }
+
+ /*
+ * the boot CPU has finished the init stage and is spinning
+ * on callin_map until we finish. We are free to set up this
+ * CPU, first the APIC. (this is probably redundant on most
+ * boards)
+ */
+
+ dprintk("CALLIN, before setup_local_APIC().\n");
+ setup_local_APIC();
+
+ sti();
+
+#ifdef CONFIG_MTRR
+ /*
+ * Must be done before calibration delay is computed
+ */
+ mtrr_init_secondary_cpu ();
+#endif
+ /*
+ * Get our bogomips.
+ */
+ calibrate_delay();
+ dprintk("Stack at about %p\n",&cpuid);
+
+ /*
+ * Save our processor parameters
+ */
+ smp_store_cpu_info(cpuid);
+
+ /*
+ * Allow the master to continue.
+ */
+ set_bit(cpuid, &cpu_callin_map);
+
+ /*
+ * Synchronize the TSC with the BP
+ */
+ if (cpu_has_tsc)
+ synchronize_tsc_ap ();
+}
+
+int cpucount = 0;
+
+extern int cpu_idle(void);
+
+/*
+ * Activate a secondary processor.
+ */
+int __init start_secondary(void *unused)
+{
+ /*
+ * Dont put anything before smp_callin(), SMP
+ * booting is too fragile that we want to limit the
+ * things done here to the most necessary things.
+ */
+ cpu_init();
+ smp_callin();
+ while (!atomic_read(&smp_commenced))
+ /* nothing */ ;
+ return cpu_idle();
+}
+
+/*
+ * Everything has been set up for the secondary
+ * CPUs - they just need to reload everything
+ * from the task structure
+ * This function must not return.
+ */
+void __init initialize_secondary(void)
+{
+ /*
+ * We don't actually need to load the full TSS,
+ * basically just the stack pointer and the eip.
+ */
+
+ asm volatile(
+ "movl %0,%%esp\n\t"
+ "jmp *%1"
+ :
+ :"r" (current->thread.esp),"r" (current->thread.eip));
+}
+
+extern struct {
+ void * esp;
+ unsigned short ss;
+} stack_start;
+
+static int __init fork_by_hand(void)
+{
+ struct pt_regs regs;
+ /*
+ * don't care about the eip and regs settings since
+ * we'll never reschedule the forked task.
+ */
+ return do_fork(CLONE_VM|CLONE_PID, 0, &regs);
+}
+
+static void __init do_boot_cpu(int i)
+{
+ unsigned long cfg;
+ pgd_t maincfg;
+ struct task_struct *idle;
+ unsigned long send_status, accept_status;
+ int timeout, num_starts, j;
+ unsigned long start_eip;
+
+ cpucount++;
+ /*
+ * We can't use kernel_thread since we must avoid to
+ * reschedule the child.
+ */
+ if (fork_by_hand() < 0)
+ panic("failed fork for CPU %d", i);
+
+ /*
+ * We remove it from the pidhash and the runqueue
+ * once we got the process:
+ */
+ idle = init_task.prev_task;
+ if (!idle)
+ panic("No idle process for CPU %d", i);
+
+ idle->processor = i;
+ __cpu_logical_map[cpucount] = i;
+ cpu_number_map[i] = cpucount;
+ idle->has_cpu = 1; /* we schedule the first task manually */
+ idle->thread.eip = (unsigned long) start_secondary;
+
+ del_from_runqueue(idle);
+ unhash_process(idle);
+ init_tasks[cpucount] = idle;
+
+ /* start_eip had better be page-aligned! */
+ start_eip = setup_trampoline();
+
+ /* So we see what's up */
+ printk("Booting processor %d eip %lx\n", i, start_eip);
+ stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
+
+ /*
+ * This grunge runs the startup process for
+ * the targeted processor.
+ */
+
+ dprintk("Setting warm reset code and vector.\n");
+
+ CMOS_WRITE(0xa, 0xf);
+ local_flush_tlb();
+ dprintk("1.\n");
+ *((volatile unsigned short *) phys_to_virt(0x469)) = start_eip >> 4;
+ dprintk("2.\n");
+ *((volatile unsigned short *) phys_to_virt(0x467)) = start_eip & 0xf;
+ dprintk("3.\n");
+
+ maincfg=swapper_pg_dir[0];
+ ((unsigned long *)swapper_pg_dir)[0]=0x102007;
+
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+
+ if (APIC_INTEGRATED(apic_version[i])) {
+ apic_readaround(APIC_SPIV);
+ apic_write(APIC_ESR, 0);
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ }
+
+ /*
+ * Status is now clean
+ */
+ send_status = 0;
+ accept_status = 0;
+
+ /*
+ * Starting actual IPI sequence...
+ */
+
+ dprintk("Asserting INIT.\n");
+
+ /*
+ * Turn INIT on
+ */
+ cfg = apic_read(APIC_ICR2);
+ cfg &= 0x00FFFFFF;
+
+ /*
+ * Target chip
+ */
+ apic_write(APIC_ICR2, cfg | SET_APIC_DEST_FIELD(i));
+
+ /*
+ * Send IPI
+ */
+ cfg = apic_read(APIC_ICR);
+ cfg &= ~0xCDFFF;
+ cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_ASSERT | APIC_DEST_DM_INIT);
+ apic_write(APIC_ICR, cfg);
+
+ udelay(200);
+ dprintk("Deasserting INIT.\n");
+
+ /* Target chip */
+ cfg = apic_read(APIC_ICR2);
+ cfg &= 0x00FFFFFF;
+ apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i));
+
+ /* Send IPI */
+ cfg = apic_read(APIC_ICR);
+ cfg &= ~0xCDFFF;
+ cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_DM_INIT);
+ apic_write(APIC_ICR, cfg);
+
+ /*
+ * Should we send STARTUP IPIs ?
+ *
+ * Determine this based on the APIC version.
+ * If we don't have an integrated APIC, don't
+ * send the STARTUP IPIs.
+ */
+
+ if (APIC_INTEGRATED(apic_version[i]))
+ num_starts = 2;
+ else
+ num_starts = 0;
+
+ /*
+ * Run STARTUP IPI loop.
+ */
+
+ for (j = 1; j <= num_starts; j++) {
+ dprintk("Sending STARTUP #%d.\n",j);
+ apic_readaround(APIC_SPIV);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ dprintk("After apic_write.\n");
+
+ /*
+ * STARTUP IPI
+ */
+
+ /* Target chip */
+ cfg = apic_read(APIC_ICR2);
+ cfg &= 0x00FFFFFF;
+ apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i));
+
+ /* Boot on the stack */
+ cfg = apic_read(APIC_ICR);
+ cfg &= ~0xCDFFF;
+ cfg |= (APIC_DEST_DM_STARTUP | (start_eip >> 12));
+
+ /* Kick the second */
+ apic_write(APIC_ICR, cfg);
+
+ dprintk("Startup point 1.\n");
+
+ dprintk("Waiting for send to finish...\n");
+ timeout = 0;
+ do {
+ dprintk("+");
+ udelay(100);
+ send_status = apic_read(APIC_ICR) & 0x1000;
+ } while (send_status && (timeout++ < 1000));
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ udelay(200);
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ if (send_status || accept_status)
+ break;
+ }
+ dprintk("After Startup.\n");
+
+ if (send_status)
+ printk("APIC never delivered???\n");
+ if (accept_status)
+ printk("APIC delivery error (%lx).\n", accept_status);
+
+ if (!send_status && !accept_status) {
+ /*
+ * allow APs to start initializing.
+ */
+ dprintk("Before Callout %d.\n", i);
+ set_bit(i, &cpu_callout_map);
+ dprintk("After Callout %d.\n", i);
+
+ /*
+ * Wait 5s total for a response
+ */
+ for (timeout = 0; timeout < 50000; timeout++) {
+ if (test_bit(i, &cpu_callin_map))
+ break; /* It has booted */
+ udelay(100);
+ }
+
+ if (test_bit(i, &cpu_callin_map)) {
+ /* number CPUs logically, starting from 1 (BSP is 0) */
+ printk("OK.\n");
+ printk("CPU%d: ", i);
+ print_cpu_info(&cpu_data[i]);
+ } else {
+ if (*((volatile unsigned char *)phys_to_virt(8192))
+ == 0xA5) /* trampoline code not run */
+ printk("Stuck ??\n");
+ else
+ printk("CPU booted but not responding.\n");
+ }
+ dprintk("CPU has booted.\n");
+ } else {
+ __cpu_logical_map[cpucount] = -1;
+ cpu_number_map[i] = -1;
+ cpucount--;
+ }
+
+ swapper_pg_dir[0]=maincfg;
+ local_flush_tlb();
+
+ /* mark "stuck" area as not stuck */
+ *((volatile unsigned long *)phys_to_virt(8192)) = 0;
+}
+
+cycles_t cacheflush_time;
+extern unsigned long cpu_hz;
+
+static void smp_tune_scheduling (void)
+{
+ unsigned long cachesize;
+ /*
+ * Rough estimation for SMP scheduling, this is the number of
+ * cycles it takes for a fully memory-limited process to flush
+ * the SMP-local cache.
+ *
+ * (For a P5 this pretty much means we will choose another idle
+ * CPU almost always at wakeup time (this is due to the small
+ * L1 cache), on PIIs it's around 50-100 usecs, depending on
+ * the cache size)
+ */
+
+ if (!cpu_hz) {
+ /*
+ * this basically disables processor-affinity
+ * scheduling on SMP without a TSC.
+ */
+ cacheflush_time = 0;
+ return;
+ } else {
+ cachesize = boot_cpu_data.x86_cache_size;
+ if (cachesize == -1)
+ cachesize = 8; /* Pentiums */
+
+ cacheflush_time = cpu_hz/1024*cachesize/5000;
+ }
+
+ printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
+ (long)cacheflush_time/(cpu_hz/1000000),
+ ((long)cacheflush_time*100/(cpu_hz/1000000)) % 100);
+}
+
+/*
+ * Cycle through the processors sending APIC IPIs to boot each.
+ */
+
+extern int prof_multiplier[NR_CPUS];
+extern int prof_old_multiplier[NR_CPUS];
+extern int prof_counter[NR_CPUS];
+
+void __init smp_boot_cpus(void)
+{
+ int i;
+
+#ifdef CONFIG_MTRR
+ /* Must be done before other processors booted */
+ mtrr_init_boot_cpu ();
+#endif
+ /*
+ * Initialize the logical to physical CPU number mapping
+ * and the per-CPU profiling counter/multiplier
+ */
+
+ for (i = 0; i < NR_CPUS; i++) {
+ cpu_number_map[i] = -1;
+ prof_counter[i] = 1;
+ prof_old_multiplier[i] = 1;
+ prof_multiplier[i] = 1;
+ }
+
+ /*
+ * Setup boot CPU information
+ */
+
+ smp_store_cpu_info(boot_cpu_id); /* Final full version of the data */
+ smp_tune_scheduling();
+ printk("CPU%d: ", boot_cpu_id);
+ print_cpu_info(&cpu_data[boot_cpu_id]);
+
+ /*
+ * not necessary because the MP table should list the boot
+ * CPU too, but we do it for the sake of robustness anyway.
+ * (and for the case when a non-SMP board boots an SMP kernel)
+ */
+ cpu_present_map |= (1 << hard_smp_processor_id());
+
+ cpu_number_map[boot_cpu_id] = 0;
+
+ init_idle();
+
+ /*
+ * If we couldnt find an SMP configuration at boot time,
+ * get out of here now!
+ */
+
+ if (!smp_found_config) {
+ printk(KERN_NOTICE "SMP motherboard not detected. Using dummy APIC emulation.\n");
+#ifndef CONFIG_VISWS
+ io_apic_irqs = 0;
+#endif
+ cpu_online_map = cpu_present_map;
+ smp_num_cpus = 1;
+ goto smp_done;
+ }
+
+ /*
+ * If SMP should be disabled, then really disable it!
+ */
+
+ if (!max_cpus) {
+ smp_found_config = 0;
+ printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+ }
+
+#ifdef SMP_DEBUG
+ {
+ int reg;
+
+ /*
+ * This is to verify that we're looking at
+ * a real local APIC. Check these against
+ * your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+
+ reg = apic_read(APIC_LVR);
+ dprintk("Getting VERSION: %x\n", reg);
+
+ apic_write(APIC_LVR, 0);
+ reg = apic_read(APIC_LVR);
+ dprintk("Getting VERSION: %x\n", reg);
+
+ /*
+ * The two version reads above should print the same
+ * NON-ZERO!!! numbers. If the second one is zero,
+ * there is a problem with the APIC write/read
+ * definitions.
+ *
+ * The next two are just to see if we have sane values.
+ * They're only really relevant if we're in Virtual Wire
+ * compatibility mode, but most boxes are anymore.
+ */
+
+
+ reg = apic_read(APIC_LVT0);
+ dprintk("Getting LVT0: %x\n", reg);
+
+ reg = apic_read(APIC_LVT1);
+ dprintk("Getting LVT1: %x\n", reg);
+ }
+#endif
+
+ setup_local_APIC();
+
+ if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id)
+ BUG();
+
+ /*
+ * Now scan the CPU present map and fire up the other CPUs.
+ */
+
+ /*
+ * Add all detected CPUs. (later on we can down individual
+ * CPUs which will change cpu_online_map but not necessarily
+ * cpu_present_map. We are pretty much ready for hot-swap CPUs.)
+ */
+ cpu_online_map = cpu_present_map;
+ mb();
+
+ dprintk("CPU map: %lx\n", cpu_present_map);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ /*
+ * Don't even attempt to start the boot CPU!
+ */
+ if (i == boot_cpu_id)
+ continue;
+
+ if ((cpu_online_map & (1 << i))
+ && (max_cpus < 0 || max_cpus > cpucount+1)) {
+ do_boot_cpu(i);
+ }
+
+ /*
+ * Make sure we unmap all failed CPUs
+ */
+ if (cpu_number_map[i] == -1 && (cpu_online_map & (1 << i))) {
+ printk("CPU #%d not responding - cannot use it.\n",i);
+ cpu_online_map &= ~(1 << i);
+ }
+ }
+
+ /*
+ * Cleanup possible dangling ends...
+ */
+
+#ifndef CONFIG_VISWS
+ {
+ unsigned long cfg;
+
+ /*
+ * Install writable page 0 entry to set BIOS data area.
+ */
+ cfg = pg0[0];
+ /* writeable, present, addr 0 */
+ pg0[0] = _PAGE_RW | _PAGE_PRESENT | 0;
+ local_flush_tlb();
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ CMOS_WRITE(0, 0xf);
+
+ *((volatile long *) phys_to_virt(0x467)) = 0;
+
+ /*
+ * Restore old page 0 entry.
+ */
+ pg0[0] = cfg;
+ local_flush_tlb();
+ }
+#endif
+
+ /*
+ * Allow the user to impress friends.
+ */
+
+ dprintk("Before bogomips.\n");
+ if (!cpucount) {
+ printk(KERN_ERR "Error: only one processor found.\n");
+ cpu_online_map = (1<<hard_smp_processor_id());
+ } else {
+ unsigned long bogosum = 0;
+ for(i = 0; i < 32; i++)
+ if (cpu_online_map&(1<<i))
+ bogosum+=cpu_data[i].loops_per_sec;
+ printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ cpucount+1,
+ (bogosum+2500)/500000,
+ ((bogosum+2500)/5000)%100);
+ dprintk("Before bogocount - setting activated=1.\n");
+ smp_activated = 1;
+ }
+ smp_num_cpus = cpucount + 1;
+
+ if (smp_b_stepping)
+ printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
+ dprintk("Boot done.\n");
+
+ cache_APIC_registers();
+#ifndef CONFIG_VISWS
+ /*
+ * Here we can be sure that there is an IO-APIC in the system. Let's
+ * go and set it up:
+ */
+ if (!skip_ioapic_setup)
+ setup_IO_APIC();
+#endif
+
+smp_done:
+ /*
+ * now we know the other CPUs have fired off and we know our
+ * APIC ID, so we can go init the TSS and stuff:
+ */
+ cpu_init();
+
+ /*
+ * Set up all local APIC timers in the system:
+ */
+ setup_APIC_clocks();
+
+ /*
+ * Synchronize the TSC with the AP
+ */
+ if (cpu_has_tsc && cpucount)
+ synchronize_tsc_bp();
+}
+
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 9d18999a0..d3f0d3109 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -74,7 +74,7 @@ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
* Equal to 2^32 * (1 / (clocks per usec) ).
* Initialized in time_init.
*/
-static unsigned long fast_gettimeoffset_quotient=0;
+unsigned long fast_gettimeoffset_quotient=0;
extern rwlock_t xtime_lock;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index f3e6f75aa..ebd1cd002 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -58,10 +58,17 @@ struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
*/
struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+extern int console_loglevel;
+
+static inline void console_silent(void)
+{
+ console_loglevel = 0;
+}
+
static inline void console_verbose(void)
{
- extern int console_loglevel;
- console_loglevel = 15;
+ if (console_loglevel)
+ console_loglevel = 15;
}
#define DO_ERROR(trapnr, signr, str, name, tsk) \
@@ -202,8 +209,6 @@ void die(const char * str, struct pt_regs * regs, long err)
printk("%s: %04lx\n", str, err & 0xffff);
show_registers(regs);
-spin_lock_irq(&die_lock);
-
spin_unlock_irq(&die_lock);
do_exit(SIGSEGV);
}
@@ -292,7 +297,11 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
printk("You probably have a hardware problem with your RAM chips\n");
-}
+
+ /* Clear and disable the memory parity error line. */
+ reason = (reason & 0xf) | 4;
+ outb(reason, 0x61);
+}
static void io_check_error(unsigned char reason, struct pt_regs * regs)
{
@@ -301,8 +310,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs)
printk("NMI: IOCK error (debug interrupt?)\n");
show_registers(regs);
- /* Re-enable the IOCK line, wait for a few seconds */
- reason |= 8;
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & 0xf) | 8;
outb(reason, 0x61);
i = 2000;
while (--i) udelay(1000);
@@ -325,18 +334,107 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
printk("Do you have a strange power saving mode enabled?\n");
}
+atomic_t nmi_counter[NR_CPUS];
+
+#if CONFIG_SMP
+
+int nmi_watchdog = 1;
+
+static int __init setup_nmi_watchdog(char *str)
+{
+ get_option(&str, &nmi_watchdog);
+ return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+extern spinlock_t console_lock;
+static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+
+inline void nmi_watchdog_tick(struct pt_regs * regs)
+{
+ /*
+ * the best way to detect wether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are broadcasted to every CPU, here
+ * we only have to check the current processor.
+ *
+ * since NMIs dont listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up console_lock first ...
+ * [when there will be more tty-related locks, break them up
+ * here too!]
+ */
+
+ static unsigned int last_irq_sums [NR_CPUS] = { 0, },
+ alert_counter [NR_CPUS] = { 0, };
+
+ /*
+ * Since current-> is always on the stack, and we always switch
+ * the stack NMI-atomically, it's safe to use smp_processor_id().
+ */
+ int sum, cpu = smp_processor_id();
+
+ sum = apic_timer_irqs[cpu];
+
+ if (last_irq_sums[cpu] == sum) {
+ /*
+ * Ayiee, looks like this CPU is stuck ...
+ * wait a few IRQs (5 seconds) before doing the oops ...
+ */
+ alert_counter[cpu]++;
+ if (alert_counter[cpu] == 5*HZ) {
+ spin_lock(&nmi_print_lock);
+ spin_unlock(&console_lock); // we are in trouble anyway
+ printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
+ show_registers(regs);
+ printk("console shuts up ...\n");
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+ do_exit(SIGSEGV);
+ }
+ } else {
+ last_irq_sums[cpu] = sum;
+ alert_counter[cpu] = 0;
+ }
+}
+#endif
+
asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
{
unsigned char reason = inb(0x61);
- extern atomic_t nmi_counter;
- atomic_inc(&nmi_counter);
+ atomic_inc(nmi_counter+smp_processor_id());
+ if (!(reason & 0xc0)) {
+#if CONFIG_SMP
+ /*
+ * Ok, so this is none of the documented NMI sources,
+ * so it must be the NMI watchdog.
+ */
+ if (nmi_watchdog) {
+ nmi_watchdog_tick(regs);
+ return;
+ } else
+ unknown_nmi_error(reason, regs);
+#else
+ unknown_nmi_error(reason, regs);
+#endif
+ return;
+ }
if (reason & 0x80)
mem_parity_error(reason, regs);
if (reason & 0x40)
io_check_error(reason, regs);
- if (!(reason & 0xc0))
- unknown_nmi_error(reason, regs);
+ /*
+ * Reassert NMI in case it became active meanwhile
+ * as it's edge-triggered.
+ */
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
}
/*
@@ -455,6 +553,7 @@ asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs,
asmlinkage void math_state_restore(struct pt_regs regs)
{
__asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */
+
if(current->used_math)
__asm__("frstor %0": :"m" (current->thread.i387));
else
@@ -489,7 +588,6 @@ void __init trap_init_f00f_bug(void)
pmd_t * pmd;
pte_t * pte;
-return;
/*
* Allocate a new page in virtual address space,
* move the IDT into it and write protect this page.
@@ -658,7 +756,7 @@ cobalt_init(void)
*/
set_fixmap(FIX_APIC_BASE, APIC_PHYS_BASE);
printk("Local APIC ID %lx\n", apic_read(APIC_ID));
- printk("Local APIC Version %lx\n", apic_read(APIC_VERSION));
+ printk("Local APIC Version %lx\n", apic_read(APIC_LVR));
set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
printk("Cobalt Revision %lx\n", co_cpu_read(CO_CPU_REV));
@@ -679,7 +777,7 @@ void __init trap_init(void)
set_trap_gate(0,&divide_error);
set_trap_gate(1,&debug);
- set_trap_gate(2,&nmi);
+ set_intr_gate(2,&nmi);
set_system_gate(3,&int3); /* int3-5 can be called from all */
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);