1 files changed, 684 insertions, 310 deletions
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 62d074508..24c33be65 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -1,7 +1,7 @@
 /*
  *	linux/arch/i386/kernel/irq.c
  *
- *	Copyright (C) 1992 Linus Torvalds
+ *	Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
  *
  * This file contains the code used by various IRQ handling routines:
  * asking for different IRQ's should be done through these routines
@@ -26,6 +26,7 @@
 #include <linux/malloc.h>
 #include <linux/random.h>
 #include <linux/smp.h>
+#include <linux/tasks.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 
@@ -35,150 +36,231 @@
 #include <asm/bitops.h>
 #include <asm/smp.h>
 #include <asm/pgtable.h>
+#include <asm/delay.h>
 
 #include "irq.h"
 
-#ifdef __SMP_PROF__
-extern volatile unsigned long smp_local_timer_ticks[1+NR_CPUS];
-#endif
-
+unsigned int local_bh_count[NR_CPUS];
 unsigned int local_irq_count[NR_CPUS];
-#ifdef __SMP__
-atomic_t __intel_bh_counter;
-#else
-int __intel_bh_counter;
-#endif
-
-#ifdef __SMP_PROF__
-static unsigned int int_count[NR_CPUS][NR_IRQS] = {{0},};
-#endif
 
 atomic_t nmi_counter;
 
 /*
- * This contains the irq mask for both irq controllers
+ * About the IO-APIC, the architecture is 'merged' into our
+ * current irq architecture, seemlessly. (i hope). It is only
+ * visible through 8 more hardware interrupt lines, but otherwise
+ * drivers are unaffected. The main code is believed to be
+ * NR_IRQS-safe (nothing anymore thinks we have 16
+ * irq lines only), but there might be some places left ...
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ * and on SMP the extended IO-APIC IRQs 16-23. The IO-APIC
+ * uses this mask too, in probe_irq*().
+ *
+ * (0x0000ffff for NR_IRQS==16, 0x00ffffff for NR_IRQS=24)
  */
-static unsigned int cached_irq_mask = 0xffff;
+static unsigned int cached_irq_mask = (1<<NR_IRQS)-1;
 
-#define cached_21	(((char *)(&cached_irq_mask))[0])
-#define cached_A1	(((char *)(&cached_irq_mask))[1])
+#define cached_21	((cached_irq_mask | io_apic_irqs) & 0xff)
+#define cached_A1	(((cached_irq_mask | io_apic_irqs) >> 8) & 0xff)
 
 spinlock_t irq_controller_lock;
 
+static unsigned int irq_events [NR_IRQS] = { -1, };
+static int disabled_irq [NR_IRQS] = { 0, };
+
 /*
- * This is always called from an interrupt context
- * with local interrupts disabled. Don't worry about
- * irq-safe locks.
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt and sometimes the keyboard interrupt is
+ * not connected to any IO-APIC pin, it's fed to the CPU ExtInt IRQ line
+ * directly.
  *
- * Note that we always ack the primary irq controller,
- * even if the interrupt came from the secondary, as
- * the primary will still have routed it. Oh, the joys
- * of PC hardware.
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs us one more branch in do_IRQ,
+ * but we have _much_ higher compatibility and robustness this way.
  */
-static inline void mask_and_ack_irq(int irq_nr)
+
+/*
+ * Default to all normal IRQ's _not_ using the IO APIC.
+ *
+ * To get IO-APIC interrupts you should either:
+ *  - turn some of them into IO-APIC interrupts at runtime
+ *    with some magic system call interface.
+ *  - explicitly use irq 16-19 depending on which PCI irq
+ *    line your PCI controller uses.
+ */
+unsigned int io_apic_irqs = 0;
+
+struct hw_interrupt_type {
+	void (*handle)(unsigned int irq, int cpu, struct pt_regs * regs);
+	void (*enable)(unsigned int irq);
+	void (*disable)(unsigned int irq);
+};
+
+
+static void do_8259A_IRQ (unsigned int irq, int cpu, struct pt_regs * regs);
+static void enable_8259A_irq (unsigned int irq);
+static void disable_8259A_irq (unsigned int irq);
+
+static struct hw_interrupt_type i8259A_irq_type = {
+	do_8259A_IRQ,
+	enable_8259A_irq,
+	disable_8259A_irq
+};
+
+
+#ifdef __SMP__
+static void do_ioapic_IRQ (unsigned int irq, int cpu, struct pt_regs * regs);
+static void enable_ioapic_irq (unsigned int irq);
+static void disable_ioapic_irq (unsigned int irq);
+
+static struct hw_interrupt_type ioapic_irq_type = {
+	do_ioapic_IRQ,
+	enable_ioapic_irq,
+	disable_ioapic_irq
+};
+#endif
+
+struct hw_interrupt_type *irq_handles[NR_IRQS] =
 {
-	spin_lock(&irq_controller_lock);
-	cached_irq_mask |= 1 << irq_nr;
-	if (irq_nr & 8) {
-		inb(0xA1);	/* DUMMY */
+	[0 ... 15] = &i8259A_irq_type			/* standard ISA IRQs */
+#ifdef __SMP__
+	, [16 ... NR_IRQS-1] = &ioapic_irq_type		/* 'high' PCI IRQs */
+#endif
+};
+
+
+/*
+ * These have to be protected by the irq controller spinlock
+ * before being called.
+ */
+
+static inline void mask_8259A(unsigned int irq)
+{
+	cached_irq_mask |= 1 << irq;
+	if (irq & 8) {
 		outb(cached_A1,0xA1);
-		outb(0x62,0x20);	/* Specific EOI to cascade */
-		outb(0x20,0xA0);
 	} else {
-		inb(0x21);	/* DUMMY */
 		outb(cached_21,0x21);
-		outb(0x20,0x20);
 	}
-	spin_unlock(&irq_controller_lock);
 }
 
-static inline void set_irq_mask(int irq_nr)
+static inline void unmask_8259A(unsigned int irq)
 {
-	if (irq_nr & 8) {
+	cached_irq_mask &= ~(1 << irq);
+	if (irq & 8) {
 		outb(cached_A1,0xA1);
 	} else {
 		outb(cached_21,0x21);
 	}
 }
 
-/*
- * These have to be protected by the spinlock
- * before being called.
- */
-static inline void mask_irq(unsigned int irq_nr)
+void set_8259A_irq_mask(unsigned int irq)
 {
-	cached_irq_mask |= 1 << irq_nr;
-	set_irq_mask(irq_nr);
+	/*
+	 * (it might happen that we see IRQ>15 on a UP box, with SMP
+	 * emulation)
+	 */
+	if (irq < 16) {
+		if (irq & 8) {
+			outb(cached_A1,0xA1);
+		} else {
+			outb(cached_21,0x21);
+		}
+	}
 }
 
-static inline void unmask_irq(unsigned int irq_nr)
+void unmask_generic_irq(unsigned int irq)
 {
-	cached_irq_mask &= ~(1 << irq_nr);
-	set_irq_mask(irq_nr);
+	if (IO_APIC_IRQ(irq))
+		enable_IO_APIC_irq(irq);
+	else {
+		cached_irq_mask &= ~(1 << irq);
+		set_8259A_irq_mask(irq);
+	}
 }
 
-void disable_irq(unsigned int irq_nr)
-{
-	unsigned long flags;
+/*
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
 
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	mask_irq(irq_nr);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-	synchronize_irq();
-}
 
-void enable_irq(unsigned int irq_nr)
-{
-	unsigned long flags;
+BUILD_COMMON_IRQ()
+/*
+ * ISA PIC or IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ */
+BUILD_IRQ(0) BUILD_IRQ(1) BUILD_IRQ(2) BUILD_IRQ(3)
+BUILD_IRQ(4) BUILD_IRQ(5) BUILD_IRQ(6) BUILD_IRQ(7)
+BUILD_IRQ(8) BUILD_IRQ(9) BUILD_IRQ(10) BUILD_IRQ(11)
+BUILD_IRQ(12) BUILD_IRQ(13) BUILD_IRQ(14) BUILD_IRQ(15)
 
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	unmask_irq(irq_nr);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
+#ifdef __SMP__
 
 /*
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ * The IO-APIC (persent only in SMP boards) has 8 more hardware
+ * interrupt pins, for all of them we define an IRQ vector:
  *
- * These macros create the low-level assembly IRQ routines that do all
- * the operations that are needed to keep the AT interrupt-controller
- * happy. They are also written to be fast - and to disable interrupts
- * as little as humanly possible.
+ * raw PCI interrupts 0-3, basically these are the ones used
+ * heavily:
  */
+BUILD_IRQ(16) BUILD_IRQ(17) BUILD_IRQ(18) BUILD_IRQ(19)
 
-#if NR_IRQS != 16
-#error make irq stub building NR_IRQS dependent and remove me.
-#endif
+/*
+ * [FIXME: anyone with 2 separate PCI buses and 2 IO-APICs, please
+ *	   speak up if problems and request experimental patches.
+ *         --mingo ]
+ */
 
-BUILD_COMMON_IRQ()
-BUILD_IRQ(FIRST,0,0x01)
-BUILD_IRQ(FIRST,1,0x02)
-BUILD_IRQ(FIRST,2,0x04)
-BUILD_IRQ(FIRST,3,0x08)
-BUILD_IRQ(FIRST,4,0x10)
-BUILD_IRQ(FIRST,5,0x20)
-BUILD_IRQ(FIRST,6,0x40)
-BUILD_IRQ(FIRST,7,0x80)
-BUILD_IRQ(SECOND,8,0x01)
-BUILD_IRQ(SECOND,9,0x02)
-BUILD_IRQ(SECOND,10,0x04)
-BUILD_IRQ(SECOND,11,0x08)
-BUILD_IRQ(SECOND,12,0x10)
-BUILD_IRQ(SECOND,13,0x20)
-BUILD_IRQ(SECOND,14,0x40)
-BUILD_IRQ(SECOND,15,0x80)
+/*
+ * MIRQ (motherboard IRQ) interrupts 0-1:
+ */
+BUILD_IRQ(20) BUILD_IRQ(21)
 
-#ifdef __SMP__
+/*
+ * 'nondefined general purpose interrupt'.
+ */
+BUILD_IRQ(22)
+/*
+ * optionally rerouted SMI interrupt:
+ */
+BUILD_IRQ(23)
+
+/*
+ * The following vectors are part of the Linux architecture, there
+ * is no hardware IRQ pin equivalent for them, they are triggered
+ * through the ICC by us (IPIs), via smp_message_pass():
+ */
 BUILD_SMP_INTERRUPT(reschedule_interrupt)
 BUILD_SMP_INTERRUPT(invalidate_interrupt)
 BUILD_SMP_INTERRUPT(stop_cpu_interrupt)
+
+/*
+ * every pentium local APIC has two 'local interrupts', with a
+ * soft-definable vector attached to both interrupts, one of
+ * which is a timer interrupt, the other one is error counter
+ * overflow. Linux uses the local APIC timer interrupt to get
+ * a much simpler SMP time architecture:
+ */
 BUILD_SMP_TIMER_INTERRUPT(apic_timer_interrupt)
+
 #endif
 
-static void (*interrupt[17])(void) = {
+static void (*interrupt[NR_IRQS])(void) = {
 	IRQ0_interrupt, IRQ1_interrupt, IRQ2_interrupt, IRQ3_interrupt,
 	IRQ4_interrupt, IRQ5_interrupt, IRQ6_interrupt, IRQ7_interrupt,
 	IRQ8_interrupt, IRQ9_interrupt, IRQ10_interrupt, IRQ11_interrupt,
-	IRQ12_interrupt, IRQ13_interrupt, IRQ14_interrupt, IRQ15_interrupt	
+	IRQ12_interrupt, IRQ13_interrupt, IRQ14_interrupt, IRQ15_interrupt
+#ifdef __SMP__
+	,IRQ16_interrupt, IRQ17_interrupt, IRQ18_interrupt, IRQ19_interrupt,
+	IRQ20_interrupt, IRQ21_interrupt, IRQ22_interrupt, IRQ23_interrupt
+#endif
 };
 
 /*
@@ -202,7 +284,7 @@ static void no_action(int cpl, void *dev_id, struct pt_regs *regs) { }
 static void math_error_irq(int cpl, void *dev_id, struct pt_regs *regs)
 {
 	outb(0,0xF0);
-	if (ignore_irq13 || !hard_math)
+	if (ignore_irq13 || !boot_cpu_data.hard_math)
 		return;
 	math_error();
 }
@@ -214,135 +296,59 @@ static struct irqaction irq13 = { math_error_irq, 0, 0, "fpu", NULL, NULL };
  */
 static struct irqaction irq2  = { no_action, 0, 0, "cascade", NULL, NULL};
 
-static struct irqaction *irq_action[16] = {
+static struct irqaction *irq_action[NR_IRQS] = {
 	NULL, NULL, NULL, NULL,
 	NULL, NULL, NULL, NULL,
 	NULL, NULL, NULL, NULL,
 	NULL, NULL, NULL, NULL
+#ifdef __SMP__
+	,NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL
+#endif
 };
 
 int get_irq_list(char *buf)
 {
-	int i;
+	int i, j;
 	struct irqaction * action;
 	char *p = buf;
 
+	p += sprintf(p, "           ");
+	for (j=0; j<smp_num_cpus; j++)
+		p += sprintf(p, "CPU%d       ",j);
+	*p++ = '\n';
+
 	for (i = 0 ; i < NR_IRQS ; i++) {
 		action = irq_action[i];
 		if (!action) 
 			continue;
-		p += sprintf(p, "%3d: %10u   %s",
-			i, kstat.interrupts[i], action->name);
+		p += sprintf(p, "%3d: ",i);
+#ifndef __SMP__
+		p += sprintf(p, "%10u ", kstat_irqs(i));
+#else
+		for (j=0; j<smp_num_cpus; j++)
+			p += sprintf(p, "%10u ",
+				kstat.irqs[cpu_logical_map(j)][i]);
+#endif
+
+		if (IO_APIC_IRQ(i))
+			p += sprintf(p, " IO-APIC ");
+		else
+			p += sprintf(p, "  XT PIC ");
+		p += sprintf(p, "  %s", action->name);
+
 		for (action=action->next; action; action = action->next) {
 			p += sprintf(p, ", %s", action->name);
 		}
 		*p++ = '\n';
 	}
 	p += sprintf(p, "NMI: %10u\n", atomic_read(&nmi_counter));
-#ifdef __SMP_PROF__
+#ifdef __SMP__
 	p += sprintf(p, "IPI: %10lu\n", ipi_count);
 #endif		
 	return p - buf;
 }
 
-#ifdef __SMP_PROF__
-
-extern unsigned int prof_multiplier[NR_CPUS];
-extern unsigned int prof_counter[NR_CPUS];
-
-int get_smp_prof_list(char *buf) {
-	int i,j, len = 0;
-	struct irqaction * action;
-	unsigned long sum_spins = 0;
-	unsigned long sum_spins_syscall = 0;
-	unsigned long sum_spins_sys_idle = 0;
-	unsigned long sum_smp_idle_count = 0;
-	unsigned long sum_local_timer_ticks = 0;
-
-	for (i=0;i<smp_num_cpus;i++) {
-		int cpunum = cpu_logical_map[i];
-		sum_spins+=smp_spins[cpunum];
-		sum_spins_syscall+=smp_spins_syscall[cpunum];
-		sum_spins_sys_idle+=smp_spins_sys_idle[cpunum];
-		sum_smp_idle_count+=smp_idle_count[cpunum];
-		sum_local_timer_ticks+=smp_local_timer_ticks[cpunum];
-	}
-
-	len += sprintf(buf+len,"CPUS: %10i \n", smp_num_cpus);
-	len += sprintf(buf+len,"            SUM ");
-	for (i=0;i<smp_num_cpus;i++)
-		len += sprintf(buf+len,"        P%1d ",cpu_logical_map[i]);
-	len += sprintf(buf+len,"\n");
-	for (i = 0 ; i < NR_IRQS ; i++) {
-		action = *(i + irq_action);
-		if (!action || !action->handler)
-			continue;
-		len += sprintf(buf+len, "%3d: %10d ",
-			i, kstat.interrupts[i]);
-		for (j=0;j<smp_num_cpus;j++)
-			len+=sprintf(buf+len, "%10d ",
-				int_count[cpu_logical_map[j]][i]);
-		len += sprintf(buf+len, "  %s", action->name);
-		for (action=action->next; action; action = action->next) {
-			len += sprintf(buf+len, ", %s", action->name);
-		}
-		len += sprintf(buf+len, "\n");
-	}
-	len+=sprintf(buf+len, "LCK: %10lu",
-		sum_spins);
-
-	for (i=0;i<smp_num_cpus;i++)
-		len+=sprintf(buf+len," %10lu",smp_spins[cpu_logical_map[i]]);
-
-	len +=sprintf(buf+len,"   spins from int\n");
-
-	len+=sprintf(buf+len, "LCK: %10lu",
-		sum_spins_syscall);
-
-	for (i=0;i<smp_num_cpus;i++)
-		len+=sprintf(buf+len," %10lu",smp_spins_syscall[cpu_logical_map[i]]);
-
-	len +=sprintf(buf+len,"   spins from syscall\n");
-
-	len+=sprintf(buf+len, "LCK: %10lu",
-		sum_spins_sys_idle);
-
-	for (i=0;i<smp_num_cpus;i++)
-		len+=sprintf(buf+len," %10lu",smp_spins_sys_idle[cpu_logical_map[i]]);
-
-	len +=sprintf(buf+len,"   spins from sysidle\n");
-	len+=sprintf(buf+len,"IDLE %10lu",sum_smp_idle_count);
-
-	for (i=0;i<smp_num_cpus;i++)
-		len+=sprintf(buf+len," %10lu",smp_idle_count[cpu_logical_map[i]]);
-
-	len +=sprintf(buf+len,"   idle ticks\n");
-
-	len+=sprintf(buf+len,"TICK %10lu",sum_local_timer_ticks);
-	for (i=0;i<smp_num_cpus;i++)
-		len+=sprintf(buf+len," %10lu",smp_local_timer_ticks[cpu_logical_map[i]]);
-
-	len +=sprintf(buf+len,"   local APIC timer ticks\n");
-
-	len+=sprintf(buf+len,"MULT:          ");
-	for (i=0;i<smp_num_cpus;i++)
-		len+=sprintf(buf+len," %10u",prof_multiplier[cpu_logical_map[i]]);
-	len +=sprintf(buf+len,"   profiling multiplier\n");
-
-	len+=sprintf(buf+len,"COUNT:         ");
-	for (i=0;i<smp_num_cpus;i++)
-		len+=sprintf(buf+len," %10u",prof_counter[cpu_logical_map[i]]);
-
-	len +=sprintf(buf+len,"   profiling counter\n");
-
-	len+=sprintf(buf+len, "IPI: %10lu   received\n",
-		ipi_count);
-
-	return len;
-}
-#endif 
-
-
 /*
  * Global interrupt locks for SMP. Allow interrupts to come in on any
  * CPU, yet make cli/sti act globally to protect critical regions..
@@ -352,8 +358,8 @@ unsigned char global_irq_holder = NO_PROC_ID;
 unsigned volatile int global_irq_lock;
 atomic_t global_irq_count;
 
-#define irq_active(cpu) \
-	(global_irq_count != local_irq_count[cpu])
+atomic_t global_bh_count;
+atomic_t global_bh_lock;
 
 /*
  * "global_cli()" is a special case, in that it can hold the
@@ -371,37 +377,123 @@ static inline void check_smp_invalidate(int cpu)
 	}
 }
 
-static unsigned long previous_irqholder;
+static void show(char * str)
+{
+	int i;
+	unsigned long *stack;
+	int cpu = smp_processor_id();
+
+	printk("\n%s, CPU %d:\n", str, cpu);
+	printk("irq:  %d [%d %d]\n",
+		atomic_read(&global_irq_count), local_irq_count[0], local_irq_count[1]);
+	printk("bh:   %d [%d %d]\n",
+		atomic_read(&global_bh_count), local_bh_count[0], local_bh_count[1]);
+	stack = (unsigned long *) &str;
+	for (i = 40; i ; i--) {
+		unsigned long x = *++stack;
+		if (x > (unsigned long) &init_task_union && x < (unsigned long) &vsprintf) {
+			printk("<[%08lx]> ", x);
+		}
+	}
+}
+	
+
+#define MAXCOUNT 100000000
 
-static inline void wait_on_irq(int cpu, unsigned long where)
+static inline void wait_on_bh(void)
 {
-	int local_count = local_irq_count[cpu];
+	int count = MAXCOUNT;
+	do {
+		if (!--count) {
+			show("wait_on_bh");
+			count = ~0;
+		}
+		/* nothing .. wait for the other bh's to go away */
+	} while (atomic_read(&global_bh_count) != 0);
+}
 
-	/* Are we the only one in an interrupt context? */
-	while (local_count != atomic_read(&global_irq_count)) {
-		/*
-		 * No such luck. Now we need to release the lock,
-		 * _and_ release our interrupt context, because
-		 * otherwise we'd have dead-locks and live-locks
-		 * and other fun things.
-		 */
-		atomic_sub(local_count, &global_irq_count);
-		global_irq_lock = 0;
+/*
+ * I had a lockup scenario where a tight loop doing
+ * spin_unlock()/spin_lock() on CPU#1 was racing with
+ * spin_lock() on CPU#0. CPU#0 should have noticed spin_unlock(), but
+ * apparently the spin_unlock() information did not make it
+ * through to CPU#0 ... nasty, is this by design, do we have to limit
+ * 'memory update oscillation frequency' artificially like here?
+ *
+ * Such 'high frequency update' races can be avoided by careful design, but
+ * some of our major constructs like spinlocks use similar techniques,
+ * it would be nice to clarify this issue. Set this define to 0 if you
+ * want to check wether your system freezes. I suspect the delay done
+ * by SYNC_OTHER_CORES() is in correlation with 'snooping latency', but
+ * i thought that such things are guaranteed by design, since we use
+ * the 'LOCK' prefix.
+ */
+#define SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND 1
+
+#if SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND
+# define SYNC_OTHER_CORES(x) udelay(x+1)
+#else
+/*
+ * We have to allow irqs to arrive between __sti and __cli
+ */
+# define SYNC_OTHER_CORES(x) __asm__ __volatile__ ("nop")
+#endif
+
+static inline void wait_on_irq(int cpu)
+{
+	int count = MAXCOUNT;
+
+	for (;;) {
 
 		/*
-		 * Wait for everybody else to go away and release
-		 * their things before trying to get the lock again.
+		 * Wait until all interrupts are gone. Wait
+		 * for bottom half handlers unless we're
+		 * already executing in one..
 		 */
+		if (!atomic_read(&global_irq_count)) {
+			if (local_bh_count[cpu] || !atomic_read(&global_bh_count))
+				break;
+		}
+
+		/* Duh, we have to loop. Release the lock to avoid deadlocks */
+		clear_bit(0,&global_irq_lock);
+
 		for (;;) {
+			if (!--count) {
+				show("wait_on_irq");
+				count = ~0;
+			}
+			__sti();
+			SYNC_OTHER_CORES(cpu);
+			__cli();
 			check_smp_invalidate(cpu);
 			if (atomic_read(&global_irq_count))
 				continue;
 			if (global_irq_lock)
 				continue;
+			if (!local_bh_count[cpu] && atomic_read(&global_bh_count))
+				continue;
 			if (!test_and_set_bit(0,&global_irq_lock))
 				break;
 		}
-		atomic_add(local_count, &global_irq_count);
+	}
+}
+
+/*
+ * This is called when we want to synchronize with
+ * bottom half handlers. We need to wait until
+ * no other CPU is executing any bottom half handler.
+ *
+ * Don't wait if we're already running in an interrupt
+ * context or are inside a bh handler.
+ */
+void synchronize_bh(void)
+{
+	if (atomic_read(&global_bh_count)) {
+		int cpu = smp_processor_id();
+		if (!local_irq_count[cpu] && !local_bh_count[cpu]) {
+			wait_on_bh();
+		}
 	}
 }
 
@@ -411,23 +503,17 @@ static inline void wait_on_irq(int cpu, unsigned long where)
  * stop sending interrupts: but to make sure there
  * are no interrupts that are executing on another
  * CPU we need to call this function.
- *
- * On UP this is a no-op.
  */
 void synchronize_irq(void)
 {
-	int cpu = smp_processor_id();
-	int local_count = local_irq_count[cpu];
-
-	/* Do we need to wait? */
-	if (local_count != atomic_read(&global_irq_count)) {
-		/* The stupid way to do this */
+	if (atomic_read(&global_irq_count)) {
+		/* Stupid approach */
 		cli();
 		sti();
 	}
 }
 
-static inline void get_irqlock(int cpu, unsigned long where)
+static inline void get_irqlock(int cpu)
 {
 	if (test_and_set_bit(0,&global_irq_lock)) {
 		/* do we already hold the lock? */
@@ -440,105 +526,313 @@ static inline void get_irqlock(int cpu, unsigned long where)
 			} while (test_bit(0,&global_irq_lock));
 		} while (test_and_set_bit(0,&global_irq_lock));		
 	}
-	/*
-	 * Ok, we got the lock bit.
-	 * But that's actually just the easy part.. Now
-	 * we need to make sure that nobody else is running
+	/* 
+	 * We also to make sure that nobody else is running
 	 * in an interrupt context. 
 	 */
-	wait_on_irq(cpu, where);
+	wait_on_irq(cpu);
 
 	/*
-	 * Finally.
+	 * Ok, finally..
 	 */
 	global_irq_holder = cpu;
-	previous_irqholder = where;
 }
 
+/*
+ * A global "cli()" while in an interrupt context
+ * turns into just a local cli(). Interrupts
+ * should use spinlocks for the (very unlikely)
+ * case that they ever want to protect against
+ * each other.
+ */
 void __global_cli(void)
 {
 	int cpu = smp_processor_id();
-	unsigned long where;
 
-	__asm__("movl 16(%%esp),%0":"=r" (where));
 	__cli();
-	get_irqlock(cpu, where);
+	if (!local_irq_count[cpu])
+		get_irqlock(cpu);
 }
 
 void __global_sti(void)
 {
-	release_irqlock(smp_processor_id());
+	int cpu = smp_processor_id();
+
+	if (!local_irq_count[cpu])
+		release_irqlock(cpu);
 	__sti();
 }
 
 unsigned long __global_save_flags(void)
 {
-	return global_irq_holder == (unsigned char) smp_processor_id();
+	if (!local_irq_count[smp_processor_id()])
+		return global_irq_holder == (unsigned char) smp_processor_id();
+	else {
+		unsigned long x;
+		__save_flags(x);
+		return x;
+	}
 }
 
 void __global_restore_flags(unsigned long flags)
 {
-	switch (flags) {
-	case 0:
-		release_irqlock(smp_processor_id());
-		__sti();
-		break;
-	case 1:
-		__global_cli();
-		break;
-	default:
-		printk("global_restore_flags: %08lx (%08lx)\n",
-			flags, (&flags)[-1]);
-	}
+	if (!local_irq_count[smp_processor_id()]) {
+		switch (flags) {
+		case 0:
+			__global_sti();
+			break;
+		case 1:
+			__global_cli();
+			break;
+		default:
+			printk("global_restore_flags: %08lx (%08lx)\n",
+				flags, (&flags)[-1]);
+		}
+	} else
+		__restore_flags(flags);
 }
 
 #endif
 
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-asmlinkage void do_IRQ(struct pt_regs regs)
+static int handle_IRQ_event(unsigned int irq, struct pt_regs * regs)
 {
-	int irq = regs.orig_eax & 0xff;
 	struct irqaction * action;
-	int status, cpu;
-
-	/* 
-	 * mask and ack quickly, we don't want the irq controller
-	 * thinking we're snobs just because some other CPU has
-	 * disabled global interrupts (we have already done the
-	 * INT_ACK cycles, it's too late to try to pretend to the
-	 * controller that we aren't taking the interrupt).
-	 */
-	mask_and_ack_irq(irq);
+	int status;
 
-	cpu = smp_processor_id();
-	irq_enter(cpu, irq);
-	kstat.interrupts[irq]++;
-
-	/* Return with this interrupt masked if no action */
 	status = 0;
 	action = *(irq + irq_action);
+
 	if (action) {
+		status |= 1;
+
 		if (!(action->flags & SA_INTERRUPT))
 			__sti();
 
 		do {
 			status |= action->flags;
-			action->handler(irq, action->dev_id, &regs);
+			action->handler(irq, action->dev_id, regs);
 			action = action->next;
 		} while (action);
 		if (status & SA_SAMPLE_RANDOM)
 			add_interrupt_randomness(irq);
 		__cli();
+	}
+
+	return status;
+}
+
+
+void disable_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&irq_controller_lock, flags);
+	irq_handles[irq]->disable(irq);
+	spin_unlock_irqrestore(&irq_controller_lock, flags);
+
+	synchronize_irq();
+}
+
+/*
+ * disable/enable_irq() wait for all irq contexts to finish
+ * executing. Also it's recursive.
+ */
+static void disable_8259A_irq(unsigned int irq)
+{
+	disabled_irq[irq]++;
+	cached_irq_mask |= 1 << irq;
+	set_8259A_irq_mask(irq);
+}
+
+#ifdef __SMP__
+static void disable_ioapic_irq(unsigned int irq)
+{
+	disabled_irq[irq]++;
+	/*
+	 * We do not disable IO-APIC irqs in hardware ...
+	 */
+}
+#endif
+
+void enable_8259A_irq (unsigned int irq)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&irq_controller_lock, flags);
+	if (disabled_irq[irq])
+		disabled_irq[irq]--;
+	else {
+		spin_unlock_irqrestore(&irq_controller_lock, flags);
+		return;
+	}
+	cached_irq_mask &= ~(1 << irq);
+	set_8259A_irq_mask(irq);
+	spin_unlock_irqrestore(&irq_controller_lock, flags);
+}
+
+#ifdef __SMP__
+void enable_ioapic_irq (unsigned int irq)
+{
+	unsigned long flags;
+	int cpu = smp_processor_id(), should_handle_irq;
+
+	spin_lock_irqsave(&irq_controller_lock, flags);
+	if (disabled_irq[irq])
+		disabled_irq[irq]--;
+	else {
+		spin_unlock_irqrestore(&irq_controller_lock, flags);
+		return;
+	}
+	/*
+	 * In the SMP+IOAPIC case it might happen that there are an unspecified
+	 * number of pending IRQ events unhandled. We protect against multiple
+	 * enable_irq()'s executing them via disable_irq[irq]++
+	 */
+	if (!disabled_irq[irq] && irq_events[irq]) {
+		struct pt_regs regs; /* FIXME: these are fake currently */
+
+		disabled_irq[irq]++;
+		spin_unlock(&irq_controller_lock);
+		release_irqlock(cpu);
+		irq_enter(cpu, irq);
+again:
+		handle_IRQ_event(irq, &regs);
+
 		spin_lock(&irq_controller_lock);
-		unmask_irq(irq);
+		disabled_irq[irq]--;
+		should_handle_irq=0;
+		if (--irq_events[irq] && !disabled_irq[irq]) {
+			should_handle_irq=1;
+			disabled_irq[irq]++;
+		}
+		spin_unlock(&irq_controller_lock);
+
+		if (should_handle_irq)
+			goto again;
+
+		irq_exit(cpu, irq);
+		__restore_flags(flags);
+	} else
+		spin_unlock_irqrestore(&irq_controller_lock, flags);
+}
+#endif
+
+void enable_irq(unsigned int irq)
+{
+	irq_handles[irq]->enable(irq);
+}
+
+void make_8259A_irq (unsigned int irq)
+{
+	io_apic_irqs &= ~(1<<irq);
+	irq_handles[irq] = &i8259A_irq_type;
+	disable_irq(irq);
+	enable_irq(irq);
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static inline void mask_and_ack_8259A(unsigned int irq)
+{
+	spin_lock(&irq_controller_lock);
+	cached_irq_mask |= 1 << irq;
+	if (irq & 8) {
+		inb(0xA1);	/* DUMMY */
+		outb(cached_A1,0xA1);
+		outb(0x62,0x20);	/* Specific EOI to cascade */
+		outb(0x20,0xA0);
+	} else {
+		inb(0x21);	/* DUMMY */
+		outb(cached_21,0x21);
+		outb(0x20,0x20);
+	}
+	spin_unlock(&irq_controller_lock);
+}
+
+static void do_8259A_IRQ(unsigned int irq, int cpu, struct pt_regs * regs)
+{
+	mask_and_ack_8259A(irq);
+
+	irq_enter(cpu, irq);
+
+	if (handle_IRQ_event(irq, regs)) {
+		spin_lock(&irq_controller_lock);
+		unmask_8259A(irq);
 		spin_unlock(&irq_controller_lock);
 	}
 
 	irq_exit(cpu, irq);
+}
+
+#ifdef __SMP__
+static void do_ioapic_IRQ(unsigned int irq, int cpu, struct pt_regs * regs)
+{
+	int should_handle_irq = 0;
+
+	ack_APIC_irq();
+
+	spin_lock(&irq_controller_lock);
+
+	if (!irq_events[irq]++ && !disabled_irq[irq])
+		should_handle_irq = 1;
+
+	spin_unlock(&irq_controller_lock);
+
+	irq_enter(cpu, irq);
+
+	if (should_handle_irq) {
+again:
+		handle_IRQ_event(irq, regs);
+
+		spin_lock(&irq_controller_lock);
+		should_handle_irq=0;
+		if (--irq_events[irq] && !disabled_irq[irq])
+			should_handle_irq=1;
+		spin_unlock(&irq_controller_lock);
+
+		if (should_handle_irq)
+			goto again;
+	}
+
+	irq_exit(cpu, irq);
+}
+#endif
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ *
+ * the biggest change on SMP is the fact that we no more mask
+ * interrupts in hardware, please believe me, this is unavoidable,
+ * the hardware is largely message-oriented, i tried to force our
+ * state-driven irq handling scheme onto the IO-APIC, but no avail.
+ *
+ * so we soft-disable interrupts via 'event counters', the first 'incl'
+ * will do the IRQ handling. This also has the nice side effect of increased
+ * overlapping ... i saw no driver problem so far.
+ */
+asmlinkage void do_IRQ(struct pt_regs regs)
+{	
+	/* 
+	 * We ack quickly, we don't want the irq controller
+	 * thinking we're snobs just because some other CPU has
+	 * disabled global interrupts (we have already done the
+	 * INT_ACK cycles, it's too late to try to pretend to the
+	 * controller that we aren't taking the interrupt).
+	 *
+	 * 0 return value means that this irq is already being
+	 * handled by some other CPU. (or is disabled)
+	 */
+	unsigned int irq = regs.orig_eax & 0xff;
+	int cpu = smp_processor_id();
+
+	kstat.irqs[cpu][irq]++;
+	irq_handles[irq]->handle(irq, cpu, &regs);
+
 	/*
 	 * This should be conditional: we should really get
 	 * a return code from the irq handler to tell us
@@ -551,7 +845,7 @@ asmlinkage void do_IRQ(struct pt_regs regs)
 	}
 }
 
-int setup_x86_irq(int irq, struct irqaction * new)
+int setup_x86_irq(unsigned int irq, struct irqaction * new)
 {
 	int shared = 0;
 	struct irqaction *old, **p;
@@ -580,7 +874,18 @@ int setup_x86_irq(int irq, struct irqaction * new)
 
 	if (!shared) {
 		spin_lock(&irq_controller_lock);
-		unmask_irq(irq);
+#ifdef __SMP__
+		if (IO_APIC_IRQ(irq)) {
+			irq_handles[irq] = &ioapic_irq_type;
+			/*
+			 * First disable it in the 8259A:
+			 */
+			cached_irq_mask |= 1 << irq;
+			if (irq < 16)
+				set_8259A_irq_mask(irq);
+		}
+#endif
+		unmask_generic_irq(irq);
 		spin_unlock(&irq_controller_lock);
 	}
 	restore_flags(flags);
@@ -596,12 +901,13 @@ int request_irq(unsigned int irq,
 	int retval;
 	struct irqaction * action;
 
-	if (irq > 15)
+	if (irq >= NR_IRQS)
 		return -EINVAL;
 	if (!handler)
 		return -EINVAL;
 
-	action = (struct irqaction *)kmalloc(sizeof(struct irqaction), GFP_KERNEL);
+	action = (struct irqaction *)
+			kmalloc(sizeof(struct irqaction), GFP_KERNEL);
 	if (!action)
 		return -ENOMEM;
 
@@ -624,7 +930,7 @@ void free_irq(unsigned int irq, void *dev_id)
 	struct irqaction * action, **p;
 	unsigned long flags;
 
-	if (irq > 15) {
+	if (irq >= NR_IRQS) {
 		printk("Trying to free IRQ%d\n",irq);
 		return;
 	}
@@ -643,42 +949,104 @@ void free_irq(unsigned int irq, void *dev_id)
 	printk("Trying to free free IRQ%d\n",irq);
 }
 
+/*
+ * probing is always single threaded [FIXME: is this true?]
+ */
+static unsigned int probe_irqs[NR_CPUS][NR_IRQS];
+
 unsigned long probe_irq_on (void)
 {
-	unsigned int i, irqs = 0;
+	unsigned int i, j, irqs = 0;
 	unsigned long delay;
 
-	/* first, enable any unassigned irqs */
-	for (i = 15; i > 0; i--) {
+	/*
+	 * save current irq counts
+	 */
+	memcpy(probe_irqs,kstat.irqs,NR_CPUS*NR_IRQS*sizeof(int));
+
+	/*
+	 * first, enable any unassigned irqs
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
 		if (!irq_action[i]) {
-			enable_irq(i);
+			unsigned long flags;
+			spin_lock_irqsave(&irq_controller_lock, flags);
+			unmask_generic_irq(i);
 			irqs |= (1 << i);
+			spin_unlock_irqrestore(&irq_controller_lock, flags);
 		}
 	}
 
-	/* wait for spurious interrupts to mask themselves out again */
+	/*
+	 * wait for spurious interrupts to increase counters
+	 */
 	for (delay = jiffies + HZ/10; delay > jiffies; )
-		/* about 100ms delay */;
+		/* about 100ms delay */ synchronize_irq();
 
-	/* now filter out any obviously spurious interrupts */
-	return irqs & ~cached_irq_mask;
+	/*
+	 * now filter out any obviously spurious interrupts
+	 */
+	for (i=0; i<NR_IRQS; i++)
+		for (j=0; j<NR_CPUS; j++)
+			if (kstat.irqs[j][i] != probe_irqs[j][i])
+				irqs &= ~(i<<1);
+
+	return irqs;
 }
 
 int probe_irq_off (unsigned long irqs)
 {
-	unsigned int i;
+	int i,j, irq_found = -1;
 
-#ifdef DEBUG
-	printk("probe_irq_off: irqs=0x%04lx irqmask=0x%04x\n", irqs, cached_irq_mask);
-#endif
-	irqs &= cached_irq_mask;
-	if (!irqs)
-		return 0;
-	i = ffz(~irqs);
-	if (irqs != (irqs & (1 << i)))
-		i = -i;
-	return i;
+	for (i=0; i<NR_IRQS; i++) {
+		int sum = 0;
+		for (j=0; j<NR_CPUS; j++) {
+			sum += kstat.irqs[j][i];
+			sum -= probe_irqs[j][i];
+		}
+		if (sum && (irqs & (i<<1))) {
+			if (irq_found != -1) {
+				irq_found = -irq_found;
+				goto out;
+			} else
+				irq_found = i;
+		}
+	}
+	if (irq_found == -1)
+		irq_found = 0;
+out:
+	return irq_found;
+}
+
+#ifdef __SMP__
+void init_IO_APIC_traps(void)
+{
+	int i;
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level
+	 *
+	 * also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kindof importantish ;)
+	 */
+	for (i = 0; i < NR_IRQS ; i++)
+		if (IO_APIC_GATE_OFFSET+(i<<3) <= 0xfe)  /* HACK */ {
+			if (IO_APIC_IRQ(i)) {
+				irq_handles[i] = &ioapic_irq_type;
+				/*
+				 * First disable it in the 8259A:
+				 */
+				cached_irq_mask |= 1 << i;
+				if (i < 16)
+					set_8259A_irq_mask(i);
+			}
+		}
 }
+#endif
 
 __initfunc(void init_IRQ(void))
 {
@@ -689,18 +1057,22 @@ __initfunc(void init_IRQ(void))
 	outb_p(LATCH & 0xff , 0x40);	/* LSB */
 	outb(LATCH >> 8 , 0x40);	/* MSB */
 
-	for (i = 0; i < NR_IRQS ; i++)
+	printk("INIT IRQ\n");
+	for (i=0; i<NR_IRQS; i++) {
+		irq_events[i] = 0;
+		disabled_irq[i] = 0;
+	}
+	/*
+	 * 16 old-style INTA-cycle interrupt gates:
+	 */
+	for (i = 0; i < 16; i++)
 		set_intr_gate(0x20+i,interrupt[i]);
 
 #ifdef __SMP__	
-	/*
-	 * NOTE! The local APIC isn't very good at handling
-	 * multiple interrupts at the same interrupt level.
-	 * As the interrupt level is determined by taking the
-	 * vector number and shifting that right by 4, we
-	 * want to spread these out a bit so that they don't
-	 * all fall in the same interrupt level
-	 */
+
+	for (i = 0; i < NR_IRQS ; i++)
+		if (IO_APIC_GATE_OFFSET+(i<<3) <= 0xfe)  /* hack -- mingo */
+			set_intr_gate(IO_APIC_GATE_OFFSET+(i<<3),interrupt[i]);
 
 	/*
 	 * The reschedule interrupt slowly changes it's functionality,
@@ -711,21 +1083,23 @@ __initfunc(void init_IRQ(void))
 	 * [ It has to be here .. it doesn't work if you put
 	 *   it down the bottom - assembler explodes 8) ]
 	 */
-	/* IRQ '16' (trap 0x30) - IPI for rescheduling */
-	set_intr_gate(0x20+i, reschedule_interrupt);
 
+	/* IPI for rescheduling */
+	set_intr_gate(0x30, reschedule_interrupt);
 
-	/* IRQ '17' (trap 0x31) - IPI for invalidation */
-	set_intr_gate(0x21+i, invalidate_interrupt);
+	/* IPI for invalidation */
+	set_intr_gate(0x31, invalidate_interrupt);
 
-	/* IRQ '18' (trap 0x40) - IPI for CPU halt */
-	set_intr_gate(0x30+i, stop_cpu_interrupt);
+	/* IPI for CPU halt */
+	set_intr_gate(0x40, stop_cpu_interrupt);
+
+	/* self generated IPI for local APIC timer */
+	set_intr_gate(0x41, apic_timer_interrupt);
 
-	/* IRQ '19' (trap 0x41) - self generated IPI for local APIC timer */
-	set_intr_gate(0x31+i, apic_timer_interrupt);
 #endif	
 	request_region(0x20,0x20,"pic1");
 	request_region(0xa0,0x20,"pic2");
 	setup_x86_irq(2, &irq2);
 	setup_x86_irq(13, &irq13);
 } 
+