Merge with Linux 2.4.0.

author: Ralf Baechle <ralf@linux-mips.org> 2001-01-11 04:02:40 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2001-01-11 04:02:40 +0000
commit: e47f00743fc4776491344f2c618cc8dc2c23bcbc (patch)
tree: 13e03a113a82a184c51c19c209867cfd3a59b3b9 /arch/ia64/kernel
parent: b2ad5f821b1381492d792ca10b1eb7a107b48f14 (diff)
30 files changed, 1933 insertions, 1538 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 7a49511d3..e4ffb3ae6 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -9,20 +9,20 @@
 
 all: kernel.o head.o init_task.o
 
-obj-y := acpi.o entry.o gate.o efi.o efi_stub.o irq.o irq_ia64.o irq_sapic.o ivt.o		\
-	 machvec.o pal.o pci-dma.o process.o perfmon.o ptrace.o sal.o semaphore.o setup.o	\
-	 signal.o sys_ia64.o traps.o time.o unaligned.o unwind.o
+O_TARGET := kernel.o
 
-obj-$(CONFIG_IA64_GENERIC) += machvec.o
+obj-y := acpi.o entry.o gate.o efi.o efi_stub.o irq.o irq_ia64.o irq_sapic.o ivt.o \
+	 machvec.o pal.o process.o perfmon.o ptrace.o sal.o semaphore.o setup.o	\
+	 signal.o sys_ia64.o traps.o time.o unaligned.o unwind.o
+obj-$(CONFIG_IA64_GENERIC) += machvec.o iosapic.o
+obj-$(CONFIG_IA64_DIG) += iosapic.o
 obj-$(CONFIG_IA64_PALINFO) += palinfo.o
 obj-$(CONFIG_PCI) += pci.o
 obj-$(CONFIG_SMP) += smp.o smpboot.o
 obj-$(CONFIG_IA64_MCA) += mca.o mca_asm.o
 obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o
 
-O_TARGET := kernel.o
-O_OBJS	 :=  $(obj-y)
-OX_OBJS  := ia64_ksyms.o
+export-objs := ia64_ksyms.o
 
 clean::
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index a8c1ead1f..35ed564c9 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -6,6 +6,12 @@
  * 
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 2000 Hewlett-Packard Co.
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000 Intel Corp.
+ * Copyright (C) 2000 J.I. Lee <jung-ik.lee@intel.com>
+ *      ACPI based kernel configuration manager.
+ *      ACPI 2.0 & IA64 ext 0.71
  */
 
 #include <linux/config.h>
@@ -36,29 +42,87 @@ int __initdata total_cpus;
 
 void (*pm_idle)(void);
 
+asm (".weak iosapic_register_legacy_irq");
+asm (".weak iosapic_init");
+
+const char *
+acpi_get_sysname (void)
+{
+	/* the following should go away once we have an ACPI parser: */
+#ifdef CONFIG_IA64_GENERIC
+	return "hpsim";
+#else
+# if defined (CONFIG_IA64_HP_SIM)
+	return "hpsim";
+# elif defined (CONFIG_IA64_SGI_SN1)
+	return "sn1";
+# elif defined (CONFIG_IA64_DIG)
+	return "dig";
+# else
+#	error Unknown platform.  Fix acpi.c.
+# endif
+#endif
+
+}
+
 /*
- * Identify usable CPU's and remember them for SMP bringup later.
+ * Configure legacy IRQ information.
  */
 static void __init
-acpi_lsapic(char *p) 
+acpi_legacy_irq (char *p)
 {
-	int add = 1;
-
-	acpi_entry_lsapic_t *lsapic = (acpi_entry_lsapic_t *) p;
+	acpi_entry_int_override_t *legacy = (acpi_entry_int_override_t *) p;
+	unsigned long polarity = 0, edge_triggered = 0;
 
-	if ((lsapic->flags & LSAPIC_PRESENT) == 0) 
+	/*
+	 * If the platform we're running doesn't define
+	 * iosapic_register_legacy_irq(), we ignore this info...
+	 */
+	if (!iosapic_register_legacy_irq)
 		return;
 
+	switch (legacy->flags) {
+	      case 0x5:	polarity = 1; edge_triggered = 1; break;
+	      case 0x7: polarity = 0; edge_triggered = 1; break;
+	      case 0xd: polarity = 1; edge_triggered = 0; break;
+	      case 0xf: polarity = 0; edge_triggered = 0; break;
+	      default:
+		printk("    ACPI Legacy IRQ 0x%02x: Unknown flags 0x%x\n", legacy->isa_irq,
+		       legacy->flags);
+		break;
+	}
+	iosapic_register_legacy_irq(legacy->isa_irq, legacy->pin, polarity, edge_triggered);
+}
+
+/*
+ * ACPI 2.0 tables parsing functions
+ */
+
+static unsigned long
+readl_unaligned(void *p)
+{
+	unsigned long ret;
+
+	memcpy(&ret, p, sizeof(long));
+	return ret;
+}
+
+/*
+ * Identify usable CPU's and remember them for SMP bringup later.
+ */
+static void __init
+acpi20_lsapic (char *p) 
+{
+	int add = 1;
+
+	acpi20_entry_lsapic_t *lsapic = (acpi20_entry_lsapic_t *) p;
 	printk("      CPU %d (%.04x:%.04x): ", total_cpus, lsapic->eid, lsapic->id);
 
 	if ((lsapic->flags & LSAPIC_ENABLED) == 0) {
 		printk("Disabled.\n");
 		add = 0;
-	} else if (lsapic->flags & LSAPIC_PERFORMANCE_RESTRICTED) {
-		printk("Performance Restricted; ignoring.\n");
-		add = 0;
 	}
-	
+
 #ifdef CONFIG_SMP
 	smp_boot_data.cpu_phys_id[total_cpus] = -1;
 #endif
@@ -73,87 +137,234 @@ acpi_lsapic(char *p)
 }
 
 /*
- * Configure legacy IRQ information in iosapic_vector
+ * Info on platform interrupt sources: NMI. PMI, INIT, etc.
  */
 static void __init
-acpi_legacy_irq(char *p)
+acpi20_platform (char *p)
 {
-	/*
-	 * This is not good.  ACPI is not necessarily limited to CONFIG_IA64_DIG, yet
-	 * ACPI does not necessarily imply IOSAPIC either.  Perhaps there should be
-	 * a means for platform_setup() to register ACPI handlers?
-	 */
-#ifdef CONFIG_IA64_IRQ_ACPI
-	acpi_entry_int_override_t *legacy = (acpi_entry_int_override_t *) p;
-	unsigned char vector; 
-	int i;
+	acpi20_entry_platform_src_t *plat = (acpi20_entry_platform_src_t *) p;
+
+	printk("PLATFORM: IOSAPIC %x -> Vector %x on CPU %.04u:%.04u\n",
+	       plat->iosapic_vector, plat->global_vector, plat->eid, plat->id);
+}
 
-	vector = isa_irq_to_vector(legacy->isa_irq);
+/*
+ * Override the physical address of the local APIC in the MADT stable header.
+ */
+static void __init
+acpi20_lapic_addr_override (char *p)
+{
+	acpi20_entry_lapic_addr_override_t * lapic = (acpi20_entry_lapic_addr_override_t *) p;
+
+	if (lapic->lapic_address) {
+		iounmap((void *)ipi_base_addr);
+		ipi_base_addr = (unsigned long) ioremap(lapic->lapic_address, 0);
+
+		printk("LOCAL ACPI override to 0x%lx(p=0x%lx)\n",
+		       ipi_base_addr, lapic->lapic_address);
+	}
+}
+
+/*
+ * Parse the ACPI Multiple APIC Description Table
+ */
+static void __init
+acpi20_parse_madt (acpi_madt_t *madt)
+{
+	acpi_entry_iosapic_t *iosapic;
+	char *p, *end;
+
+	/* Base address of IPI Message Block */
+	if (madt->lapic_address) {
+		ipi_base_addr = (unsigned long) ioremap(madt->lapic_address, 0);
+		printk("Lapic address set to 0x%lx\n", ipi_base_addr);
+	} else
+		printk("Lapic address set to default 0x%lx\n", ipi_base_addr);
+
+	p = (char *) (madt + 1);
+	end = p + (madt->header.length - sizeof(acpi_madt_t));
 
 	/*
-	 * Clobber any old pin mapping.  It may be that it gets replaced later on
+	 * Splitted entry parsing to ensure ordering.
 	 */
-	for (i = 0; i < IA64_MAX_VECTORED_IRQ; i++) {
-		if (i == vector) 
-			continue;
-		if (iosapic_pin(i) == iosapic_pin(vector))
-			iosapic_pin(i) = 0xff;
-        }
 
-	iosapic_pin(vector) = legacy->pin;
-	iosapic_bus(vector) = BUS_ISA;	/* This table only overrides the ISA devices */
-	iosapic_busdata(vector) = 0;
-	
-	/* 
-	 * External timer tick is special... 
-	 */
-	if (vector != TIMER_IRQ)
-		iosapic_dmode(vector) = IO_SAPIC_LOWEST_PRIORITY;
-	else 
-		iosapic_dmode(vector) = IO_SAPIC_FIXED;
+	while (p < end) {
+		switch (*p) {
+		case ACPI20_ENTRY_LOCAL_APIC_ADDR_OVERRIDE:
+			printk("ACPI 2.0 MADT: LOCAL APIC Override\n");
+			acpi20_lapic_addr_override(p);
+			break;
+
+		case ACPI20_ENTRY_LOCAL_SAPIC:
+			printk("ACPI 2.0 MADT: LOCAL SAPIC\n");
+			acpi20_lsapic(p);
+			break;
 	
-	/* See MPS 1.4 section 4.3.4 */
-	switch (legacy->flags) {
-	case 0x5:
-		iosapic_polarity(vector) = IO_SAPIC_POL_HIGH;
-		iosapic_trigger(vector) = IO_SAPIC_EDGE;
-		break;
-	case 0x8:
-		iosapic_polarity(vector) = IO_SAPIC_POL_LOW;
-		iosapic_trigger(vector) = IO_SAPIC_EDGE;
-		break;
-	case 0xd:
-		iosapic_polarity(vector) = IO_SAPIC_POL_HIGH;
-		iosapic_trigger(vector) = IO_SAPIC_LEVEL;
-		break;
-	case 0xf:
-		iosapic_polarity(vector) = IO_SAPIC_POL_LOW;
-		iosapic_trigger(vector) = IO_SAPIC_LEVEL;
-		break;
-	default:
-		printk("    ACPI Legacy IRQ 0x%02x: Unknown flags 0x%x\n", legacy->isa_irq,
-		       legacy->flags);
-		break;
+		case ACPI20_ENTRY_IO_SAPIC:
+			iosapic = (acpi_entry_iosapic_t *) p;
+			if (iosapic_init)
+				iosapic_init(iosapic->address, iosapic->irq_base);
+			break;
+
+		case ACPI20_ENTRY_PLATFORM_INT_SOURCE:
+			printk("ACPI 2.0 MADT: PLATFORM INT SOUCE\n");
+			acpi20_platform(p);
+			break;
+
+		case ACPI20_ENTRY_LOCAL_APIC:
+			printk("ACPI 2.0 MADT: LOCAL APIC entry\n"); break;
+		case ACPI20_ENTRY_IO_APIC:
+			printk("ACPI 2.0 MADT: IO APIC entry\n"); break;
+		case ACPI20_ENTRY_NMI_SOURCE:
+			printk("ACPI 2.0 MADT: NMI SOURCE entry\n"); break;
+		case ACPI20_ENTRY_LOCAL_APIC_NMI:
+			printk("ACPI 2.0 MADT: LOCAL APIC NMI entry\n"); break;
+		case ACPI20_ENTRY_INT_SRC_OVERRIDE:
+			break;
+		default:
+			printk("ACPI 2.0 MADT: unknown entry skip\n"); break;
+			break;
+		}
+
+		p += p[1];
+	}
+
+	p = (char *) (madt + 1);
+	end = p + (madt->header.length - sizeof(acpi_madt_t));
+
+	while (p < end) {
+		
+		switch (*p) {
+		case ACPI20_ENTRY_INT_SRC_OVERRIDE:
+			printk("ACPI 2.0 MADT: INT SOURCE Override\n");
+			acpi_legacy_irq(p);
+			break;
+		default:
+			break;
+		}
+
+		p += p[1];
+	}
+
+	/* Make bootup pretty */
+	printk("      %d CPUs available, %d CPUs total\n",
+		available_cpus, total_cpus);
+}
+
+int __init 
+acpi20_parse (acpi20_rsdp_t *rsdp20)
+{
+	acpi_xsdt_t *xsdt;
+	acpi_desc_table_hdr_t *hdrp;
+	int tables, i;
+
+	if (strncmp(rsdp20->signature, ACPI_RSDP_SIG, ACPI_RSDP_SIG_LEN)) {
+		printk("ACPI 2.0 RSDP signature incorrect!\n");
+		return 0;
+	} else {
+		printk("ACPI 2.0 Root System Description Ptr at 0x%lx\n",
+			(unsigned long)rsdp20);
+	}
+
+	xsdt = __va(rsdp20->xsdt);
+	hdrp = &xsdt->header;
+	if (strncmp(hdrp->signature,
+		ACPI_XSDT_SIG, ACPI_XSDT_SIG_LEN)) {
+		printk("ACPI 2.0 XSDT signature incorrect. Trying RSDT\n");
+		/* RSDT parsing here */
+		return 0;
+	} else {
+		printk("ACPI 2.0 XSDT at 0x%lx (p=0x%lx)\n",
+		(unsigned long)xsdt, (unsigned long)rsdp20->xsdt);
+	}
+
+	printk("ACPI 2.0: %.6s %.8s %d.%d\n",
+		hdrp->oem_id,
+		hdrp->oem_table_id,
+		hdrp->oem_revision >> 16,
+		hdrp->oem_revision & 0xffff);
+
+#ifdef CONFIG_ACPI_KERNEL_CONFIG
+	acpi_cf_init((void *)rsdp20);
+#endif
+
+	tables =(hdrp->length -sizeof(acpi_desc_table_hdr_t))>>3;
+
+	for (i = 0; i < tables; i++) {
+		hdrp = (acpi_desc_table_hdr_t *) __va(readl_unaligned(&xsdt->entry_ptrs[i]));
+		printk("        :table %4.4s found\n", hdrp->signature);
+
+		/* Only interested int the MADT table for now ... */
+		if (strncmp(hdrp->signature,
+			ACPI_MADT_SIG, ACPI_MADT_SIG_LEN) != 0)
+			continue;
+
+		acpi20_parse_madt((acpi_madt_t *) hdrp);
+	}
+
+#ifdef CONFIG_ACPI_KERNEL_CONFIG
+	acpi_cf_terminate();
+#endif
+
+#ifdef CONFIG_SMP
+	if (available_cpus == 0) {
+		printk("ACPI: Found 0 CPUS; assuming 1\n");
+		available_cpus = 1; /* We've got at least one of these, no? */
+	}
+	smp_boot_data.cpu_count = available_cpus;
+#endif
+	return 1;
+}
+/*
+ * ACPI 1.0b with 0.71 IA64 extensions functions; should be removed once all 
+ * platforms start supporting ACPI 2.0
+ */
+
+/*
+ * Identify usable CPU's and remember them for SMP bringup later.
+ */
+static void __init
+acpi_lsapic (char *p) 
+{
+	int add = 1;
+
+	acpi_entry_lsapic_t *lsapic = (acpi_entry_lsapic_t *) p;
+
+	if ((lsapic->flags & LSAPIC_PRESENT) == 0) 
+		return;
+
+	printk("      CPU %d (%.04x:%.04x): ", total_cpus, lsapic->eid, lsapic->id);
+
+	if ((lsapic->flags & LSAPIC_ENABLED) == 0) {
+		printk("Disabled.\n");
+		add = 0;
+	} else if (lsapic->flags & LSAPIC_PERFORMANCE_RESTRICTED) {
+		printk("Performance Restricted; ignoring.\n");
+		add = 0;
 	}
 
-# ifdef ACPI_DEBUG
-	printk("Legacy ISA IRQ %x -> IA64 Vector %x IOSAPIC Pin %x Active %s %s Trigger\n", 
-	       legacy->isa_irq, vector, iosapic_pin(vector), 
-	       ((iosapic_polarity(vector) == IO_SAPIC_POL_LOW) ? "Low" : "High"),
-	       ((iosapic_trigger(vector) == IO_SAPIC_LEVEL) ? "Level" : "Edge"));
-# endif /* ACPI_DEBUG */
-#endif /* CONFIG_IA64_IRQ_ACPI */
+#ifdef CONFIG_SMP
+	smp_boot_data.cpu_phys_id[total_cpus] = -1;
+#endif
+	if (add) {
+		printk("Available.\n");
+		available_cpus++;
+#ifdef CONFIG_SMP
+		smp_boot_data.cpu_phys_id[total_cpus] = (lsapic->id << 8) | lsapic->eid;
+#endif /* CONFIG_SMP */
+	}
+	total_cpus++;
 }
 
 /*
  * Info on platform interrupt sources: NMI. PMI, INIT, etc.
  */
 static void __init
-acpi_platform(char *p)
+acpi_platform (char *p)
 {
 	acpi_entry_platform_src_t *plat = (acpi_entry_platform_src_t *) p;
 
-	printk("PLATFORM: IOSAPIC %x -> Vector %lx on CPU %.04u:%.04u\n",
+	printk("PLATFORM: IOSAPIC %x -> Vector %x on CPU %.04u:%.04u\n",
 	       plat->iosapic_vector, plat->global_vector, plat->eid, plat->id);
 }
 
@@ -161,8 +372,9 @@ acpi_platform(char *p)
  * Parse the ACPI Multiple SAPIC Table
  */
 static void __init
-acpi_parse_msapic(acpi_sapic_t *msapic)
+acpi_parse_msapic (acpi_sapic_t *msapic)
 {
+	acpi_entry_iosapic_t *iosapic;
 	char *p, *end;
 
 	/* Base address of IPI Message Block */
@@ -172,41 +384,31 @@ acpi_parse_msapic(acpi_sapic_t *msapic)
 	end = p + (msapic->header.length - sizeof(acpi_sapic_t));
 
 	while (p < end) {
-		
 		switch (*p) {
-		case ACPI_ENTRY_LOCAL_SAPIC:
+		      case ACPI_ENTRY_LOCAL_SAPIC:
 			acpi_lsapic(p);
 			break;
 	
-		case ACPI_ENTRY_IO_SAPIC:
-			platform_register_iosapic((acpi_entry_iosapic_t *) p);
+		      case ACPI_ENTRY_IO_SAPIC:
+			iosapic = (acpi_entry_iosapic_t *) p;
+			if (iosapic_init)
+				iosapic_init(iosapic->address, iosapic->irq_base);
 			break;
 
-		case ACPI_ENTRY_INT_SRC_OVERRIDE:
+		      case ACPI_ENTRY_INT_SRC_OVERRIDE:
 			acpi_legacy_irq(p);
 			break;
-		
-		case ACPI_ENTRY_PLATFORM_INT_SOURCE:
+
+		      case ACPI_ENTRY_PLATFORM_INT_SOURCE:
 			acpi_platform(p);
 			break;
-		
-		default:
+
+		      default:
 			break;
 		}
 
 		/* Move to next table entry. */
-#define BAD_ACPI_TABLE
-#ifdef BAD_ACPI_TABLE
-		/*
-		 * Some prototype Lion's have a bad ACPI table
-		 * requiring this fix.  Without this fix, those
-		 * machines crash during bootup.
-		 */
-		if (p[1] == 0)
-			p = end;
-		else
-#endif
-			p += p[1];
+		p += p[1];
 	}
 
 	/* Make bootup pretty */
@@ -214,24 +416,18 @@ acpi_parse_msapic(acpi_sapic_t *msapic)
 }
 
 int __init 
-acpi_parse(acpi_rsdp_t *rsdp)
+acpi_parse (acpi_rsdp_t *rsdp)
 {
 	acpi_rsdt_t *rsdt;
 	acpi_desc_table_hdr_t *hdrp;
 	long tables, i;
 
-	if (!rsdp) {
-		printk("Uh-oh, no ACPI Root System Description Pointer table!\n");
-		return 0;
-	}
-
 	if (strncmp(rsdp->signature, ACPI_RSDP_SIG, ACPI_RSDP_SIG_LEN)) {
 		printk("Uh-oh, ACPI RSDP signature incorrect!\n");
 		return 0;
 	}
 
-	rsdp->rsdt = __va(rsdp->rsdt);
-	rsdt = rsdp->rsdt;
+	rsdt = __va(rsdp->rsdt);
 	if (strncmp(rsdt->header.signature, ACPI_RSDT_SIG, ACPI_RSDT_SIG_LEN)) {
 		printk("Uh-oh, ACPI RDST signature incorrect!\n");
 		return 0;
@@ -256,7 +452,7 @@ acpi_parse(acpi_rsdp_t *rsdp)
 	}
 
 #ifdef CONFIG_ACPI_KERNEL_CONFIG
-       acpi_cf_terminate();
+	acpi_cf_terminate();
 #endif
 
 #ifdef CONFIG_SMP
@@ -268,22 +464,3 @@ acpi_parse(acpi_rsdp_t *rsdp)
 #endif
 	return 1;
 }
-
-const char *
-acpi_get_sysname (void)
-{       
-	/* the following should go away once we have an ACPI parser: */
-#ifdef CONFIG_IA64_GENERIC
-	return "hpsim";
-#else
-# if defined (CONFIG_IA64_HP_SIM)
-	return "hpsim";
-# elif defined (CONFIG_IA64_SGI_SN1)
-	return "sn1";
-# elif defined (CONFIG_IA64_DIG)
-	return "dig";
-# else
-#	error Unknown platform.  Fix acpi.c.
-# endif
-#endif
-}
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 759db7f52..1ac4e04f4 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -333,6 +333,9 @@ efi_init (void)
 		if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
 			efi.mps = __va(config_tables[i].table);
 			printk(" MPS=0x%lx", config_tables[i].table);
+		} else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
+			efi.acpi20 = __va(config_tables[i].table);
+			printk(" ACPI 2.0=0x%lx", config_tables[i].table);
 		} else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
 			efi.acpi = __va(config_tables[i].table);
 			printk(" ACPI=0x%lx", config_tables[i].table);
@@ -364,7 +367,7 @@ efi_init (void)
 #if EFI_DEBUG
 	/* print EFI memory map: */
 	{
-		efi_memory_desc_t *md = p;
+		efi_memory_desc_t *md;
 		void *p;
 
 		for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) {
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index ffb1760ea..f8c647386 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -11,6 +11,17 @@
  * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com>
  */
 /*
+ * ia64_switch_to now places correct virtual mapping in in TR2 for
+ * kernel stack. This allows us to handle interrupts without changing
+ * to physical mode.
+ *
+ * ar.k4 is now used to hold last virtual map address
+ * 
+ * Jonathan Nickin	<nicklin@missioncriticallinux.com>
+ * Patrick O'Rourke	<orourke@missioncriticallinux.com>
+ * 11/07/2000
+ /
+/*
  * Global (preserved) predicate usage on syscall entry/exit path:
  *
  *	pKern:		See entry.h.
@@ -27,7 +38,8 @@
 #include <asm/processor.h>
 #include <asm/unistd.h>
 #include <asm/asmmacro.h>
-
+#include <asm/pgtable.h>
+	
 #include "entry.h"
 
 	.text
@@ -98,6 +110,8 @@ GLOBAL_ENTRY(sys_clone)
 	br.ret.sptk.many rp
 END(sys_clone)
 
+#define KSTACK_TR	2
+
 /*
  * prev_task <- ia64_switch_to(struct task_struct *next)
  */
@@ -108,22 +122,55 @@ GLOBAL_ENTRY(ia64_switch_to)
 	UNW(.body)
 
 	adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
-	dep r18=-1,r0,0,61	// build mask 0x1fffffffffffffff
+	mov r27=ar.k4
+	dep r20=0,in0,61,3		// physical address of "current"
+	;;
+	st8 [r22]=sp			// save kernel stack pointer of old task
+	shr.u r26=r20,_PAGE_SIZE_256M
+	;;
+	cmp.eq p7,p6=r26,r0		// check < 256M
 	adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
 	;;
-	st8 [r22]=sp		// save kernel stack pointer of old task
-	ld8 sp=[r21]		// load kernel stack pointer of new task
-	and r20=in0,r18		// physical address of "current"
+	/*
+	 * If we've already mapped this task's page, we can skip doing it
+	 * again.
+	 */
+(p6)	cmp.eq p7,p6=r26,r27
+(p6)	br.cond.dpnt.few .map
+	;;
+.done:	ld8 sp=[r21]			// load kernel stack pointer of new task
+(p6)	ssm psr.ic			// if we we had to map, renable the psr.ic bit FIRST!!!
 	;;
-	mov ar.k6=r20		// copy "current" into ar.k6
-	mov r8=r13		// return pointer to previously running task
-	mov r13=in0		// set "current" pointer
+(p6)	srlz.d
+	mov ar.k6=r20			// copy "current" into ar.k6
+	mov r8=r13			// return pointer to previously running task
+	mov r13=in0			// set "current" pointer
 	;;
+(p6)	ssm psr.i			// renable psr.i AFTER the ic bit is serialized
 	DO_LOAD_SWITCH_STACK( )
+
 #ifdef CONFIG_SMP
-	sync.i			// ensure "fc"s done by this CPU are visible on other CPUs
-#endif
-	br.ret.sptk.few rp
+	sync.i				// ensure "fc"s done by this CPU are visible on other CPUs
+#endif 
+	br.ret.sptk.few rp		// boogie on out in new context
+
+.map:
+	rsm psr.i | psr.ic
+	movl r25=__DIRTY_BITS|_PAGE_PL_0|_PAGE_AR_RWX
+	;;
+	srlz.d
+	or r23=r25,r20			// construct PA | page properties
+	mov r25=_PAGE_SIZE_256M<<2
+	;;
+	mov cr.itir=r25
+	mov cr.ifa=in0			// VA of next task...
+	;;
+	mov r25=KSTACK_TR		// use tr entry #2...
+	mov ar.k4=r26			// remember last page we mapped...
+	;;
+	itr.d dtr[r25]=r23		// wire in new mapping...
+	br.cond.sptk.many .done
+	;;
 END(ia64_switch_to)
 
 #ifndef CONFIG_IA64_NEW_UNWIND
@@ -503,7 +550,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	;;
 	ld4 r2=[r2]
 	;;
-	shl r2=r2,SMP_LOG_CACHE_BYTES	// can't use shladd here...
+	shl r2=r2,SMP_CACHE_SHIFT	// can't use shladd here...
 	;;
 	add r3=r2,r3
 #else
@@ -542,7 +589,7 @@ back_from_resched:
 	// check & deliver pending signals:
 (p2)	br.call.spnt.few rp=handle_signal_delivery
 .ret9:
-#if defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_IA64_SOFTSDV_HACKS)
+#ifdef CONFIG_IA64_SOFTSDV_HACKS
 	// Check for lost ticks
 	rsm psr.i
 	mov r2 = ar.itc
@@ -611,14 +658,13 @@ restore_all:
 	mov ar.ccv=r1
 	mov ar.fpsr=r13
 	mov b0=r14
-	// turn off interrupts, interrupt collection, & data translation
-	rsm psr.i | psr.ic | psr.dt
+	// turn off interrupts, interrupt collection
+	rsm psr.i | psr.ic
 	;;
 	srlz.i			// EAS 2.5
 	mov b7=r15
 	;;
 	invala			// invalidate ALAT
-	dep r12=0,r12,61,3	// convert sp to physical address
 	bsw.0;;			// switch back to bank 0 (must be last in insn group)
 	;;
 #ifdef CONFIG_ITANIUM_ASTEP_SPECIFIC
@@ -757,7 +803,7 @@ END(invoke_schedule_tail)
 
 #endif /* CONFIG_SMP */
 
-#if defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_IA64_SOFTSDV_HACKS)
+#ifdef CONFIG_IA64_SOFTSDV_HACKS
 
 ENTRY(invoke_ia64_reset_itm)
 	UNW(.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8))
@@ -772,7 +818,7 @@ ENTRY(invoke_ia64_reset_itm)
 	br.ret.sptk.many rp
 END(invoke_ia64_reset_itm)
 
-#endif /* CONFIG_ITANIUM_ASTEP_SPECIFIC || CONFIG_IA64_SOFTSDV_HACKS */
+#endif /* CONFIG_IA64_SOFTSDV_HACKS */
 
 	/*
 	 * Invoke do_softirq() while preserving in0-in7, which may be needed
@@ -1091,7 +1137,7 @@ sys_call_table:
 	data8 sys_setpriority
 	data8 sys_statfs
 	data8 sys_fstatfs
-	data8 ia64_ni_syscall
+	data8 ia64_ni_syscall			// 1105
 	data8 sys_semget
 	data8 sys_semop
 	data8 sys_semctl
diff --git a/arch/ia64/kernel/fw-emu.c b/arch/ia64/kernel/fw-emu.c
index 34316fe58..e16f23426 100644
--- a/arch/ia64/kernel/fw-emu.c
+++ b/arch/ia64/kernel/fw-emu.c
@@ -402,7 +402,6 @@ sys_fw_init (const char *args, int arglen)
 	sal_systab->sal_rev_minor = 1;
 	sal_systab->sal_rev_major = 0;
 	sal_systab->entry_count = 1;
-	sal_systab->ia32_bios_present = 0;
 
 #ifdef CONFIG_IA64_GENERIC
         strcpy(sal_systab->oem_id, "Generic");
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index e6298b297..abee408f1 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -74,8 +74,8 @@ GLOBAL_ENTRY(_start)
 	;;
 
 #ifdef CONFIG_IA64_EARLY_PRINTK
-	mov r2=6
-	mov r3=(8<<8) | (28<<2)
+	mov r3=(6<<8) | (28<<2)
+	movl r2=6<<61
 	;;
 	mov rr[r2]=r3
 	;;
@@ -168,6 +168,11 @@ GLOBAL_ENTRY(ia64_save_debug_regs)
 	add r19=IA64_NUM_DBG_REGS*8,in0
 	;;
 1:	mov r16=dbr[r18]
+#if defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC) \
+    || defined(CONFIG_ITANIUM_C0_SPECIFIC)
+	;;
+	srlz.d
+#endif
 	mov r17=ibr[r18]
 	add r18=1,r18
 	;;
@@ -181,7 +186,8 @@ END(ia64_save_debug_regs)
 
 GLOBAL_ENTRY(ia64_load_debug_regs)
 	alloc r16=ar.pfs,1,0,0,0
-#if !(defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
+#if !(defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) \
+   || defined(CONFIG_ITANIUM_B0_SPECIFIC) || defined(CONFIG_ITANIUM_B1_SPECIFIC))
 	lfetch.nta [in0]
 #endif
 	mov r20=ar.lc			// preserve ar.lc
@@ -194,6 +200,11 @@ GLOBAL_ENTRY(ia64_load_debug_regs)
 	add r18=1,r18
 	;;
 	mov dbr[r18]=r16
+#if defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC) \
+    || defined(CONFIG_ITANIUM_C0_SPECIFIC)
+	;;
+	srlz.d
+#endif
 	mov ibr[r18]=r17
 	br.cloop.sptk.few 1b
 	;;
@@ -754,7 +765,7 @@ GLOBAL_ENTRY(ia64_spinlock_contention)
 	mov tmp=ar.itc
 (p15)	br.cond.sptk .wait
 	;;
-	ld1 tmp=[r31]
+	ld4 tmp=[r31]
 	;;
 	cmp.ne p15,p0=tmp,r0
 	mov tmp=ar.itc
@@ -764,7 +775,7 @@ GLOBAL_ENTRY(ia64_spinlock_contention)
 	mov tmp=1
 	;;
 	IA64_SEMFIX_INSN
-	cmpxchg1.acq tmp=[r31],tmp,ar.ccv
+	cmpxchg4.acq tmp=[r31],tmp,ar.ccv
 	;;
 	cmp.eq p15,p0=tmp,r0
 
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index d3d2416cf..f831f86d9 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -24,9 +24,8 @@ EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
 EXPORT_SYMBOL(strtok);
 
-#include <linux/pci.h>
-EXPORT_SYMBOL(pci_alloc_consistent);
-EXPORT_SYMBOL(pci_free_consistent);
+#include <asm/hw_irq.h>
+EXPORT_SYMBOL(isa_irq_to_vector_map);
 
 #include <linux/in6.h>
 #include <asm/checksum.h>
@@ -49,14 +48,6 @@ EXPORT_SYMBOL(disable_irq_nosync);
 #include <asm/page.h>
 EXPORT_SYMBOL(clear_page);
 
-#include <asm/pci.h>
-EXPORT_SYMBOL(pci_dma_sync_sg);
-EXPORT_SYMBOL(pci_dma_sync_single);
-EXPORT_SYMBOL(pci_map_sg);
-EXPORT_SYMBOL(pci_map_single);
-EXPORT_SYMBOL(pci_unmap_sg);
-EXPORT_SYMBOL(pci_unmap_single);
-
 #include <asm/processor.h>
 EXPORT_SYMBOL(cpu_data);
 EXPORT_SYMBOL(kernel_thread);
@@ -92,6 +83,9 @@ EXPORT_SYMBOL(__global_restore_flags);
 #include <asm/uaccess.h>
 EXPORT_SYMBOL(__copy_user);
 EXPORT_SYMBOL(__do_clear_user);
+EXPORT_SYMBOL(__strlen_user);
+EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(__strnlen_user);
 
 #include <asm/unistd.h>
 EXPORT_SYMBOL(__ia64_syscall);
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
new file mode 100644
index 000000000..9d8408c3f
--- /dev/null
+++ b/arch/ia64/kernel/iosapic.c
@@ -0,0 +1,498 @@
+/*
+ * I/O SAPIC support.
+ *
+ * Copyright (C) 1999 Intel Corp.
+ * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
+ * Copyright (C) 1999-2000 Hewlett-Packard Co.
+ * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
+ *
+ * 00/04/19	D. Mosberger	Rewritten to mirror more closely the x86 I/O APIC code.
+ *				In particular, we now have separate handlers for edge
+ *				and level triggered interrupts.
+ * 00/10/27	Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector allocation 
+ *				PCI to vector mapping, shared PCI interrupts.
+ * 00/10/27	D. Mosberger	Document things a bit more to make them more understandable.
+ *				Clean up much of the old IOSAPIC cruft.
+ */
+/*
+ * Here is what the interrupt logic between a PCI device and the CPU looks like:
+ *
+ * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, INTD).  The
+ *     device is uniquely identified by its bus-, device-, and slot-number (the function
+ *     number does not matter here because all functions share the same interrupt
+ *     lines).
+ *
+ * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC controller.
+ *     Multiple interrupt lines may have to share the same IOSAPIC pin (if they're level
+ *     triggered and use the same polarity).  Each interrupt line has a unique IOSAPIC
+ *     irq number which can be calculated as the sum of the controller's base irq number
+ *     and the IOSAPIC pin number to which the line connects.
+ *
+ * (3) The IOSAPIC uses an internal table to map the IOSAPIC pin into the IA-64 interrupt
+ *     vector.  This interrupt vector is then sent to the CPU.
+ *
+ * In other words, there are two levels of indirections involved:
+ *
+ *	pci pin -> iosapic irq -> IA-64 vector
+ *
+ * Note: outside this module, IA-64 vectors are called "irqs".  This is because that's
+ * the traditional name Linux uses for interrupt vectors.
+ */
+#include <linux/config.h>
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/string.h>
+#include <linux/irq.h>
+
+#include <asm/acpi-ext.h>
+#include <asm/delay.h>
+#include <asm/io.h>
+#include <asm/iosapic.h>
+#include <asm/machvec.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+
+#ifdef	CONFIG_ACPI_KERNEL_CONFIG
+# include <asm/acpikcfg.h>
+#endif
+
+#undef DEBUG_IRQ_ROUTING
+
+static spinlock_t iosapic_lock = SPIN_LOCK_UNLOCKED;
+
+/* PCI pin to IOSAPIC irq routing information.  This info typically comes from ACPI. */
+
+static struct {
+	int num_routes;
+	struct pci_vector_struct *route;
+} pci_irq;
+
+/* This tables maps IA-64 vectors to the IOSAPIC pin that generates this vector. */
+
+static struct iosapic_irq {
+	char *addr;			/* base address of IOSAPIC */
+	unsigned char base_irq;		/* first irq assigned to this IOSAPIC */
+        char pin;			/* IOSAPIC pin (-1 => not an IOSAPIC irq) */
+	unsigned char dmode 	: 3;	/* delivery mode (see iosapic.h) */
+	unsigned char polarity	: 1;	/* interrupt polarity (see iosapic.h) */
+	unsigned char trigger	: 1;	/* trigger mode (see iosapic.h) */
+} iosapic_irq[NR_IRQS];
+
+/*
+ * Translate IOSAPIC irq number to the corresponding IA-64 interrupt vector.  If no
+ * entry exists, return -1.
+ */
+static int 
+iosapic_irq_to_vector (int irq)
+{
+	int vector;
+
+	for (vector = 0; vector < NR_IRQS; ++vector)
+		if (iosapic_irq[vector].base_irq + iosapic_irq[vector].pin == irq)
+			return vector;
+	return -1;
+}
+		
+/*
+ * Map PCI pin to the corresponding IA-64 interrupt vector.  If no such mapping exists,
+ * return -1.
+ */
+static int
+pci_pin_to_vector (int bus, int slot, int pci_pin)
+{
+	struct pci_vector_struct *r;
+
+	for (r = pci_irq.route; r < pci_irq.route + pci_irq.num_routes; ++r)
+		if (r->bus == bus && (r->pci_id >> 16) == slot && r->pin == pci_pin)
+			return iosapic_irq_to_vector(r->irq);
+	return -1;
+}
+
+static void
+set_rte (unsigned int vector, unsigned long dest)
+{
+	unsigned long pol, trigger, dmode;
+	u32 low32, high32;
+	char *addr;
+	int pin;
+
+	pin = iosapic_irq[vector].pin;
+	if (pin < 0)
+		return;		/* not an IOSAPIC interrupt */
+
+	addr    = iosapic_irq[vector].addr;
+	pol     = iosapic_irq[vector].polarity;
+	trigger = iosapic_irq[vector].trigger;
+	dmode   = iosapic_irq[vector].dmode;
+
+	low32 = ((pol << IOSAPIC_POLARITY_SHIFT) |
+		 (trigger << IOSAPIC_TRIGGER_SHIFT) |
+		 (dmode << IOSAPIC_DELIVERY_SHIFT) |
+		 vector);
+
+#ifdef CONFIG_IA64_AZUSA_HACKS
+	/* set Flush Disable bit */
+	if (addr != (char *) 0xc0000000fec00000)
+		low32 |= (1 << 17);
+#endif
+
+	/* dest contains both id and eid */
+	high32 = (dest << IOSAPIC_DEST_SHIFT);	
+
+	writel(IOSAPIC_RTE_HIGH(pin), addr + IOSAPIC_REG_SELECT);
+	writel(high32, addr + IOSAPIC_WINDOW);
+	writel(IOSAPIC_RTE_LOW(pin), addr + IOSAPIC_REG_SELECT);
+	writel(low32, addr + IOSAPIC_WINDOW);
+}
+
+static void
+nop (unsigned int vector)
+{
+	/* do nothing... */
+}
+
+static void 
+mask_irq (unsigned int vector)
+{
+	unsigned long flags;
+	char *addr;
+	u32 low32;
+	int pin;
+
+	addr = iosapic_irq[vector].addr;
+	pin = iosapic_irq[vector].pin;
+
+	if (pin < 0)
+		return;			/* not an IOSAPIC interrupt! */
+
+	spin_lock_irqsave(&iosapic_lock, flags);
+	{
+		writel(IOSAPIC_RTE_LOW(pin), addr + IOSAPIC_REG_SELECT);
+		low32 = readl(addr + IOSAPIC_WINDOW);
+
+		low32 |= (1 << IOSAPIC_MASK_SHIFT);    /* set only the mask bit */
+		writel(low32, addr + IOSAPIC_WINDOW);
+	}
+	spin_unlock_irqrestore(&iosapic_lock, flags);
+}
+
+static void 
+unmask_irq (unsigned int vector)
+{
+	unsigned long flags;
+	char *addr;
+	u32 low32;
+	int pin;
+
+	addr = iosapic_irq[vector].addr;
+	pin = iosapic_irq[vector].pin;
+	if (pin < 0)
+		return;			/* not an IOSAPIC interrupt! */
+
+	spin_lock_irqsave(&iosapic_lock, flags);
+	{
+		writel(IOSAPIC_RTE_LOW(pin), addr + IOSAPIC_REG_SELECT);
+		low32 = readl(addr + IOSAPIC_WINDOW);
+
+		low32 &= ~(1 << IOSAPIC_MASK_SHIFT);    /* clear only the mask bit */
+		writel(low32, addr + IOSAPIC_WINDOW);
+	}
+	spin_unlock_irqrestore(&iosapic_lock, flags);
+}
+
+
+static void
+iosapic_set_affinity (unsigned int vector, unsigned long mask)
+{
+	printk("iosapic_set_affinity: not implemented yet\n");
+}
+
+/*
+ * Handlers for level-triggered interrupts.
+ */
+
+static unsigned int
+iosapic_startup_level_irq (unsigned int vector)
+{
+	unmask_irq(vector);
+	return 0;
+}
+
+static void
+iosapic_end_level_irq (unsigned int vector)
+{
+	writel(vector, iosapic_irq[vector].addr + IOSAPIC_EOI);
+}
+
+#define iosapic_shutdown_level_irq	mask_irq
+#define iosapic_enable_level_irq	unmask_irq
+#define iosapic_disable_level_irq	mask_irq
+#define iosapic_ack_level_irq		nop
+
+struct hw_interrupt_type irq_type_iosapic_level = {
+	typename:	"IO-SAPIC-level",
+	startup:	iosapic_startup_level_irq,
+	shutdown:	iosapic_shutdown_level_irq,
+	enable:		iosapic_enable_level_irq,
+	disable:	iosapic_disable_level_irq,
+	ack:		iosapic_ack_level_irq,
+	end:		iosapic_end_level_irq,
+	set_affinity:	iosapic_set_affinity
+};
+
+/*
+ * Handlers for edge-triggered interrupts.
+ */
+
+static unsigned int
+iosapic_startup_edge_irq (unsigned int vector)
+{
+	unmask_irq(vector);
+	/*
+	 * IOSAPIC simply drops interrupts pended while the
+	 * corresponding pin was masked, so we can't know if an
+	 * interrupt is pending already.  Let's hope not...
+	 */
+	return 0;
+}
+
+static void
+iosapic_ack_edge_irq (unsigned int vector)
+{
+	/*
+	 * Once we have recorded IRQ_PENDING already, we can mask the
+	 * interrupt for real. This prevents IRQ storms from unhandled
+	 * devices.
+	 */
+	if ((irq_desc[vector].status & (IRQ_PENDING|IRQ_DISABLED)) == (IRQ_PENDING|IRQ_DISABLED))
+		mask_irq(vector);
+}
+
+#define iosapic_enable_edge_irq		unmask_irq
+#define iosapic_disable_edge_irq	nop
+#define iosapic_end_edge_irq		nop
+
+struct hw_interrupt_type irq_type_iosapic_edge = {
+	typename:	"IO-SAPIC-edge",
+	startup:	iosapic_startup_edge_irq,
+	shutdown:	iosapic_disable_edge_irq,
+	enable:		iosapic_enable_edge_irq,
+	disable:	iosapic_disable_edge_irq,
+	ack:		iosapic_ack_edge_irq,
+	end:		iosapic_end_edge_irq,
+	set_affinity:	iosapic_set_affinity
+};
+
+static unsigned int
+iosapic_version (char *addr) 
+{
+	/*
+	 * IOSAPIC Version Register return 32 bit structure like:
+	 * {
+	 *	unsigned int version   : 8;
+	 *	unsigned int reserved1 : 8;
+	 *	unsigned int pins      : 8;
+	 *	unsigned int reserved2 : 8;
+	 * }
+	 */
+	writel(IOSAPIC_VERSION, addr + IOSAPIC_REG_SELECT);
+	return readl(IOSAPIC_WINDOW + addr);
+}
+
+/*
+ * ACPI calls this when it finds an entry for a legacy ISA interrupt.  Note that the
+ * irq_base and IOSAPIC address must be set in iosapic_init().
+ */
+void
+iosapic_register_legacy_irq (unsigned long irq,
+			     unsigned long pin, unsigned long polarity,
+			     unsigned long edge_triggered)
+{
+	unsigned int vector = isa_irq_to_vector(irq);
+
+#ifdef DEBUG_IRQ_ROUTING
+	printk("ISA: IRQ %u -> IOSAPIC irq 0x%02x (%s, %s) -> vector %02x\n",
+	       (unsigned) irq, (unsigned) pin,
+	       polarity ? "high" : "low", edge_triggered ? "edge" : "level",
+	       vector);
+#endif
+
+	iosapic_irq[vector].pin = pin;
+	iosapic_irq[vector].dmode = IOSAPIC_LOWEST_PRIORITY;
+	iosapic_irq[vector].polarity = polarity ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW;
+	iosapic_irq[vector].trigger = edge_triggered ? IOSAPIC_EDGE : IOSAPIC_LEVEL;
+}
+
+void __init
+iosapic_init (unsigned long phys_addr, unsigned int base_irq)
+{
+	struct hw_interrupt_type *irq_type;
+	int i, irq, max_pin, vector;
+	unsigned int ver;
+	char *addr;
+	static int first_time = 1;
+
+	if (first_time) {
+		first_time = 0;
+
+		for (vector = 0; vector < NR_IRQS; ++vector)
+			iosapic_irq[vector].pin = -1;	/* mark as unused */
+
+		/* 
+		 * Fetch the PCI interrupt routing table:
+		 */
+#ifdef CONFIG_ACPI_KERNEL_CONFIG
+		acpi_cf_get_pci_vectors(&pci_irq.route, &pci_irq.num_routes);
+#else
+		pci_irq.route =
+			(struct pci_vector_struct *) __va(ia64_boot_param.pci_vectors);
+		pci_irq.num_routes = ia64_boot_param.num_pci_vectors;
+#endif
+	}
+
+	addr = ioremap(phys_addr, 0);
+
+	ver = iosapic_version(addr);
+	max_pin = (ver >> 16) & 0xff;
+	
+	printk("IOSAPIC: version %x.%x, address 0x%lx, IRQs 0x%02x-0x%02x\n", 
+	       (ver & 0xf0) >> 4, (ver & 0x0f), phys_addr, base_irq, base_irq + max_pin);
+
+	if (base_irq == 0)
+		/*
+		 * Map the legacy ISA devices into the IOSAPIC data.  Some of these may
+		 * get reprogrammed later on with data from the ACPI Interrupt Source
+		 * Override table.
+		 */
+		for (irq = 0; irq < 16; ++irq) {
+			vector = isa_irq_to_vector(irq);
+			iosapic_irq[vector].addr = addr;
+			iosapic_irq[vector].base_irq = 0;
+			if (iosapic_irq[vector].pin == -1)
+				iosapic_irq[vector].pin = irq;
+			iosapic_irq[vector].dmode = IOSAPIC_LOWEST_PRIORITY;
+			iosapic_irq[vector].trigger  = IOSAPIC_EDGE;
+			iosapic_irq[vector].polarity = IOSAPIC_POL_HIGH;
+#ifdef DEBUG_IRQ_ROUTING
+			printk("ISA: IRQ %u -> IOSAPIC irq 0x%02x (high, edge) -> vector 0x%02x\n",
+			       irq, iosapic_irq[vector].base_irq + iosapic_irq[vector].pin,
+			       vector);
+#endif
+		  	irq_type = &irq_type_iosapic_edge;
+			if (irq_desc[vector].handler != irq_type) {
+				if (irq_desc[vector].handler != &no_irq_type)
+					printk("iosapic_init: changing vector 0x%02x from %s to "
+					       "%s\n", irq, irq_desc[vector].handler->typename,
+					       irq_type->typename);
+				irq_desc[vector].handler = irq_type;
+			}
+
+			/* program the IOSAPIC routing table: */
+			set_rte(vector, (ia64_get_lid() >> 16) & 0xffff);
+		}
+
+#ifndef CONFIG_IA64_SOFTSDV_HACKS
+	for (i = 0; i < pci_irq.num_routes; i++) {
+		irq = pci_irq.route[i].irq;
+
+		if ((unsigned) (irq - base_irq) > max_pin)
+			/* the interrupt route is for another controller... */
+			continue;
+
+		if (irq < 16)
+			vector = isa_irq_to_vector(irq);
+		else {
+			vector = iosapic_irq_to_vector(irq);
+			if (vector < 0)
+				/* new iosapic irq: allocate a vector for it */
+				vector = ia64_alloc_irq();
+		}
+
+		iosapic_irq[vector].addr     = addr;
+		iosapic_irq[vector].base_irq = base_irq;
+		iosapic_irq[vector].pin	     = (irq - base_irq);
+		iosapic_irq[vector].dmode    = IOSAPIC_LOWEST_PRIORITY;
+		iosapic_irq[vector].trigger  = IOSAPIC_LEVEL;
+		iosapic_irq[vector].polarity = IOSAPIC_POL_LOW;
+
+# ifdef DEBUG_IRQ_ROUTING
+		printk("PCI: (B%d,I%d,P%d) -> IOSAPIC irq 0x%02x -> vector 0x%02x\n",
+		       pci_irq.route[i].bus, pci_irq.route[i].pci_id>>16, pci_irq.route[i].pin,
+		       iosapic_irq[vector].base_irq + iosapic_irq[vector].pin, vector);
+# endif
+		irq_type = &irq_type_iosapic_level;
+		if (irq_desc[vector].handler != irq_type){
+			if (irq_desc[vector].handler != &no_irq_type)
+				printk("iosapic_init: changing vector 0x%02x from %s to %s\n",
+				       vector, irq_desc[vector].handler->typename,
+				       irq_type->typename);
+			irq_desc[vector].handler = irq_type;
+		}
+
+		/* program the IOSAPIC routing table: */
+		set_rte(vector, (ia64_get_lid() >> 16) & 0xffff);
+	}
+#endif /* !CONFIG_IA64_SOFTSDV_HACKS */
+}
+
+void
+iosapic_pci_fixup (int phase)
+{
+	struct	pci_dev	*dev;
+	unsigned char pin;
+	int vector;
+
+	if (phase != 1)
+		return;
+
+	pci_for_each_dev(dev) {
+		pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+		if (pin) {
+			pin--;          /* interrupt pins are numbered starting from 1 */
+			vector = pci_pin_to_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+			if (vector < 0 && dev->bus->parent) {
+				/* go back to the bridge */
+				struct pci_dev *bridge = dev->bus->self;
+
+				if (bridge) {
+					/* allow for multiple bridges on an adapter */
+					do {
+						/* do the bridge swizzle... */
+						pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+						vector = pci_pin_to_vector(bridge->bus->number,
+									   PCI_SLOT(bridge->devfn),
+									   pin);
+					} while (vector < 0 && (bridge = bridge->bus->self));
+				}
+				if (vector >= 0)
+					printk(KERN_WARNING
+					       "PCI: using PPB(B%d,I%d,P%d) to get vector %02x\n",
+					       bridge->bus->number, PCI_SLOT(bridge->devfn),
+					       pin, vector);
+				else
+					printk(KERN_WARNING
+					       "PCI: Couldn't map irq for (B%d,I%d,P%d)o\n",
+					       bridge->bus->number, PCI_SLOT(bridge->devfn),
+					       pin);
+			}
+			if (vector >= 0) {
+				printk("PCI->APIC IRQ transform: (B%d,I%d,P%d) -> 0x%02x\n",
+				       dev->bus->number, PCI_SLOT(dev->devfn), pin, vector);
+				dev->irq = vector;
+			}
+		}
+		/*
+		 * Nothing to fixup
+		 * Fix out-of-range IRQ numbers
+		 */
+		if (dev->irq >= NR_IRQS)
+			dev->irq = 15;	/* Spurious interrupts */
+	}
+}
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index b3646e275..ab8961a54 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -541,6 +541,18 @@ void enable_irq(unsigned int irq)
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
+void do_IRQ_per_cpu(unsigned long irq, struct pt_regs *regs)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	int cpu = smp_processor_id();
+
+	kstat.irqs[cpu][irq]++;
+
+	desc->handler->ack(irq);
+	handle_IRQ_event(irq, regs, desc->action);
+	desc->handler->end(irq);
+}
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
@@ -581,8 +593,7 @@ unsigned int do_IRQ(unsigned long irq, struct pt_regs *regs)
 	if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) {
 		action = desc->action;
 		status &= ~IRQ_PENDING; /* we commit to handling */
-		if (!(status & IRQ_PER_CPU))
-			status |= IRQ_INPROGRESS; /* we are handling it */
+		status |= IRQ_INPROGRESS; /* we are handling it */
 	}
 	desc->status = status;
 
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 2166e205f..155ee66b7 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -7,6 +7,9 @@
  *
  *  6/10/99: Updated to bring in sync with x86 version to facilitate
  *	     support for SMP and different interrupt controllers.
+ *
+ * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector
+ *                      PCI to vector allocation routine.
  */
 
 #include <linux/config.h>
@@ -35,38 +38,28 @@
 
 #define IRQ_DEBUG	0
 
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-spinlock_t ivr_read_lock;
-#endif
-
 /* default base addr of IPI table */
 unsigned long ipi_base_addr = (__IA64_UNCACHED_OFFSET | IPI_DEFAULT_BASE_ADDR);	
 
 /*
- * Legacy IRQ to IA-64 vector translation table.  Any vector not in
- * this table maps to itself (ie: irq 0x30 => IA64 vector 0x30)
+ * Legacy IRQ to IA-64 vector translation table.
  */
 __u8 isa_irq_to_vector_map[16] = {
 	/* 8259 IRQ translation, first 16 entries */
-	0x60, 0x50, 0x10, 0x51, 0x52, 0x53, 0x43, 0x54,
-	0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x40, 0x41
+	0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
+	0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21
 };
 
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-
-int usbfix;
-
-static int __init
-usbfix_option (char *str)
+int
+ia64_alloc_irq (void)
 {
-	printk("irq: enabling USB workaround\n");
-	usbfix = 1;
-	return 1;
-}
+	static int next_irq = FIRST_DEVICE_IRQ;
 
-__setup("usbfix", usbfix_option);
-
-#endif /* CONFIG_ITANIUM_A1_SPECIFIC */
+	if (next_irq > LAST_DEVICE_IRQ)
+		/* XXX could look for sharable vectors instead of panic'ing... */
+		panic("ia64_alloc_irq: out of interrupt vectors!");
+	return next_irq++;
+}
 
 /*
  * That's where the IVT branches when we get an external
@@ -77,42 +70,6 @@ void
 ia64_handle_irq (unsigned long vector, struct pt_regs *regs)
 {
 	unsigned long saved_tpr;
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-	unsigned long eoi_ptr;
- 
-# ifdef CONFIG_USB
-	extern void reenable_usb (void);
-	extern void disable_usb (void);
-
-	if (usbfix)
-		disable_usb();
-# endif
-	/*
-	 * Stop IPIs by getting the ivr_read_lock
-	 */
-	spin_lock(&ivr_read_lock);
-	{
-		unsigned int tmp;
-		/*
-		 * Disable PCI writes
-		 */
-		outl(0x80ff81c0, 0xcf8);
-		tmp = inl(0xcfc);
-		outl(tmp | 0x400, 0xcfc);
-		eoi_ptr = inl(0xcfc);
-		vector = ia64_get_ivr();
-		/*
-		 * Enable PCI writes
-		 */
-		outl(tmp, 0xcfc);
-	}
-	spin_unlock(&ivr_read_lock);
-
-# ifdef CONFIG_USB
-	if (usbfix)
-		reenable_usb();
-# endif
-#endif /* CONFIG_ITANIUM_A1_SPECIFIC */
 
 #if IRQ_DEBUG
 	{
@@ -161,7 +118,10 @@ ia64_handle_irq (unsigned long vector, struct pt_regs *regs)
 		ia64_set_tpr(vector);
 		ia64_srlz_d();
 
-		do_IRQ(vector, regs);
+		if ((irq_desc[vector].status & IRQ_PER_CPU) != 0)
+			do_IRQ_per_cpu(vector, regs);
+		else
+			do_IRQ(vector, regs);
 
 		/*
 		 * Disable interrupts and send EOI:
@@ -169,9 +129,6 @@ ia64_handle_irq (unsigned long vector, struct pt_regs *regs)
 		local_irq_disable();
 		ia64_set_tpr(saved_tpr);
 		ia64_eoi();
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-		break;
-#endif
 		vector = ia64_get_ivr();
 	} while (vector != IA64_SPURIOUS_INT);
 }
@@ -194,8 +151,8 @@ init_IRQ (void)
 	 * Disable all local interrupts
 	 */
 	ia64_set_itv(0, 1);
-	ia64_set_lrr0(0, 1);	
-	ia64_set_lrr1(0, 1);	
+	ia64_set_lrr0(0, 1);
+	ia64_set_lrr1(0, 1);
 
 	irq_desc[IA64_SPURIOUS_INT].handler = &irq_type_ia64_sapic;
 #ifdef CONFIG_SMP
@@ -217,14 +174,11 @@ init_IRQ (void)
 }
 
 void
-ipi_send (int cpu, int vector, int delivery_mode, int redirect)
+ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect)
 {
 	unsigned long ipi_addr;
 	unsigned long ipi_data;
 	unsigned long phys_cpu_id;
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-	unsigned long flags;
-#endif
 
 #ifdef CONFIG_SMP
 	phys_cpu_id = cpu_physical_id(cpu);
@@ -239,13 +193,5 @@ ipi_send (int cpu, int vector, int delivery_mode, int redirect)
 	ipi_data = (delivery_mode << 8) | (vector & 0xff);
 	ipi_addr = ipi_base_addr | (phys_cpu_id << 4) | ((redirect & 1)  << 3);
 
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-	spin_lock_irqsave(&ivr_read_lock, flags);
-#endif
-
 	writeq(ipi_data, ipi_addr);
-
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-	spin_unlock_irqrestore(&ivr_read_lock, flags);
-#endif
 }
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index fa0ad0993..b75cd9dbc 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -6,6 +6,7 @@
  * Copyright (C) 1998-2000 David Mosberger <davidm@hpl.hp.com>
  *
  * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP
+ * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT.
  */
 /*
  * This file defines the interrupt vector table used by the CPU.
@@ -44,23 +45,13 @@
 #include <asm/system.h>
 #include <asm/unistd.h>
 
-#define MINSTATE_START_SAVE_MIN	/* no special action needed */
-#define MINSTATE_END_SAVE_MIN									\
-	or r2=r2,r14;		/* make first base a kernel virtual address */			\
-	or r12=r12,r14;		/* make sp a kernel virtual address */				\
-	or r13=r13,r14;		/* make `current' a kernel virtual address */			\
-	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
-	;;
-
+#define MINSTATE_VIRT	/* needed by minstate.h */
 #include "minstate.h"
 
 #define FAULT(n)									\
-	rsm psr.dt;			/* avoid nested faults due to TLB misses... */	\
-	;;										\
-	srlz.d;				/* ensure everyone knows psr.dt is off... */	\
 	mov r31=pr;									\
 	mov r19=n;;			/* prepare to save predicates */		\
-	br.cond.sptk.many dispatch_to_fault_handler
+	br.sptk.many dispatch_to_fault_handler
 
 /*
  * As we don't (hopefully) use the space available, we need to fill it with
@@ -122,15 +113,14 @@ ia64_ivt:
 (p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
 	srlz.d					// ensure "rsm psr.dt" has taken effect
 (p6)	movl r19=__pa(SWAPPER_PGD_ADDR)		// region 5 is rooted at swapper_pg_dir
-(p6)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-1
-(p7)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-4
+(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
+(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
 	;;
 (p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
 (p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
 	shr.u r18=r16,PMD_SHIFT			// shift L2 index into position
 	;;
-(p6)	cmp.eq p7,p6=-1,r21			// unused address bits all ones?
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
 	;;
 (p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
@@ -145,7 +135,7 @@ ia64_ivt:
 (p7)	ld8 r18=[r21]				// read the L3 PTE
 	mov r19=cr.isr				// cr.isr bit 0 tells us if this is an insn miss
 	;;
-(p7)	tbit.z p6,p7=r18,0			// page present bit cleared?
+(p7)	tbit.z p6,p7=r18,_PAGE_P_BIT		// page present bit cleared?
 	mov r22=cr.iha				// get the VHPT address that caused the TLB miss
 	;;					// avoid RAW on p7
 (p7)	tbit.nz.unc p10,p11=r19,32		// is it an instruction TLB miss?
@@ -153,7 +143,7 @@ ia64_ivt:
 	;;
 (p10)	itc.i r18				// insert the instruction TLB entry
 (p11)	itc.d r18				// insert the data TLB entry
-(p6)	br.spnt.few page_fault			// handle bad address/page not present (page fault)
+(p6)	br.spnt.many page_fault			// handle bad address/page not present (page fault)
 	mov cr.ifa=r22
 
 	// Now compute and insert the TLB entry for the virtual page table.
@@ -183,212 +173,117 @@ ia64_ivt:
 
 	mov pr=r31,-1				// restore predicate registers
 	rfi
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x0400 Entry 1 (size 64 bundles) ITLB (21)
 	/*
-	 * The ITLB basically does the same as the VHPT handler except
-	 * that we always insert exactly one instruction TLB entry.
-	 */
-	/*
-	 * Attempt to lookup PTE through virtual linear page table.
-	 * The speculative access will fail if there is no TLB entry
-	 * for the L3 page table page we're trying to access.
+	 * The ITLB handler accesses the L3 PTE via the virtually mapped linear
+	 * page table.  If a nested TLB miss occurs, we switch into physical
+	 * mode, walk the page table, and then re-execute the L3 PTE read
+	 * and go on normally after that.
 	 */
+itlb_fault:
 	mov r16=cr.ifa				// get virtual address
-	mov r19=cr.iha				// get virtual address of L3 PTE
-	;;
-	ld8.s r17=[r19]				// try to read L3 PTE
+	mov r29=b0				// save b0
 	mov r31=pr				// save predicates
+	mov r17=cr.iha				// get virtual address of L3 PTE
+	movl r30=1f				// load nested fault continuation point
 	;;
-	tnat.nz p6,p0=r17			// did read succeed?
-(p6)	br.cond.spnt.many 1f
+1:	ld8 r18=[r17]				// read L3 PTE
 	;;
-	itc.i r17
+	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
+(p6)	br.cond.spnt.many page_fault
+	;;
+	itc.i r18
 	;;
 #ifdef CONFIG_SMP
-	ld8.s r18=[r19]				// try to read L3 PTE again and see if same
+	ld8 r19=[r17]				// read L3 PTE again and see if same
 	mov r20=PAGE_SHIFT<<2			// setup page size for purge
 	;;
-	cmp.eq p6,p7=r17,r18
+	cmp.ne p7,p0=r18,r19
 	;;
 (p7)	ptc.l r16,r20
 #endif
 	mov pr=r31,-1
 	rfi
-
-#ifdef CONFIG_DISABLE_VHPT
-itlb_fault:
-#endif
-1:	rsm psr.dt				// use physical addressing for data
-	mov r19=ar.k7				// get page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
-	shr.u r17=r16,61			// get the region number into r17
 	;;
-	cmp.eq p6,p7=5,r17			// is IFA pointing into to region 5?
-	shr.u r18=r16,PGDIR_SHIFT		// get bits 33-63 of the faulting address
-	;;
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
-	srlz.d					// ensure "rsm psr.dt" has taken effect
-(p6)	movl r19=__pa(SWAPPER_PGD_ADDR)		// region 5 is rooted at swapper_pg_dir
-(p6)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-1
-(p7)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-4
-	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
-	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r16,PMD_SHIFT			// shift L2 index into position
-	;;
-(p6)	cmp.eq p7,p6=-1,r21			// unused address bits all ones?
-	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
-	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
-	;;
-(p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r16,PAGE_SHIFT		// shift L3 index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
-	;;
-(p7)	ld8 r18=[r17]				// read the L3 PTE
-	;;
-(p7)	tbit.z p6,p7=r18,0			// page present bit cleared?
-	;;
-(p7)	itc.i r18				// insert the instruction TLB entry
-(p6)	br.spnt.few page_fault			// handle bad address/page not present (page fault)
-	;;
-#ifdef CONFIG_SMP
-	ld8 r19=[r17]				// re-read the PTE and check if same
-	;;
-	cmp.eq p6,p7=r18,r19
-	mov r20=PAGE_SHIFT<<2
-	;;
-(p7)	ptc.l r16,r20				// PTE changed purge translation
-#endif
-
-	mov pr=r31,-1				// restore predicate registers
-	rfi
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
 	/*
-	 * The DTLB basically does the same as the VHPT handler except
-	 * that we always insert exactly one data TLB entry.
-	 */
-	/*
-	 * Attempt to lookup PTE through virtual linear page table.
-	 * The speculative access will fail if there is no TLB entry
-	 * for the L3 page table page we're trying to access.
+	 * The DTLB handler accesses the L3 PTE via the virtually mapped linear
+	 * page table.  If a nested TLB miss occurs, we switch into physical
+	 * mode, walk the page table, and then re-execute the L3 PTE read
+	 * and go on normally after that.
 	 */
+dtlb_fault:
 	mov r16=cr.ifa				// get virtual address
-	mov r19=cr.iha				// get virtual address of L3 PTE
-	;;
-	ld8.s r17=[r19]				// try to read L3 PTE
+	mov r29=b0				// save b0
 	mov r31=pr				// save predicates
+	mov r17=cr.iha				// get virtual address of L3 PTE
+	movl r30=1f				// load nested fault continuation point
+	;;
+1:	ld8 r18=[r17]				// read L3 PTE
 	;;
-	tnat.nz p6,p0=r17			// did read succeed?
-(p6)	br.cond.spnt.many 1f
+	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
+(p6)	br.cond.spnt.many page_fault
 	;;
-	itc.d r17
+	itc.d r18
 	;;
 #ifdef CONFIG_SMP
-	ld8.s r18=[r19]				// try to read L3 PTE again and see if same
+	ld8 r19=[r17]				// read L3 PTE again and see if same
 	mov r20=PAGE_SHIFT<<2			// setup page size for purge
 	;;
-	cmp.eq p6,p7=r17,r18
+	cmp.ne p7,p0=r18,r19
 	;;
 (p7)	ptc.l r16,r20
 #endif
 	mov pr=r31,-1
 	rfi
-
-#ifdef CONFIG_DISABLE_VHPT
-dtlb_fault:
-#endif
-1:	rsm psr.dt				// use physical addressing for data
-	mov r19=ar.k7				// get page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
-	shr.u r17=r16,61			// get the region number into r17
-	;;
-	cmp.eq p6,p7=5,r17			// is IFA pointing into to region 5?
-	shr.u r18=r16,PGDIR_SHIFT		// get bits 33-63 of the faulting address
 	;;
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
-	srlz.d					// ensure "rsm psr.dt" has taken effect
-(p6)	movl r19=__pa(SWAPPER_PGD_ADDR)		// region 5 is rooted at swapper_pg_dir
-(p6)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-1
-(p7)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-4
-	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
-	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r16,PMD_SHIFT			// shift L2 index into position
-	;;
-(p6)	cmp.eq p7,p6=-1,r21			// unused address bits all ones?
-	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
-	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
-	;;
-(p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r16,PAGE_SHIFT		// shift L3 index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
-	;;
-(p7)	ld8 r18=[r17]				// read the L3 PTE
-	;;
-(p7)	tbit.z p6,p7=r18,0			// page present bit cleared?
-	;;
-(p7)	itc.d r18				// insert the instruction TLB entry
-(p6)	br.spnt.few page_fault			// handle bad address/page not present (page fault)
-	;;
-#ifdef CONFIG_SMP
-	ld8 r19=[r17]				// re-read the PTE and check if same
-	;;
-	cmp.eq p6,p7=r18,r19
-	mov r20=PAGE_SHIFT<<2
-	;;
-(p7)	ptc.l r16,r20				// PTE changed purge translation
-#endif
-	mov pr=r31,-1				// restore predicate registers
-	rfi
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
 	mov r16=cr.ifa		// get address that caused the TLB miss
-#ifdef CONFIG_DISABLE_VHPT
+	movl r17=__DIRTY_BITS|_PAGE_PL_0|_PAGE_AR_RWX
+	mov r21=cr.ipsr
 	mov r31=pr
 	;;
-	shr.u r21=r16,61			// get the region number into r21
+#ifdef CONFIG_DISABLE_VHPT
+	shr.u r22=r16,61			// get the region number into r21
 	;;
-	cmp.gt p6,p0=6,r21			// user mode 
-(p6)	br.cond.dptk.many itlb_fault
+	cmp.gt p8,p0=6,r22			// user mode 
 	;;
-	mov pr=r31,-1
-#endif
-	movl r17=__DIRTY_BITS|_PAGE_PL_0|_PAGE_AR_RX
+(p8)	thash r17=r16
 	;;
+(p8)	mov cr.iha=r17
+(p8)	br.cond.dptk.many itlb_fault
+#endif
+	extr.u r23=r21,IA64_PSR_CPL0_BIT,2	// extract psr.cpl
 	shr.u r18=r16,57	// move address bit 61 to bit 4
-	dep r16=0,r16,IA64_MAX_PHYS_BITS,(64-IA64_MAX_PHYS_BITS)	// clear ed & reserved bits
+	dep r19=0,r16,IA64_MAX_PHYS_BITS,(64-IA64_MAX_PHYS_BITS)	// clear ed & reserved bits
 	;;
 	andcm r18=0x10,r18	// bit 4=~address-bit(61)
-	dep r16=r17,r16,0,12	// insert PTE control bits into r16
+	cmp.ne p8,p0=r0,r23	// psr.cpl != 0?
+	dep r19=r17,r19,0,12	// insert PTE control bits into r19
 	;;
-	or r16=r16,r18		// set bit 4 (uncached) if the access was to region 6
+	or r19=r19,r18		// set bit 4 (uncached) if the access was to region 6
+(p8)	br.cond.spnt.many page_fault
 	;;
-	itc.i r16		// insert the TLB entry
+	itc.i r19		// insert the TLB entry
+	mov pr=r31,-1
 	rfi
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
 	mov r16=cr.ifa		// get address that caused the TLB miss
-	movl r17=__DIRTY_BITS|_PAGE_PL_0|_PAGE_AR_RW
+	movl r17=__DIRTY_BITS|_PAGE_PL_0|_PAGE_AR_RWX
 	mov r20=cr.isr
 	mov r21=cr.ipsr
 	mov r31=pr
@@ -396,29 +291,40 @@ dtlb_fault:
 #ifdef CONFIG_DISABLE_VHPT
 	shr.u r22=r16,61			// get the region number into r21
 	;;
-	cmp.gt p8,p0=6,r22			// user mode
+	cmp.gt p8,p0=6,r22			// access to region 0-5
+	;;
+(p8)	thash r17=r16
+	;;
+(p8)	mov cr.iha=r17
 (p8)	br.cond.dptk.many dtlb_fault
 #endif
+	extr.u r23=r21,IA64_PSR_CPL0_BIT,2	// extract psr.cpl
 	tbit.nz p6,p7=r20,IA64_ISR_SP_BIT	// is speculation bit on?
 	shr.u r18=r16,57	// move address bit 61 to bit 4
-	dep r16=0,r16,IA64_MAX_PHYS_BITS,(64-IA64_MAX_PHYS_BITS) // clear ed & reserved bits
+	dep r19=0,r16,IA64_MAX_PHYS_BITS,(64-IA64_MAX_PHYS_BITS) // clear ed & reserved bits
 	;;
-	dep r21=-1,r21,IA64_PSR_ED_BIT,1
 	andcm r18=0x10,r18	// bit 4=~address-bit(61)
-	dep r16=r17,r16,0,12	// insert PTE control bits into r16
+	cmp.ne p8,p0=r0,r23
+(p8)	br.cond.spnt.many page_fault
+
+	dep r21=-1,r21,IA64_PSR_ED_BIT,1
+	dep r19=r17,r19,0,12	// insert PTE control bits into r19
 	;;
-	or r16=r16,r18		// set bit 4 (uncached) if the access was to region 6
+	or r19=r19,r18		// set bit 4 (uncached) if the access was to region 6
 (p6)	mov cr.ipsr=r21
 	;;
-(p7)	itc.d r16		// insert the TLB entry
+(p7)	itc.d r19		// insert the TLB entry
 	mov pr=r31,-1
 	rfi
-
 	;;
 
 	//-----------------------------------------------------------------------------------
-	// call do_page_fault (predicates are in r31, psr.dt is off, r16 is faulting address)
+	// call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
 page_fault:
+	ssm psr.dt
+	;;
+	srlz.i
+	;;
 	SAVE_MIN_WITH_COVER
 	//
 	// Copy control registers to temporary registers, then turn on psr bits,
@@ -430,7 +336,7 @@ page_fault:
 	mov r9=cr.isr
 	adds r3=8,r2				// set up second base pointer
 	;;
-	ssm psr.ic | psr.dt
+	ssm psr.ic
 	;;
 	srlz.i					// guarantee that interrupt collection is enabled
 	;;
@@ -445,36 +351,37 @@ page_fault:
 	mov rp=r14
 	;;
 	adds out2=16,r12			// out2 = pointer to pt_regs
-	br.call.sptk.few b6=ia64_do_page_fault	// ignore return address
+	br.call.sptk.many b6=ia64_do_page_fault	// ignore return address
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
 	//
-	// In the absence of kernel bugs, we get here when the Dirty-bit, Instruction
-	// Access-bit, or Data Access-bit faults cause a nested fault because the
-	// dTLB entry for the virtual page table isn't present.  In such a case,
-	// we lookup the pte for the faulting address by walking the page table
-	// and return to the continuation point passed in register r30.
-	// In accessing the page tables, we don't need to check for NULL entries
-	// because if the page tables didn't map the faulting address, it would not
-	// be possible to receive one of the above faults.
+	// In the absence of kernel bugs, we get here when the virtually mapped linear page
+	// table is accessed non-speculatively (e.g.,  in the Dirty-bit, Instruction
+	// Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
+	// table is missing, a nested TLB miss fault is triggered and control is transferred
+	// to this point.  When this happens, we lookup the pte for the faulting address
+	// by walking the page table in physical mode and return to the continuation point
+	// passed in register r30 (or call page_fault if the address is not mapped).
 	//
 	// Input:	r16:	faulting address
 	//		r29:	saved b0
 	//		r30:	continuation address
+	//		r31:	saved pr
 	//
 	// Output:	r17:	physical address of L3 PTE of faulting address
 	//		r29:	saved b0
 	//		r30:	continuation address
+	//		r31:	saved pr
 	//
-	// Clobbered:	b0, r18, r19, r21, r31, psr.dt (cleared)
+	// Clobbered:	b0, r18, r19, r21, psr.dt (cleared)
 	//
 	rsm psr.dt				// switch to using physical data addressing
 	mov r19=ar.k7				// get the page table base address
 	shl r21=r16,3				// shift bit 60 into sign bit
 	;;
-	mov r31=pr				// save the predicate registers
 	shr.u r17=r16,61			// get the region number into r17
 	;;
 	cmp.eq p6,p7=5,r17			// is faulting address in region 5?
@@ -482,26 +389,30 @@ page_fault:
 	;;
 (p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
 	srlz.d
-(p6)	movl r17=__pa(SWAPPER_PGD_ADDR)		// region 5 is rooted at swapper_pg_dir
-(p6)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-1
-(p7)	shr r21=r21,PGDIR_SHIFT+PAGE_SHIFT-4
+(p6)	movl r19=__pa(SWAPPER_PGD_ADDR)		// region 5 is rooted at swapper_pg_dir
+(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
+(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
 	;;
-(p6)	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
+(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
 (p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
 	shr.u r18=r16,PMD_SHIFT			// shift L2 index into position
 	;;
-	ld8 r17=[r17]				// fetch the L1 entry
+	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
 	mov b0=r30
 	;;
+(p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
 	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
 	;;
-	ld8 r17=[r17]				// fetch the L2 entry
+(p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
 	shr.u r19=r16,PAGE_SHIFT		// shift L3 index into position
 	;;
+(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
 	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
 	;;
-	mov pr=r31,-1				// restore predicates
-	br.cond.sptk.few b0			// return to continuation point
+(p6)	br.cond.spnt.many page_fault
+	br.sptk.many b0				// return to continuation point
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -526,33 +437,19 @@ page_fault:
 	// a nested TLB miss hit where we look up the physical address of the L3 PTE
 	// and then continue at label 1 below.
 	//
-#ifndef CONFIG_SMP
 	mov r16=cr.ifa				// get the address that caused the fault
 	movl r30=1f				// load continuation point in case of nested fault
 	;;
 	thash r17=r16				// compute virtual address of L3 PTE
 	mov r29=b0				// save b0 in case of nested fault
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	or r18=_PAGE_D,r18			// set the dirty bit
-	mov b0=r29				// restore b0
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	itc.d r18				// install updated PTE
-#else
-	mov r16=cr.ifa				// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	;;
-	thash r17=r16				// compute virtual address of L3 PTE
+	mov r31=pr				// save pr
+#ifdef CONFIG_SMP
 	mov r28=ar.ccv				// save ar.ccv
-	mov r29=b0				// save b0 in case of nested fault
-	mov r27=pr
 	;;
 1:	ld8 r18=[r17]
 	;;					// avoid RAW on r18
 	mov ar.ccv=r18				// set compare value for cmpxchg
-	or r25=_PAGE_D,r18			// set the dirty bit
+	or r25=_PAGE_D|_PAGE_A,r18		// set the dirty and accessed bits
 	;;
 	cmpxchg8.acq r26=[r17],r25,ar.ccv
 	mov r24=PAGE_SHIFT<<2
@@ -568,70 +465,46 @@ page_fault:
 (p7)	ptc.l r16,r24
 	mov b0=r29				// restore b0
 	mov ar.ccv=r28
-	mov pr=r27,-1
+#else
+	;;
+1:	ld8 r18=[r17]
+	;;					// avoid RAW on r18
+	or r18=_PAGE_D|_PAGE_A,r18		// set the dirty and accessed bits
+	mov b0=r29				// restore b0
+	;;
+	st8 [r17]=r18				// store back updated PTE
+	itc.d r18				// install updated PTE
 #endif
+	mov pr=r31,-1				// restore pr
 	rfi
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
 	// Like Entry 8, except for instruction access
 	mov r16=cr.ifa				// get the address that caused the fault
+	movl r30=1f				// load continuation point in case of nested fault
+	mov r31=pr				// save predicates
 #ifdef CONFIG_ITANIUM
 	/*
-	 * Erratum 10 (IFA may contain incorrect address) now has
-	 * "NoFix" status.  There are no plans for fixing this.
+	 * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
 	 */
 	mov r17=cr.ipsr
-	mov r31=pr				// save predicates
 	;;
 	mov r18=cr.iip
 	tbit.z p6,p0=r17,IA64_PSR_IS_BIT	// IA64 instruction set?
 	;;
 (p6)	mov r16=r18				// if so, use cr.iip instead of cr.ifa
-	mov pr=r31,-1
 #endif /* CONFIG_ITANIUM */
-
-#ifndef CONFIG_SMP
-	movl r30=1f				// load continuation point in case of nested fault
 	;;
 	thash r17=r16				// compute virtual address of L3 PTE
 	mov r29=b0				// save b0 in case of nested fault)
-	;;
-1:	ld8 r18=[r17]
-#if defined(CONFIG_IA32_SUPPORT) && \
-    (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_B0_SPECIFIC))
-	//
-	// Erratum 85 (Access bit fault could be reported before page not present fault)
-	//   If the PTE is indicates the page is not present, then just turn this into a
-	//   page fault.
-	//
-	mov r31=pr				// save predicates
-	;;
-	tbit.nz p6,p0=r18,0			// page present bit set?
-(p6)	br.cond.sptk 1f
-	;;					// avoid WAW on p6
-	mov pr=r31,-1
-	br.cond.sptk page_fault			// page wasn't present
-1:	mov pr=r31,-1
-#else
-	;;					// avoid RAW on r18
-#endif
-	or r18=_PAGE_A,r18			// set the accessed bit
-	mov b0=r29				// restore b0
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	itc.i r18				// install updated PTE
-#else
-	movl r30=1f				// load continuation point in case of nested fault
-	;;
-	thash r17=r16				// compute virtual address of L3 PTE
+#ifdef CONFIG_SMP
 	mov r28=ar.ccv				// save ar.ccv
-	mov r29=b0				// save b0 in case of nested fault)
-	mov r27=pr
 	;;
 1:	ld8 r18=[r17]
-#if defined(CONFIG_IA32_SUPPORT) && \
+# if defined(CONFIG_IA32_SUPPORT) && \
     (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_B0_SPECIFIC))
 	//
 	// Erratum 85 (Access bit fault could be reported before page not present fault)
@@ -639,15 +512,9 @@ page_fault:
 	//   page fault.
 	//
 	;;
-	tbit.nz p6,p0=r18,0			// page present bit set?
-(p6)	br.cond.sptk 1f
-	;;					// avoid WAW on p6
-	mov pr=r27,-1
-	br.cond.sptk page_fault			// page wasn't present
-1:	
-#else
-	;;					// avoid RAW on r18
-#endif
+	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
+(p6)	br.sptk page_fault			// page wasn't present
+# endif
 	mov ar.ccv=r18				// set compare value for cmpxchg
 	or r25=_PAGE_A,r18			// set the accessed bit
 	;;
@@ -665,36 +532,42 @@ page_fault:
 (p7)	ptc.l r16,r24
 	mov b0=r29				// restore b0
 	mov ar.ccv=r28
-	mov pr=r27,-1
-#endif
+#else /* !CONFIG_SMP */
+	;;
+1:	ld8 r18=[r17]
+	;;
+# if defined(CONFIG_IA32_SUPPORT) && \
+    (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_B0_SPECIFIC))
+	//
+	// Erratum 85 (Access bit fault could be reported before page not present fault)
+	//   If the PTE is indicates the page is not present, then just turn this into a
+	//   page fault.
+	//
+	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
+(p6)	br.sptk page_fault			// page wasn't present
+# endif
+	or r18=_PAGE_A,r18			// set the accessed bit
+	mov b0=r29				// restore b0
+	;;
+	st8 [r17]=r18				// store back updated PTE
+	itc.i r18				// install updated PTE
+#endif /* !CONFIG_SMP */
+	mov pr=r31,-1
 	rfi
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
 	// Like Entry 8, except for data access
-#ifndef CONFIG_SMP
 	mov r16=cr.ifa				// get the address that caused the fault
 	movl r30=1f				// load continuation point in case of nested fault
 	;;
 	thash r17=r16				// compute virtual address of L3 PTE
+	mov r31=pr
 	mov r29=b0				// save b0 in case of nested fault)
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	or r18=_PAGE_A,r18			// set the accessed bit
-	mov b0=r29				// restore b0
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	itc.d r18				// install updated PTE
-#else
-	mov r16=cr.ifa				// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	;;
-	thash r17=r16				// compute virtual address of L3 PTE
+#ifdef CONFIG_SMP
 	mov r28=ar.ccv				// save ar.ccv
-	mov r29=b0				// save b0 in case of nested fault
-	mov r27=pr
 	;;
 1:	ld8 r18=[r17]
 	;;					// avoid RAW on r18
@@ -713,11 +586,20 @@ page_fault:
 	cmp.eq p6,p7=r18,r25			// is it same as the newly installed
 	;;
 (p7)	ptc.l r16,r24
-	mov b0=r29				// restore b0
 	mov ar.ccv=r28
-	mov pr=r27,-1
+#else
+	;;
+1:	ld8 r18=[r17]
+	;;					// avoid RAW on r18
+	or r18=_PAGE_A,r18			// set the accessed bit
+	;;
+	st8 [r17]=r18				// store back updated PTE
+	itc.d r18				// install updated PTE
 #endif
+	mov b0=r29				// restore b0
+	mov pr=r31,-1
 	rfi
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -725,16 +607,14 @@ page_fault:
 	mov r16=cr.iim
 	mov r17=__IA64_BREAK_SYSCALL
 	mov r31=pr		// prepare to save predicates
-	rsm psr.dt		// avoid nested faults due to TLB misses...
 	;;
-	srlz.d			// ensure everyone knows psr.dt is off...
 	cmp.eq p0,p7=r16,r17	// is this a system call? (p7 <- false, if so)
 (p7)	br.cond.spnt.many non_syscall
 
 	SAVE_MIN				// uses r31; defines r2:
 
-	// turn interrupt collection and data translation back on:
-	ssm psr.ic | psr.dt
+	// turn interrupt collection back on:
+	ssm psr.ic
 	;;
 	srlz.i					// guarantee that interrupt collection is enabled
 	cmp.eq pSys,pNonSys=r0,r0		// set pSys=1, pNonSys=0
@@ -746,14 +626,13 @@ page_fault:
 	adds r3=8,r2		// set up second base pointer for SAVE_REST
 	;;
 	SAVE_REST
-	;;			// avoid WAW on r2 & r3
+	br.call.sptk rp=demine_args		// clear NaT bits in (potential) syscall args
 
 	mov r3=255
 	adds r15=-1024,r15			// r15 contains the syscall number---subtract 1024
 	adds r2=IA64_TASK_PTRACE_OFFSET,r13	// r2 = &current->ptrace
-
 	;;
-	cmp.geu.unc p6,p7=r3,r15		// (syscall > 0 && syscall <= 1024+255) ?
+	cmp.geu p6,p7=r3,r15		// (syscall > 0 && syscall <= 1024+255) ?
 	movl r16=sys_call_table
 	;;
 (p6)	shladd r16=r15,3,r16
@@ -788,40 +667,61 @@ page_fault:
 	;;
 	st8 [r16]=r18				// store new value for cr.isr
 
-(p8)	br.call.sptk.few b6=b6			// ignore this return addr 
-	br.call.sptk.few rp=ia64_trace_syscall	// rp will be overwritten (ignored)
+(p8)	br.call.sptk.many b6=b6			// ignore this return addr 
+	br.call.sptk.many rp=ia64_trace_syscall	// rp will be overwritten (ignored)
 	// NOT REACHED
 
+	.proc demine_args
+demine_args:
+	alloc r2=ar.pfs,8,0,0,0
+	tnat.nz p8,p0=in0
+	tnat.nz p9,p0=in1
+	;;
+(p8)	mov in0=-1
+	tnat.nz p10,p0=in2
+	tnat.nz p11,p0=in3
+
+(p9)	mov in1=-1
+	tnat.nz p12,p0=in4
+	tnat.nz p13,p0=in5
+	;;
+(p10)	mov in2=-1
+	tnat.nz p14,p0=in6
+	tnat.nz p15,p0=in7
+
+(p11)	mov in3=-1
+(p12)	mov in4=-1
+(p13)	mov in5=-1
+	;;
+(p14)	mov in6=-1
+(p15)	mov in7=-1
+	br.ret.sptk.many rp
+	.endp demine_args
+
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
-	rsm psr.dt		// avoid nested faults due to TLB misses...
-	;;
-	srlz.d			// ensure everyone knows psr.dt is off...
 	mov r31=pr		// prepare to save predicates
 	;;
 
 	SAVE_MIN_WITH_COVER	// uses r31; defines r2 and r3
-	ssm psr.ic | psr.dt	// turn interrupt collection and data translation back on
+	ssm psr.ic		// turn interrupt collection
 	;;
 	adds r3=8,r2		// set up second base pointer for SAVE_REST
-	srlz.i			// ensure everybody knows psr.ic and psr.dt are back on
+	srlz.i			// ensure everybody knows psr.ic is back on
 	;;
 	SAVE_REST
 	;;
 	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
-#ifdef CONFIG_ITANIUM_A1_SPECIFIC
-	mov out0=r0		// defer reading of cr.ivr to handle_irq...
-#else
 	mov out0=cr.ivr		// pass cr.ivr as first arg
-#endif
 	add out1=16,sp		// pass pointer to pt_regs as second arg
 	;;
 	srlz.d			// make  sure we see the effect of cr.ivr
 	movl r14=ia64_leave_kernel
 	;;
 	mov rp=r14
-	br.call.sptk.few b6=ia64_handle_irq
+	br.call.sptk.many b6=ia64_handle_irq
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -855,7 +755,7 @@ dispatch_illegal_op_fault:
 	// The "alloc" can cause a mandatory store which could lead to
 	// an "Alt DTLB" fault which we can handle only if psr.ic is on.
 	//
-	ssm psr.ic | psr.dt
+	ssm psr.ic
 	;;
 	srlz.i		// guarantee that interrupt collection is enabled
 	;;
@@ -867,7 +767,7 @@ dispatch_illegal_op_fault:
 	;;
 	SAVE_REST
 	;;
-	br.call.sptk.few rp=ia64_illegal_op_fault
+	br.call.sptk.many rp=ia64_illegal_op_fault
 .ret0:	;;
 	alloc r14=ar.pfs,0,0,3,0	// must be first in insn group
 	mov out0=r9
@@ -881,6 +781,7 @@ dispatch_illegal_op_fault:
 	cmp.ne p6,p0=0,r8
 (p6)	br.call.dpnt b6=b6		// call returns to ia64_leave_kernel
 	br.sptk ia64_leave_kernel
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -900,7 +801,7 @@ dispatch_to_ia32_handler:
 	SAVE_MIN
 	;;
 	mov r14=cr.isr
-	ssm psr.ic | psr.dt
+	ssm psr.ic
 	;;
 	srlz.i					// guarantee that interrupt collection is enabled
 	;;
@@ -913,7 +814,7 @@ dispatch_to_ia32_handler:
 	shr r14=r14,16          // Get interrupt number
 	;; 
 	cmp.ne p6,p0=r14,r15
-(p6)    br.call.dpnt.few b6=non_ia32_syscall
+(p6)    br.call.dpnt.many b6=non_ia32_syscall
 
 	adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp	// 16 byte hole per SW conventions
 	adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
@@ -924,7 +825,7 @@ dispatch_to_ia32_handler:
 	alloc r15=ar.pfs,0,0,6,0	// must first in an insn group
 	;; 
 	ld4 r8=[r14],8          // r8 == EAX (syscall number)
-	mov r15=190		// sys_vfork - last implemented system call
+	mov r15=222		// sys_vfork - last implemented system call
 	;;
 	cmp.leu.unc p6,p7=r8,r15
 	ld4 out1=[r14],8        // r9 == ecx
@@ -961,11 +862,12 @@ non_ia32_syscall:
 	mov out0=r14                            // interrupt #
 	add out1=16,sp                          // pointer to pt_regs
 	;;			// avoid WAW on CFM
-	br.call.sptk.few rp=ia32_bad_interrupt
+	br.call.sptk.many rp=ia32_bad_interrupt
 .ret1:	movl r15=ia64_leave_kernel
 	;;
 	mov rp=r15
 	br.ret.sptk.many rp
+	;;
 
 #endif /* CONFIG_IA32_SUPPORT */
 
@@ -985,8 +887,8 @@ non_syscall:
 	mov r8=cr.iim			// get break immediate (must be done while psr.ic is off)
 	adds r3=8,r2			// set up second base pointer for SAVE_REST
 
-	// turn interrupt collection and data translation back on:
-	ssm psr.ic | psr.dt
+	// turn interrupt collection back on:
+	ssm psr.ic
 	;;
 	srlz.i				// guarantee that interrupt collection is enabled
 	;;
@@ -1000,7 +902,8 @@ non_syscall:
 	SAVE_REST
 	mov rp=r15
 	;;
-	br.call.sptk.few b6=ia64_bad_break	// avoid WAW on CFM and ignore return addr
+	br.call.sptk.many b6=ia64_bad_break	// avoid WAW on CFM and ignore return addr
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1023,7 +926,7 @@ dispatch_unaligned_handler:
 	// wouldn't get the state to recover.
 	//
 	mov r15=cr.ifa
-	ssm psr.ic | psr.dt
+	ssm psr.ic
 	;;
 	srlz.i					// guarantee that interrupt collection is enabled
 	;;
@@ -1039,7 +942,8 @@ dispatch_unaligned_handler:
 	adds out1=16,sp				// out1 = pointer to pt_regs
 	;;
 	mov rp=r14
-	br.sptk.few ia64_prepare_handle_unaligned
+	br.sptk.many ia64_prepare_handle_unaligned
+	;;
 
 	.align 1024
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1055,7 +959,6 @@ dispatch_to_fault_handler:
 	//
 	// Input:
 	//	psr.ic:	off
-	//	psr.dt:	off
 	//	r19:	fault vector number (e.g., 24 for General Exception)
 	//	r31:	contains saved predicates (pr)
 	//
@@ -1071,7 +974,7 @@ dispatch_to_fault_handler:
 	mov r10=cr.iim
 	mov r11=cr.itir
 	;;
-	ssm psr.ic | psr.dt
+	ssm psr.ic
 	;;
 	srlz.i					// guarantee that interrupt collection is enabled
 	;;
@@ -1089,7 +992,9 @@ dispatch_to_fault_handler:
 	movl r14=ia64_leave_kernel
 	;;
 	mov rp=r14
-	br.call.sptk.few b6=ia64_fault
+	br.call.sptk.many b6=ia64_fault
+	;;
+
 //
 // --- End of long entries, Beginning of short entries
 //
@@ -1099,16 +1004,16 @@ dispatch_to_fault_handler:
 // 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
 	mov r16=cr.ifa
 	rsm psr.dt
-#if 1
-	// If you disable this, you MUST re-enable to update_mmu_cache() code in pgtable.h
+	// The Linux page fault handler doesn't expect non-present pages to be in
+	// the TLB.  Flush the existing entry now, so we meet that expectation.
 	mov r17=_PAGE_SIZE_4K<<2
 	;;
 	ptc.l r16,r17
-#endif
 	;;
 	mov r31=pr
 	srlz.d
-	br.cond.sptk.many page_fault
+	br.sptk.many page_fault
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1118,7 +1023,8 @@ dispatch_to_fault_handler:
 	mov r31=pr
 	;;
 	srlz.d
-	br.cond.sptk.many page_fault
+	br.sptk.many page_fault
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1128,7 +1034,8 @@ dispatch_to_fault_handler:
 	mov r31=pr
 	;;
 	srlz.d
-	br.cond.sptk.many page_fault
+	br.sptk.many page_fault
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1138,31 +1045,32 @@ dispatch_to_fault_handler:
 	mov r31=pr
 	;;
 	srlz.d
-	br.cond.sptk.many page_fault
+	br.sptk.many page_fault
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
 	mov r16=cr.isr
 	mov r31=pr
-	rsm psr.dt		// avoid nested faults due to TLB misses...
 	;;
-	srlz.d			// ensure everyone knows psr.dt is off...
 	cmp4.eq p6,p0=0,r16
 (p6)	br.sptk dispatch_illegal_op_fault
 	;;
 	mov r19=24		// fault number
-	br.cond.sptk.many dispatch_to_fault_handler
+	br.sptk.many dispatch_to_fault_handler
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
-	rsm psr.dt | psr.dfh			// ensure we can access fph
+	rsm psr.dfh		// ensure we can access fph
 	;;
 	srlz.d
 	mov r31=pr
 	mov r19=25
-	br.cond.sptk.many dispatch_to_fault_handler
+	br.sptk.many dispatch_to_fault_handler
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1204,6 +1112,7 @@ dispatch_to_fault_handler:
 	;;
 
 	rfi				// and go back
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1218,12 +1127,11 @@ dispatch_to_fault_handler:
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
-	rsm psr.dt		// avoid nested faults due to TLB misses...
 	mov r16=cr.ipsr
 	mov r31=pr		// prepare to save predicates
 	;;									
-	srlz.d			// ensure everyone knows psr.dt is off
-	br.cond.sptk.many dispatch_unaligned_handler
+	br.sptk.many dispatch_unaligned_handler
+	;;
 
 	.align 256
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1304,9 +1212,6 @@ dispatch_to_fault_handler:
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
 #ifdef	CONFIG_IA32_SUPPORT
-	rsm psr.dt
-	;;
-	srlz.d
 	mov r31=pr
 	mov r16=cr.isr
 	;;
@@ -1325,7 +1230,7 @@ dispatch_to_fault_handler:
 	;;
 	mov pr=r31,-1		// restore predicate registers
 	rfi
-
+	;;
 1:
 #endif	// CONFIG_IA32_SUPPORT
 	FAULT(46)
@@ -1334,11 +1239,9 @@ dispatch_to_fault_handler:
 /////////////////////////////////////////////////////////////////////////////////////////
 // 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
 #ifdef CONFIG_IA32_SUPPORT
-	rsm psr.dt
-	;;
-	srlz.d
 	mov r31=pr
-	br.cond.sptk.many dispatch_to_ia32_handler
+	br.sptk.many dispatch_to_ia32_handler
+	;;
 #else
 	FAULT(47)
 #endif
diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c
index 2afb5613e..df19a8d6f 100644
--- a/arch/ia64/kernel/machvec.c
+++ b/arch/ia64/kernel/machvec.c
@@ -1,11 +1,13 @@
 #include <linux/config.h>
+
+#ifdef CONFIG_IA64_GENERIC
+
 #include <linux/kernel.h>
+#include <linux/string.h>
 
 #include <asm/page.h>
 #include <asm/machvec.h>
 
-#ifdef CONFIG_IA64_GENERIC
-
 struct ia64_machine_vector ia64_mv;
 
 /*
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 333258d35..1456b8d96 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -19,6 +19,7 @@
 #include <linux/irq.h>
 #include <linux/smp_lock.h>
 
+#include <asm/machvec.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
@@ -26,7 +27,6 @@
 #include <asm/mca.h>
 
 #include <asm/irq.h>
-#include <asm/machvec.h>
 
  
 typedef struct ia64_fptr {
@@ -365,7 +365,7 @@ ia64_mca_wakeup_ipi_wait(void)
 void
 ia64_mca_wakeup(int cpu)
 {
-	ipi_send(cpu, IA64_MCA_WAKEUP_INT_VECTOR, IA64_IPI_DM_INT, 0);
+	platform_send_ipi(cpu, IA64_MCA_WAKEUP_INT_VECTOR, IA64_IPI_DM_INT, 0);
 	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
 	
 }
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
index 15993525d..b148d8b9c 100644
--- a/arch/ia64/kernel/mca_asm.S
+++ b/arch/ia64/kernel/mca_asm.S
@@ -3,11 +3,11 @@
 //
 // Mods by cfleck to integrate into kernel build
 // 00/03/15 davidm Added various stop bits to get a clean compile
-// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp kstack,
-//		   switch modes, jump to C INIT handler
+//
+// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp
+//		   kstack, switch modes, jump to C INIT handler
 //
 #include <linux/config.h>
-
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/mca_asm.h>
@@ -17,14 +17,7 @@
  * When we get an machine check, the kernel stack pointer is no longer
  * valid, so we need to set a new stack pointer.
  */
-#define MINSTATE_START_SAVE_MIN							\
-(pKern) movl sp=ia64_init_stack+IA64_STK_OFFSET-IA64_PT_REGS_SIZE;		\
-	;;
-
-#define MINSTATE_END_SAVE_MIN							\
-	or r12=r12,r14;		/* make sp a kernel virtual address */		\
-	or r13=r13,r14;		/* make `current' a kernel virtual address */	\
-	;;
+#define	MINSTATE_PHYS	/* Make sure stack access is physical for MINSTATE */ 
 
 #include "minstate.h"
 	
diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
index 8790d49c3..2ea6f1791 100644
--- a/arch/ia64/kernel/minstate.h
+++ b/arch/ia64/kernel/minstate.h
@@ -20,6 +20,72 @@
 #define rR1		r20
 
 /*
+ * Here start the source dependent macros.
+ */
+
+/*
+ * For ivt.s we want to access the stack virtually so we dont have to disable translation
+ * on interrupts.
+ */
+#define MINSTATE_START_SAVE_MIN_VIRT								\
+	dep r1=-1,r1,61,3;				/* r1 = current (virtual) */		\
+(p7)	mov ar.rsc=r0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
+	;;											\
+(p7)	addl rKRBS=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
+(p7)	mov rARRNAT=ar.rnat;									\
+(pKern) mov r1=sp;					/* get sp  */				\
+	;;											\
+(p7)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
+(p7)	mov rARBSPSTORE=ar.bspstore;			/* save ar.bspstore */			\
+	;;											\
+(pKern) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
+(p7)	mov ar.bspstore=rKRBS;				/* switch to kernel RBS */		\
+	;;											\
+(p7)	mov r18=ar.bsp;										\
+(p7)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
+
+#define MINSTATE_END_SAVE_MIN_VIRT								\
+	or r13=r13,r14;		/* make `current' a kernel virtual address */			\
+	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
+	;;
+
+/*
+ * For mca_asm.S we want to access the stack physically since the state is saved before we
+ * go virtual and dont want to destroy the iip or ipsr.
+ */
+#define MINSTATE_START_SAVE_MIN_PHYS								\
+(pKern) movl sp=ia64_init_stack+IA64_STK_OFFSET-IA64_PT_REGS_SIZE;				\
+(p7)	mov ar.rsc=r0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
+(p7)	addl rKRBS=IA64_RBS_OFFSET,r1;		/* compute base of register backing store */	\
+	;;											\
+(p7)	mov rARRNAT=ar.rnat;									\
+(pKern) dep r1=0,sp,61,3;				/* compute physical addr of sp	*/	\
+(p7)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
+(p7)	mov rARBSPSTORE=ar.bspstore;			/* save ar.bspstore */			\
+(p7)	dep rKRBS=-1,rKRBS,61,3;			/* compute kernel virtual addr of RBS */\
+	;;											\
+(pKern) addl r1=-IA64_PT_REGS_SIZE,r1;		/* if in kernel mode, use sp (r12) */		\
+(p7)	mov ar.bspstore=rKRBS;			/* switch to kernel RBS */			\
+	;;											\
+(p7)	mov r18=ar.bsp;										\
+(p7)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
+
+#define MINSTATE_END_SAVE_MIN_PHYS								\
+	or r12=r12,r14;		/* make sp a kernel virtual address */				\
+	or r13=r13,r14;		/* make `current' a kernel virtual address */			\
+	;;
+
+#ifdef MINSTATE_VIRT
+# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_VIRT
+# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_VIRT
+#endif
+
+#ifdef MINSTATE_PHYS
+# define MINSTATE_START_SAVE_MIN	MINSTATE_START_SAVE_MIN_PHYS
+# define MINSTATE_END_SAVE_MIN		MINSTATE_END_SAVE_MIN_PHYS
+#endif
+
+/*
  * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
  * the minimum state necessary that allows us to turn psr.ic back
  * on.
@@ -31,7 +97,6 @@
  *
  * Upon exit, the state is as follows:
  *	psr.ic: off
- *	psr.dt: off
  *	r2 = points to &pt_regs.r16
  *	r12 = kernel sp (kernel virtual address)
  *	r13 = points to current task_struct (kernel virtual address)
@@ -50,7 +115,7 @@
 	mov rCRIPSR=cr.ipsr;									  \
 	mov rB6=b6;		/* rB6 = branch reg 6 */					  \
 	mov rCRIIP=cr.iip;									  \
-	mov r1=ar.k6;		/* r1 = current */						  \
+	mov r1=ar.k6;		/* r1 = current (physical) */					  \
 	;;											  \
 	invala;											  \
 	extr.u r16=rCRIPSR,32,2;		/* extract psr.cpl */				  \
@@ -58,25 +123,11 @@
 	cmp.eq pKern,p7=r0,r16;			/* are we in kernel mode already? (psr.cpl==0) */ \
 	/* switch from user to kernel RBS: */							  \
 	COVER;											  \
-	;; 									                  \
+	;;											  \
 	MINSTATE_START_SAVE_MIN									  \
-(p7)	mov ar.rsc=r0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	  \
-(p7)	addl rKRBS=IA64_RBS_OFFSET,r1;		/* compute base of register backing store */	  \
-	;;											  \
-(p7)	mov rARRNAT=ar.rnat;									  \
-(pKern)	dep r1=0,sp,61,3;				/* compute physical addr of sp  */	  \
-(p7)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	  \
-(p7)	mov rARBSPSTORE=ar.bspstore;			/* save ar.bspstore */			  \
-(p7)	dep rKRBS=-1,rKRBS,61,3;			/* compute kernel virtual addr of RBS */  \
-	;;											  \
-(pKern)	addl r1=-IA64_PT_REGS_SIZE,r1;		/* if in kernel mode, use sp (r12) */		  \
-(p7)	mov ar.bspstore=rKRBS;			/* switch to kernel RBS */			  \
-	;;											  \
-(p7)	mov r18=ar.bsp;										  \
-(p7)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		  \
-												  \
-	mov r16=r1;		/* initialize first base pointer */				  \
-	adds r17=8,r1;		/* initialize second base pointer */				  \
+	;;											  \
+	mov r16=r1;					/* initialize first base pointer */	  \
+	adds r17=8,r1;					/* initialize second base pointer */	  \
 	;;											  \
 	st8 [r16]=rCRIPSR,16;	/* save cr.ipsr */						  \
 	st8 [r17]=rCRIIP,16;	/* save cr.iip */						  \
diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
index 2e56a428e..fc14cc31c 100644
--- a/arch/ia64/kernel/pal.S
+++ b/arch/ia64/kernel/pal.S
@@ -52,10 +52,9 @@ END(ia64_pal_default_handler)
 /*
  * Make a PAL call using the static calling convention.
  *
- * in0         Pointer to struct ia64_pal_retval
- * in1         Index of PAL service
- * in2 - in4   Remaining PAL arguments
- * in5	       1 ==> clear psr.ic,  0 ==> don't clear psr.ic
+ * in0         Index of PAL service
+ * in1 - in3   Remaining PAL arguments
+ * in4	       1 ==> clear psr.ic,  0 ==> don't clear psr.ic
  *
  */
 GLOBAL_ENTRY(ia64_pal_call_static)
@@ -69,7 +68,7 @@ GLOBAL_ENTRY(ia64_pal_call_static)
 	}
 	;;
 	ld8 loc2 = [loc2]		// loc2 <- entry point
-	tbit.nz p6,p7 = in5, 0
+	tbit.nz p6,p7 = in4, 0
 	adds r8 = 1f-1b,r8
 	;;
 	mov loc3 = psr
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
deleted file mode 100644
index 6293cdfa0..000000000
--- a/arch/ia64/kernel/pci-dma.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Dynamic DMA mapping support.
- *
- * This implementation is for IA-64 platforms that do not support
- * I/O TLBs (aka DMA address translation hardware).
- * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
- * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
- */
-
-#include <linux/config.h>
-
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-#include <asm/pci.h>
-#include <asm/dma.h>
-
-#ifdef CONFIG_SWIOTLB
-
-#include <linux/init.h>
-#include <linux/bootmem.h>
-
-#define ALIGN(val, align) ((unsigned long) (((unsigned long) (val) + ((align) - 1)) & ~((align) - 1)))
-
-/*
- * log of the size of each IO TLB slab.  The number of slabs is command line
- * controllable.
- */
-#define IO_TLB_SHIFT 11
-
-/*
- * Used to do a quick range check in pci_unmap_single and pci_sync_single, to see if the 
- * memory was in fact allocated by this API.
- */
-static char *io_tlb_start, *io_tlb_end;
-
-/*
- * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and io_tlb_end.
- * This is command line adjustable via setup_io_tlb_npages.
- */
-unsigned long io_tlb_nslabs = 1024;
-
-/*
- * This is a free list describing the number of free entries available from each index
- */
-static unsigned int *io_tlb_list;
-static unsigned int io_tlb_index;
-
-/*
- * We need to save away the original address corresponding to a mapped entry for the sync 
- * operations.
- */
-static unsigned char **io_tlb_orig_addr;
-
-/*
- * Protect the above data structures in the map and unmap calls
- */ 
-spinlock_t io_tlb_lock = SPIN_LOCK_UNLOCKED;
-
-static int __init
-setup_io_tlb_npages (char *str)
-{
-	io_tlb_nslabs = simple_strtoul(str, NULL, 0) << (PAGE_SHIFT - IO_TLB_SHIFT);
-	return 1;
-}
-__setup("swiotlb=", setup_io_tlb_npages);
-
-/*
- * Statically reserve bounce buffer space and initialize bounce buffer
- * data structures for the software IO TLB used to implement the PCI DMA API
- */
-void
-setup_swiotlb (void)
-{
-	int i;
-
-	/*
-	 * Get IO TLB memory from the low pages
-	 */
-	io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
-	if (!io_tlb_start)
-		BUG();
-	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
-
-	/*
-	 * Allocate and initialize the free list array.  This array is used
-	 * to find contiguous free memory regions of size 2^IO_TLB_SHIFT between
-	 * io_tlb_start and io_tlb_end.
-	 */
-	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
-	for (i = 0; i < io_tlb_nslabs; i++)
-		io_tlb_list[i] = io_tlb_nslabs - i;
-	io_tlb_index = 0;
-	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *));
-
-	printk("Placing software IO TLB between 0x%p - 0x%p\n",
-	       (void *) io_tlb_start, (void *) io_tlb_end);
-}
-
-/*
- * Allocates bounce buffer and returns its kernel virtual address.
- */
-static void *
-__pci_map_single (struct pci_dev *hwdev, char *buffer, size_t size, int direction)
-{
-	unsigned long flags;
-	char *dma_addr;
-	unsigned int i, nslots, stride, index, wrap;
-
-	/*
-	 * For mappings greater than a page size, we limit the stride (and hence alignment)
-	 * to a page size.
-	 */
-	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-	if (size > (1 << PAGE_SHIFT))
-		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
-	else
-		stride = nslots;
-
-	if (!nslots)
-		BUG();
-
-	/*
-	 * Find suitable number of IO TLB entries size that will fit this request and allocate a buffer
-	 * from that IO TLB pool.
-	 */
-	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		wrap = index = ALIGN(io_tlb_index, stride);
-		do {
-			/*
-			 * If we find a slot that indicates we have 'nslots' number of 
-			 * contiguous buffers, we allocate the buffers from that slot and mark the
-			 * entries as '0' indicating unavailable.
-			 */
-			if (io_tlb_list[index] >= nslots) {
-				for (i = index; i < index + nslots; i++)
-					io_tlb_list[i] = 0;
-				dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
-
-				/*
-				 * Update the indices to avoid searching in the next round.
-				 */
-				io_tlb_index = (index + nslots) < io_tlb_nslabs ? (index + nslots) : 0;
-
-				goto found;
-			}
-			index += stride;
-			if (index >= io_tlb_nslabs)
-				index = 0;
-		} while (index != wrap);
-
-		/*
-		 * XXX What is a suitable recovery mechanism here?  We cannot 
-		 * sleep because we are called from with in interrupts!
-		 */
-		panic("__pci_map_single: could not allocate software IO TLB (%ld bytes)", size);
-found:
-	}
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
-
-	/*
-	 * Save away the mapping from the original address to the DMA address.  This is needed
-	 * when we sync the memory.  Then we sync the buffer if needed.
-	 */
-	io_tlb_orig_addr[index] = buffer;
-	if (direction == PCI_DMA_TODEVICE || direction == PCI_DMA_BIDIRECTIONAL)
-		memcpy(dma_addr, buffer, size);
-
-	return dma_addr;
-}
-
-/*
- * dma_addr is the kernel virtual address of the bounce buffer to unmap.
- */
-static void
-__pci_unmap_single (struct pci_dev *hwdev, char *dma_addr, size_t size, int direction)
-{
-	unsigned long flags;
-	int i, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	char *buffer = io_tlb_orig_addr[index];
-
-	/*
-	 * First, sync the memory before unmapping the entry
-	 */
-	if ((direction == PCI_DMA_FROMDEVICE) || (direction == PCI_DMA_BIDIRECTIONAL))
-		/*
- 	 	 * bounce... copy the data back into the original buffer
-	  	 * and delete the bounce buffer.
- 	 	 */
-		memcpy(buffer, dma_addr, size);
-
-	/*
-	 * Return the buffer to the free list by setting the corresponding entries to indicate
-	 * the number of contigous entries available.  
-	 * While returning the entries to the free list, we merge the entries with slots below
-	 * and above the pool being returned.
-	 */
-	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		int count = ((index + nslots) < io_tlb_nslabs ? io_tlb_list[index + nslots] : 0);
-		/*
-		 * Step 1: return the slots to the free list, merging the slots with superceeding slots
-		 */
-		for (i = index + nslots - 1; i >= index; i--)
-			io_tlb_list[i] = ++count;
-		/*
-		 * Step 2: merge the returned slots with the preceeding slots, if available (non zero)
-		 */
-		for (i = index - 1; (i >= 0) && io_tlb_list[i]; i--)
-			io_tlb_list[i] += io_tlb_list[index];
-	}
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
-}
-
-static void
-__pci_sync_single (struct pci_dev *hwdev, char *dma_addr, size_t size, int direction)
-{
-	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	char *buffer = io_tlb_orig_addr[index];
-
-	/*
-  	 * bounce... copy the data back into/from the original buffer
-	 * XXX How do you handle PCI_DMA_BIDIRECTIONAL here ?
- 	 */
-	if (direction == PCI_DMA_FROMDEVICE)
-		memcpy(buffer, dma_addr, size);
-	else if (direction == PCI_DMA_TODEVICE)
-		memcpy(dma_addr, buffer, size);
-	else
-		BUG();
-}
-
-/*
- * Map a single buffer of the indicated size for DMA in streaming mode.
- * The PCI address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single is performed.
- */
-dma_addr_t
-pci_map_single (struct pci_dev *hwdev, void *ptr, size_t size, int direction)
-{
-	unsigned long pci_addr = virt_to_phys(ptr);
-
-	if (direction == PCI_DMA_NONE)
-		BUG();
-	/*
-	 * Check if the PCI device can DMA to ptr... if so, just return ptr
-	 */
-	if ((pci_addr & ~hwdev->dma_mask) == 0)
-		/*
-		 * Device is bit capable of DMA'ing to the
-		 * buffer... just return the PCI address of ptr
-		 */
-		return pci_addr;
-
-	/* 
-	 * get a bounce buffer: 
-	 */
-	pci_addr = virt_to_phys(__pci_map_single(hwdev, ptr, size, direction));
-
-	/*
-	 * Ensure that the address returned is DMA'ble:
-	 */
-	if ((pci_addr & ~hwdev->dma_mask) != 0)
-		panic("__pci_map_single: bounce buffer is not DMA'ble");
-
-	return pci_addr;
-}
-
-/*
- * Unmap a single streaming mode DMA translation.  The dma_addr and size
- * must match what was provided for in a previous pci_map_single call.  All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guarenteed to see
- * whatever the device wrote there.
- */
-void
-pci_unmap_single (struct pci_dev *hwdev, dma_addr_t pci_addr, size_t size, int direction)
-{
-	char *dma_addr = phys_to_virt(pci_addr);
-
-	if (direction == PCI_DMA_NONE)
-		BUG();
-	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
-		__pci_unmap_single(hwdev, dma_addr, size, direction);
-}
-
-/*
- * Make physical memory consistent for a single
- * streaming mode DMA translation after a transfer.
- *
- * If you perform a pci_map_single() but wish to interrogate the
- * buffer using the cpu, yet do not wish to teardown the PCI dma
- * mapping, you must call this function before doing so.  At the
- * next point you give the PCI dma address back to the card, the
- * device again owns the buffer.
- */
-void
-pci_dma_sync_single (struct pci_dev *hwdev, dma_addr_t pci_addr, size_t size, int direction)
-{
-	char *dma_addr = phys_to_virt(pci_addr);
-
-	if (direction == PCI_DMA_NONE)
-		BUG();
-	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
-		__pci_sync_single(hwdev, dma_addr, size, direction);
-}
-
-/*
- * Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scather-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-int
-pci_map_sg (struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction)
-{
-	int i;
-
-	if (direction == PCI_DMA_NONE)
-		BUG();
-
-	for (i = 0; i < nelems; i++, sg++) {
-		sg->orig_address = sg->address;
-		if ((virt_to_phys(sg->address) & ~hwdev->dma_mask) != 0) {
-			sg->address = __pci_map_single(hwdev, sg->address, sg->length, direction);
-		}
-	}
-	return nelems;
-}
-
-/*
- * Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-void
-pci_unmap_sg (struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction)
-{
-	int i;
-
-	if (direction == PCI_DMA_NONE)
-		BUG();
-
-	for (i = 0; i < nelems; i++, sg++)
-		if (sg->orig_address != sg->address) {
-			__pci_unmap_single(hwdev, sg->address, sg->length, direction);
-			sg->address = sg->orig_address;
-		}
-}
-
-/*
- * Make physical memory consistent for a set of streaming mode DMA
- * translations after a transfer.
- *
- * The same as pci_dma_sync_single but for a scatter-gather list,
- * same rules and usage.
- */
-void
-pci_dma_sync_sg (struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction)
-{
-	int i;
-
-	if (direction == PCI_DMA_NONE)
-		BUG();
-
-	for (i = 0; i < nelems; i++, sg++)
-		if (sg->orig_address != sg->address)
-			__pci_sync_single(hwdev, sg->address, sg->length, direction);
-}
-
-#else
-/*
- * Map a single buffer of the indicated size for DMA in streaming mode.
- * The 32-bit bus address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single is performed.
- */
-dma_addr_t
-pci_map_single (struct pci_dev *hwdev, void *ptr, size_t size, int direction)
-{
-        if (direction == PCI_DMA_NONE)
-                BUG();
-        return virt_to_bus(ptr);
-}
-
-/*
- * Unmap a single streaming mode DMA translation.  The dma_addr and size
- * must match what was provided for in a previous pci_map_single call.  All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guarenteed to see
- * whatever the device wrote there.
- */
-void
-pci_unmap_single (struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction)
-{
-        if (direction == PCI_DMA_NONE)
-                BUG();
-        /* Nothing to do */
-}
-/*
- * Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scather-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-int
-pci_map_sg (struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction)
-{
-        if (direction == PCI_DMA_NONE)
-                BUG();
-        return nents;
-}
-
-/*
- * Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-void
-pci_unmap_sg (struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction)
-{
-        if (direction == PCI_DMA_NONE)
-                BUG();
-        /* Nothing to do */
-}
-/*
- * Make physical memory consistent for a single
- * streaming mode DMA translation after a transfer.
- *
- * If you perform a pci_map_single() but wish to interrogate the
- * buffer using the cpu, yet do not wish to teardown the PCI dma
- * mapping, you must call this function before doing so.  At the
- * next point you give the PCI dma address back to the card, the
- * device again owns the buffer.
- */
-void
-pci_dma_sync_single (struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction)
-{
-        if (direction == PCI_DMA_NONE)
-                BUG();
-        /* Nothing to do */
-}
-
-/*
- * Make physical memory consistent for a set of streaming mode DMA
- * translations after a transfer.
- *
- * The same as pci_dma_sync_single but for a scatter-gather list,
- * same rules and usage.
- */
-void
-pci_dma_sync_sg (struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction)
-{
-        if (direction == PCI_DMA_NONE)
-                BUG();
-        /* Nothing to do */
-}
-
-#endif /* CONFIG_SWIOTLB */
-
-void *
-pci_alloc_consistent (struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle)
-{
-	unsigned long pci_addr;
-	int gfp = GFP_ATOMIC;
-	void *ret;
-
-	if (!hwdev || hwdev->dma_mask <= 0xffffffff)
-		gfp |= GFP_DMA; /* XXX fix me: should change this to GFP_32BIT or ZONE_32BIT */
-	ret = (void *)__get_free_pages(gfp, get_order(size));
-	if (!ret)
-		return NULL;
-
-	memset(ret, 0, size);
-	pci_addr = virt_to_phys(ret);
-	if ((pci_addr & ~hwdev->dma_mask) != 0)
-		panic("pci_alloc_consistent: allocated memory is out of range for PCI device");
-	*dma_handle = pci_addr;
-	return ret;
-}
-
-void
-pci_free_consistent (struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle)
-{
-	free_pages((unsigned long) vaddr, get_order(size));
-}
diff --git a/arch/ia64/kernel/pci.c b/arch/ia64/kernel/pci.c
index 2d814b443..37dbf811a 100644
--- a/arch/ia64/kernel/pci.c
+++ b/arch/ia64/kernel/pci.c
@@ -1,10 +1,8 @@
 /*
- * pci.c - Low-Level PCI Access in IA64
+ * pci.c - Low-Level PCI Access in IA-64
  * 
  * Derived from bios32.c of i386 tree.
- *
  */
-
 #include <linux/config.h>
 
 #include <linux/types.h>
@@ -44,19 +42,16 @@
  * This interrupt-safe spinlock protects all accesses to PCI
  * configuration space.
  */
-
 spinlock_t pci_lock = SPIN_LOCK_UNLOCKED;
 
-struct pci_fixup pcibios_fixups[] = { { 0 } };
-
-#define PCI_NO_CHECKS		0x400
-#define PCI_NO_PEER_FIXUP	0x800
-
-static unsigned int pci_probe = PCI_NO_CHECKS;
+struct pci_fixup pcibios_fixups[] = {
+	{ 0 }
+};
 
 /* Macro to build a PCI configuration address to be passed as a parameter to SAL. */
 
-#define PCI_CONFIG_ADDRESS(dev, where) (((u64) dev->bus->number << 16) | ((u64) (dev->devfn & 0xff) << 8) | (where & 0xff))
+#define PCI_CONFIG_ADDRESS(dev, where) \
+	(((u64) dev->bus->number << 16) | ((u64) (dev->devfn & 0xff) << 8) | (where & 0xff))
 
 static int 
 pci_conf_read_config_byte(struct pci_dev *dev, int where, u8 *value)
@@ -109,8 +104,7 @@ pci_conf_write_config_dword (struct pci_dev *dev, int where, u32 value)
 	return ia64_sal_pci_config_write(PCI_CONFIG_ADDRESS(dev, where), 4, value);
 }
 
-
-static struct pci_ops pci_conf = {
+struct pci_ops pci_conf = {
       pci_conf_read_config_byte,
       pci_conf_read_config_word,
       pci_conf_read_config_dword,
@@ -120,36 +114,21 @@ static struct pci_ops pci_conf = {
 };
 
 /*
- * Try to find PCI BIOS.  This will always work for IA64.
- */
-
-static struct pci_ops * __init
-pci_find_bios(void)
-{
-	return &pci_conf;
-}
-
-/*
  * Initialization. Uses the SAL interface
  */
-
-#define PCI_BUSES_TO_SCAN 255
-
 void __init 
-pcibios_init(void)
+pcibios_init (void)
 {
-	struct pci_ops *ops = NULL;
+#	define PCI_BUSES_TO_SCAN 255
 	int i;
 
-	if ((ops = pci_find_bios()) == NULL) {
-		printk("PCI: No PCI bus detected\n");
-		return;
-	}
+	platform_pci_fixup(0);	/* phase 0 initialization (before PCI bus has been scanned) */
 
 	printk("PCI: Probing PCI hardware\n");
 	for (i = 0; i < PCI_BUSES_TO_SCAN; i++) 
-		pci_scan_bus(i, ops, NULL);
-	platform_pci_fixup();
+		pci_scan_bus(i, &pci_conf, NULL);
+
+	platform_pci_fixup(1);	/* phase 1 initialization (after PCI bus has been scanned) */
 	return;
 }
 
@@ -157,16 +136,15 @@ pcibios_init(void)
  *  Called after each bus is probed, but before its children
  *  are examined.
  */
-
 void __init
-pcibios_fixup_bus(struct pci_bus *b)
+pcibios_fixup_bus (struct pci_bus *b)
 {
 	return;
 }
 
 void __init
-pcibios_update_resource(struct pci_dev *dev, struct resource *root,
-			struct resource *res, int resource)
+pcibios_update_resource (struct pci_dev *dev, struct resource *root,
+			 struct resource *res, int resource)
 {
         unsigned long where, size;
         u32 reg;
@@ -181,7 +159,7 @@ pcibios_update_resource(struct pci_dev *dev, struct resource *root,
 }
 
 void __init
-pcibios_update_irq(struct pci_dev *dev, int irq)
+pcibios_update_irq (struct pci_dev *dev, int irq)
 {
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 
@@ -204,18 +182,16 @@ pcibios_enable_device (struct pci_dev *dev)
 	return 0;
 }
 
+void
+pcibios_align_resource (void *data, struct resource *res, unsigned long size)
+{
+}
+
 /*
  * PCI BIOS setup, always defaults to SAL interface
  */
-
 char * __init 
-pcibios_setup(char *str)
+pcibios_setup (char *str)
 {
-	pci_probe =  PCI_NO_CHECKS;
 	return NULL;
 }
-
-void
-pcibios_align_resource (void *data, struct resource *res, unsigned long size)
-{
-}
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index e5efbc8b5..4c7ba4295 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4,18 +4,20 @@
  *
  * Originaly Written by Ganesh Venkitachalam, IBM Corp.
  * Modifications by David Mosberger-Tang, Hewlett-Packard Co.
+ * Modifications by Stephane Eranian, Hewlett-Packard Co.
  * Copyright (C) 1999 Ganesh Venkitachalam <venkitac@us.ibm.com>
  * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
  */
 
 #include <linux/config.h>
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/smp_lock.h>
 #include <linux/proc_fs.h>
-#include <linux/ptrace.h>
 
 #include <asm/errno.h>
 #include <asm/hw_irq.h>
@@ -58,19 +60,51 @@
 #define MAX_PERF_COUNTER	4	/* true for Itanium, at least */
 #define PMU_FIRST_COUNTER	4	/* first generic counter */
 
-#define WRITE_PMCS_AND_START	0xa0
-#define WRITE_PMCS		0xa1
-#define READ_PMDS		0xa2
-#define STOP_PMCS		0xa3
+#define PFM_WRITE_PMCS		0xa0
+#define PFM_WRITE_PMDS		0xa1
+#define PFM_READ_PMDS		0xa2
+#define PFM_STOP		0xa3
+#define PFM_START		0xa4
+#define PFM_ENABLE		0xa5	/* unfreeze only */
+#define PFM_DISABLE		0xa6	/* freeze only */
+/* 
+ * Those 2 are just meant for debugging. I considered using sysctl() for
+ * that but it is a little bit too pervasive. This solution is at least
+ * self-contained.
+ */
+#define PFM_DEBUG_ON		0xe0	
+#define PFM_DEBUG_OFF		0xe1
+
+#ifdef CONFIG_SMP
+#define cpu_is_online(i) (cpu_online_map & (1UL << i))
+#else
+#define cpu_is_online(i)	1
+#endif
 
+#define PMC_IS_IMPL(i)		(pmu_conf.impl_regs[i>>6] & (1<< (i&~(64-1))))
+#define PMD_IS_IMPL(i)  	(pmu_conf.impl_regs[4+(i>>6)] & (1<< (i&~(64-1))))
+#define PMD_IS_COUNTER(i)	(i>=PMU_FIRST_COUNTER && i < (PMU_FIRST_COUNTER+pmu_conf.max_counters))
+#define PMC_IS_COUNTER(i)	(i>=PMU_FIRST_COUNTER && i < (PMU_FIRST_COUNTER+pmu_conf.max_counters))
 
 /*
  * this structure needs to be enhanced
  */
 typedef struct {
+	unsigned long	pfr_reg_num;	/* which register */
+	unsigned long	pfr_reg_value;	/* configuration (PMC) or initial value (PMD) */
+	unsigned long	pfr_reg_reset;	/* reset value on overflow (PMD) */
+	void		*pfr_smpl_buf;	/* pointer to user buffer for EAR/BTB */
+	unsigned long	pfr_smpl_size;	/* size of user buffer for EAR/BTB */
+	pid_t		pfr_notify_pid;	/* process to notify */
+	int		pfr_notify_sig;	/* signal for notification, 0=no notification */
+} perfmon_req_t;
+
+#if 0
+typedef struct {
 	unsigned long pmu_reg_data;	/* generic PMD register */
 	unsigned long pmu_reg_num;	/* which register number */
 } perfmon_reg_t; 
+#endif
 
 /*
  * This structure is initialize at boot time and contains
@@ -78,86 +112,141 @@ typedef struct {
  * by PAL
  */
 typedef struct {
-	unsigned long perf_ovfl_val;	/* overflow value for generic counters */
-	unsigned long max_pmc;		/* highest PMC */
-	unsigned long max_pmd;		/* highest PMD */
-	unsigned long max_counters;	/* number of generic counter pairs (PMC/PMD) */
+	unsigned long perf_ovfl_val;	/* overflow value for generic counters   */
+	unsigned long max_counters;	/* upper limit on counter pair (PMC/PMD) */
+	unsigned long impl_regs[16];	/* buffer used to hold implememted PMC/PMD mask */
 } pmu_config_t;
 
-/* XXX will go static when ptrace() is cleaned */
-unsigned long perf_ovfl_val;	/* overflow value for generic counters */
-
 static pmu_config_t pmu_conf;
 
+/* for debug only */
+static unsigned long pfm_debug=1;	/* 0= nodebug, >0= debug output on */
+#define DBprintk(a)	{\
+	if (pfm_debug >0) { printk a; } \
+}
+
 /*
- * could optimize to avoid cache conflicts in SMP
+ * could optimize to avoid cache line conflicts in SMP
  */
-unsigned long pmds[NR_CPUS][MAX_PERF_COUNTER];
+static struct task_struct *pmu_owners[NR_CPUS];
 
-asmlinkage unsigned long
-sys_perfmonctl (int cmd, int count, void *ptr, long arg4, long arg5, long arg6, long arg7, long arg8, long stack)
+static int
+do_perfmonctl (struct task_struct *task, int cmd, int flags, perfmon_req_t *req, int count, struct pt_regs *regs)
 {
-	struct pt_regs *regs = (struct pt_regs *) &stack;
-        perfmon_reg_t tmp, *cptr = ptr;
-        unsigned long cnum;
+        perfmon_req_t tmp;
         int i;
 
         switch (cmd) {
-	      case WRITE_PMCS:           /* Writes to PMC's and clears PMDs */
-	      case WRITE_PMCS_AND_START: /* Also starts counting */
+		case PFM_WRITE_PMCS:          
+			/* we don't quite support this right now */
+			if (task != current) return -EINVAL;
+
+			if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;
+
+			for (i = 0; i < count; i++, req++) {
+				copy_from_user(&tmp, req, sizeof(tmp));
+
+				/* XXX needs to check validity of the data maybe */
+
+				if (!PMC_IS_IMPL(tmp.pfr_reg_num)) {
+					DBprintk((__FUNCTION__ " invalid pmc[%ld]\n", tmp.pfr_reg_num));
+					return -EINVAL;
+				}
+
+				/* XXX: for counters, need to some checks */
+				if (PMC_IS_COUNTER(tmp.pfr_reg_num)) {
+					current->thread.pmu_counters[tmp.pfr_reg_num - PMU_FIRST_COUNTER].sig = tmp.pfr_notify_sig;
+					current->thread.pmu_counters[tmp.pfr_reg_num - PMU_FIRST_COUNTER].pid = tmp.pfr_notify_pid;
+
+					DBprintk((__FUNCTION__" setting PMC[%ld] send sig %d to %d\n",tmp.pfr_reg_num, tmp.pfr_notify_sig, tmp.pfr_notify_pid));
+				}
+				ia64_set_pmc(tmp.pfr_reg_num, tmp.pfr_reg_value);
+
+				DBprintk((__FUNCTION__" setting PMC[%ld]=0x%lx\n", tmp.pfr_reg_num, tmp.pfr_reg_value));
+			}
+			/*
+			 * we have to set this here event hough we haven't necessarily started monitoring
+			 * because we may be context switched out
+			 */
+			current->thread.flags |= IA64_THREAD_PM_VALID;
+                	break;
+
+		case PFM_WRITE_PMDS:
+			/* we don't quite support this right now */
+			if (task != current) return -EINVAL;
+
+			if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;
+
+			for (i = 0; i < count; i++, req++) {
+				copy_from_user(&tmp, req, sizeof(tmp));
+
+				if (!PMD_IS_IMPL(tmp.pfr_reg_num)) return -EINVAL;
+
+				/* update virtualized (64bits) counter */
+				if (PMD_IS_COUNTER(tmp.pfr_reg_num)) {
+					current->thread.pmu_counters[tmp.pfr_reg_num - PMU_FIRST_COUNTER].val  = tmp.pfr_reg_value & ~pmu_conf.perf_ovfl_val;
+					current->thread.pmu_counters[tmp.pfr_reg_num - PMU_FIRST_COUNTER].rval = tmp.pfr_reg_reset;
+				}
+				/* writes to unimplemented part is ignored, so this is safe */
+				ia64_set_pmd(tmp.pfr_reg_num, tmp.pfr_reg_value);
+				/* to go away */
+				ia64_srlz_d();
+				DBprintk((__FUNCTION__" setting PMD[%ld]:  pmod.val=0x%lx pmd=0x%lx rval=0x%lx\n", tmp.pfr_reg_num, current->thread.pmu_counters[tmp.pfr_reg_num - PMU_FIRST_COUNTER].val, ia64_get_pmd(tmp.pfr_reg_num),current->thread.pmu_counters[tmp.pfr_reg_num - PMU_FIRST_COUNTER].rval));
+			}
+			/*
+			 * we have to set this here event hough we haven't necessarily started monitoring
+			 * because we may be context switched out
+			 */
+			current->thread.flags |= IA64_THREAD_PM_VALID;
+                	break;
+
+		case PFM_START:
+			/* we don't quite support this right now */
+			if (task != current) return -EINVAL;
+
+			pmu_owners[smp_processor_id()] = current;
 
-		if (!access_ok(VERIFY_READ, cptr, sizeof(struct perfmon_reg_t)*count))
-			return -EFAULT;
-
-		for (i = 0; i < count; i++, cptr++) {
-
-			copy_from_user(&tmp, cptr, sizeof(tmp));
-
-			/* XXX need to check validity of pmu_reg_num and perhaps data!! */
-
-			if (tmp.pmu_reg_num > pmu_conf.max_pmc || tmp.pmu_reg_num == 0) return -EFAULT;
+			/* will start monitoring right after rfi */
+			ia64_psr(regs)->up = 1;
 
-			ia64_set_pmc(tmp.pmu_reg_num, tmp.pmu_reg_data);
+			/* 
+		 	 * mark the state as valid.
+		 	 * this will trigger save/restore at context switch
+		 	 */
+			current->thread.flags |= IA64_THREAD_PM_VALID;
 
-			/* to go away */
-			if (tmp.pmu_reg_num >= PMU_FIRST_COUNTER && tmp.pmu_reg_num < PMU_FIRST_COUNTER+pmu_conf.max_counters) {
-				ia64_set_pmd(tmp.pmu_reg_num, 0);
-				pmds[smp_processor_id()][tmp.pmu_reg_num - PMU_FIRST_COUNTER] = 0;
+			ia64_set_pmc(0, 0);
 
-				printk(__FUNCTION__" setting PMC/PMD[%ld] es=0x%lx pmd[%ld]=%lx\n", tmp.pmu_reg_num, (tmp.pmu_reg_data>>8) & 0x7f, tmp.pmu_reg_num, ia64_get_pmd(tmp.pmu_reg_num));
-			} else
-				printk(__FUNCTION__" setting PMC[%ld]=0x%lx\n", tmp.pmu_reg_num, tmp.pmu_reg_data);
-		}
+                	break;
 
-		if (cmd == WRITE_PMCS_AND_START) {
-#if 0
-/* irrelevant with user monitors */
-			local_irq_save(flags);
+		case PFM_ENABLE:
+			/* we don't quite support this right now */
+			if (task != current) return -EINVAL;
 
-			dcr = ia64_get_dcr();
-			dcr |= IA64_DCR_PP;
-			ia64_set_dcr(dcr);
+			pmu_owners[smp_processor_id()] = current;
 
-			local_irq_restore(flags);
-#endif
+			/* 
+		 	 * mark the state as valid.
+		 	 * this will trigger save/restore at context switch
+		 	 */
+			current->thread.flags |= IA64_THREAD_PM_VALID;
 
+			/* simply unfreeze */
 			ia64_set_pmc(0, 0);
+			break;
 
-			/* will start monitoring right after rfi */
-			ia64_psr(regs)->up = 1;
-		}
-		/* 
-		 * mark the state as valid.
-		 * this will trigger save/restore at context switch
-		 */
-		current->thread.flags |= IA64_THREAD_PM_VALID;
-                break;
+		case PFM_DISABLE:
+			/* we don't quite support this right now */
+			if (task != current) return -EINVAL;
+
+			/* simply unfreeze */
+			ia64_set_pmc(0, 1);
+			ia64_srlz_d();
+			break;
 
-	      case READ_PMDS:
-		if (count <= 0 || count > MAX_PERF_COUNTER)
-			return -EINVAL;
-		if (!access_ok(VERIFY_WRITE, cptr, sizeof(struct perfmon_reg_t)*count))
-			return -EFAULT;
+	        case PFM_READ_PMDS:
+			if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;
+			if (!access_ok(VERIFY_WRITE, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;
 
 		/* This looks shady, but IMHO this will work fine. This is  
 		 * the sequence that I could come up with to avoid races
@@ -187,16 +276,31 @@ sys_perfmonctl (int cmd, int count, void *ptr, long arg4, long arg5, long arg6,
 		 * is the irq_save/restore needed?
 		 */
 
+		for (i = 0; i < count; i++, req++) {
+			unsigned long val=0;
 
-		/* XXX: This needs to change to read more than just the counters */
-		for (i = 0, cnum = PMU_FIRST_COUNTER;i < count; i++, cnum++, cptr++) {
+			copy_from_user(&tmp, req, sizeof(tmp));
 
-			tmp.pmu_reg_data = (pmds[smp_processor_id()][i]
-				    + (ia64_get_pmd(cnum) & pmu_conf.perf_ovfl_val));
+			if (!PMD_IS_IMPL(tmp.pfr_reg_num)) return -EINVAL;
 
-			tmp.pmu_reg_num = cnum;
+			if (PMD_IS_COUNTER(tmp.pfr_reg_num)) {
+				if (task == current){
+					val = ia64_get_pmd(tmp.pfr_reg_num) & pmu_conf.perf_ovfl_val;
+				} else {
+					val = task->thread.pmd[tmp.pfr_reg_num - PMU_FIRST_COUNTER] & pmu_conf.perf_ovfl_val;
+				}
+				val += task->thread.pmu_counters[tmp.pfr_reg_num - PMU_FIRST_COUNTER].val;
+			} else {
+				/* for now */
+				if (task != current) return -EINVAL;
 
-			if (copy_to_user(cptr, &tmp, sizeof(tmp))) return -EFAULT;
+				val = ia64_get_pmd(tmp.pfr_reg_num);
+			}
+			tmp.pfr_reg_value = val;
+
+DBprintk((__FUNCTION__" reading PMD[%ld]=0x%lx\n", tmp.pfr_reg_num, val));
+
+			if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
 		}
 #if 0
 /* irrelevant with user monitors */
@@ -209,11 +313,18 @@ sys_perfmonctl (int cmd, int count, void *ptr, long arg4, long arg5, long arg6,
 #endif
                 break;
 
-	      case STOP_PMCS:
+	      case PFM_STOP:
+		/* we don't quite support this right now */
+		if (task != current) return -EINVAL;
+
 		ia64_set_pmc(0, 1);
 		ia64_srlz_d();
-		for (i = 0; i < MAX_PERF_COUNTER; ++i)
-			ia64_set_pmc(4+i, 0);
+
+		ia64_psr(regs)->up = 0;
+
+		current->thread.flags &= ~IA64_THREAD_PM_VALID;
+
+		pmu_owners[smp_processor_id()] = NULL;
 
 #if 0
 /* irrelevant with user monitors */
@@ -225,48 +336,140 @@ sys_perfmonctl (int cmd, int count, void *ptr, long arg4, long arg5, long arg6,
 		ia64_psr(regs)->up = 0;
 #endif
 
-		current->thread.flags &= ~(IA64_THREAD_PM_VALID);
-
 		break;
 
+	      case PFM_DEBUG_ON:
+			printk(__FUNCTION__" debuggin on\n");
+			pfm_debug = 1;
+			break;
+
+	      case PFM_DEBUG_OFF:
+			printk(__FUNCTION__" debuggin off\n");
+			pfm_debug = 0;
+			break;
+
 	      default:
+		DBprintk((__FUNCTION__" UNknown command 0x%x\n", cmd));
 		return -EINVAL;
 		break;
         }
         return 0;
 }
 
-static inline void
-update_counters (void)
+asmlinkage int
+sys_perfmonctl (int pid, int cmd, int flags, perfmon_req_t *req, int count, long arg6, long arg7, long arg8, long stack)
 {
-	unsigned long mask, i, cnum, val;
+	struct pt_regs *regs = (struct pt_regs *) &stack;
+	struct task_struct *child = current;
+	int ret;
+
+	if (pid != current->pid) {
+		read_lock(&tasklist_lock);
+		{
+			child = find_task_by_pid(pid);
+			if (child)
+				get_task_struct(child);
+		}
+		if (!child) { 
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		/*
+		 * XXX: need to do more checking here
+		 */
+		if (child->state != TASK_ZOMBIE) {
+			DBprintk((__FUNCTION__" warning process %d not in stable state %ld\n", pid, child->state));
+		}
+	} 
+	ret = do_perfmonctl(child, cmd, flags, req, count, regs);
 
-	mask = ia64_get_pmc(0) >> 4;
-	for (i = 0, cnum = PMU_FIRST_COUNTER ; i < pmu_conf.max_counters; cnum++, i++, mask >>= 1) {
+	if (child != current) read_unlock(&tasklist_lock);
 
+	return ret;
+}
 
-		val = mask & 0x1 ? pmu_conf.perf_ovfl_val + 1 : 0;
 
-		if (mask & 0x1) 
-			printk(__FUNCTION__ " PMD%ld overflowed pmd=%lx pmod=%lx\n", cnum, ia64_get_pmd(cnum), pmds[smp_processor_id()][i]); 
+static inline int
+update_counters (u64 pmc0)
+{
+	unsigned long mask, i, cnum;
+	struct thread_struct *th;
+	struct task_struct *ta;
 
-		/* since we got an interrupt, might as well clear every pmd. */
-		val += ia64_get_pmd(cnum) & pmu_conf.perf_ovfl_val;
+	if (pmu_owners[smp_processor_id()] == NULL) {
+		DBprintk((__FUNCTION__" Spurious overflow interrupt: PMU not owned\n"));
+		return 0;
+	}
+	
+	/*
+	 * It is never safe to access the task for which the overflow interrupt is destinated
+	 * using the current variable as the interrupt may occur in the middle of a context switch
+	 * where current does not hold the task that is running yet.
+	 *
+	 * For monitoring, however, we do need to get access to the task which caused the overflow
+	 * to account for overflow on the counters.
+	 * We accomplish this by maintaining a current owner of the PMU per CPU. During context
+	 * switch the ownership is changed in a way such that the reflected owner is always the 
+	 * valid one, i.e. the one that caused the interrupt.
+	 */
+	ta = pmu_owners[smp_processor_id()];
+	th = &pmu_owners[smp_processor_id()]->thread;
 
-		printk(__FUNCTION__ " adding val=%lx to pmod[%ld]=%lx \n", val, i, pmds[smp_processor_id()][i]); 
+	/*
+	 * Don't think this could happen given first test. Keep as sanity check
+	 */
+	if ((th->flags & IA64_THREAD_PM_VALID) == 0) {
+		DBprintk((__FUNCTION__" Spurious overflow interrupt: process %d not using perfmon\n", ta->pid));
+		return 0;
+	}
+
+	/*
+	 * if PMU not frozen: spurious from previous context 
+	 * if PMC[0] = 0x1 : frozen but no overflow reported: leftover from previous context
+	 *
+	 * in either case we don't touch the state upon return from handler
+	 */
+	if ((pmc0 & 0x1) == 0 || pmc0 == 0x1) { 
+		DBprintk((__FUNCTION__" Spurious overflow interrupt: process %d freeze=0\n",ta->pid));
+		return 0;
+	}
 
-		pmds[smp_processor_id()][i] += val;
+	mask = pmc0 >> 4;
 
-		ia64_set_pmd(cnum, 0);
+	for (i = 0, cnum = PMU_FIRST_COUNTER; i < pmu_conf.max_counters; cnum++, i++, mask >>= 1) {
+
+		if (mask & 0x1) {
+			DBprintk((__FUNCTION__ " PMD[%ld] overflowed pmd=0x%lx pmod.val=0x%lx\n", cnum, ia64_get_pmd(cnum), th->pmu_counters[i].val)); 
+			
+			/*
+			 * Because we somtimes (EARS/BTB) reset to a specific value, we cannot simply use 
+			 * val to count the number of times we overflowed. Otherwise we would loose the value
+			 * current in the PMD (which can be >0). So to make sure we don't loose
+			 * the residual counts we set val to contain full 64bits value of the counter.
+			 */
+			th->pmu_counters[i].val += 1+pmu_conf.perf_ovfl_val+(ia64_get_pmd(cnum) &pmu_conf.perf_ovfl_val);
+
+			/* writes to upper part are ignored, so this is safe */
+			ia64_set_pmd(cnum, th->pmu_counters[i].rval);
+
+			DBprintk((__FUNCTION__ " pmod[%ld].val=0x%lx pmd=0x%lx\n", i, th->pmu_counters[i].val, ia64_get_pmd(cnum)&pmu_conf.perf_ovfl_val)); 
+
+			if (th->pmu_counters[i].pid != 0 && th->pmu_counters[i].sig>0) {
+				DBprintk((__FUNCTION__ " shouild notify process %d with signal %d\n",th->pmu_counters[i].pid, th->pmu_counters[i].sig)); 
+			}
+		}
 	}
+	return 1;
 }
 
 static void
 perfmon_interrupt (int irq, void *arg, struct pt_regs *regs)
 {
-	update_counters();
-	ia64_set_pmc(0, 0);
-	ia64_srlz_d();
+	/* unfreeze if not spurious */
+	if ( update_counters(ia64_get_pmc(0)) ) {
+		ia64_set_pmc(0, 0);
+		ia64_srlz_d();
+	}
 }
 
 static struct irqaction perfmon_irqaction = {
@@ -280,9 +483,13 @@ perfmon_proc_info(char *page)
 {
 	char *p = page;
 	u64 pmc0 = ia64_get_pmc(0);
+	int i;
 
-	p += sprintf(p, "PMC[0]=%lx\n", pmc0);
-
+	p += sprintf(p, "PMC[0]=%lx\nPerfmon debug: %s\n", pmc0, pfm_debug ? "On" : "Off");
+	for(i=0; i < NR_CPUS; i++) {
+		if (cpu_is_online(i)) 
+			p += sprintf(p, "CPU%d.PMU %d\n", i, pmu_owners[i] ? pmu_owners[i]->pid: -1);
+	}
 	return p - page;
 }
 
@@ -308,7 +515,6 @@ void __init
 perfmon_init (void)
 {
 	pal_perf_mon_info_u_t pm_info;
-	u64 pm_buffer[16];
 	s64 status;
 	
 	irq_desc[PERFMON_IRQ].status |= IRQ_PER_CPU;
@@ -320,15 +526,13 @@ perfmon_init (void)
 
 	printk("perfmon: Initialized vector to %u\n",PERFMON_IRQ);
 
-	if ((status=ia64_pal_perf_mon_info(pm_buffer, &pm_info)) != 0) {
+	if ((status=ia64_pal_perf_mon_info(pmu_conf.impl_regs, &pm_info)) != 0) {
 		printk(__FUNCTION__ " pal call failed (%ld)\n", status);
 		return;
 	} 
-	pmu_conf.perf_ovfl_val = perf_ovfl_val = (1L << pm_info.pal_perf_mon_info_s.width) - 1; 
+	pmu_conf.perf_ovfl_val = (1L << pm_info.pal_perf_mon_info_s.width) - 1; 
 
 	/* XXX need to use PAL instead */
-	pmu_conf.max_pmc       = 13;
-	pmu_conf.max_pmd       = 17;
 	pmu_conf.max_counters  = pm_info.pal_perf_mon_info_s.generic;
 
 	printk("perfmon: Counters are %d bits\n", pm_info.pal_perf_mon_info_s.width);
@@ -347,36 +551,137 @@ perfmon_init_percpu (void)
 	ia64_srlz_d();
 }
 
+/*
+ * XXX: for system wide this function MUST never be called
+ */
 void
-ia64_save_pm_regs (struct thread_struct *t)
+ia64_save_pm_regs (struct task_struct *ta)
 {
-	int i;
+	struct thread_struct *t = &ta->thread;
+	u64 pmc0, psr;
+	int i,j;
+
+	/*
+	 * We must maek sure that we don't loose any potential overflow
+	 * interrupt while saving PMU context. In this code, external
+	 * interrupts are always enabled.
+	 */
+
+	/*
+	 * save current PSR: needed because we modify it
+	 */
+	__asm__ __volatile__ ("mov %0=psr;;": "=r"(psr) :: "memory");
+
+	/*
+	 * stop monitoring:
+	 * This is the only way to stop monitoring without destroying overflow
+	 * information in PMC[0..3].
+	 * This is the last instruction which can cause overflow when monitoring
+	 * in kernel.
+	 * By now, we could still have an overflow interrupt in flight.
+	 */
+	__asm__ __volatile__ ("rsm psr.up;;"::: "memory");
+	
+	/*
+	 * read current overflow status:
+	 *
+	 * We may be reading stale information at this point, if we got interrupt
+	 * just before the read(pmc0) but that's all right. However, if we did
+	 * not get the interrupt before, this read reflects LAST state.
+	 *
+	 */
+	pmc0 = ia64_get_pmc(0);
 
+	/*
+	 * freeze PMU:
+	 *
+	 * This destroys the overflow information. This is required to make sure
+	 * next process does not start with monitoring on if not requested
+	 * (PSR.up may not be enough).
+	 *
+	 * We could still get an overflow interrupt by now. However the handler
+	 * will not do anything if is sees PMC[0].fr=1 but no overflow bits
+	 * are set. So PMU will stay in frozen state. This implies that pmc0
+	 * will still be holding the correct unprocessed information.
+	 *
+	 */
 	ia64_set_pmc(0, 1);
 	ia64_srlz_d();
+
+	/*
+	 * check for overflow bits set:
+	 *
+	 * If pmc0 reports PMU frozen, this means we have a pending overflow,
+	 * therefore we invoke the handler. Handler is reentrant with regards
+	 * to PMC[0] so it is safe to call it twice.
+	 *
+	 * IF pmc0 reports overflow, we need to reread current PMC[0] value
+	 * in case the handler was invoked right after the first pmc0 read.
+	 * it is was not invoked then pmc0==PMC[0], otherwise it's been invoked
+	 * and overflow information has been processed, so we don't need to call.
+	 *
+	 * Test breakdown:
+	 *	- pmc0 & ~0x1: test if overflow happened
+	 * 	- second part: check if current register reflects this as well.
+	 *
+	 * NOTE: testing for pmc0 & 0x1 is not enough has it would trigger call
+	 * when PM_VALID and PMU.fr which is common when setting up registers
+	 * just before actually starting monitors.
+	 *
+	 */
+	if ((pmc0 & ~0x1) && ((pmc0=ia64_get_pmc(0)) &~0x1) ) {
+		printk(__FUNCTION__" Warning: pmc[0]=0x%lx\n", pmc0);
+		update_counters(pmc0);
+		/* 
+		 * XXX: not sure that's enough. the next task may still get the
+		 * interrupt.
+		 */
+	}
+
+	/*
+	 * restore PSR for context switch to save
+	 */
+	__asm__ __volatile__ ("mov psr.l=%0;;"::"r"(psr): "memory");
+
 	/*
 	 * XXX: this will need to be extended beyong just counters
 	 */
-	for (i=0; i< IA64_NUM_PM_REGS; i++) {
-		t->pmd[i]  = ia64_get_pmd(4+i);
-		t->pmod[i] = pmds[smp_processor_id()][i];
-		t->pmc[i]  = ia64_get_pmc(4+i);
+	for (i=0,j=4; i< IA64_NUM_PMD_COUNTERS; i++,j++) {
+		t->pmd[i] = ia64_get_pmd(j);
+		t->pmc[i] = ia64_get_pmc(j);
 	}
+	/*
+	 * PMU is frozen, PMU context is saved: nobody owns the PMU on this CPU
+	 * At this point, we should not receive any pending interrupt from the 
+	 * 'switched out' task
+	 */
+	pmu_owners[smp_processor_id()] = NULL;
 }
 
 void
-ia64_load_pm_regs (struct thread_struct *t)
+ia64_load_pm_regs (struct task_struct *ta)
 {
-	int i;
+	struct thread_struct *t = &ta->thread;
+	int i,j;
+
+	/*
+	 * we first restore ownership of the PMU to the 'soon to be current'
+	 * context. This way, if, as soon as we unfreeze the PMU at the end
+	 * of this function, we get an interrupt, we attribute it to the correct
+	 * task
+	 */
+	pmu_owners[smp_processor_id()] = ta;
 
 	/*
 	 * XXX: this will need to be extended beyong just counters 
 	 */
-	for (i=0; i< IA64_NUM_PM_REGS ; i++) {
-		ia64_set_pmd(4+i, t->pmd[i]);
-		pmds[smp_processor_id()][i] = t->pmod[i];
-		ia64_set_pmc(4+i, t->pmc[i]);
+	for (i=0,j=4; i< IA64_NUM_PMD_COUNTERS; i++,j++) {
+		ia64_set_pmd(j, t->pmd[i]);
+		ia64_set_pmc(j, t->pmc[i]);
 	}
+	/*
+	 * unfreeze PMU
+	 */
 	ia64_set_pmc(0, 0);
 	ia64_srlz_d();
 }
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 41db60a0c..e61843db5 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -137,23 +137,6 @@ cpu_idle (void *unused)
 		check_pgt_cache();
 		if (pm_idle)
 			(*pm_idle)();
-#ifdef CONFIG_ITANIUM_ASTEP_SPECIFIC
-		local_irq_disable();
-		{
-			u64 itc, itm;
-
-			itc = ia64_get_itc();
-			itm = ia64_get_itm();
-			if (time_after(itc, itm + 1000)) {
-				extern void ia64_reset_itm (void);
-
-				printk("cpu_idle: ITM in past (itc=%lx,itm=%lx:%lums)\n",
-				       itc, itm, (itc - itm)/500000);
-				ia64_reset_itm();
-			}
-		}
-		local_irq_enable();
-#endif
 	}
 }
 
@@ -164,7 +147,7 @@ ia64_save_extra (struct task_struct *task)
 		ia64_save_debug_regs(&task->thread.dbr[0]);
 #ifdef CONFIG_PERFMON
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
-		ia64_save_pm_regs(&task->thread);
+		ia64_save_pm_regs(task);
 #endif
 	if (IS_IA32_PROCESS(ia64_task_regs(task)))
 		ia32_save_state(&task->thread);
@@ -177,7 +160,7 @@ ia64_load_extra (struct task_struct *task)
 		ia64_load_debug_regs(&task->thread.dbr[0]);
 #ifdef CONFIG_PERFMON
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
-		ia64_load_pm_regs(&task->thread);
+		ia64_load_pm_regs(task);
 #endif
 	if (IS_IA32_PROCESS(ia64_task_regs(task)))
 		ia32_load_state(&task->thread);
@@ -299,6 +282,14 @@ copy_thread (int nr, unsigned long clone_flags,
 #	define THREAD_FLAGS_TO_SET	0
 	p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
 			   | THREAD_FLAGS_TO_SET);
+#ifdef CONFIG_IA32_SUPPORT
+	/*
+	 * If we're cloning an IA32 task then save the IA32 extra
+	 * state from the current task to the new task
+	 */
+	if (IS_IA32_PROCESS(ia64_task_regs(current)))
+		ia32_save_state(&p->thread);
+#endif
 	return 0;
 }
 
@@ -554,7 +545,7 @@ exit_thread (void)
 		 * we garantee no race.  this call we also stop
 		 * monitoring
 		 */
-		ia64_save_pm_regs(&current->thread);
+		ia64_save_pm_regs(current);
 		/*
 		 * make sure that switch_to() will not save context again
 		 */
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 820a87854..0b49bdcaa 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -617,7 +617,6 @@ access_uarea (struct task_struct *child, unsigned long addr, unsigned long *data
 	struct switch_stack *sw;
 	struct unw_frame_info info;
 	struct pt_regs *pt;
-	unsigned long pmd_tmp;
 
 	pt = ia64_task_regs(child);
 	sw = (struct switch_stack *) (child->thread.ksp + 16);
@@ -794,11 +793,7 @@ access_uarea (struct task_struct *child, unsigned long addr, unsigned long *data
 				addr);
 			return -1;
 		}
-	} else 
-#ifdef CONFIG_PERFMON
-		if (addr < PT_PMD) 
-#endif
-		{
+	} else {
 		/* access debug registers */
 
 		if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
@@ -820,33 +815,14 @@ access_uarea (struct task_struct *child, unsigned long addr, unsigned long *data
 		}
 
 		ptr += regnum;
-	}
-#ifdef CONFIG_PERFMON
-	else {
-		/*
-		 * XXX: will eventually move back to perfmonctl()
-		 */
-		unsigned long pmd = (addr - PT_PMD) >> 3;
-		extern unsigned long perf_ovfl_val;
-
-		/* we just use ptrace to read */
-		if (write_access) return -1;
-
-		if (pmd > 3) {
-			printk("ptrace: rejecting access to PMD[%ld] address 0x%lx\n", pmd, addr);
-			return -1;
-		}
 
-		/* 
-		 * We always need to mask upper 32bits of pmd because value is random
-		 */
-		pmd_tmp = child->thread.pmod[pmd]+(child->thread.pmd[pmd]& perf_ovfl_val);
-
-		/*printk(__FUNCTION__" child=%d reading pmd[%ld]=%lx\n", child->pid, pmd, pmd_tmp);*/
-
-		ptr = &pmd_tmp;
+		if (write_access)
+			/* don't let the user set kernel-level breakpoints... */
+			*ptr = *data & ~(7UL << 56);
+		else
+			*data = *ptr;
+		return 0;
 	}
-#endif
 	if (write_access)
 		*ptr = *data;
 	else
@@ -861,7 +837,6 @@ access_uarea (struct task_struct *child, unsigned long addr, unsigned long *data
 {
 	unsigned long *ptr = NULL, *rbs, *bspstore, ndirty, regnum;
 	struct switch_stack *sw;
-	unsigned long pmd_tmp;
 	struct pt_regs *pt;
 
 	if ((addr & 0x7) != 0)
@@ -977,11 +952,7 @@ access_uarea (struct task_struct *child, unsigned long addr, unsigned long *data
 			/* disallow accessing anything else... */
 			return -1;
 		}
-	} else 
-#ifdef CONFIG_PERFMON
-		if (addr < PT_PMD) 
-#endif
-		{
+	} else {
 
 		/* access debug registers */
 
@@ -1002,34 +973,14 @@ access_uarea (struct task_struct *child, unsigned long addr, unsigned long *data
 			return -1;
 
 		ptr += regnum;
-	}
-#ifdef CONFIG_PERFMON
-	else {
-		/*
-		 * XXX: will eventually move back to perfmonctl()
-		 */
-		unsigned long pmd = (addr - PT_PMD) >> 3;
-		extern unsigned long perf_ovfl_val;
-
-		/* we just use ptrace to read */
-		if (write_access) return -1;
-
-		if (pmd > 3) {
-			printk("ptrace: rejecting access to PMD[%ld] address 0x%lx\n", pmd, addr);
-			return -1;
-		}
 
-		/* 
-		 * We always need to mask upper 32bits of pmd because value is random
-		 */
-		pmd_tmp = child->thread.pmod[pmd]+(child->thread.pmd[pmd]& perf_ovfl_val);
-
-		/*printk(__FUNCTION__" child=%d reading pmd[%ld]=%lx\n", child->pid, pmd, pmd_tmp);*/
-
-		ptr = &pmd_tmp;
+		if (write_access)
+			/* don't let the user set kernel-level breakpoints... */
+			*ptr = *data & ~(7UL << 56);
+		else
+			*data = *ptr;
+		return 0;
 	}
-#endif
-
 	if (write_access)
 		*ptr = *data;
 	else
@@ -1107,7 +1058,7 @@ sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data,
 		goto out_tsk;
 
 	if (child->state != TASK_STOPPED) {
-		if (request != PTRACE_KILL && request != PTRACE_PEEKUSR)
+		if (request != PTRACE_KILL)
 			goto out_tsk;
 	}
 
diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c
index 87c7befea..56059a306 100644
--- a/arch/ia64/kernel/sal.c
+++ b/arch/ia64/kernel/sal.c
@@ -104,9 +104,11 @@ ia64_sal_init (struct ia64_sal_systab *systab)
 	if (strncmp(systab->signature, "SST_", 4) != 0)
 		printk("bad signature in system table!");
 
-	printk("SAL v%u.%02u: ia32bios=%s, oem=%.32s, product=%.32s\n",
+	/* 
+	 * revisions are coded in BCD, so %x does the job for us
+	 */
+	printk("SAL v%x.%02x: oem=%.32s, product=%.32s\n",
 	       systab->sal_rev_major, systab->sal_rev_minor,
-	       systab->ia32_bios_present ? "present" : "absent",
 	       systab->oem_id, systab->product_id);
 
 	min = ~0UL;
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index ed091d864..83d5643cd 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -235,6 +235,12 @@ setup_arch (char **cmdline_p)
 	machvec_init(acpi_get_sysname());
 #endif
 
+#ifdef	CONFIG_ACPI20
+	if (efi.acpi20) {
+		/* Parse the ACPI 2.0 tables */
+		acpi20_parse(efi.acpi20);
+	} else 
+#endif
 	if (efi.acpi) {
 		/* Parse the ACPI tables */
 		acpi_parse(efi.acpi);
@@ -255,13 +261,6 @@ setup_arch (char **cmdline_p)
 
 	paging_init();
 	platform_setup(cmdline_p);
-
-#ifdef CONFIG_SWIOTLB
-	{
-		extern void setup_swiotlb (void);
-		setup_swiotlb();
-	}
-#endif
 }
 
 /*
@@ -271,9 +270,9 @@ int
 get_cpuinfo (char *buffer)
 {
 #ifdef CONFIG_SMP
-#	define lps	c->loops_per_sec
+#	define lpj	c->loops_per_jiffy
 #else
-#	define lps	loops_per_sec
+#	define lpj	loops_per_jiffy
 #endif
 	char family[32], model[32], features[128], *cp, *p = buffer;
 	struct cpuinfo_ia64 *c;
@@ -325,7 +324,7 @@ get_cpuinfo (char *buffer)
 			     features,
 			     c->ppn, c->number, c->proc_freq / 1000000, c->proc_freq % 1000000,
 			     c->itc_freq / 1000000, c->itc_freq % 1000000,
-			     lps / 500000, (lps / 5000) % 100);
+			     lpj*HZ/500000, (lpj*HZ/5000) % 100);
         }
 	return p - buffer;
 }
@@ -376,15 +375,7 @@ identify_cpu (struct cpuinfo_ia64 *c)
 
 	status = ia64_pal_vm_summary(&vm1, &vm2);
 	if (status == PAL_STATUS_SUCCESS) {
-#if 1
-		/*
-		 * XXX the current PAL code returns IMPL_VA_MSB==60, which is dead-wrong.
-		 * --davidm 00/05/26
-		 s*/
-		impl_va_msb = 50;
-#else
 		impl_va_msb = vm2.pal_vm_info_2_s.impl_va_msb;
-#endif
 		phys_addr_size = vm1.pal_vm_info_1_s.phys_add_size;
 	}
 	printk("CPU %d: %lu virtual and %lu physical address bits\n",
@@ -408,6 +399,8 @@ cpu_init (void)
 {
 	extern void __init ia64_rid_init (void);
 	extern void __init ia64_tlb_init (void);
+	pal_vm_info_2_u_t vmi;
+	unsigned int max_ctx;
 
 	identify_cpu(&my_cpu_data);
 
@@ -415,15 +408,12 @@ cpu_init (void)
 	memset(ia64_task_regs(current), 0, sizeof(struct pt_regs));
 
 	/*
-	 * Initialize default control register to defer speculative
-	 * faults.  On a speculative load, we want to defer access
-	 * right, key miss, and key permission faults.  We currently
-	 * do NOT defer TLB misses, page-not-present, access bit, or
-	 * debug faults but kernel code should not rely on any
-	 * particular setting of these bits.
-	ia64_set_dcr(IA64_DCR_DR | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_PP);
+	 * Initialize default control register to defer all speculative faults.  The
+	 * kernel MUST NOT depend on a particular setting of these bits (in other words,
+	 * the kernel must have recovery code for all speculative accesses).
 	 */
-	ia64_set_dcr(IA64_DCR_DR | IA64_DCR_DK | IA64_DCR_DX );
+	ia64_set_dcr(  IA64_DCR_DM | IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
+		     | IA64_DCR_DA | IA64_DCR_DD);
 #ifndef CONFIG_SMP
 	ia64_set_fpu_owner(0);		/* initialize ar.k5 */
 #endif
@@ -444,4 +434,17 @@ cpu_init (void)
 #ifdef CONFIG_SMP
 	normal_xtp();
 #endif
+
+	/* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
+	if (ia64_pal_vm_summary(NULL, &vmi) == 0)
+		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
+	else {
+		printk("ia64_rid_init: PAL VM summary failed, assuming 18 RID bits\n");
+		max_ctx = (1U << 15) - 1;	/* use architected minimum */
+	}
+	while (max_ctx < ia64_ctx.max_ctx) {
+		unsigned int old = ia64_ctx.max_ctx;
+		if (cmpxchg(&ia64_ctx.max_ctx, old, max_ctx) == old)
+			break;
+	}
 }
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index e0adf1981..3ffa201aa 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -91,7 +91,7 @@ ia64_rt_sigsuspend (sigset_t *uset, size_t sigsetsize, struct sigscratch *scr)
 		scr->pt.r10 = -1;
 	}
 	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
+		current->state = TASK_INTERRUPTIBLE;
 		schedule();
 		if (ia64_do_signal(&oldset, scr, 1))
 			return -EINTR;
@@ -499,9 +499,10 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 			/* Let the debugger run.  */
 			current->exit_code = signr;
 			current->thread.siginfo = &info;
-			set_current_state(TASK_STOPPED);
+			current->state = TASK_STOPPED;
 			notify_parent(current, SIGCHLD);
 			schedule();
+
 			signr = current->exit_code;
 			current->thread.siginfo = 0;
 
@@ -557,7 +558,7 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 				/* FALLTHRU */
 
 			      case SIGSTOP:
-				set_current_state(TASK_STOPPED);
+				current->state = TASK_STOPPED;
 				current->exit_code = signr;
 				if (!(current->p_pptr->sig->action[SIGCHLD-1].sa.sa_flags
 				      & SA_NOCLDSTOP))
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 694711507..5093341a5 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -6,11 +6,13 @@
  * 
  * Lots of stuff stolen from arch/alpha/kernel/smp.c
  *
- *  00/09/11 David Mosberger <davidm@hpl.hp.com> Do loops_per_sec calibration on each CPU.
+ *  00/09/11 David Mosberger <davidm@hpl.hp.com> Do loops_per_jiffy calibration on each CPU.
  *  00/08/23 Asit Mallick <asit.k.mallick@intel.com> fixed logical processor id
  *  00/03/31 Rohit Seth <rohit.seth@intel.com>	Fixes for Bootstrap Processor & cpu_online_map
  *			now gets done here (instead of setup.c)
  *  99/10/05 davidm	Update to bring it in sync with new command-line processing scheme.
+ *  10/13/00 Goutham Rao <goutham.rao@intel.com> Updated smp_call_function and
+ *		smp_call_function_single to resend IPI on timeouts
  */
 #define __KERNEL_SYSCALLS__
 
@@ -30,6 +32,7 @@
 #include <asm/current.h>
 #include <asm/delay.h>
 #include <asm/efi.h>
+#include <asm/machvec.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -78,10 +81,6 @@ struct smp_call_struct {
 };
 static volatile struct smp_call_struct *smp_call_function_data;
 
-#ifdef	CONFIG_ITANIUM_A1_SPECIFIC
-extern spinlock_t ivr_read_lock;
-#endif
-
 #define IPI_RESCHEDULE	        0
 #define IPI_CALL_FUNC	        1
 #define IPI_CPU_STOP	        2
@@ -269,14 +268,14 @@ handle_IPI(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static inline void
-send_IPI_single(int dest_cpu, int op) 
+send_IPI_single (int dest_cpu, int op) 
 {
 	
 	if (dest_cpu == -1) 
                 return;
         
 	set_bit(op, &ipi_op[dest_cpu]);
-	ipi_send(dest_cpu, IPI_IRQ, IA64_IPI_DM_INT, 0);
+	platform_send_ipi(dest_cpu, IPI_IRQ, IA64_IPI_DM_INT, 0);
 }
 
 static inline void
@@ -358,6 +357,7 @@ smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int
 	if (pointer_lock(&smp_call_function_data, &data, retry))
 		return -EBUSY;
 
+resend:
 	/*  Send a message to all other CPUs and wait for them to respond  */
 	send_IPI_single(cpuid, IPI_CALL_FUNC);
 
@@ -366,8 +366,12 @@ smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int
 	while ((atomic_read(&data.unstarted_count) > 0) && time_before(jiffies, timeout))
 		barrier();
 	if (atomic_read(&data.unstarted_count) > 0) {
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
+		goto resend;
+#else
 		smp_call_function_data = NULL;
 		return -ETIMEDOUT;
+#endif
 	}
 	if (wait)
 		while (atomic_read(&data.unfinished_count) > 0)
@@ -411,13 +415,23 @@ smp_call_function (void (*func) (void *info), void *info, int retry, int wait)
 	/*  Send a message to all other CPUs and wait for them to respond  */
 	send_IPI_allbutself(IPI_CALL_FUNC);
 
+retry:
 	/*  Wait for response  */
 	timeout = jiffies + HZ;
 	while ((atomic_read(&data.unstarted_count) > 0) && time_before(jiffies, timeout))
 		barrier();
 	if (atomic_read(&data.unstarted_count) > 0) {
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
+		int i;
+		for (i = 0; i < smp_num_cpus; i++) {
+			if (i != smp_processor_id())
+				platform_send_ipi(i, IPI_IRQ, IA64_IPI_DM_INT, 0);
+		}
+		goto retry;
+#else
 		smp_call_function_data = NULL;
 		return -ETIMEDOUT;
+#endif
 	}
 	if (wait)
 		while (atomic_read(&data.unfinished_count) > 0)
@@ -430,8 +444,6 @@ smp_call_function (void (*func) (void *info), void *info, int retry, int wait)
 /*
  * Flush all other CPU's tlb and then mine.  Do this with smp_call_function() as we
  * want to ensure all TLB's flushed before proceeding.
- *
- * XXX: Is it OK to use the same ptc.e info on all cpus?
  */
 void
 smp_flush_tlb_all(void)
@@ -502,7 +514,7 @@ smp_callin (void)
 	local_irq_enable();		/* Interrupts have been off until now */
 
 	calibrate_delay();
-	my_cpu_data.loops_per_sec = loops_per_sec;
+	my_cpu_data.loops_per_jiffy = loops_per_jiffy;
 
 	/* allow the master to continue */
 	set_bit(cpu, &cpu_callin_map);
@@ -569,7 +581,7 @@ smp_boot_one_cpu(int cpu)
 	cpu_now_booting = cpu;
 
 	/* Kick the AP in the butt */
-	ipi_send(cpu, ap_wakeup_vector, IA64_IPI_DM_INT, 0);
+	platform_send_ipi(cpu, ap_wakeup_vector, IA64_IPI_DM_INT, 0);
 
 	/* wait up to 10s for the AP to start  */
 	for (timeout = 0; timeout < 100000; timeout++) {
@@ -603,7 +615,7 @@ smp_boot_cpus(void)
 	__cpu_physical_id[0] = hard_smp_processor_id();
 
 	/* on the BP, the kernel already called calibrate_delay_loop() in init/main.c */
-	my_cpu_data.loops_per_sec = loops_per_sec;
+	my_cpu_data.loops_per_jiffy = loops_per_jiffy;
 #if 0
 	smp_tune_scheduling();
 #endif
@@ -653,13 +665,11 @@ smp_boot_cpus(void)
 	bogosum = 0;
         for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_online_map & (1L << i))
-			bogosum += cpu_data[i].loops_per_sec;
+			bogosum += cpu_data[i].loops_per_jiffy;
         }
 
-	printk(KERN_INFO "SMP: Total of %d processors activated "
-	       "(%lu.%02lu BogoMIPS).\n",
-	       cpu_count, (bogosum + 2500) / 500000,
-	       ((bogosum + 2500) / 5000) % 100);
+	printk(KERN_INFO "SMP: Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+	       cpu_count, bogosum*HZ/500000, (bogosum*HZ/5000) % 100);
 
 	smp_num_cpus = cpu_count;
 }
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index f78512229..2713d7fd9 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -16,8 +16,38 @@
 #include <linux/smp_lock.h>
 #include <linux/highuid.h>
 
+#include <asm/shmparam.h>
 #include <asm/uaccess.h>
 
+#define COLOR_ALIGN(addr)	(((addr) + SHMLBA - 1) & ~(SHMLBA - 1))
+
+unsigned long
+get_unmapped_area (unsigned long addr, unsigned long len)
+{
+	struct vm_area_struct * vmm;
+
+	if (len > RGN_MAP_LIMIT)
+		return 0;
+	if (!addr)
+		addr = TASK_UNMAPPED_BASE;
+
+	if (current->thread.flags & IA64_THREAD_MAP_SHARED)
+		addr = COLOR_ALIGN(addr);
+	else
+		addr = PAGE_ALIGN(addr);
+
+	for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+		/* At this point:  (!vmm || addr < vmm->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return 0;
+		if (rgn_offset(addr) + len > RGN_MAP_LIMIT)	/* no risk of overflow here... */
+			return 0;
+		if (!vmm || addr + len <= vmm->vm_start)
+			return addr;
+		addr = vmm->vm_end;
+	}
+}
+
 asmlinkage long
 ia64_getpriority (int which, int who, long arg2, long arg3, long arg4, long arg5, long arg6, 
 		  long arg7, long stack)
@@ -34,6 +64,7 @@ ia64_getpriority (int which, int who, long arg2, long arg3, long arg4, long arg5
 	return prio;
 }
 
+/* XXX obsolete, but leave it here until the old libc is gone... */
 asmlinkage unsigned long
 sys_getpagesize (void)
 {
@@ -58,16 +89,61 @@ ia64_shmat (int shmid, void *shmaddr, int shmflg, long arg3, long arg4, long arg
 }
 
 asmlinkage unsigned long
-ia64_brk (long brk, long arg1, long arg2, long arg3,
+ia64_brk (unsigned long brk, long arg1, long arg2, long arg3,
 	  long arg4, long arg5, long arg6, long arg7, long stack)
 {
-	extern unsigned long sys_brk (unsigned long brk);
+	extern int vm_enough_memory (long pages);
 	struct pt_regs *regs = (struct pt_regs *) &stack;
-	unsigned long retval;
+	unsigned long rlim, retval, newbrk, oldbrk;
+	struct mm_struct *mm = current->mm;
+
+	/*
+	 * Most of this replicates the code in sys_brk() except for an additional safety
+	 * check and the clearing of r8.  However, we can't call sys_brk() because we need
+	 * to acquire the mmap_sem before we can do the test...
+	 */
+	down(&mm->mmap_sem);
 
-	retval = sys_brk(brk);
+	if (brk < mm->end_code)
+		goto out;
+	newbrk = PAGE_ALIGN(brk);
+	oldbrk = PAGE_ALIGN(mm->brk);
+	if (oldbrk == newbrk)
+		goto set_brk;
+
+	/* Always allow shrinking brk. */
+	if (brk <= mm->brk) {
+		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+			goto set_brk;
+		goto out;
+	}
 
-	regs->r8 = 0;	/* ensure large retval isn't mistaken as error code */
+	/* Check against unimplemented/unmapped addresses: */
+	if ((newbrk - oldbrk) > RGN_MAP_LIMIT || rgn_offset(newbrk) > RGN_MAP_LIMIT)
+		goto out;
+
+	/* Check against rlimit.. */
+	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+		goto out;
+
+	/* Check against existing mmap mappings. */
+	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+		goto out;
+
+	/* Check if we have enough memory.. */
+	if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
+		goto out;
+
+	/* Ok, looks good - let it rip. */
+	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+		goto out;
+set_brk:
+	mm->brk = brk;
+out:
+	retval = mm->brk;
+	up(&mm->mmap_sem);
+	regs->r8 = 0;		/* ensure large retval isn't mistaken as error code */
 	return retval;
 }
 
@@ -95,10 +171,8 @@ sys_pipe (long arg0, long arg1, long arg2, long arg3,
 static inline unsigned long
 do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff)
 {
-	unsigned long loff, hoff;
+	unsigned long roff;
 	struct file *file = 0;
-	/* the virtual address space that is mappable in each region: */
-#	define OCTANT_SIZE	((PTRS_PER_PGD<<PGDIR_SHIFT)/8)
 
 	/*
 	 * A zero mmap always succeeds in Linux, independent of
@@ -107,15 +181,12 @@ do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, un
 	if (PAGE_ALIGN(len) == 0)
 		return addr;
 
-	/* Don't permit mappings into or across the address hole in a region: */
-	loff = rgn_offset(addr);
-	hoff = loff - (RGN_SIZE - OCTANT_SIZE/2);
-	if ((len | loff | (loff + len)) >= OCTANT_SIZE/2
-	    && (len | hoff | (hoff + len)) >= OCTANT_SIZE/2)
+	/* don't permit mappings into unmapped space or the virtual page table of a region: */
+	roff = rgn_offset(addr);
+	if ((len | roff | (roff + len)) >= RGN_MAP_LIMIT)
 		return -EINVAL;
 
-	/* Don't permit mappings that would cross a region boundary: */
-
+	/* don't permit mappings that would cross a region boundary: */
 	if (rgn_index(addr) != rgn_index(addr + len))
 		return -EINVAL;
 
@@ -126,10 +197,15 @@ do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, un
 			return -EBADF;
 	}
 
+	if (flags & MAP_SHARED)
+		current->thread.flags |= IA64_THREAD_MAP_SHARED;
+
 	down(&current->mm->mmap_sem);
 	addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 	up(&current->mm->mmap_sem);
 
+	current->thread.flags &= ~IA64_THREAD_MAP_SHARED;
+
 	if (file)
 		fput(file);
 	return addr;
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 5e54e4f4b..8f65adc2c 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -152,19 +152,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 	unsigned long new_itm;
-#if 0
-	static unsigned long last_time;
-	static unsigned char count;
-	int printed = 0;
-#endif
 
-	/*
-	 * Here we are in the timer irq handler. We have irqs locally
-	 * disabled, but we don't know if the timer_bh is running on
-	 * another CPU. We need to avoid to SMP race by acquiring the
-	 * xtime_lock.
-	 */
-	write_lock(&xtime_lock);
 	new_itm = itm.next[cpu].count;
 
 	if (!time_after(ia64_get_itc(), new_itm))
@@ -173,48 +161,33 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
 	while (1) {
 		/*
-		 * Do kernel PC profiling here.  We multiply the
-		 * instruction number by four so that we can use a
-		 * prof_shift of 2 to get instruction-level instead of
-		 * just bundle-level accuracy.
+		 * Do kernel PC profiling here.  We multiply the instruction number by
+		 * four so that we can use a prof_shift of 2 to get instruction-level
+		 * instead of just bundle-level accuracy.
 		 */
 		if (!user_mode(regs)) 
 			do_profile(regs->cr_iip + 4*ia64_psr(regs)->ri);
 
 #ifdef CONFIG_SMP
 		smp_do_timer(regs);
-		if (smp_processor_id() == 0)
-			do_timer(regs);
-#else
-		do_timer(regs);
 #endif
+		if (smp_processor_id() == 0) {
+			/*
+			 * Here we are in the timer irq handler. We have irqs locally
+			 * disabled, but we don't know if the timer_bh is running on
+			 * another CPU. We need to avoid to SMP race by acquiring the
+			 * xtime_lock.
+			 */
+			write_lock(&xtime_lock);
+			do_timer(regs);
+			write_unlock(&xtime_lock);
+		}
 
 		new_itm += itm.delta;
 		itm.next[cpu].count = new_itm;
 		if (time_after(new_itm, ia64_get_itc()))
 			break;
-
-#if 0
-		/*
-		 * SoftSDV in SMP mode is _slow_, so we do "lose" ticks, 
-		 * but it's really OK...
-		 */
-		if (count > 0 && jiffies - last_time > 5*HZ)
-			count = 0;
-		if (count++ == 0) {
-			last_time = jiffies;
-			if (!printed) {
-				printk("Lost clock tick on CPU %d (now=%lx, next=%lx)!!\n",
-				       cpu, ia64_get_itc(), itm.next[cpu].count);
-				printed = 1;
-# ifdef CONFIG_IA64_DEBUG_IRQ
-				printk("last_cli_ip=%lx\n", last_cli_ip);
-# endif
-			}
-		}
-#endif
 	}
-	write_unlock(&xtime_lock);
 
 	/*
 	 * If we're too close to the next clock tick for comfort, we
@@ -229,7 +202,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	ia64_set_itm(new_itm);
 }
 
-#if defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_IA64_SOFTSDV_HACKS)
+#ifdef CONFIG_IA64_SOFTSDV_HACKS
 
 /*
  * Interrupts must be disabled before calling this routine.
@@ -240,7 +213,7 @@ ia64_reset_itm (void)
 	timer_interrupt(0, 0, ia64_task_regs(current));
 }
 
-#endif /* CONFIG_ITANIUM_ASTEP_SPECIFIC */
+#endif
 
 /*
  * Encapsulate access to the itm structure for SMP.
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 43340bf85..fd8369291 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -78,7 +78,7 @@ void
 die_if_kernel (char *str, struct pt_regs *regs, long err)
 {
 	if (user_mode(regs)) {
-#if 1
+#if 0
 		/* XXX for debugging only */
 		printk ("!!die_if_kernel: %s(%d): %s %ld\n",
 			current->comm, current->pid, str, err);
@@ -484,6 +484,20 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 		sprintf(buf, "Disabled FPL fault---not supposed to happen!");
 		break;
 
+	      case 26: /* NaT Consumption */
+	      case 31: /* Unsupported Data Reference */
+		if (user_mode(regs)) {
+			siginfo.si_signo = SIGILL;
+			siginfo.si_code = ILL_ILLOPN;
+			siginfo.si_errno = 0;
+			siginfo.si_addr = (void *) (regs->cr_iip + ia64_psr(regs)->ri);
+			siginfo.si_imm = vector;
+			force_sig_info(SIGILL, &siginfo, current);
+			return;
+		}
+		sprintf(buf, (vector == 26) ? "NaT consumption" : "Unsupported data reference");
+		break;
+
 	      case 29: /* Debug */
 	      case 35: /* Taken Branch Trap */
 	      case 36: /* Single Step Trap */
@@ -522,10 +536,10 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 
 	      case 34:		/* Unimplemented Instruction Address Trap */
 		if (user_mode(regs)) {
-			printk("Woah! Unimplemented Instruction Address Trap!\n");
-			siginfo.si_code = ILL_BADIADDR;
 			siginfo.si_signo = SIGILL;
+			siginfo.si_code = ILL_BADIADDR;
 			siginfo.si_errno = 0;
+			siginfo.si_addr = (void *) (regs->cr_iip + ia64_psr(regs)->ri);
 			force_sig_info(SIGILL, &siginfo, current);
 			return;
 		}
@@ -544,7 +558,8 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 
 	      case 46:
 		printk("Unexpected IA-32 intercept trap (Trap 46)\n");
-		printk("  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n", regs->cr_iip, ifa, isr);
+		printk("  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n",
+		       regs->cr_iip, ifa, isr, iim);
 		force_sig(SIGSEGV, current);
 		return;
 
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 7cc238a83..a24121a26 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -572,7 +572,8 @@ getreg(unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs)
 	 */
 	if (regnum == 0) {
 		*val = 0;
-		*nat = 0;
+		if (nat)
+			*nat = 0;
 		return;
 	}
 
@@ -1563,9 +1564,13 @@ ia64_handle_unaligned(unsigned long ifa, struct pt_regs *regs)
 
 	DPRINT(("ret=%d\n", ret));
 	if (ret) {
-		lock_kernel();
-	        force_sig(SIGSEGV, current);
-	        unlock_kernel();
+		struct siginfo si;
+
+		si.si_signo = SIGBUS;
+		si.si_errno = 0;
+		si.si_code = BUS_ADRALN;
+		si.si_addr = (void *) ifa;
+	        force_sig_info(SIGBUS, &si, current);
 	} else {
 		/*
 	 	 * given today's architecture this case is not likely to happen
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
index 21a2ead16..f5ae7e497 100644
--- a/arch/ia64/kernel/unwind.c
+++ b/arch/ia64/kernel/unwind.c
@@ -46,16 +46,6 @@
 #define MIN(a,b)	((a) < (b) ? (a) : (b))
 #define p5		5
 
-/*
- * The unwind tables are supposed to be sorted, but the GNU toolchain
- * currently fails to produce a sorted table in the presence of
- * functions that go into sections other than .text.  For example, the
- * kernel likes to put initialization code into .text.init, which
- * messes up the sort order.  Hopefully, this will get fixed sometime
- * soon.  --davidm 00/05/23
- */
-#define UNWIND_TABLE_SORT_BUG
-
 #define UNW_LOG_CACHE_SIZE	7	/* each unw_script is ~256 bytes in size */
 #define UNW_CACHE_SIZE		(1 << UNW_LOG_CACHE_SIZE)
 
@@ -531,6 +521,10 @@ push (struct unw_state_record *sr)
 	struct unw_reg_state *rs;
 
 	rs = alloc_reg_state();
+	if (!rs) {
+		printk("unwind: cannot stack reg state!\n");
+		return;
+	}
 	memcpy(rs, &sr->curr, sizeof(*rs));
 	rs->next = sr->stack;
 	sr->stack = rs;
@@ -1964,23 +1958,6 @@ init_unwind_table (struct unw_table *table, const char *name, unsigned long segm
 {
 	struct unw_table_entry *start = table_start, *end = table_end;
 
-#ifdef UNWIND_TABLE_SORT_BUG
-	{
-		struct unw_table_entry *e1, *e2, tmp;
-
-		/* stupid bubble sort... */
-
-		for (e1 = start; e1 < end; ++e1) {
-			for (e2 = e1 + 1; e2 < end; ++e2) {
-				if (e2->start_offset < e1->start_offset) {
-					tmp = *e1;
-					*e1 = *e2;
-					*e2 = tmp;
-				}
-			}
-		}
-	}
-#endif
 	table->name = name;
 	table->segment_base = segment_base;
 	table->gp = gp;
@@ -2023,8 +2000,8 @@ unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned lon
 void
 unw_remove_unwind_table (void *handle)
 {
-	struct unw_table *table, *prevt;
-	struct unw_script *tmp, *prev;
+	struct unw_table *table, *prev;
+	struct unw_script *tmp;
 	unsigned long flags;
 	long index;
 
@@ -2043,41 +2020,35 @@ unw_remove_unwind_table (void *handle)
 	{
 		/* first, delete the table: */
 
-		for (prevt = (struct unw_table *) &unw.tables; prevt; prevt = prevt->next)
-			if (prevt->next == table)
+		for (prev = (struct unw_table *) &unw.tables; prev; prev = prev->next)
+			if (prev->next == table)
 				break;
-		if (!prevt) {
+		if (!prev) {
 			dprintk("unwind: failed to find unwind table %p\n", (void *) table);
 			spin_unlock_irqrestore(&unw.lock, flags);
 			return;
 		}
-		prevt->next = table->next;
+		prev->next = table->next;
+	}
+	spin_unlock_irqrestore(&unw.lock, flags);
 
-		/* next, remove hash table entries for this table */
+	/* next, remove hash table entries for this table */
 
-		for (index = 0; index <= UNW_HASH_SIZE; ++index) {
-			if (unw.hash[index] >= UNW_CACHE_SIZE)
-				continue;
+	for (index = 0; index <= UNW_HASH_SIZE; ++index) {
+		tmp = unw.cache + unw.hash[index];
+		if (unw.hash[index] >= UNW_CACHE_SIZE
+		    || tmp->ip < table->start || tmp->ip >= table->end)
+			continue;
 
-			tmp = unw.cache + unw.hash[index];
-			prev = 0;
-			while (1) {
-				write_lock(&tmp->lock);
-				{
-					if (tmp->ip >= table->start && tmp->ip < table->end) {
-						if (prev)
-							prev->coll_chain = tmp->coll_chain;
-						else
-							unw.hash[index] = -1;
-						tmp->ip = 0;
-					} else
-						prev = tmp;
-				}
-				write_unlock(&tmp->lock);
+		write_lock(&tmp->lock);
+		{
+			if (tmp->ip >= table->start && tmp->ip < table->end) {
+				unw.hash[index] = tmp->coll_chain;
+				tmp->ip = 0;
 			}
 		}
+		write_unlock(&tmp->lock);
 	}
-	spin_unlock_irqrestore(&unw.lock, flags);
 
 	kfree(table);
 }
author	Ralf Baechle <ralf@linux-mips.org>	2001-01-11 04:02:40 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2001-01-11 04:02:40 +0000
commit	e47f00743fc4776491344f2c618cc8dc2c23bcbc (patch)
tree	13e03a113a82a184c51c19c209867cfd3a59b3b9 /arch/ia64/kernel
parent	b2ad5f821b1381492d792ca10b1eb7a107b48f14 (diff)