summaryrefslogtreecommitdiffstats
path: root/arch/alpha
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2001-03-09 20:33:35 +0000
committerRalf Baechle <ralf@linux-mips.org>2001-03-09 20:33:35 +0000
commit116674acc97ba75a720329996877077d988443a2 (patch)
tree6a3f2ff0b612ae2ee8a3f3509370c9e6333a53b3 /arch/alpha
parent71118c319fcae4a138f16e35b4f7e0a6d53ce2ca (diff)
Merge with Linux 2.4.2.
Diffstat (limited to 'arch/alpha')
-rw-r--r--arch/alpha/kernel/Makefile4
-rw-r--r--arch/alpha/kernel/alpha_ksyms.c6
-rw-r--r--arch/alpha/kernel/irq.c2
-rw-r--r--arch/alpha/kernel/irq_alpha.c2
-rw-r--r--arch/alpha/kernel/osf_sys.c4
-rw-r--r--arch/alpha/kernel/pci-noop.c104
-rw-r--r--arch/alpha/kernel/process.c16
-rw-r--r--arch/alpha/kernel/ptrace.c2
-rw-r--r--arch/alpha/kernel/setup.c2
-rw-r--r--arch/alpha/kernel/smc37c669.c2
-rw-r--r--arch/alpha/kernel/smc37c93x.c2
-rw-r--r--arch/alpha/kernel/smp.c2
-rw-r--r--arch/alpha/kernel/sys_ruffian.c74
-rw-r--r--arch/alpha/kernel/traps.c2
-rw-r--r--arch/alpha/lib/Makefile2
-rw-r--r--arch/alpha/lib/clear_page.S39
-rw-r--r--arch/alpha/lib/copy_page.S49
-rw-r--r--arch/alpha/lib/ev6-clear_page.S54
-rw-r--r--arch/alpha/lib/ev6-copy_page.S203
19 files changed, 547 insertions, 24 deletions
diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile
index abc04cca2..08ec1d613 100644
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -30,9 +30,7 @@ ifdef CONFIG_VGA_HOSE
obj-y += console.o
endif
-
obj-$(CONFIG_SMP) += smp.o irq_smp.o
-
obj-$(CONFIG_PCI) += pci.o pci_iommu.o
ifdef CONFIG_ALPHA_GENERIC
@@ -76,7 +74,7 @@ obj-y += sys_eb64p.o
endif
obj-$(CONFIG_ALPHA_EIGER) += sys_eiger.o
-obj-$(CONFIG_ALPHA_JENSEN) += sys_jensen.o
+obj-$(CONFIG_ALPHA_JENSEN) += sys_jensen.o pci-noop.o
obj-$(CONFIG_ALPHA_MIATA) += sys_miata.o
obj-$(CONFIG_ALPHA_MIKASA) += sys_mikasa.o
obj-$(CONFIG_ALPHA_NAUTILUS) += sys_nautilus.o
diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index 17285ac26..d7bf13ec3 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -98,9 +98,13 @@ EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(__memset);
EXPORT_SYMBOL(__memsetw);
EXPORT_SYMBOL(__constant_c_memset);
+EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(clear_page);
EXPORT_SYMBOL(__direct_map_base);
EXPORT_SYMBOL(__direct_map_size);
+
+#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_alloc_consistent);
EXPORT_SYMBOL(pci_free_consistent);
EXPORT_SYMBOL(pci_map_single);
@@ -108,6 +112,7 @@ EXPORT_SYMBOL(pci_unmap_single);
EXPORT_SYMBOL(pci_map_sg);
EXPORT_SYMBOL(pci_unmap_sg);
EXPORT_SYMBOL(pci_dma_supported);
+#endif
EXPORT_SYMBOL(dump_thread);
EXPORT_SYMBOL(dump_fpu);
@@ -166,6 +171,7 @@ EXPORT_SYMBOL(__down_failed_interruptible);
EXPORT_SYMBOL(__up_wakeup);
EXPORT_SYMBOL(down);
EXPORT_SYMBOL(down_interruptible);
+EXPORT_SYMBOL(down_trylock);
EXPORT_SYMBOL(up);
EXPORT_SYMBOL(__down_read_failed);
EXPORT_SYMBOL(__down_write_failed);
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index 080e48e43..825eaf2d5 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -18,7 +18,7 @@
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/irq.h>
diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c
index 59f102496..91e99f573 100644
--- a/arch/alpha/kernel/irq_alpha.c
+++ b/arch/alpha/kernel/irq_alpha.c
@@ -18,7 +18,7 @@
unsigned long __irq_attempt[NR_IRQS];
#endif
-/* Hack minimum IPL during interupt processing for broken hardware. */
+/* Hack minimum IPL during interrupt processing for broken hardware. */
#ifdef CONFIG_ALPHA_BROKEN_IRQ_MASK
int __min_ipl;
#endif
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index cd28b07fc..6159457d9 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -19,7 +19,7 @@
#include <linux/stddef.h>
#include <linux/unistd.h>
#include <linux/ptrace.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
#include <linux/user.h>
#include <linux/a.out.h>
#include <linux/utsname.h>
@@ -74,8 +74,10 @@ asmlinkage int osf_set_program_attributes(
mm = current->mm;
mm->end_code = bss_start + bss_len;
mm->brk = bss_start + bss_len;
+#if 0
printk("set_program_attributes(%lx %lx %lx %lx)\n",
text_start, text_len, bss_start, bss_len);
+#endif
unlock_kernel();
return 0;
}
diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
new file mode 100644
index 000000000..d530ebb82
--- /dev/null
+++ b/arch/alpha/kernel/pci-noop.c
@@ -0,0 +1,104 @@
+/*
+ * linux/arch/alpha/kernel/pci-noop.c
+ *
+ * Stub PCI interfaces for Jensen-specific kernels.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+
+#include "proto.h"
+
+
+/*
+ * The PCI controler list.
+ */
+
+struct pci_controler *hose_head, **hose_tail = &hose_head;
+struct pci_controler *pci_isa_hose;
+
+
+struct pci_controler * __init
+alloc_pci_controler(void)
+{
+ struct pci_controler *hose;
+
+ hose = alloc_bootmem(sizeof(*hose));
+
+ *hose_tail = hose;
+ hose_tail = &hose->next;
+
+ return hose;
+}
+
+struct resource * __init
+alloc_resource(void)
+{
+ struct resource *res;
+
+ res = alloc_bootmem(sizeof(*res));
+
+ return res;
+}
+
+asmlinkage long
+sys_pciconfig_iobase(long which, unsigned long bus, unsigned long dfn)
+{
+ struct pci_controler *hose;
+ struct pci_dev *dev;
+
+ /* from hose or from bus.devfn */
+ if (which & IOBASE_FROM_HOSE) {
+ for (hose = hose_head; hose; hose = hose->next)
+ if (hose->index == bus)
+ break;
+ if (!hose)
+ return -ENODEV;
+ } else {
+ /* Special hook for ISA access. */
+ if (bus == 0 && dfn == 0)
+ hose = pci_isa_hose;
+ else
+ return -ENODEV;
+ }
+
+ switch (which & ~IOBASE_FROM_HOSE) {
+ case IOBASE_HOSE:
+ return hose->index;
+ case IOBASE_SPARSE_MEM:
+ return hose->sparse_mem_base;
+ case IOBASE_DENSE_MEM:
+ return hose->dense_mem_base;
+ case IOBASE_SPARSE_IO:
+ return hose->sparse_io_base;
+ case IOBASE_DENSE_IO:
+ return hose->dense_io_base;
+ case IOBASE_ROOT_BUS:
+ return hose->bus->number;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+asmlinkage long
+sys_pciconfig_read(unsigned long bus, unsigned long dfn,
+ unsigned long off, unsigned long len, void *buf)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ else
+ return -ENODEV;
+}
+
+asmlinkage long
+sys_pciconfig_write(unsigned long bus, unsigned long dfn,
+ unsigned long off, unsigned long len, void *buf)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ else
+ return -ENODEV;
+}
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 508e278b0..2dd505e10 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -18,7 +18,7 @@
#include <linux/stddef.h>
#include <linux/unistd.h>
#include <linux/ptrace.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
#include <linux/user.h>
#include <linux/a.out.h>
#include <linux/utsname.h>
@@ -416,22 +416,20 @@ dump_fpu(struct pt_regs * regs, elf_fpregset_t *r)
* Don't do this at home.
*/
asmlinkage int
-sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
- unsigned long a3, unsigned long a4, unsigned long a5,
- struct pt_regs regs)
+sys_execve(char *ufilename, char **argv, char **envp,
+ unsigned long a3, unsigned long a4, unsigned long a5,
+ struct pt_regs regs)
{
int error;
- char * filename;
+ char *filename;
- lock_kernel();
- filename = getname((char *) a0);
+ filename = getname(ufilename);
error = PTR_ERR(filename);
if (IS_ERR(filename))
goto out;
- error = do_execve(filename, (char **) a1, (char **) a2, &regs);
+ error = do_execve(filename, argv, envp, &regs);
putname(filename);
out:
- unlock_kernel();
return error;
}
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 09fcfd787..a919e7c2f 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -12,7 +12,7 @@
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/user.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 2acf56d96..a6443da53 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -16,7 +16,7 @@
#include <linux/stddef.h>
#include <linux/unistd.h>
#include <linux/ptrace.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
#include <linux/user.h>
#include <linux/a.out.h>
#include <linux/tty.h>
diff --git a/arch/alpha/kernel/smc37c669.c b/arch/alpha/kernel/smc37c669.c
index 0b2db18ab..3ffb611f2 100644
--- a/arch/alpha/kernel/smc37c669.c
+++ b/arch/alpha/kernel/smc37c669.c
@@ -3,7 +3,7 @@
*/
#include <linux/kernel.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/delay.h>
diff --git a/arch/alpha/kernel/smc37c93x.c b/arch/alpha/kernel/smc37c93x.c
index 5448305a3..b0e15d307 100644
--- a/arch/alpha/kernel/smc37c93x.c
+++ b/arch/alpha/kernel/smc37c93x.c
@@ -5,7 +5,7 @@
#include <linux/config.h>
#include <linux/kernel.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/delay.h>
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index e91e77895..4f877b10b 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -837,7 +837,7 @@ smp_call_function (void (*func) (void *info), void *info, int retry, int wait)
atomic_set(&data.unstarted_count, smp_num_cpus - 1);
atomic_set(&data.unfinished_count, smp_num_cpus - 1);
- /* Aquire the smp_call_function_data mutex. */
+ /* Acquire the smp_call_function_data mutex. */
if (pointer_lock(&smp_call_function_data, &data, retry))
return -EBUSY;
diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c
index c9a2e79a4..0d9713377 100644
--- a/arch/alpha/kernel/sys_ruffian.c
+++ b/arch/alpha/kernel/sys_ruffian.c
@@ -92,14 +92,80 @@ ruffian_kill_arch (int mode)
#endif
}
+/*
+ * Interrupt routing:
+ *
+ * Primary bus
+ * IdSel INTA INTB INTC INTD
+ * 21052 13 - - - -
+ * SIO 14 23 - - -
+ * 21143 15 44 - - -
+ * Slot 0 17 43 42 41 40
+ *
+ * Secondary bus
+ * IdSel INTA INTB INTC INTD
+ * Slot 0 8 (18) 19 18 17 16
+ * Slot 1 9 (19) 31 30 29 28
+ * Slot 2 10 (20) 27 26 25 24
+ * Slot 3 11 (21) 39 38 37 36
+ * Slot 4 12 (22) 35 34 33 32
+ * 53c875 13 (23) 20 - - -
+ *
+ */
+
static int __init
ruffian_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
{
- /* We don't know anything about the PCI routing, so leave
- the IRQ unchanged. */
- return dev->irq;
+ static char irq_tab[11][5] __initdata = {
+ /*INT INTA INTB INTC INTD */
+ {-1, -1, -1, -1, -1}, /* IdSel 13, 21052 */
+ {-1, -1, -1, -1, -1}, /* IdSel 14, SIO */
+ {44, 44, 44, 44, 44}, /* IdSel 15, 21143 */
+ {-1, -1, -1, -1, -1}, /* IdSel 16, none */
+ {43, 43, 42, 41, 40}, /* IdSel 17, 64-bit slot */
+ /* the next 6 are actually on PCI bus 1, across the bridge */
+ {19, 19, 18, 17, 16}, /* IdSel 8, slot 0 */
+ {31, 31, 30, 29, 28}, /* IdSel 9, slot 1 */
+ {27, 27, 26, 25, 24}, /* IdSel 10, slot 2 */
+ {39, 39, 38, 37, 36}, /* IdSel 11, slot 3 */
+ {35, 35, 34, 33, 32}, /* IdSel 12, slot 4 */
+ {20, 20, 20, 20, 20}, /* IdSel 13, 53c875 */
+ };
+ const long min_idsel = 13, max_idsel = 23, irqs_per_slot = 5;
+ return COMMON_TABLE_LOOKUP;
}
+static u8 __init
+ruffian_swizzle(struct pci_dev *dev, u8 *pinp)
+{
+ int slot, pin = *pinp;
+
+ if (dev->bus->number == 0) {
+ slot = PCI_SLOT(dev->devfn);
+ }
+ /* Check for the built-in bridge. */
+ else if (PCI_SLOT(dev->bus->self->devfn) == 13) {
+ slot = PCI_SLOT(dev->devfn) + 10;
+ }
+ else
+ {
+ /* Must be a card-based bridge. */
+ do {
+ if (PCI_SLOT(dev->bus->self->devfn) == 13) {
+ slot = PCI_SLOT(dev->devfn) + 10;
+ break;
+ }
+ pin = bridge_swizzle(pin, PCI_SLOT(dev->devfn));
+
+ /* Move up the chain of bridges. */
+ dev = dev->bus->self;
+ /* Slot of the next bridge. */
+ slot = PCI_SLOT(dev->devfn);
+ } while (dev->bus->self);
+ }
+ *pinp = pin;
+ return slot;
+}
#ifdef BUILDING_FOR_MILO
/*
@@ -164,6 +230,6 @@ struct alpha_machine_vector ruffian_mv __initmv = {
init_pci: cia_init_pci,
kill_arch: ruffian_kill_arch,
pci_map_irq: ruffian_map_irq,
- pci_swizzle: common_swizzle,
+ pci_swizzle: ruffian_swizzle,
};
ALIAS_MV(ruffian)
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 7b2f8be03..ed7d2f68e 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -1093,7 +1093,9 @@ alpha_ni_syscall(unsigned long a0, unsigned long a1, unsigned long a2,
{
/* We only get here for OSF system calls, minus #112;
the rest go to sys_ni_syscall. */
+#if 0
printk("<sc %ld(%lx,%lx,%lx)>", regs.r0, a0, a1, a2);
+#endif
return -ENOSYS;
}
diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile
index 1e3e485b5..d95ace0d7 100644
--- a/arch/alpha/lib/Makefile
+++ b/arch/alpha/lib/Makefile
@@ -42,6 +42,8 @@ OBJS = __divqu.o __remqu.o __divlu.o __remlu.o \
$(ev6)strncpy_from_user.o \
$(ev67)strlen_user.o \
$(ev6)csum_ipv6_magic.o \
+ $(ev6)clear_page.o \
+ $(ev6)copy_page.o \
strcasecmp.o \
fpreg.o \
callback_srm.o srm_puts.o srm_printk.o
diff --git a/arch/alpha/lib/clear_page.S b/arch/alpha/lib/clear_page.S
new file mode 100644
index 000000000..a221ae266
--- /dev/null
+++ b/arch/alpha/lib/clear_page.S
@@ -0,0 +1,39 @@
+/*
+ * arch/alpha/lib/clear_page.S
+ *
+ * Zero an entire page.
+ */
+
+ .text
+ .align 4
+ .global clear_page
+ .ent clear_page
+clear_page:
+ .prologue 0
+
+ lda $0,128
+ nop
+ unop
+ nop
+
+1: stq $31,0($16)
+ stq $31,8($16)
+ stq $31,16($16)
+ stq $31,24($16)
+
+ stq $31,32($16)
+ stq $31,40($16)
+ stq $31,48($16)
+ subq $0,1,$0
+
+ stq $31,56($16)
+ addq $16,64,$16
+ unop
+ bne $0,1b
+
+ ret
+ nop
+ unop
+ nop
+
+ .end clear_page
diff --git a/arch/alpha/lib/copy_page.S b/arch/alpha/lib/copy_page.S
new file mode 100644
index 000000000..9f3b97459
--- /dev/null
+++ b/arch/alpha/lib/copy_page.S
@@ -0,0 +1,49 @@
+/*
+ * arch/alpha/lib/copy_page.S
+ *
+ * Copy an entire page.
+ */
+
+ .text
+ .align 4
+ .global copy_page
+ .ent copy_page
+copy_page:
+ .prologue 0
+
+ lda $18,128
+ nop
+ unop
+ nop
+
+1: ldq $0,0($17)
+ ldq $1,8($17)
+ ldq $2,16($17)
+ ldq $3,24($17)
+
+ ldq $4,32($17)
+ ldq $5,40($17)
+ ldq $6,48($17)
+ ldq $7,56($17)
+
+ stq $0,0($16)
+ subq $18,1,$18
+ stq $1,8($16)
+ addq $17,64,$17
+
+ stq $2,16($16)
+ stq $3,24($16)
+ stq $4,32($16)
+ stq $5,40($16)
+
+ stq $6,48($16)
+ stq $7,56($16)
+ addq $16,64,$16
+ bne $18, 1b
+
+ ret
+ nop
+ unop
+ nop
+
+ .end copy_page
diff --git a/arch/alpha/lib/ev6-clear_page.S b/arch/alpha/lib/ev6-clear_page.S
new file mode 100644
index 000000000..adf4f7be0
--- /dev/null
+++ b/arch/alpha/lib/ev6-clear_page.S
@@ -0,0 +1,54 @@
+/*
+ * arch/alpha/lib/ev6-clear_page.S
+ *
+ * Zero an entire page.
+ */
+
+ .text
+ .align 4
+ .global clear_page
+ .ent clear_page
+clear_page:
+ .prologue 0
+
+ lda $0,128
+ lda $1,125
+ addq $16,64,$2
+ addq $16,128,$3
+
+ addq $16,192,$17
+ wh64 ($16)
+ wh64 ($2)
+ wh64 ($3)
+
+1: wh64 ($17)
+ stq $31,0($16)
+ subq $0,1,$0
+ subq $1,1,$1
+
+ stq $31,8($16)
+ stq $31,16($16)
+ addq $17,64,$2
+ nop
+
+ stq $31,24($16)
+ stq $31,32($16)
+ cmovgt $1,$2,$17
+ nop
+
+ stq $31,40($16)
+ stq $31,48($16)
+ nop
+ nop
+
+ stq $31,56($16)
+ addq $16,64,$16
+ nop
+ bne $0,1b
+
+ ret
+ nop
+ nop
+ nop
+
+ .end clear_page
diff --git a/arch/alpha/lib/ev6-copy_page.S b/arch/alpha/lib/ev6-copy_page.S
new file mode 100644
index 000000000..b789db192
--- /dev/null
+++ b/arch/alpha/lib/ev6-copy_page.S
@@ -0,0 +1,203 @@
+/*
+ * arch/alpha/lib/ev6-copy_page.S
+ *
+ * Copy an entire page.
+ */
+
+/* The following comparison of this routine vs the normal copy_page.S
+ was written by an unnamed ev6 hardware designer and forwarded to me
+ via Steven Hobbs <hobbs@steven.zko.dec.com>.
+
+ First Problem: STQ overflows.
+ -----------------------------
+
+ It would be nice if EV6 handled every resource overflow efficiently,
+ but for some it doesn't. Including store queue overflows. It causes
+ a trap and a restart of the pipe.
+
+ To get around this we sometimes use (to borrow a term from a VSSAD
+ researcher) "aeration". The idea is to slow the rate at which the
+ processor receives valid instructions by inserting nops in the fetch
+ path. In doing so, you can prevent the overflow and actually make
+ the code run faster. You can, of course, take advantage of the fact
+ that the processor can fetch at most 4 aligned instructions per cycle.
+
+ I inserted enough nops to force it to take 10 cycles to fetch the
+ loop code. In theory, EV6 should be able to execute this loop in
+ 9 cycles but I was not able to get it to run that fast -- the initial
+ conditions were such that I could not reach this optimum rate on
+ (chaotic) EV6. I wrote the code such that everything would issue
+ in order.
+
+ Second Problem: Dcache index matches.
+ -------------------------------------
+
+ If you are going to use this routine on random aligned pages, there
+ is a 25% chance that the pages will be at the same dcache indices.
+ This results in many nasty memory traps without care.
+
+ The solution is to schedule the prefetches to avoid the memory
+ conflicts. I schedule the wh64 prefetches farther ahead of the
+ read prefetches to avoid this problem.
+
+ Third Problem: Needs more prefetching.
+ --------------------------------------
+
+ In order to improve the code I added deeper prefetching to take the
+ most advantage of EV6's bandwidth.
+
+ I also prefetched the read stream. Note that adding the read prefetch
+ forced me to add another cycle to the inner-most kernel - up to 11
+ from the original 8 cycles per iteration. We could improve performance
+ further by unrolling the loop and doing multiple prefetches per cycle.
+
+ I think that the code below will be very robust and fast code for the
+ purposes of copying aligned pages. It is slower when both source and
+ destination pages are in the dcache, but it is my guess that this is
+ less important than the dcache miss case. */
+
+
+ .text
+ .align 4
+ .global copy_page
+ .ent copy_page
+copy_page:
+ .prologue 0
+
+ /* Prefetch 5 read cachelines; write-hint 10 cache lines. */
+ wh64 ($16)
+ ldl $31,0($17)
+ ldl $31,64($17)
+ lda $1,1*64($16)
+
+ wh64 ($1)
+ ldl $31,128($17)
+ ldl $31,192($17)
+ lda $1,2*64($16)
+
+ wh64 ($1)
+ ldl $31,256($17)
+ lda $18,118
+ lda $1,3*64($16)
+
+ wh64 ($1)
+ nop
+ lda $1,4*64($16)
+ lda $2,5*64($16)
+
+ wh64 ($1)
+ wh64 ($2)
+ lda $1,6*64($16)
+ lda $2,7*64($16)
+
+ wh64 ($1)
+ wh64 ($2)
+ lda $1,8*64($16)
+ lda $2,9*64($16)
+
+ wh64 ($1)
+ wh64 ($2)
+ lda $19,10*64($16)
+ nop
+
+ /* Main prefetching/write-hinting loop. */
+1: ldq $0,0($17)
+ ldq $1,8($17)
+ unop
+ unop
+
+ unop
+ unop
+ ldq $2,16($17)
+ ldq $3,24($17)
+
+ ldq $4,32($17)
+ ldq $5,40($17)
+ unop
+ unop
+
+ unop
+ unop
+ ldq $6,48($17)
+ ldq $7,56($17)
+
+ ldl $31,320($17)
+ unop
+ unop
+ unop
+
+ /* This gives the extra cycle of aeration above the minimum. */
+ unop
+ unop
+ unop
+ unop
+
+ wh64 ($19)
+ unop
+ unop
+ unop
+
+ stq $0,0($16)
+ subq $18,1,$18
+ stq $1,8($16)
+ unop
+
+ unop
+ stq $2,16($16)
+ addq $17,64,$17
+ stq $3,24($16)
+
+ stq $4,32($16)
+ stq $5,40($16)
+ addq $19,64,$19
+ unop
+
+ stq $6,48($16)
+ stq $7,56($16)
+ addq $16,64,$16
+ bne $18, 1b
+
+ /* Prefetch the final 5 cache lines of the read stream. */
+ lda $18,10
+ ldl $31,320($17)
+ ldl $31,384($17)
+ ldl $31,448($17)
+
+ ldl $31,512($17)
+ ldl $31,576($17)
+ nop
+ nop
+
+ /* Non-prefetching, non-write-hinting cleanup loop for the
+ final 10 cache lines. */
+2: ldq $0,0($17)
+ ldq $1,8($17)
+ ldq $2,16($17)
+ ldq $3,24($17)
+
+ ldq $4,32($17)
+ ldq $5,40($17)
+ ldq $6,48($17)
+ ldq $7,56($17)
+
+ stq $0,0($16)
+ subq $18,1,$18
+ stq $1,8($16)
+ addq $17,64,$17
+
+ stq $2,16($16)
+ stq $3,24($16)
+ stq $4,32($16)
+ stq $5,40($16)
+
+ stq $6,48($16)
+ stq $7,56($16)
+ addq $16,64,$16
+ bne $18, 2b
+
+ ret
+ nop
+ unop
+ nop
+
+ .end copy_page