summaryrefslogtreecommitdiffstats
path: root/arch/sparc64
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-11-28 03:58:46 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-11-28 03:58:46 +0000
commitb63ad0882a16a5d28003e57f2b0b81dee3fb322b (patch)
tree0a343ce219e2b8b38a5d702d66032c57b83d9720 /arch/sparc64
parenta9d7bff9a84dba79609a0002e5321b74c4d64c64 (diff)
Merge with 2.4.0-test11.
Diffstat (limited to 'arch/sparc64')
-rw-r--r--arch/sparc64/config.in2
-rw-r--r--arch/sparc64/kernel/dtlb_base.S26
-rw-r--r--arch/sparc64/kernel/dtlb_prot.S50
-rw-r--r--arch/sparc64/kernel/ebus.c29
-rw-r--r--arch/sparc64/kernel/ioctl32.c22
-rw-r--r--arch/sparc64/kernel/itlb_base.S30
-rw-r--r--arch/sparc64/kernel/pci.c8
-rw-r--r--arch/sparc64/kernel/process.c17
-rw-r--r--arch/sparc64/kernel/semaphore.c6
-rw-r--r--arch/sparc64/kernel/sparc64_ksyms.c52
-rw-r--r--arch/sparc64/kernel/sys_sparc32.c6
-rw-r--r--arch/sparc64/kernel/sys_sunos32.c4
-rw-r--r--arch/sparc64/lib/Makefile5
-rw-r--r--arch/sparc64/lib/U3copy_from_user.S500
-rw-r--r--arch/sparc64/lib/U3copy_in_user.S531
-rw-r--r--arch/sparc64/lib/U3copy_to_user.S528
-rw-r--r--arch/sparc64/lib/U3memcpy.S409
-rw-r--r--arch/sparc64/lib/VIScopy.S34
-rw-r--r--arch/sparc64/mm/init.c16
-rw-r--r--arch/sparc64/mm/ultra.S67
-rw-r--r--arch/sparc64/solaris/ioctl.c4
-rw-r--r--arch/sparc64/solaris/socket.c2
-rw-r--r--arch/sparc64/vmlinux.lds3
23 files changed, 2189 insertions, 162 deletions
diff --git a/arch/sparc64/config.in b/arch/sparc64/config.in
index a754b796b..19b05e28f 100644
--- a/arch/sparc64/config.in
+++ b/arch/sparc64/config.in
@@ -29,6 +29,8 @@ bool 'Symmetric multi-processing support' CONFIG_SMP
# Global things across all Sun machines.
define_bool CONFIG_HAVE_DEC_LOCK y
define_bool CONFIG_ISA n
+define_bool CONFIG_EISA n
+define_bool CONFIG_MCA n
define_bool CONFIG_PCMCIA n
define_bool CONFIG_SBUS y
define_bool CONFIG_SBUSCHAR y
diff --git a/arch/sparc64/kernel/dtlb_base.S b/arch/sparc64/kernel/dtlb_base.S
index 72120b563..80c74aa18 100644
--- a/arch/sparc64/kernel/dtlb_base.S
+++ b/arch/sparc64/kernel/dtlb_base.S
@@ -1,4 +1,4 @@
-/* $Id: dtlb_base.S,v 1.7 2000/03/26 09:13:48 davem Exp $
+/* $Id: dtlb_base.S,v 1.8 2000/11/10 08:28:45 davem Exp $
* dtlb_base.S: Front end to DTLB miss replacement strategy.
* This is included directly into the trap table.
*
@@ -57,7 +57,7 @@
srax %g4, VPTE_SHIFT, %g6 ! Create VPTE offset
ldxa [%g3 + %g6] ASI_S, %g5 ! Load VPTE
1: brlz,pt %g5, 9f ! Valid, load into TLB
- and %g5, (_PAGE_PRESENT|_PAGE_READ), %g4 ! Mask readable bits
+ nop ! Delay-slot
ba,a,pt %xcc, 4f ! Invalid, branch out
/* DTLB ** ICACHE line 2: Quick kernel TLB misses */
@@ -68,27 +68,27 @@
nop
9: stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Reload TLB
retry ! Trap return
- nop
+4: rdpr %pstate, %g5 ! Move into alternate globals
/* DTLB ** ICACHE line 3: winfixups+real_faults */
-4: cmp %g4, (_PAGE_PRESENT|_PAGE_READ) ! Readable page?
- be,pn %xcc, 5f ! Yep, refbit update
- sllx %g1, 60, %g4 ! Get valid bit
- rdpr %pstate, %g5 ! Move into alternate globals
wrpr %g5, PSTATE_AG|PSTATE_MG, %pstate
rdpr %tl, %g4 ! See where we came from.
cmp %g4, 1 ! Is etrap/rtrap window fault?
mov TLB_TAG_ACCESS, %g4 ! Prepare for fault processing
-
-/* DTLB ** ICACHE line 4: padding */
ldxa [%g4] ASI_DMMU, %g5 ! Load faulting VA page
be,pt %xcc, sparc64_realfault_common ! Jump to normal fault handling
mov FAULT_CODE_DTLB, %g4 ! It was read from DTLB
ba,a,pt %xcc, winfix_trampoline ! Call window fixup code
-5: or %g5, _PAGE_ACCESSED, %g5 ! Indicate reference
- or %g5, %g4, %g5 ! Set valid
- stxa %g5, [%g3 + %g6] ASI_S ! Update PTE table (cant trap)
- ba,a,pt %xcc, 9b ! Complete tlb miss
+
+/* DTLB ** ICACHE line 4: Unused... */
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
#undef TAG_CONTEXT_BITS
#undef VPTE_SHIFT
diff --git a/arch/sparc64/kernel/dtlb_prot.S b/arch/sparc64/kernel/dtlb_prot.S
index 5e99d5d47..1da370c7c 100644
--- a/arch/sparc64/kernel/dtlb_prot.S
+++ b/arch/sparc64/kernel/dtlb_prot.S
@@ -1,4 +1,4 @@
-/* $Id: dtlb_prot.S,v 1.20 2000/03/26 09:13:48 davem Exp $
+/* $Id: dtlb_prot.S,v 1.21 2000/11/10 08:28:45 davem Exp $
* dtlb_prot.S: DTLB protection trap strategy.
* This is included directly into the trap table.
*
@@ -6,10 +6,6 @@
* Copyright (C) 1997,1998 Jakub Jelinek (jj@ultra.linux.cz)
*/
-#define TAG_CONTEXT_BITS 0x3ff
-#define VPTE_SHIFT (PAGE_SHIFT - 3)
-#define MODIFIED_BITS (_PAGE_WRITE | _PAGE_W | _PAGE_MODIFIED | _PAGE_ACCESSED)
-
/* Ways we can get here:
*
* [TL == 0] 1) User stores to readonly pages.
@@ -18,45 +14,41 @@
*/
/* PROT ** ICACHE line 1: User DTLB protection trap */
- ldxa [%g1] ASI_DMMU, %g6 ! Primary or Secondary ctx?
- and %g6, 0x10, %g6 ! Get pri/sec ctx bit
stxa %g0, [%g1] ASI_DMMU ! Clear SFSR FaultValid bit
membar #Sync ! Synchronize ASI stores
- ldxa [%g1 + %g1] ASI_DMMU, %g4 ! Load TAG_ACCESS
- andn %g4, TAG_CONTEXT_BITS, %g4 ! Clear CTX bits
- stxa %g0, [%g4 + %g6] ASI_DMMU_DEMAP ! Perform TLB flush of page
- membar #Sync ! Synchronize ASI stores
-
-/* PROT ** ICACHE line 2: Further normal processing */
- srax %g4, VPTE_SHIFT, %g6 ! Compute VPTE offset
- ldxa [%g3 + %g6] ASI_S, %g5 ! Load PTE entry
- andcc %g5, _PAGE_WRITE, %g0 ! Writable page?
- be,pt %xcc, 1f ! Nope, real fault
- or %g5, (MODIFIED_BITS), %g5 ! Mark as writable/modified
- stxa %g5, [%g3 + %g6] ASI_S ! Update PTE entry
- stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Load PTE into TLB
- retry ! Trap return
-
-/* PROT ** ICACHE line 3: Real user faults */
-1: rdpr %pstate, %g5 ! Move into alternate globals
+ rdpr %pstate, %g5 ! Move into alternate globals
wrpr %g5, PSTATE_AG|PSTATE_MG, %pstate
rdpr %tl, %g1 ! Need to do a winfixup?
cmp %g1, 1 ! Trap level >1?
mov TLB_TAG_ACCESS, %g4 ! Prepare reload of vaddr
+ nop
+
+/* PROT ** ICACHE line 2: More real fault processing */
bgu,pn %xcc, winfix_trampoline ! Yes, perform winfixup
ldxa [%g4] ASI_DMMU, %g5 ! Put tagaccess in %g5
ba,pt %xcc, sparc64_realfault_common ! Nope, normal fault
-
-/* PROT ** ICACHE line 4: More real fault processing */
mov FAULT_CODE_DTLB | FAULT_CODE_WRITE, %g4
nop
nop
nop
nop
+
+/* PROT ** ICACHE line 3: Unused... */
+ nop
+ nop
+ nop
+ nop
+ nop
nop
nop
nop
-#undef TAG_CONTEXT_BITS
-#undef VPTE_SHIFT
-#undef MODIFIED_BITS
+/* PROT ** ICACHE line 3: Unused... */
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
diff --git a/arch/sparc64/kernel/ebus.c b/arch/sparc64/kernel/ebus.c
index 5872046b1..e175fac27 100644
--- a/arch/sparc64/kernel/ebus.c
+++ b/arch/sparc64/kernel/ebus.c
@@ -1,4 +1,4 @@
-/* $Id: ebus.c,v 1.48 2000/08/02 06:22:35 davem Exp $
+/* $Id: ebus.c,v 1.53 2000/11/08 05:08:23 davem Exp $
* ebus.c: PCI to EBus bridge device.
*
* Copyright (C) 1997 Eddie C. Dost (ecd@skynet.be)
@@ -22,21 +22,9 @@
struct linux_ebus *ebus_chain = 0;
-#ifdef CONFIG_SUN_OPENPROMIO
-extern int openprom_init(void);
-#endif
#ifdef CONFIG_SUN_AUXIO
extern void auxio_probe(void);
#endif
-#ifdef CONFIG_OBP_FLASH
-extern int flash_init(void);
-#endif
-#ifdef CONFIG_ENVCTRL
-extern int envctrl_init(void);
-#endif
-#ifdef CONFIG_DISPLAY7SEG
-extern int d7s_init(void);
-#endif
static inline void *ebus_alloc(size_t size)
{
@@ -372,24 +360,9 @@ void __init ebus_init(void)
++num_ebus;
}
-#ifdef CONFIG_SUN_OPENPROMIO
- openprom_init();
-#endif
-#ifdef CONFIG_SUN_BPP
- bpp_init();
-#endif
#ifdef CONFIG_SUN_AUXIO
auxio_probe();
#endif
-#ifdef CONFIG_ENVCTRL
- envctrl_init();
-#endif
-#ifdef CONFIG_OBP_FLASH
- flash_init();
-#endif
-#ifdef CONFIG_DISPLAY7SEG
- d7s_init();
-#endif
clock_probe();
power_init();
}
diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c
index 9f7cd59e9..a510c2aff 100644
--- a/arch/sparc64/kernel/ioctl32.c
+++ b/arch/sparc64/kernel/ioctl32.c
@@ -1,4 +1,4 @@
-/* $Id: ioctl32.c,v 1.99 2000/10/17 16:20:33 davem Exp $
+/* $Id: ioctl32.c,v 1.103 2000/11/10 05:44:33 davem Exp $
* ioctl32.c: Conversion between 32bit and 64bit native ioctls.
*
* Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
@@ -71,8 +71,9 @@
#include <asm/openpromio.h>
#include <asm/envctrl.h>
#include <asm/audioio.h>
-#include <asm/ethtool.h>
+#include <linux/ethtool.h>
#include <asm/display7seg.h>
+#include <asm/module.h>
#include <linux/soundcard.h>
#include <linux/atm.h>
@@ -3230,6 +3231,7 @@ COMPATIBLE_IOCTL(ENVCTRL_RD_SCSI_TEMPERATURE)
COMPATIBLE_IOCTL(ENVCTRL_RD_ETHERNET_TEMPERATURE)
COMPATIBLE_IOCTL(ENVCTRL_RD_MTHRBD_TEMPERATURE)
COMPATIBLE_IOCTL(ENVCTRL_RD_CPU_VOLTAGE)
+COMPATIBLE_IOCTL(ENVCTRL_RD_GLOBALADDRESS)
/* COMPATIBLE_IOCTL(D7SIOCRD) same value as ENVCTRL_RD_VOLTAGE_STATUS */
COMPATIBLE_IOCTL(D7SIOCWR)
COMPATIBLE_IOCTL(D7SIOCTM)
@@ -3467,6 +3469,14 @@ COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
/* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */
/* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */
@@ -3492,6 +3502,14 @@ COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
/* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */
/* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */
diff --git a/arch/sparc64/kernel/itlb_base.S b/arch/sparc64/kernel/itlb_base.S
index 7f0da3d14..bd6a3603d 100644
--- a/arch/sparc64/kernel/itlb_base.S
+++ b/arch/sparc64/kernel/itlb_base.S
@@ -1,4 +1,4 @@
-/* $Id: itlb_base.S,v 1.9 2000/03/26 09:13:48 davem Exp $
+/* $Id: itlb_base.S,v 1.10 2000/11/10 08:28:45 davem Exp $
* itlb_base.S: Front end to ITLB miss replacement strategy.
* This is included directly into the trap table.
*
@@ -23,22 +23,13 @@
srax %g4, VPTE_SHIFT, %g6 ! Create VPTE offset
ldxa [%g3 + %g6] ASI_P, %g5 ! Load VPTE
1: brgez,pn %g5, 3f ! Not valid, branch out
- and %g5, (_PAGE_PRESENT|_PAGE_READ), %g4 ! Mask readable bits
+ nop ! Delay-slot
2: stxa %g5, [%g0] ASI_ITLB_DATA_IN ! Load PTE into TLB
retry ! Trap return
-3: cmp %g4, (_PAGE_PRESENT|_PAGE_READ) ! Readable page?
+3: rdpr %pstate, %g4 ! Move into alternate globals
-/* ITLB ** ICACHE line 2: Quick user ref updates */
- bne,pn %xcc, 4f ! Nope, real missing page
- sllx %g1, 60, %g4 ! Sliiickkk...
- or %g5, _PAGE_ACCESSED, %g5 ! Mark as touched
- or %g5, %g4, %g5 ! Allow user to see it
- ba,pt %xcc, 2b ! Branch to load TLB
- stxa %g5, [%g3 + %g6] ASI_S ! Update PTE table
-4: rdpr %pstate, %g4 ! Move into alternate globals
+/* ITLB ** ICACHE line 2: Real faults */
wrpr %g4, PSTATE_AG|PSTATE_MG, %pstate
-
-/* ITLB ** ICACHE line 3: Real faults */
rdpr %tpc, %g5 ! And load faulting VA
mov FAULT_CODE_ITLB, %g4 ! It was read from ITLB
sparc64_realfault_common: ! Called by TL0 dtlb_miss too
@@ -46,10 +37,11 @@ sparc64_realfault_common: ! Called by TL0 dtlb_miss too
stx %g5, [%g6 + AOFF_task_thread + AOFF_thread_fault_address]
ba,pt %xcc, etrap ! Save state
1: rd %pc, %g7 ! ...
+ nop
+
+/* ITLB ** ICACHE line 3: Finish faults + window fixups */
call do_sparc64_fault ! Call fault handler
add %sp, STACK_BIAS + REGWIN_SZ, %o0! Compute pt_regs arg
-
-/* ITLB ** ICACHE line 4: Finish faults + window fixups */
ba,pt %xcc, rtrap_clr_l6 ! Restore cpu state
nop
winfix_trampoline:
@@ -57,6 +49,14 @@ winfix_trampoline:
or %g3, 0x7c, %g3 ! Compute offset to branch
wrpr %g3, %tnpc ! Write it into TNPC
done ! Do it to it
+
+/* ITLB ** ICACHE line 4: Unused... */
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
nop
nop
diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c
index dd153a24e..1abef824f 100644
--- a/arch/sparc64/kernel/pci.c
+++ b/arch/sparc64/kernel/pci.c
@@ -1,4 +1,4 @@
-/* $Id: pci.c,v 1.18 2000/10/03 11:31:42 anton Exp $
+/* $Id: pci.c,v 1.19 2000/11/08 04:49:17 davem Exp $
* pci.c: UltraSparc PCI controller support.
*
* Copyright (C) 1997, 1998, 1999 David S. Miller (davem@redhat.com)
@@ -202,12 +202,6 @@ void pcibios_update_irq(struct pci_dev *pdev, int irq)
{
}
-unsigned long resource_fixup(struct pci_dev *pdev, struct resource *res,
- unsigned long start, unsigned long size)
-{
- return start;
-}
-
void pcibios_fixup_pbus_ranges(struct pci_bus *pbus,
struct pbus_set_ranges_data *pranges)
{
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 1f3386d53..4534ad59b 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -1,4 +1,4 @@
-/* $Id: process.c,v 1.112 2000/09/06 00:45:01 davem Exp $
+/* $Id: process.c,v 1.113 2000/11/08 08:14:58 davem Exp $
* arch/sparc64/kernel/process.c
*
* Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -647,14 +647,21 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
{
long retval;
- __asm__ __volatile("mov %1, %%g1\n\t"
+ /* If the parent runs before fn(arg) is called by the child,
+ * the input registers of this function can be clobbered.
+ * So we stash 'fn' and 'arg' into global registers which
+ * will not be modified by the parent.
+ */
+ __asm__ __volatile("mov %4, %%g2\n\t" /* Save FN into global */
+ "mov %5, %%g3\n\t" /* Save ARG into global */
+ "mov %1, %%g1\n\t" /* Clone syscall nr. */
"mov %2, %%o0\n\t" /* Clone flags. */
"mov 0, %%o1\n\t" /* usp arg == 0 */
"t 0x6d\n\t" /* Linux/Sparc clone(). */
"brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */
" mov %%o0, %0\n\t"
- "jmpl %4, %%o7\n\t" /* Call the function. */
- " mov %5, %%o0\n\t" /* Set arg in delay. */
+ "jmpl %%g2, %%o7\n\t" /* Call the function. */
+ " mov %%g3, %%o0\n\t" /* Set arg in delay. */
"mov %3, %%g1\n\t"
"t 0x6d\n\t" /* Linux/Sparc exit(). */
/* Notreached by child. */
@@ -662,7 +669,7 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
"=r" (retval) :
"i" (__NR_clone), "r" (flags | CLONE_VM),
"i" (__NR_exit), "r" (fn), "r" (arg) :
- "g1", "o0", "o1", "memory", "cc");
+ "g1", "g2", "g3", "o0", "o1", "memory", "cc");
return retval;
}
diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
index 8cb6e7211..1928d5a0d 100644
--- a/arch/sparc64/kernel/semaphore.c
+++ b/arch/sparc64/kernel/semaphore.c
@@ -1,4 +1,4 @@
-/* $Id: semaphore.c,v 1.4 2000/10/14 10:09:00 davem Exp $
+/* $Id: semaphore.c,v 1.5 2000/11/10 04:02:03 davem Exp $
* Generic semaphore code. Buyer beware. Do your own
* specific changes in <asm/semaphore-helper.h>
*/
@@ -223,7 +223,7 @@ void down_write_failed_biased(struct rw_semaphore *sem)
for (;;) {
if (test_and_clear_le_bit(1, &sem->granted))
break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE | TASK_EXCLUSIVE);
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!test_le_bit(1, &sem->granted))
schedule();
}
@@ -273,7 +273,7 @@ void down_write_failed(struct rw_semaphore *sem)
add_wait_queue_exclusive(&sem->wait, &wait);
while (sem->count < 0) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE | TASK_EXCLUSIVE);
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (sem->count >= 0)
break; /* we must attempt to acquire or bias the lock */
schedule();
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 6e7f59309..e1ae982bf 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -1,4 +1,4 @@
-/* $Id: sparc64_ksyms.c,v 1.95 2000/10/30 21:01:40 davem Exp $
+/* $Id: sparc64_ksyms.c,v 1.98 2000/11/13 10:03:32 davem Exp $
* arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support.
*
* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -87,7 +87,6 @@ extern long sparc32_open(const char * filename, int flags, int mode);
extern int register_ioctl32_conversion(unsigned int cmd, int (*handler)(unsigned int, unsigned int, unsigned long, struct file *));
extern int unregister_ioctl32_conversion(unsigned int cmd);
extern int io_remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot, int space);
-extern void __flush_dcache_page(void *addr);
extern int __ashrdi3(int, int);
@@ -110,25 +109,14 @@ extern void _do_write_unlock(rwlock_t *rw);
extern unsigned long phys_base;
-/* One thing to note is that the way the symbols of the mul/div
- * support routines are named is a mess, they all start with
- * a '.' which makes it a bitch to export, here is the trick:
- */
-
-#define EXPORT_SYMBOL_PRIVATE(sym) \
-extern int __sparc_priv_ ## sym (int) __asm__("__" #sym); \
-const struct module_symbol __export_priv_##sym \
-__attribute__((section("__ksymtab"))) = \
-{ (unsigned long) &__sparc_priv_ ## sym, "__" #sym }
-
/* used by various drivers */
#ifdef CONFIG_SMP
#ifndef SPIN_LOCK_DEBUG
/* Out of line rw-locking implementation. */
-EXPORT_SYMBOL_PRIVATE(read_lock);
-EXPORT_SYMBOL_PRIVATE(read_unlock);
-EXPORT_SYMBOL_PRIVATE(write_lock);
-EXPORT_SYMBOL_PRIVATE(write_unlock);
+EXPORT_SYMBOL(__read_lock);
+EXPORT_SYMBOL(__read_unlock);
+EXPORT_SYMBOL(__write_lock);
+EXPORT_SYMBOL(__write_unlock);
#endif
/* Kernel wide locking */
@@ -137,10 +125,10 @@ EXPORT_SYMBOL(kernel_flag);
/* Hard IRQ locking */
EXPORT_SYMBOL(global_irq_holder);
EXPORT_SYMBOL(synchronize_irq);
-EXPORT_SYMBOL_PRIVATE(global_cli);
-EXPORT_SYMBOL_PRIVATE(global_sti);
-EXPORT_SYMBOL_PRIVATE(global_save_flags);
-EXPORT_SYMBOL_PRIVATE(global_restore_flags);
+EXPORT_SYMBOL(__global_cli);
+EXPORT_SYMBOL(__global_sti);
+EXPORT_SYMBOL(__global_save_flags);
+EXPORT_SYMBOL(__global_restore_flags);
/* Per-CPU information table */
EXPORT_SYMBOL(cpu_data);
@@ -163,27 +151,33 @@ EXPORT_SYMBOL(_do_write_unlock);
#endif
+/* semaphores */
+EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__down_trylock);
+EXPORT_SYMBOL(__up);
+
/* rw semaphores */
EXPORT_SYMBOL_NOVERS(__down_read_failed);
EXPORT_SYMBOL_NOVERS(__down_write_failed);
EXPORT_SYMBOL_NOVERS(__rwsem_wake);
/* Atomic counter implementation. */
-EXPORT_SYMBOL_PRIVATE(atomic_add);
-EXPORT_SYMBOL_PRIVATE(atomic_sub);
+EXPORT_SYMBOL(__atomic_add);
+EXPORT_SYMBOL(__atomic_sub);
/* Atomic bit operations. */
-EXPORT_SYMBOL_PRIVATE(test_and_set_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_clear_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_change_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_set_le_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_clear_le_bit);
+EXPORT_SYMBOL(__test_and_set_bit);
+EXPORT_SYMBOL(__test_and_clear_bit);
+EXPORT_SYMBOL(__test_and_change_bit);
+EXPORT_SYMBOL(__test_and_set_le_bit);
+EXPORT_SYMBOL(__test_and_clear_le_bit);
EXPORT_SYMBOL(ivector_table);
EXPORT_SYMBOL(enable_irq);
EXPORT_SYMBOL(disable_irq);
-EXPORT_SYMBOL_PRIVATE(flushw_user);
+EXPORT_SYMBOL(__flushw_user);
EXPORT_SYMBOL(__flush_dcache_page);
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 24c8cd593..9b211d86d 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -1,4 +1,4 @@
-/* $Id: sys_sparc32.c,v 1.165 2000/10/10 04:47:31 davem Exp $
+/* $Id: sys_sparc32.c,v 1.166 2000/11/10 04:49:56 davem Exp $
* sys_sparc32.c: Conversion between 32bit and 64bit native syscalls.
*
* Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
@@ -2952,7 +2952,7 @@ static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm)
return -ENOMEM;
new = 1;
}
- kaddr = (char *)kmap(page);
+ kaddr = kmap(page);
if (new && offset)
memset(kaddr, 0, offset);
@@ -2967,7 +2967,7 @@ static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm)
err = copy_from_user(kaddr + offset, (char *)A(str),
bytes_to_copy);
flush_page_to_ram(page);
- kunmap((unsigned long)kaddr);
+ kunmap(page);
if (err)
return -EFAULT;
diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c
index 75d5c096e..a5f5411f5 100644
--- a/arch/sparc64/kernel/sys_sunos32.c
+++ b/arch/sparc64/kernel/sys_sunos32.c
@@ -601,7 +601,6 @@ sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr)
int try_port;
int ret;
struct socket *socket;
- struct dentry *dentry;
struct inode *inode;
struct file *file;
@@ -609,8 +608,7 @@ sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr)
if(!file)
return 0;
- dentry = file->f_dentry;
- inode = dentry->d_inode;
+ inode = file->f_dentry->d_inode;
socket = &inode->u.socket_i;
local.sin_family = AF_INET;
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
index fa057936a..77531321d 100644
--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -1,4 +1,4 @@
-# $Id: Makefile,v 1.23 2000/07/10 20:57:34 davem Exp $
+# $Id: Makefile,v 1.24 2000/11/01 07:33:47 davem Exp $
# Makefile for Sparc64 library files..
#
@@ -8,7 +8,8 @@ OBJS = PeeCeeI.o blockops.o debuglocks.o strlen.o strncmp.o \
memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \
VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \
VIScsumcopyusr.o VISsave.o atomic.o rwlock.o bitops.o \
- dec_and_lock.o
+ dec_and_lock.o U3memcpy.o U3copy_from_user.o U3copy_to_user.o \
+ U3copy_in_user.o
lib.a: $(OBJS)
$(AR) rcs lib.a $(OBJS)
diff --git a/arch/sparc64/lib/U3copy_from_user.S b/arch/sparc64/lib/U3copy_from_user.S
new file mode 100644
index 000000000..b1003e607
--- /dev/null
+++ b/arch/sparc64/lib/U3copy_from_user.S
@@ -0,0 +1,500 @@
+/* $Id: U3copy_from_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized copy from userspace.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ ba,pt %xcc, U3cfu_fixup; \
+ a, b, %o1; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EX(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ ba,pt %xcc, U3cfu_fixup; \
+ a, b, %o1; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EX2(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ and %o2, (0x40 - 1), %o1; \
+ add %o1, %o4, %o1; \
+ ba,pt %xcc, U3cfu_fixup; \
+ add %o1, 0x1c0, %o1; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EX3(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ and %o2, (0x40 - 1), %o1; \
+ sll %g3, 6, %g3; \
+ add %o1, 0x80, %o1; \
+ ba,pt %xcc, U3cfu_fixup; \
+ add %o1, %g3, %o1; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EX4(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ and %o2, (0x40 - 1), %o1; \
+ add %o1, 0x40, %o1; \
+ ba,pt %xcc, U3cfu_fixup; \
+ add %o1, %g3, %o1; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#else
+#define ASI_BLK_P 0xf0
+#define FPRS_FEF 0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b) x,y;
+#define EX(x,y,a,b) x,y;
+#define EX2(x,y) x,y;
+#define EX3(x,y) x,y;
+#define EX4(x,y) x,y;
+#endif
+
+ /* Special/non-trivial issues of this code:
+ *
+ * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+ * 2) Only low 32 FPU registers are used so that only the
+ * lower half of the FPU register set is dirtied by this
+ * code. This is especially important in the kernel.
+ * 3) This code never prefetches cachelines past the end
+ * of the source buffer.
+ */
+
+ .text
+ .align 32
+
+ /* The cheetah's flexible spine, oversized liver, enlarged heart,
+ * slender muscular body, and claws make it the swiftest hunter
+ * in Africa and the fastest animal on land. Can reach speeds
+ * of up to 2.4GB per second.
+ */
+
+ .globl U3copy_from_user
+U3copy_from_user: /* %o0=dst, %o1=src, %o2=len */
+#ifndef __KERNEL__
+ /* Save away original 'dst' for memcpy return value. */
+ mov %o0, %g3 ! A0 Group
+#endif
+ /* Anything to copy at all? */
+ cmp %o2, 0 ! A1
+ ble,pn %icc, U3copy_from_user_short_ret! BR
+
+ /* Extremely small copy? */
+ cmp %o2, 31 ! A0 Group
+ ble,pn %icc, U3copy_from_user_short ! BR
+
+ /* Large enough to use unrolled prefetch loops? */
+ cmp %o2, 0x100 ! A1
+ bge,a,pt %icc, U3copy_from_user_enter ! BR Group
+ andcc %o0, 0x3f, %g2 ! A0
+
+ ba,pt %xcc, U3copy_from_user_toosmall ! BR Group
+ andcc %o0, 0x7, %g2 ! A0
+
+ .align 32
+U3copy_from_user_short:
+ /* Copy %o2 bytes from src to dst, one byte at a time. */
+ EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS Group
+ add %o1, 0x1, %o1 ! A0
+ add %o0, 0x1, %o0 ! A1
+ subcc %o2, 1, %o2 ! A0 Group
+
+ bg,pt %icc, U3copy_from_user_short ! BR
+ stb %o3, [%o0 + -1] ! MS Group (1-cycle stall)
+
+U3copy_from_user_short_ret:
+#ifdef __KERNEL__
+ retl ! BR Group (0-4 cycle stall)
+ clr %o0 ! A0
+#else
+ retl ! BR Group (0-4 cycle stall)
+ mov %g3, %o0 ! A0
+#endif
+
+ /* Here len >= (6 * 64) and condition codes reflect execution
+ * of "andcc %o0, 0x7, %g2", done by caller.
+ */
+ .align 64
+U3copy_from_user_enter:
+ /* Is 'dst' already aligned on an 64-byte boundary? */
+ be,pt %xcc, 2f ! BR
+
+ /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
+ * of bytes to copy to make 'dst' 64-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x40, %g2 ! A0 Group
+ sub %g0, %g2, %g2 ! A0 Group
+ sub %o2, %g2, %o2 ! A0 Group
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ stb %o3, [%o0 + -1] ! MS Group
+
+2: VISEntryHalf ! MS+MS
+ and %o1, 0x7, %g1 ! A1
+ ba,pt %xcc, U3copy_from_user_begin ! BR
+ alignaddr %o1, %g0, %o1 ! MS (Break-after)
+
+ .align 64
+U3copy_from_user_begin:
+ prefetcha [%o1 + 0x000] %asi, #one_read ! MS Group1
+ prefetcha [%o1 + 0x040] %asi, #one_read ! MS Group2
+ andn %o2, (0x40 - 1), %o4 ! A0
+ prefetcha [%o1 + 0x080] %asi, #one_read ! MS Group3
+ cmp %o4, 0x140 ! A0
+ prefetcha [%o1 + 0x0c0] %asi, #one_read ! MS Group4
+ EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0) ! MS Group5 (%f0 results at G8)
+ bge,a,pt %icc, 1f ! BR
+
+ prefetcha [%o1 + 0x100] %asi, #one_read ! MS Group6
+1: EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0) ! AX (%f2 results at G9)
+ cmp %o4, 0x180 ! A1
+ bge,a,pt %icc, 1f ! BR
+ prefetcha [%o1 + 0x140] %asi, #one_read ! MS Group7
+1: EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0) ! AX (%f4 results at G10)
+ cmp %o4, 0x1c0 ! A1
+ bge,a,pt %icc, 1f ! BR
+
+ prefetcha [%o1 + 0x180] %asi, #one_read ! MS Group8
+1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12)
+ EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0) ! AX (%f6 results at G12)
+ faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13)
+ EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0) ! MS (%f8 results at G13)
+ faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15)
+ EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0) ! MS (%f10 results at G15)
+ faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16)
+
+ EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0) ! MS (%f12 results at G16)
+ faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18)
+ EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0) ! MS (%f14 results at G18)
+ faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19)
+ EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0) ! MS (%f0 results at G19)
+
+ /* We only use the first loop if len > (7 * 64). */
+ subcc %o4, 0x1c0, %o4 ! A0 Group17
+ bg,pt %icc, U3copy_from_user_loop1 ! BR
+ add %o1, 0x40, %o1 ! A1
+
+ add %o4, 0x140, %o4 ! A0 Group18
+ ba,pt %xcc, U3copy_from_user_loop2 ! BR
+ srl %o4, 6, %o3 ! A0 Group19
+ nop
+ nop
+ nop
+ nop
+ nop
+
+ nop
+ nop
+
+ /* This loop performs the copy and queues new prefetches.
+ * We drop into the second loop when len <= (5 * 64). Note
+ * that this (5 * 64) factor has been subtracted from len
+ * already.
+ */
+U3copy_from_user_loop1:
+ EX2(ldda [%o1 + 0x008] %asi, %f2) ! MS Group2 (%f2 results at G5)
+ faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5)
+ EX2(ldda [%o1 + 0x010] %asi, %f4) ! MS Group3 (%f4 results at G6)
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7)
+ stda %f16, [%o0] ASI_BLK_P ! MS
+ EX2(ldda [%o1 + 0x018] %asi, %f6) ! AX (%f6 results at G7)
+
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+ EX2(ldda [%o1 + 0x020] %asi, %f8) ! MS (%f8 results at G15)
+ faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16)
+ EX2(ldda [%o1 + 0x028] %asi, %f10) ! MS (%f10 results at G16)
+ faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17)
+ EX2(ldda [%o1 + 0x030] %asi, %f12) ! MS (%f12 results at G17)
+ faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18)
+ EX2(ldda [%o1 + 0x038] %asi, %f14) ! MS (%f14 results at G18)
+
+ faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19)
+ EX2(ldda [%o1 + 0x040] %asi, %f0) ! AX (%f0 results at G19)
+ prefetcha [%o1 + 0x180] %asi, #one_read ! MS
+ faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20)
+ subcc %o4, 0x40, %o4 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3copy_from_user_loop1 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+U3copy_from_user_loop2_enter:
+ mov 5, %o3 ! A1
+
+ /* This loop performs on the copy, no new prefetches are
+ * queued. We do things this way so that we do not perform
+ * any spurious prefetches past the end of the src buffer.
+ */
+U3copy_from_user_loop2:
+ EX3(ldda [%o1 + 0x008] %asi, %f2) ! MS
+ faligndata %f12, %f14, %f28 ! FGA Group2
+ EX3(ldda [%o1 + 0x010] %asi, %f4) ! MS
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall)
+ stda %f16, [%o0] ASI_BLK_P ! MS
+ EX3(ldda [%o1 + 0x018] %asi, %f6) ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+
+ EX3(ldda [%o1 + 0x020] %asi, %f8) ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group13
+ EX3(ldda [%o1 + 0x028] %asi, %f10) ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group14
+ EX3(ldda [%o1 + 0x030] %asi, %f12) ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group15
+ EX3(ldda [%o1 + 0x038] %asi, %f14) ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group16
+
+ EX3(ldda [%o1 + 0x040] %asi, %f0) ! AX
+ faligndata %f10, %f12, %f26 ! FGA Group17
+ subcc %o3, 0x01, %o3 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3copy_from_user_loop2 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+ /* Finally we copy the last full 64-byte block. */
+U3copy_from_user_loopfini:
+ EX3(ldda [%o1 + 0x008] %asi, %f2) ! MS
+ faligndata %f12, %f14, %f28 ! FGA
+ EX3(ldda [%o1 + 0x010] %asi, %f4) ! MS Group19
+ faligndata %f14, %f0, %f30 ! FGA
+ stda %f16, [%o0] ASI_BLK_P ! MS Group20
+ EX3(ldda [%o1 + 0x018] %asi, %f6) ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall)
+ EX3(ldda [%o1 + 0x020] %asi, %f8) ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group12
+ EX3(ldda [%o1 + 0x028] %asi, %f10) ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group13
+ EX3(ldda [%o1 + 0x030] %asi, %f12) ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group14
+ EX3(ldda [%o1 + 0x038] %asi, %f14) ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group15
+ cmp %g1, 0 ! A0
+ be,pt %icc, 1f ! BR
+ add %o0, 0x40, %o0 ! A1
+ EX4(ldda [%o1 + 0x040] %asi, %f0) ! MS
+1: faligndata %f10, %f12, %f26 ! FGA Group16
+ faligndata %f12, %f14, %f28 ! FGA Group17
+ faligndata %f14, %f0, %f30 ! FGA Group18
+ stda %f16, [%o0] ASI_BLK_P ! MS
+ add %o0, 0x40, %o0 ! A0
+ add %o1, 0x40, %o1 ! A1
+ membar #Sync ! MS Group26 (7-cycle stall)
+
+ /* Now we copy the (len modulo 64) bytes at the end.
+ * Note how we borrow the %f0 loaded above.
+ *
+ * Also notice how this code is careful not to perform a
+ * load past the end of the src buffer just like similar
+ * code found in U3copy_from_user_toosmall processing.
+ */
+U3copy_from_user_loopend:
+ and %o2, 0x3f, %o2 ! A0 Group
+ andcc %o2, 0x38, %g2 ! A0 Group
+ be,pn %icc, U3copy_from_user_endcruft ! BR
+ subcc %g2, 0x8, %g2 ! A1
+ be,pn %icc, U3copy_from_user_endcruft ! BR Group
+ cmp %g1, 0 ! A0
+
+ be,a,pt %icc, 1f ! BR Group
+ EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0) ! MS
+
+1: EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0) ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f0, %f2, %f8 ! FGA Group
+ std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX)
+ be,pn %icc, U3copy_from_user_endcruft ! BR
+ add %o0, 0x8, %o0 ! A0
+ EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0) ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA
+ std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX)
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A0 Group
+
+ /* If anything is left, we copy it one byte at a time.
+ * Note that %g1 is (src & 0x3) saved above before the
+ * alignaddr was performed.
+ */
+U3copy_from_user_endcruft:
+ cmp %o2, 0
+ add %o1, %g1, %o1
+ VISExitHalf
+ be,pn %icc, U3copy_from_user_short_ret
+ nop
+ ba,a,pt %xcc, U3copy_from_user_short
+
+ /* If we get here, then 32 <= len < (6 * 64) */
+U3copy_from_user_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+ /* Is 'dst' already aligned on an 8-byte boundary? */
+ be,pt %xcc, 2f ! BR Group
+
+ /* Compute abs((dst & 7) - 8) into %g2. This is the number
+ * of bytes to copy to make 'dst' 8-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x8, %g2 ! A0
+ sub %g0, %g2, %g2 ! A0 Group (reg-dep)
+ sub %o2, %g2, %o2 ! A0 Group (reg-dep)
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group) (%o3 in 3 cycles)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ stb %o3, [%o0 + -1] ! MS Group
+
+2: VISEntryHalf ! MS+MS
+
+ /* Compute (len - (len % 8)) into %g2. This is guarenteed
+ * to be nonzero.
+ */
+ andn %o2, 0x7, %g2 ! A0 Group
+
+ /* You may read this and believe that it allows reading
+ * one 8-byte longword past the end of src. It actually
+ * does not, as %g2 is subtracted as loads are done from
+ * src, so we always stop before running off the end.
+ * Also, we are guarenteed to have at least 0x10 bytes
+ * to move here.
+ */
+ sub %g2, 0x8, %g2 ! A0 Group (reg-dep)
+ alignaddr %o1, %g0, %g1 ! MS (Break-after)
+ EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group (1-cycle stall)
+ add %g1, 0x8, %g1 ! A0
+
+1: EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0) ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+
+ faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall)
+ std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+ be,pn %icc, 2f ! BR
+
+ add %o0, 0x8, %o0 ! A1
+ EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall)
+ std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A1
+
+ /* Nothing left to copy? */
+2: cmp %o2, 0 ! A0 Group
+ VISExitHalf ! A0+MS
+ be,pn %icc, U3copy_from_user_short_ret! BR Group
+ nop ! A0
+ ba,a,pt %xcc, U3copy_from_user_short ! BR Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+ xor %o1, %o0, %g2
+ andcc %g2, 0x7, %g0
+ bne,pn %icc, U3copy_from_user_short
+ andcc %o1, 0x7, %g2
+
+ be,pt %xcc, 2f
+ sub %g2, 0x8, %g2
+ sub %g0, %g2, %g2
+ sub %o2, %g2, %o2
+
+1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+ add %o1, 0x1, %o1
+ add %o0, 0x1, %o0
+ subcc %g2, 0x1, %g2
+ bg,pt %icc, 1b
+ stb %o3, [%o0 + -1]
+
+2: andn %o2, 0x7, %g2
+ sub %o2, %g2, %o2
+
+3: EXNV(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+ add %o1, 0x8, %o1
+ add %o0, 0x8, %o0
+ subcc %g2, 0x8, %g2
+ bg,pt %icc, 3b
+ stx %o3, [%o0 + -8]
+
+ cmp %o2, 0
+ bne,pn %icc, U3copy_from_user_short
+ nop
+ ba,a,pt %xcc, U3copy_from_user_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
+
+#ifdef __KERNEL__
+ .globl U3cfu_fixup
+U3cfu_fixup:
+ /* Since this is copy_from_user(), zero out the rest of the
+ * kernel buffer.
+ */
+ cmp %o1, 0
+ ble,pn %icc, 2f
+ mov %o1, %g2
+
+1: subcc %g2, 1, %g2
+ stb %g0, [%o0]
+ bne,pt %icc, 1b
+ add %o0, 1, %o0
+
+2: retl
+ mov %o1, %o0
+#endif
diff --git a/arch/sparc64/lib/U3copy_in_user.S b/arch/sparc64/lib/U3copy_in_user.S
new file mode 100644
index 000000000..0fc169b9d
--- /dev/null
+++ b/arch/sparc64/lib/U3copy_in_user.S
@@ -0,0 +1,531 @@
+/* $Id: U3copy_in_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized copy within userspace.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: retl; \
+ a, b, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXNV2(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: a, b, %o0; \
+ retl; \
+ add %o0, 1, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXNV3(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: a, b, %o0; \
+ retl; \
+ add %o0, 8, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EX(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ retl; \
+ a, b, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK1(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ add %o4, 0x1c0, %o1; \
+ and %o2, (0x40 - 1), %o2; \
+ retl; \
+ add %o1, %o2, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK2(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ sll %o3, 6, %o3; \
+ and %o2, (0x40 - 1), %o2; \
+ add %o3, 0x80, %o1; \
+ retl; \
+ add %o1, %o2, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK3(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ and %o2, (0x40 - 1), %o2; \
+ retl; \
+ add %o2, 0x80, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK4(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ and %o2, (0x40 - 1), %o2; \
+ retl; \
+ add %o2, 0x40, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#else
+#define ASI_AIUS 0x80
+#define ASI_BLK_AIUS 0xf0
+#define FPRS_FEF 0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b) x,y;
+#define EXNV2(x,y,a,b) x,y;
+#define EXNV3(x,y,a,b) x,y;
+#define EX(x,y,a,b) x,y;
+#define EXBLK1(x,y) x,y;
+#define EXBLK2(x,y) x,y;
+#define EXBLK3(x,y) x,y;
+#define EXBLK4(x,y) x,y;
+#endif
+
+ /* Special/non-trivial issues of this code:
+ *
+ * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+ * 2) Only low 32 FPU registers are used so that only the
+ * lower half of the FPU register set is dirtied by this
+ * code. This is especially important in the kernel.
+ * 3) This code never prefetches cachelines past the end
+ * of the source buffer.
+ *
+ * XXX Actually, Cheetah can buffer up to 8 concurrent
+ * XXX prefetches, revisit this...
+ */
+
+ .text
+ .align 32
+
+ /* The cheetah's flexible spine, oversized liver, enlarged heart,
+ * slender muscular body, and claws make it the swiftest hunter
+ * in Africa and the fastest animal on land. Can reach speeds
+ * of up to 2.4GB per second.
+ */
+
+ .globl U3copy_in_user
+U3copy_in_user: /* %o0=dst, %o1=src, %o2=len */
+ /* Writing to %asi is _expensive_ so we hardcode it.
+ * Reading %asi to check for KERNEL_DS is comparatively
+ * cheap.
+ */
+ rd %asi, %g1 ! MS Group (4 cycles)
+ cmp %g1, ASI_AIUS ! A0 Group
+ bne U3memcpy ! BR
+ nop ! A1
+#ifndef __KERNEL__
+ /* Save away original 'dst' for memcpy return value. */
+ mov %o0, %g3 ! A0 Group
+#endif
+ /* Anything to copy at all? */
+ cmp %o2, 0 ! A1
+ ble,pn %icc, U3copy_in_user_short_ret ! BR
+
+ /* Extremely small copy? */
+ cmp %o2, 31 ! A0 Group
+ ble,pn %icc, U3copy_in_user_short ! BR
+
+ /* Large enough to use unrolled prefetch loops? */
+ cmp %o2, 0x100 ! A1
+ bge,a,pt %icc, U3copy_in_user_enter ! BR Group
+ andcc %o0, 0x3f, %g2 ! A0
+
+ ba,pt %xcc, U3copy_in_user_toosmall ! BR Group
+ andcc %o0, 0x7, %g2 ! A0
+
+ .align 32
+U3copy_in_user_short:
+ /* Copy %o2 bytes from src to dst, one byte at a time. */
+ EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS Group
+ add %o1, 0x1, %o1 ! A0
+ add %o0, 0x1, %o0 ! A1
+ subcc %o2, 1, %o2 ! A0 Group
+
+ bg,pt %icc, U3copy_in_user_short ! BR
+ EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1) ! MS Group (1-cycle stall)
+
+U3copy_in_user_short_ret:
+#ifdef __KERNEL__
+ retl ! BR Group (0-4 cycle stall)
+ clr %o0 ! A0
+#else
+ retl ! BR Group (0-4 cycle stall)
+ mov %g3, %o0 ! A0
+#endif
+
+ /* Here len >= (6 * 64) and condition codes reflect execution
+ * of "andcc %o0, 0x7, %g2", done by caller.
+ */
+ .align 64
+U3copy_in_user_enter:
+ /* Is 'dst' already aligned on an 64-byte boundary? */
+ be,pt %xcc, 2f ! BR
+
+ /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
+ * of bytes to copy to make 'dst' 64-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x40, %g2 ! A0 Group
+ sub %g0, %g2, %g2 ! A0 Group
+ sub %o2, %g2, %o2 ! A0 Group
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
+
+2: VISEntryHalf ! MS+MS
+ and %o1, 0x7, %g1 ! A1
+ ba,pt %xcc, U3copy_in_user_begin ! BR
+ alignaddr %o1, %g0, %o1 ! MS (Break-after)
+
+ .align 64
+U3copy_in_user_begin:
+ prefetch [%o1 + 0x000], #one_read ! MS Group1
+ prefetch [%o1 + 0x040], #one_read ! MS Group2
+ andn %o2, (0x40 - 1), %o4 ! A0
+ prefetch [%o1 + 0x080], #one_read ! MS Group3
+ cmp %o4, 0x140 ! A0
+ prefetch [%o1 + 0x0c0], #one_read ! MS Group4
+ EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0) ! MS Group5 (%f0 results at G8)
+ bge,a,pt %icc, 1f ! BR
+
+ prefetch [%o1 + 0x100], #one_read ! MS Group6
+1: EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0) ! AX (%f2 results at G9)
+ cmp %o4, 0x180 ! A1
+ bge,a,pt %icc, 1f ! BR
+ prefetch [%o1 + 0x140], #one_read ! MS Group7
+1: EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0) ! AX (%f4 results at G10)
+ cmp %o4, 0x1c0 ! A1
+ bge,a,pt %icc, 1f ! BR
+
+ prefetch [%o1 + 0x180], #one_read ! MS Group8
+1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12)
+ EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0) ! AX (%f6 results at G12)
+ faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13)
+ EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0) ! MS (%f8 results at G13)
+ faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15)
+ EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0) ! MS (%f10 results at G15)
+ faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16)
+
+ EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0) ! MS (%f12 results at G16)
+ faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18)
+ EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0) ! MS (%f14 results at G18)
+ faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19)
+ EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0) ! MS (%f0 results at G19)
+
+ /* We only use the first loop if len > (7 * 64). */
+ subcc %o4, 0x1c0, %o4 ! A0 Group17
+ bg,pt %icc, U3copy_in_user_loop1 ! BR
+ add %o1, 0x40, %o1 ! A1
+
+ add %o4, 0x140, %o4 ! A0 Group18
+ ba,pt %xcc, U3copy_in_user_loop2 ! BR
+ srl %o4, 6, %o3 ! A0 Group19
+ nop
+ nop
+ nop
+ nop
+ nop
+
+ nop
+ nop
+
+ /* This loop performs the copy and queues new prefetches.
+ * We drop into the second loop when len <= (5 * 64). Note
+ * that this (5 * 64) factor has been subtracted from len
+ * already.
+ */
+U3copy_in_user_loop1:
+ EXBLK1(ldda [%o1 + 0x008] %asi, %f2) ! MS Group2 (%f2 results at G5)
+ faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5)
+ EXBLK1(ldda [%o1 + 0x010] %asi, %f4) ! MS Group3 (%f4 results at G6)
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7)
+ EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
+ EXBLK1(ldda [%o1 + 0x018] %asi, %f6) ! AX (%f6 results at G7)
+
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+ EXBLK1(ldda [%o1 + 0x020] %asi, %f8) ! MS (%f8 results at G15)
+ faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16)
+ EXBLK1(ldda [%o1 + 0x028] %asi, %f10) ! MS (%f10 results at G16)
+ faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17)
+ EXBLK1(ldda [%o1 + 0x030] %asi, %f12) ! MS (%f12 results at G17)
+ faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18)
+ EXBLK1(ldda [%o1 + 0x038] %asi, %f14) ! MS (%f14 results at G18)
+
+ faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19)
+ EXBLK1(ldda [%o1 + 0x040] %asi, %f0) ! AX (%f0 results at G19)
+ prefetch [%o1 + 0x180], #one_read ! MS
+ faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20)
+ subcc %o4, 0x40, %o4 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3copy_in_user_loop1 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+U3copy_in_user_loop2_enter:
+ mov 5, %o3 ! A1
+
+ /* This loop performs on the copy, no new prefetches are
+ * queued. We do things this way so that we do not perform
+ * any spurious prefetches past the end of the src buffer.
+ */
+U3copy_in_user_loop2:
+ EXBLK2(ldda [%o1 + 0x008] %asi, %f2) ! MS
+ faligndata %f12, %f14, %f28 ! FGA Group2
+ EXBLK2(ldda [%o1 + 0x010] %asi, %f4) ! MS
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall)
+ EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
+ EXBLK2(ldda [%o1 + 0x018] %asi, %f6) ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+
+ EXBLK2(ldda [%o1 + 0x020] %asi, %f8) ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group13
+ EXBLK2(ldda [%o1 + 0x028] %asi, %f10) ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group14
+ EXBLK2(ldda [%o1 + 0x030] %asi, %f12) ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group15
+ EXBLK2(ldda [%o1 + 0x038] %asi, %f14) ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group16
+
+ EXBLK2(ldda [%o1 + 0x040] %asi, %f0) ! AX
+ faligndata %f10, %f12, %f26 ! FGA Group17
+ subcc %o3, 0x01, %o3 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3copy_in_user_loop2 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+ /* Finally we copy the last full 64-byte block. */
+U3copy_in_user_loopfini:
+ EXBLK3(ldda [%o1 + 0x008] %asi, %f2) ! MS
+ faligndata %f12, %f14, %f28 ! FGA
+ EXBLK3(ldda [%o1 + 0x010] %asi, %f4) ! MS Group19
+ faligndata %f14, %f0, %f30 ! FGA
+ EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS) ! MS Group20
+ EXBLK4(ldda [%o1 + 0x018] %asi, %f6) ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall)
+ EXBLK4(ldda [%o1 + 0x020] %asi, %f8) ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group12
+ EXBLK4(ldda [%o1 + 0x028] %asi, %f10) ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group13
+ EXBLK4(ldda [%o1 + 0x030] %asi, %f12) ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group14
+ EXBLK4(ldda [%o1 + 0x038] %asi, %f14) ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group15
+ cmp %g1, 0 ! A0
+ be,pt %icc, 1f ! BR
+ add %o0, 0x40, %o0 ! A1
+ EXBLK4(ldda [%o1 + 0x040] %asi, %f0) ! MS
+1: faligndata %f10, %f12, %f26 ! FGA Group16
+ faligndata %f12, %f14, %f28 ! FGA Group17
+ faligndata %f14, %f0, %f30 ! FGA Group18
+ EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
+ add %o0, 0x40, %o0 ! A0
+ add %o1, 0x40, %o1 ! A1
+ membar #Sync ! MS Group26 (7-cycle stall)
+
+ /* Now we copy the (len modulo 64) bytes at the end.
+ * Note how we borrow the %f0 loaded above.
+ *
+ * Also notice how this code is careful not to perform a
+ * load past the end of the src buffer just like similar
+ * code found in U3copy_in_user_toosmall processing.
+ */
+U3copy_in_user_loopend:
+ and %o2, 0x3f, %o2 ! A0 Group
+ andcc %o2, 0x38, %g2 ! A0 Group
+ be,pn %icc, U3copy_in_user_endcruft ! BR
+ subcc %g2, 0x8, %g2 ! A1
+ be,pn %icc, U3copy_in_user_endcruft ! BR Group
+ cmp %g1, 0 ! A0
+
+ be,a,pt %icc, 1f ! BR Group
+ EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0) ! MS
+
+1: EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0) ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f0, %f2, %f8 ! FGA Group
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
+ be,pn %icc, U3copy_in_user_endcruft ! BR
+ add %o0, 0x8, %o0 ! A0
+ EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0) ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A0 Group
+
+ /* If anything is left, we copy it one byte at a time.
+ * Note that %g1 is (src & 0x3) saved above before the
+ * alignaddr was performed.
+ */
+U3copy_in_user_endcruft:
+ cmp %o2, 0
+ add %o1, %g1, %o1
+ VISExitHalf
+ be,pn %icc, U3copy_in_user_short_ret
+ nop
+ ba,a,pt %xcc, U3copy_in_user_short
+
+ /* If we get here, then 32 <= len < (6 * 64) */
+U3copy_in_user_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+ /* Is 'dst' already aligned on an 8-byte boundary? */
+ be,pt %xcc, 2f ! BR Group
+
+ /* Compute abs((dst & 7) - 8) into %g2. This is the number
+ * of bytes to copy to make 'dst' 8-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x8, %g2 ! A0
+ sub %g0, %g2, %g2 ! A0 Group (reg-dep)
+ sub %o2, %g2, %o2 ! A0 Group (reg-dep)
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group) (%o3 in 3 cycles)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
+
+2: VISEntryHalf ! MS+MS
+
+ /* Compute (len - (len % 8)) into %g2. This is guarenteed
+ * to be nonzero.
+ */
+ andn %o2, 0x7, %g2 ! A0 Group
+
+ /* You may read this and believe that it allows reading
+ * one 8-byte longword past the end of src. It actually
+ * does not, as %g2 is subtracted as loads are done from
+ * src, so we always stop before running off the end.
+ * Also, we are guarenteed to have at least 0x10 bytes
+ * to move here.
+ */
+ sub %g2, 0x8, %g2 ! A0 Group (reg-dep)
+ alignaddr %o1, %g0, %g1 ! MS (Break-after)
+ EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group (1-cycle stall)
+ add %g1, 0x8, %g1 ! A0
+
+1: EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0) ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+
+ faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall)
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+ be,pn %icc, 2f ! BR
+
+ add %o0, 0x8, %o0 ! A1
+ EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall)
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A1
+
+ /* Nothing left to copy? */
+2: cmp %o2, 0 ! A0 Group
+ VISExitHalf ! A0+MS
+ be,pn %icc, U3copy_in_user_short_ret ! BR Group
+ nop ! A0
+ ba,a,pt %xcc, U3copy_in_user_short ! BR Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+ xor %o1, %o0, %g2
+ andcc %g2, 0x7, %g0
+ bne,pn %icc, U3copy_in_user_short
+ andcc %o1, 0x7, %g2
+
+ be,pt %xcc, 2f
+ sub %g2, 0x8, %g2
+ sub %g0, %g2, %g2
+ sub %o2, %g2, %o2
+
+1: EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+ add %o1, 0x1, %o1
+ add %o0, 0x1, %o0
+ subcc %g2, 0x1, %g2
+ bg,pt %icc, 1b
+ EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
+
+2: andn %o2, 0x7, %g2
+ sub %o2, %g2, %o2
+
+3: EXNV3(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+ add %o1, 0x8, %o1
+ add %o0, 0x8, %o0
+ subcc %g2, 0x8, %g2
+ bg,pt %icc, 3b
+ EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
+
+ cmp %o2, 0
+ bne,pn %icc, U3copy_in_user_short
+ nop
+ ba,a,pt %xcc, U3copy_in_user_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
diff --git a/arch/sparc64/lib/U3copy_to_user.S b/arch/sparc64/lib/U3copy_to_user.S
new file mode 100644
index 000000000..e08b1290b
--- /dev/null
+++ b/arch/sparc64/lib/U3copy_to_user.S
@@ -0,0 +1,528 @@
+/* $Id: U3copy_to_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized copy to userspace.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: retl; \
+ a, b, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXNV2(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: a, b, %o0; \
+ retl; \
+ add %o0, 1, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXNV3(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: a, b, %o0; \
+ retl; \
+ add %o0, 8, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EX(x,y,a,b) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ retl; \
+ a, b, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK1(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ add %o4, 0x1c0, %o1; \
+ and %o2, (0x40 - 1), %o2; \
+ retl; \
+ add %o1, %o2, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK2(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ sll %o3, 6, %o3; \
+ and %o2, (0x40 - 1), %o2; \
+ add %o3, 0x80, %o1; \
+ retl; \
+ add %o1, %o2, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK3(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ and %o2, (0x40 - 1), %o2; \
+ retl; \
+ add %o2, 0x80, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#define EXBLK4(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: VISExitHalf; \
+ and %o2, (0x40 - 1), %o2; \
+ retl; \
+ add %o2, 0x40, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+#else
+#define ASI_AIUS 0x80
+#define ASI_BLK_AIUS 0xf0
+#define FPRS_FEF 0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b) x,y;
+#define EXNV2(x,y,a,b) x,y;
+#define EXNV3(x,y,a,b) x,y;
+#define EX(x,y,a,b) x,y;
+#define EXBLK1(x,y) x,y;
+#define EXBLK2(x,y) x,y;
+#define EXBLK3(x,y) x,y;
+#define EXBLK4(x,y) x,y;
+#endif
+
+ /* Special/non-trivial issues of this code:
+ *
+ * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+ * 2) Only low 32 FPU registers are used so that only the
+ * lower half of the FPU register set is dirtied by this
+ * code. This is especially important in the kernel.
+ * 3) This code never prefetches cachelines past the end
+ * of the source buffer.
+ */
+
+ .text
+ .align 32
+
+ /* The cheetah's flexible spine, oversized liver, enlarged heart,
+ * slender muscular body, and claws make it the swiftest hunter
+ * in Africa and the fastest animal on land. Can reach speeds
+ * of up to 2.4GB per second.
+ */
+
+ .globl U3copy_to_user
+U3copy_to_user: /* %o0=dst, %o1=src, %o2=len */
+ /* Writing to %asi is _expensive_ so we hardcode it.
+ * Reading %asi to check for KERNEL_DS is comparatively
+ * cheap.
+ */
+ rd %asi, %g1 ! MS Group (4 cycles)
+ cmp %g1, ASI_AIUS ! A0 Group
+ bne U3memcpy ! BR
+ nop ! A1
+#ifndef __KERNEL__
+ /* Save away original 'dst' for memcpy return value. */
+ mov %o0, %g3 ! A0 Group
+#endif
+ /* Anything to copy at all? */
+ cmp %o2, 0 ! A1
+ ble,pn %icc, U3copy_to_user_short_ret ! BR
+
+ /* Extremely small copy? */
+ cmp %o2, 31 ! A0 Group
+ ble,pn %icc, U3copy_to_user_short ! BR
+
+ /* Large enough to use unrolled prefetch loops? */
+ cmp %o2, 0x100 ! A1
+ bge,a,pt %icc, U3copy_to_user_enter ! BR Group
+ andcc %o0, 0x3f, %g2 ! A0
+
+ ba,pt %xcc, U3copy_to_user_toosmall ! BR Group
+ andcc %o0, 0x7, %g2 ! A0
+
+ .align 32
+U3copy_to_user_short:
+ /* Copy %o2 bytes from src to dst, one byte at a time. */
+ ldub [%o1 + 0x00], %o3 ! MS Group
+ add %o1, 0x1, %o1 ! A0
+ add %o0, 0x1, %o0 ! A1
+ subcc %o2, 1, %o2 ! A0 Group
+
+ bg,pt %icc, U3copy_to_user_short ! BR
+ EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1) ! MS Group (1-cycle stall)
+
+U3copy_to_user_short_ret:
+#ifdef __KERNEL__
+ retl ! BR Group (0-4 cycle stall)
+ clr %o0 ! A0
+#else
+ retl ! BR Group (0-4 cycle stall)
+ mov %g3, %o0 ! A0
+#endif
+
+ /* Here len >= (6 * 64) and condition codes reflect execution
+ * of "andcc %o0, 0x7, %g2", done by caller.
+ */
+ .align 64
+U3copy_to_user_enter:
+ /* Is 'dst' already aligned on an 64-byte boundary? */
+ be,pt %xcc, 2f ! BR
+
+ /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
+ * of bytes to copy to make 'dst' 64-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x40, %g2 ! A0 Group
+ sub %g0, %g2, %g2 ! A0 Group
+ sub %o2, %g2, %o2 ! A0 Group
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: ldub [%o1 + 0x00], %o3 ! MS (Group)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
+
+2: VISEntryHalf ! MS+MS
+ and %o1, 0x7, %g1 ! A1
+ ba,pt %xcc, U3copy_to_user_begin ! BR
+ alignaddr %o1, %g0, %o1 ! MS (Break-after)
+
+ .align 64
+U3copy_to_user_begin:
+ prefetch [%o1 + 0x000], #one_read ! MS Group1
+ prefetch [%o1 + 0x040], #one_read ! MS Group2
+ andn %o2, (0x40 - 1), %o4 ! A0
+ prefetch [%o1 + 0x080], #one_read ! MS Group3
+ cmp %o4, 0x140 ! A0
+ prefetch [%o1 + 0x0c0], #one_read ! MS Group4
+ ldd [%o1 + 0x000], %f0 ! MS Group5 (%f0 results at G8)
+ bge,a,pt %icc, 1f ! BR
+
+ prefetch [%o1 + 0x100], #one_read ! MS Group6
+1: ldd [%o1 + 0x008], %f2 ! AX (%f2 results at G9)
+ cmp %o4, 0x180 ! A1
+ bge,a,pt %icc, 1f ! BR
+ prefetch [%o1 + 0x140], #one_read ! MS Group7
+1: ldd [%o1 + 0x010], %f4 ! AX (%f4 results at G10)
+ cmp %o4, 0x1c0 ! A1
+ bge,a,pt %icc, 1f ! BR
+
+ prefetch [%o1 + 0x180], #one_read ! MS Group8
+1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12)
+ ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G12)
+ faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13)
+ ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G13)
+ faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15)
+ ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G15)
+ faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16)
+
+ ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G16)
+ faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18)
+ ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
+ faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19)
+ ldd [%o1 + 0x040], %f0 ! MS (%f0 results at G19)
+
+ /* We only use the first loop if len > (7 * 64). */
+ subcc %o4, 0x1c0, %o4 ! A0 Group17
+ bg,pt %icc, U3copy_to_user_loop1 ! BR
+ add %o1, 0x40, %o1 ! A1
+
+ add %o4, 0x140, %o4 ! A0 Group18
+ ba,pt %xcc, U3copy_to_user_loop2 ! BR
+ srl %o4, 6, %o3 ! A0 Group19
+ nop
+ nop
+ nop
+ nop
+ nop
+
+ nop
+ nop
+
+ /* This loop performs the copy and queues new prefetches.
+ * We drop into the second loop when len <= (5 * 64). Note
+ * that this (5 * 64) factor has been subtracted from len
+ * already.
+ */
+U3copy_to_user_loop1:
+ ldd [%o1 + 0x008], %f2 ! MS Group2 (%f2 results at G5)
+ faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5)
+ ldd [%o1 + 0x010], %f4 ! MS Group3 (%f4 results at G6)
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7)
+ EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
+ ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G7)
+
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+ ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G15)
+ faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16)
+ ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G16)
+ faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17)
+ ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G17)
+ faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18)
+ ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
+
+ faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19)
+ ldd [%o1 + 0x040], %f0 ! AX (%f0 results at G19)
+ prefetch [%o1 + 0x180], #one_read ! MS
+ faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20)
+ subcc %o4, 0x40, %o4 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3copy_to_user_loop1 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+U3copy_to_user_loop2_enter:
+ mov 5, %o3 ! A1
+
+ /* This loop performs on the copy, no new prefetches are
+ * queued. We do things this way so that we do not perform
+ * any spurious prefetches past the end of the src buffer.
+ */
+U3copy_to_user_loop2:
+ ldd [%o1 + 0x008], %f2 ! MS
+ faligndata %f12, %f14, %f28 ! FGA Group2
+ ldd [%o1 + 0x010], %f4 ! MS
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall)
+ EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
+ ldd [%o1 + 0x018], %f6 ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+
+ ldd [%o1 + 0x020], %f8 ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group13
+ ldd [%o1 + 0x028], %f10 ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group14
+ ldd [%o1 + 0x030], %f12 ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group15
+ ldd [%o1 + 0x038], %f14 ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group16
+
+ ldd [%o1 + 0x040], %f0 ! AX
+ faligndata %f10, %f12, %f26 ! FGA Group17
+ subcc %o3, 0x01, %o3 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3copy_to_user_loop2 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+ /* Finally we copy the last full 64-byte block. */
+U3copy_to_user_loopfini:
+ ldd [%o1 + 0x008], %f2 ! MS
+ faligndata %f12, %f14, %f28 ! FGA
+ ldd [%o1 + 0x010], %f4 ! MS Group19
+ faligndata %f14, %f0, %f30 ! FGA
+ EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS) ! MS Group20
+ ldd [%o1 + 0x018], %f6 ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall)
+ ldd [%o1 + 0x020], %f8 ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group12
+ ldd [%o1 + 0x028], %f10 ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group13
+ ldd [%o1 + 0x030], %f12 ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group14
+ ldd [%o1 + 0x038], %f14 ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group15
+ cmp %g1, 0 ! A0
+ be,pt %icc, 1f ! BR
+ add %o0, 0x40, %o0 ! A1
+ ldd [%o1 + 0x040], %f0 ! MS
+1: faligndata %f10, %f12, %f26 ! FGA Group16
+ faligndata %f12, %f14, %f28 ! FGA Group17
+ faligndata %f14, %f0, %f30 ! FGA Group18
+ EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
+ add %o0, 0x40, %o0 ! A0
+ add %o1, 0x40, %o1 ! A1
+ membar #Sync ! MS Group26 (7-cycle stall)
+
+ /* Now we copy the (len modulo 64) bytes at the end.
+ * Note how we borrow the %f0 loaded above.
+ *
+ * Also notice how this code is careful not to perform a
+ * load past the end of the src buffer just like similar
+ * code found in U3copy_to_user_toosmall processing.
+ */
+U3copy_to_user_loopend:
+ and %o2, 0x3f, %o2 ! A0 Group
+ andcc %o2, 0x38, %g2 ! A0 Group
+ be,pn %icc, U3copy_to_user_endcruft ! BR
+ subcc %g2, 0x8, %g2 ! A1
+ be,pn %icc, U3copy_to_user_endcruft ! BR Group
+ cmp %g1, 0 ! A0
+
+ be,a,pt %icc, 1f ! BR Group
+ ldd [%o1 + 0x00], %f0 ! MS
+
+1: ldd [%o1 + 0x08], %f2 ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f0, %f2, %f8 ! FGA Group
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
+ be,pn %icc, U3copy_to_user_endcruft ! BR
+ add %o0, 0x8, %o0 ! A0
+ ldd [%o1 + 0x08], %f0 ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A0 Group
+
+ /* If anything is left, we copy it one byte at a time.
+ * Note that %g1 is (src & 0x3) saved above before the
+ * alignaddr was performed.
+ */
+U3copy_to_user_endcruft:
+ cmp %o2, 0
+ add %o1, %g1, %o1
+ VISExitHalf
+ be,pn %icc, U3copy_to_user_short_ret
+ nop
+ ba,a,pt %xcc, U3copy_to_user_short
+
+ /* If we get here, then 32 <= len < (6 * 64) */
+U3copy_to_user_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+ /* Is 'dst' already aligned on an 8-byte boundary? */
+ be,pt %xcc, 2f ! BR Group
+
+ /* Compute abs((dst & 7) - 8) into %g2. This is the number
+ * of bytes to copy to make 'dst' 8-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x8, %g2 ! A0
+ sub %g0, %g2, %g2 ! A0 Group (reg-dep)
+ sub %o2, %g2, %o2 ! A0 Group (reg-dep)
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: ldub [%o1 + 0x00], %o3 ! MS (Group) (%o3 in 3 cycles)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
+
+2: VISEntryHalf ! MS+MS
+
+ /* Compute (len - (len % 8)) into %g2. This is guarenteed
+ * to be nonzero.
+ */
+ andn %o2, 0x7, %g2 ! A0 Group
+
+ /* You may read this and believe that it allows reading
+ * one 8-byte longword past the end of src. It actually
+ * does not, as %g2 is subtracted as loads are done from
+ * src, so we always stop before running off the end.
+ * Also, we are guarenteed to have at least 0x10 bytes
+ * to move here.
+ */
+ sub %g2, 0x8, %g2 ! A0 Group (reg-dep)
+ alignaddr %o1, %g0, %g1 ! MS (Break-after)
+ ldd [%g1 + 0x00], %f0 ! MS Group (1-cycle stall)
+ add %g1, 0x8, %g1 ! A0
+
+1: ldd [%g1 + 0x00], %f2 ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+
+ faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall)
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+ be,pn %icc, 2f ! BR
+
+ add %o0, 0x8, %o0 ! A1
+ ldd [%g1 + 0x00], %f0 ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall)
+ EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A1
+
+ /* Nothing left to copy? */
+2: cmp %o2, 0 ! A0 Group
+ VISExitHalf ! A0+MS
+ be,pn %icc, U3copy_to_user_short_ret ! BR Group
+ nop ! A0
+ ba,a,pt %xcc, U3copy_to_user_short ! BR Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+ xor %o1, %o0, %g2
+ andcc %g2, 0x7, %g0
+ bne,pn %icc, U3copy_to_user_short
+ andcc %o1, 0x7, %g2
+
+ be,pt %xcc, 2f
+ sub %g2, 0x8, %g2
+ sub %g0, %g2, %g2
+ sub %o2, %g2, %o2
+
+1: ldub [%o1 + 0x00], %o3
+ add %o1, 0x1, %o1
+ add %o0, 0x1, %o0
+ subcc %g2, 0x1, %g2
+ bg,pt %icc, 1b
+ EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
+
+2: andn %o2, 0x7, %g2
+ sub %o2, %g2, %o2
+
+3: ldx [%o1 + 0x00], %o3
+ add %o1, 0x8, %o1
+ add %o0, 0x8, %o0
+ subcc %g2, 0x8, %g2
+ bg,pt %icc, 3b
+ EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
+
+ cmp %o2, 0
+ bne,pn %icc, U3copy_to_user_short
+ nop
+ ba,a,pt %xcc, U3copy_to_user_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
diff --git a/arch/sparc64/lib/U3memcpy.S b/arch/sparc64/lib/U3memcpy.S
new file mode 100644
index 000000000..d38289145
--- /dev/null
+++ b/arch/sparc64/lib/U3memcpy.S
@@ -0,0 +1,409 @@
+/* $Id: U3memcpy.S,v 1.2 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized memcpy.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#else
+#define ASI_BLK_P 0xf0
+#define FPRS_FEF 0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#endif
+
+ /* Special/non-trivial issues of this code:
+ *
+ * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+ * 2) Only low 32 FPU registers are used so that only the
+ * lower half of the FPU register set is dirtied by this
+ * code. This is especially important in the kernel.
+ * 3) This code never prefetches cachelines past the end
+ * of the source buffer.
+ */
+
+ .text
+ .align 32
+
+ /* The cheetah's flexible spine, oversized liver, enlarged heart,
+ * slender muscular body, and claws make it the swiftest hunter
+ * in Africa and the fastest animal on land. Can reach speeds
+ * of up to 2.4GB per second.
+ */
+
+ .globl U3memcpy
+U3memcpy: /* %o0=dst, %o1=src, %o2=len */
+#ifndef __KERNEL__
+ /* Save away original 'dst' for memcpy return value. */
+ mov %o0, %g3 ! A0 Group
+#endif
+ /* Anything to copy at all? */
+ cmp %o2, 0 ! A1
+ ble,pn %icc, U3memcpy_short_ret ! BR
+
+ /* Extremely small copy? */
+ cmp %o2, 31 ! A0 Group
+ ble,pn %icc, U3memcpy_short ! BR
+
+ /* Large enough to use unrolled prefetch loops? */
+ cmp %o2, 0x100 ! A1
+ bge,a,pt %icc, U3memcpy_enter ! BR Group
+ andcc %o0, 0x3f, %g2 ! A0
+
+ ba,pt %xcc, U3memcpy_toosmall ! BR Group
+ andcc %o0, 0x7, %g2 ! A0
+
+ .align 32
+U3memcpy_short:
+ /* Copy %o2 bytes from src to dst, one byte at a time. */
+ ldub [%o1 + 0x00], %o3 ! MS Group
+ add %o1, 0x1, %o1 ! A0
+ add %o0, 0x1, %o0 ! A1
+ subcc %o2, 1, %o2 ! A0 Group
+
+ bg,pt %icc, U3memcpy_short ! BR
+ stb %o3, [%o0 + -1] ! MS Group (1-cycle stall)
+
+U3memcpy_short_ret:
+#ifdef __KERNEL__
+ retl ! BR Group (0-4 cycle stall)
+ clr %o0 ! A0
+#else
+ retl ! BR Group (0-4 cycle stall)
+ mov %g3, %o0 ! A0
+#endif
+
+ /* Here len >= (6 * 64) and condition codes reflect execution
+ * of "andcc %o0, 0x7, %g2", done by caller.
+ */
+ .align 64
+U3memcpy_enter:
+ /* Is 'dst' already aligned on an 64-byte boundary? */
+ be,pt %xcc, 2f ! BR
+
+ /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
+ * of bytes to copy to make 'dst' 64-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x40, %g2 ! A0 Group
+ sub %g0, %g2, %g2 ! A0 Group
+ sub %o2, %g2, %o2 ! A0 Group
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: ldub [%o1 + 0x00], %o3 ! MS (Group)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ stb %o3, [%o0 + -1] ! MS Group
+
+2: VISEntryHalf ! MS+MS
+ and %o1, 0x7, %g1 ! A1
+ ba,pt %xcc, U3memcpy_begin ! BR
+ alignaddr %o1, %g0, %o1 ! MS (Break-after)
+
+ .align 64
+U3memcpy_begin:
+ prefetch [%o1 + 0x000], #one_read ! MS Group1
+ prefetch [%o1 + 0x040], #one_read ! MS Group2
+ andn %o2, (0x40 - 1), %o4 ! A0
+ prefetch [%o1 + 0x080], #one_read ! MS Group3
+ cmp %o4, 0x140 ! A0
+ prefetch [%o1 + 0x0c0], #one_read ! MS Group4
+ ldd [%o1 + 0x000], %f0 ! MS Group5 (%f0 results at G8)
+ bge,a,pt %icc, 1f ! BR
+
+ prefetch [%o1 + 0x100], #one_read ! MS Group6
+1: ldd [%o1 + 0x008], %f2 ! AX (%f2 results at G9)
+ cmp %o4, 0x180 ! A1
+ bge,a,pt %icc, 1f ! BR
+ prefetch [%o1 + 0x140], #one_read ! MS Group7
+1: ldd [%o1 + 0x010], %f4 ! AX (%f4 results at G10)
+ cmp %o4, 0x1c0 ! A1
+ bge,a,pt %icc, 1f ! BR
+
+ prefetch [%o1 + 0x180], #one_read ! MS Group8
+1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12)
+ ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G12)
+ faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13)
+ ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G13)
+ faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15)
+ ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G15)
+ faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16)
+
+ ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G16)
+ faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18)
+ ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
+ faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19)
+ ldd [%o1 + 0x040], %f0 ! MS (%f0 results at G19)
+
+ /* We only use the first loop if len > (7 * 64). */
+ subcc %o4, 0x1c0, %o4 ! A0 Group17
+ bg,pt %icc, U3memcpy_loop1 ! BR
+ add %o1, 0x40, %o1 ! A1
+
+ add %o4, 0x140, %o4 ! A0 Group18
+ ba,pt %xcc, U3memcpy_loop2 ! BR
+ srl %o4, 6, %o3 ! A0 Group19
+ nop
+ nop
+ nop
+ nop
+ nop
+
+ nop
+ nop
+
+ /* This loop performs the copy and queues new prefetches.
+ * We drop into the second loop when len <= (5 * 64). Note
+ * that this (5 * 64) factor has been subtracted from len
+ * already.
+ */
+U3memcpy_loop1:
+ ldd [%o1 + 0x008], %f2 ! MS Group2 (%f2 results at G5)
+ faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5)
+ ldd [%o1 + 0x010], %f4 ! MS Group3 (%f4 results at G6)
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7)
+ stda %f16, [%o0] ASI_BLK_P ! MS
+ ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G7)
+
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+ ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G15)
+ faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16)
+ ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G16)
+ faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17)
+ ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G17)
+ faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18)
+ ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
+
+ faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19)
+ ldd [%o1 + 0x040], %f0 ! AX (%f0 results at G19)
+ prefetch [%o1 + 0x180], #one_read ! MS
+ faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20)
+ subcc %o4, 0x40, %o4 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3memcpy_loop1 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+U3memcpy_loop2_enter:
+ mov 5, %o3 ! A1
+
+ /* This loop performs on the copy, no new prefetches are
+ * queued. We do things this way so that we do not perform
+ * any spurious prefetches past the end of the src buffer.
+ */
+U3memcpy_loop2:
+ ldd [%o1 + 0x008], %f2 ! MS
+ faligndata %f12, %f14, %f28 ! FGA Group2
+ ldd [%o1 + 0x010], %f4 ! MS
+ faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall)
+ stda %f16, [%o0] ASI_BLK_P ! MS
+ ldd [%o1 + 0x018], %f6 ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
+
+ ldd [%o1 + 0x020], %f8 ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group13
+ ldd [%o1 + 0x028], %f10 ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group14
+ ldd [%o1 + 0x030], %f12 ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group15
+ ldd [%o1 + 0x038], %f14 ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group16
+
+ ldd [%o1 + 0x040], %f0 ! AX
+ faligndata %f10, %f12, %f26 ! FGA Group17
+ subcc %o3, 0x01, %o3 ! A0
+ add %o1, 0x40, %o1 ! A1
+ bg,pt %xcc, U3memcpy_loop2 ! BR
+ add %o0, 0x40, %o0 ! A0 Group18
+
+ /* Finally we copy the last full 64-byte block. */
+U3memcpy_loopfini:
+ ldd [%o1 + 0x008], %f2 ! MS
+ faligndata %f12, %f14, %f28 ! FGA
+ ldd [%o1 + 0x010], %f4 ! MS Group19
+ faligndata %f14, %f0, %f30 ! FGA
+ stda %f16, [%o0] ASI_BLK_P ! MS Group20
+ ldd [%o1 + 0x018], %f6 ! AX
+ faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall)
+ ldd [%o1 + 0x020], %f8 ! MS
+ faligndata %f2, %f4, %f18 ! FGA Group12
+ ldd [%o1 + 0x028], %f10 ! MS
+ faligndata %f4, %f6, %f20 ! FGA Group13
+ ldd [%o1 + 0x030], %f12 ! MS
+ faligndata %f6, %f8, %f22 ! FGA Group14
+ ldd [%o1 + 0x038], %f14 ! MS
+ faligndata %f8, %f10, %f24 ! FGA Group15
+ cmp %g1, 0 ! A0
+ be,pt %icc, 1f ! BR
+ add %o0, 0x40, %o0 ! A1
+ ldd [%o1 + 0x040], %f0 ! MS
+1: faligndata %f10, %f12, %f26 ! FGA Group16
+ faligndata %f12, %f14, %f28 ! FGA Group17
+ faligndata %f14, %f0, %f30 ! FGA Group18
+ stda %f16, [%o0] ASI_BLK_P ! MS
+ add %o0, 0x40, %o0 ! A0
+ add %o1, 0x40, %o1 ! A1
+ membar #Sync ! MS Group26 (7-cycle stall)
+
+ /* Now we copy the (len modulo 64) bytes at the end.
+ * Note how we borrow the %f0 loaded above.
+ *
+ * Also notice how this code is careful not to perform a
+ * load past the end of the src buffer just like similar
+ * code found in U3memcpy_toosmall processing.
+ */
+U3memcpy_loopend:
+ and %o2, 0x3f, %o2 ! A0 Group
+ andcc %o2, 0x38, %g2 ! A0 Group
+ be,pn %icc, U3memcpy_endcruft ! BR
+ subcc %g2, 0x8, %g2 ! A1
+ be,pn %icc, U3memcpy_endcruft ! BR Group
+ cmp %g1, 0 ! A0
+
+ be,a,pt %icc, 1f ! BR Group
+ ldd [%o1 + 0x00], %f0 ! MS
+
+1: ldd [%o1 + 0x08], %f2 ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f0, %f2, %f8 ! FGA Group
+ std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX)
+ be,pn %icc, U3memcpy_endcruft ! BR
+ add %o0, 0x8, %o0 ! A0
+ ldd [%o1 + 0x08], %f0 ! MS Group
+ add %o1, 0x8, %o1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA
+ std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX)
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A0 Group
+
+ /* If anything is left, we copy it one byte at a time.
+ * Note that %g1 is (src & 0x3) saved above before the
+ * alignaddr was performed.
+ */
+U3memcpy_endcruft:
+ cmp %o2, 0
+ add %o1, %g1, %o1
+ VISExitHalf
+ be,pn %icc, U3memcpy_short_ret
+ nop
+ ba,a,pt %xcc, U3memcpy_short
+
+ /* If we get here, then 32 <= len < (6 * 64) */
+U3memcpy_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+ /* Is 'dst' already aligned on an 8-byte boundary? */
+ be,pt %xcc, 2f ! BR Group
+
+ /* Compute abs((dst & 7) - 8) into %g2. This is the number
+ * of bytes to copy to make 'dst' 8-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %g2, 0x8, %g2 ! A0
+ sub %g0, %g2, %g2 ! A0 Group (reg-dep)
+ sub %o2, %g2, %o2 ! A0 Group (reg-dep)
+
+ /* Copy %g2 bytes from src to dst, one byte at a time. */
+1: ldub [%o1 + 0x00], %o3 ! MS (Group) (%o3 in 3 cycles)
+ add %o1, 0x1, %o1 ! A1
+ add %o0, 0x1, %o0 ! A0 Group
+ subcc %g2, 0x1, %g2 ! A1
+
+ bg,pt %icc, 1b ! BR Group
+ stb %o3, [%o0 + -1] ! MS Group
+
+2: VISEntryHalf ! MS+MS
+
+ /* Compute (len - (len % 8)) into %g2. This is guarenteed
+ * to be nonzero.
+ */
+ andn %o2, 0x7, %g2 ! A0 Group
+
+ /* You may read this and believe that it allows reading
+ * one 8-byte longword past the end of src. It actually
+ * does not, as %g2 is subtracted as loads are done from
+ * src, so we always stop before running off the end.
+ * Also, we are guarenteed to have at least 0x10 bytes
+ * to move here.
+ */
+ sub %g2, 0x8, %g2 ! A0 Group (reg-dep)
+ alignaddr %o1, %g0, %g1 ! MS (Break-after)
+ ldd [%g1 + 0x00], %f0 ! MS Group (1-cycle stall)
+ add %g1, 0x8, %g1 ! A0
+
+1: ldd [%g1 + 0x00], %f2 ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+ subcc %g2, 0x8, %g2 ! A0 Group
+
+ faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall)
+ std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+ be,pn %icc, 2f ! BR
+
+ add %o0, 0x8, %o0 ! A1
+ ldd [%g1 + 0x00], %f0 ! MS Group
+ add %g1, 0x8, %g1 ! A0
+ sub %o2, 0x8, %o2 ! A1
+
+ subcc %g2, 0x8, %g2 ! A0 Group
+ faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall)
+ std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall)
+ add %o1, 0x8, %o1 ! A0
+
+ bne,pn %icc, 1b ! BR
+ add %o0, 0x8, %o0 ! A1
+
+ /* Nothing left to copy? */
+2: cmp %o2, 0 ! A0 Group
+ VISExitHalf ! A0+MS
+ be,pn %icc, U3memcpy_short_ret ! BR Group
+ nop ! A0
+ ba,a,pt %xcc, U3memcpy_short ! BR Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+ xor %o1, %o0, %g2
+ andcc %g2, 0x7, %g0
+ bne,pn %icc, U3memcpy_short
+ andcc %o1, 0x7, %g2
+
+ be,pt %xcc, 2f
+ sub %g2, 0x8, %g2
+ sub %g0, %g2, %g2
+ sub %o2, %g2, %o2
+
+1: ldub [%o1 + 0x00], %o3
+ add %o1, 0x1, %o1
+ add %o0, 0x1, %o0
+ subcc %g2, 0x1, %g2
+ bg,pt %icc, 1b
+ stb %o3, [%o0 + -1]
+
+2: andn %o2, 0x7, %g2
+ sub %o2, %g2, %o2
+
+3: ldx [%o1 + 0x00], %o3
+ add %o1, 0x8, %o1
+ add %o0, 0x8, %o0
+ subcc %g2, 0x8, %g2
+ bg,pt %icc, 3b
+ stx %o3, [%o0 + -8]
+
+ cmp %o2, 0
+ bne,pn %icc, U3memcpy_short
+ nop
+ ba,a,pt %xcc, U3memcpy_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
diff --git a/arch/sparc64/lib/VIScopy.S b/arch/sparc64/lib/VIScopy.S
index 56634f83f..b944a0ae7 100644
--- a/arch/sparc64/lib/VIScopy.S
+++ b/arch/sparc64/lib/VIScopy.S
@@ -1,4 +1,4 @@
-/* $Id: VIScopy.S,v 1.23 2000/03/26 09:13:49 davem Exp $
+/* $Id: VIScopy.S,v 1.25 2000/11/01 09:29:19 davem Exp $
* VIScopy.S: High speed copy operations utilizing the UltraSparc
* Visual Instruction Set.
*
@@ -361,6 +361,38 @@ bcopy: or %o0, 0, %g3 ! IEU0 Group
clr %o0 ! IEU0
+#ifdef __KERNEL__
+#define BRANCH_ALWAYS 0x10680000
+#define NOP 0x01000000
+#define ULTRA3_DO_PATCH(OLD, NEW) \
+ sethi %hi(NEW), %g1; \
+ or %g1, %lo(NEW), %g1; \
+ sethi %hi(OLD), %g2; \
+ or %g2, %lo(OLD), %g2; \
+ sub %g1, %g2, %g1; \
+ sethi %hi(BRANCH_ALWAYS), %g3; \
+ srl %g1, 2, %g1; \
+ or %g3, %lo(BRANCH_ALWAYS), %g3; \
+ or %g3, %g1, %g3; \
+ stw %g3, [%g2]; \
+ sethi %hi(NOP), %g3; \
+ or %g3, %lo(NOP), %g3; \
+ stw %g3, [%g2 + 0x4]; \
+ flush %g2;
+
+ .globl cheetah_patch_copyops
+cheetah_patch_copyops:
+ ULTRA3_DO_PATCH(memcpy, U3memcpy)
+ ULTRA3_DO_PATCH(__copy_from_user, U3copy_from_user)
+ ULTRA3_DO_PATCH(__copy_to_user, U3copy_to_user)
+ ULTRA3_DO_PATCH(__copy_in_user, U3copy_in_user)
+ retl
+ nop
+#undef BRANCH_ALWAYS
+#undef NOP
+#undef ULTRA3_DO_PATCH
+#endif /* __KERNEL__ */
+
.align 32
#ifdef __KERNEL__
__memcpy_384plus:
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 65fbd6e37..6da2d0b85 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1,4 +1,4 @@
-/* $Id: init.c,v 1.157 2000/10/19 00:49:52 davem Exp $
+/* $Id: init.c,v 1.159 2000/11/06 06:59:04 davem Exp $
* arch/sparc64/mm/init.c
*
* Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu)
@@ -99,6 +99,20 @@ int do_check_pgt_cache(int low, int high)
return freed;
}
+extern void __update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
+
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte)
+{
+ struct page *page = pte_page(pte);
+
+ if (VALID_PAGE(page) && page->mapping &&
+ test_bit(PG_dcache_dirty, &page->flags)) {
+ __flush_dcache_page(page->virtual, 1);
+ clear_bit(PG_dcache_dirty, &page->flags);
+ }
+ __update_mmu_cache(vma, address, pte);
+}
+
/*
* BAD_PAGE is the page that is used for page faults when linux
* is out-of-memory. Older versions of linux just did a
diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
index 7940218d2..daaf580a0 100644
--- a/arch/sparc64/mm/ultra.S
+++ b/arch/sparc64/mm/ultra.S
@@ -1,4 +1,4 @@
-/* $Id: ultra.S,v 1.46 2000/08/05 13:30:33 davem Exp $
+/* $Id: ultra.S,v 1.48 2000/11/06 06:59:04 davem Exp $
* ultra.S: Don't expand these all over the place...
*
* Copyright (C) 1997, 2000 David S. Miller (davem@redhat.com)
@@ -208,27 +208,58 @@ iflush2:sub %o1, 0x20, %g3
.align 64
.globl __flush_dcache_page
-__flush_dcache_page:
+__flush_dcache_page: /* %o0=kaddr, %o1=flush_icache */
sub %o0, %g4, %o0
- clr %o1
+ clr %o4
srlx %o0, 11, %o0
sethi %hi(1 << 14), %o2
-1: ldxa [%o1] ASI_DCACHE_TAG, %o3
- andn %o3, 0x3, %o3
- cmp %o0, %o3
- bne,pt %xcc, 2f
- nop
- stxa %g0, [%o1] ASI_DCACHE_TAG
- membar #Sync
-2: add %o1, (1 << 5), %o1
- cmp %o1, %o2
- bne,pt %xcc, 1b
- nop
+1: ldxa [%o4] ASI_DCACHE_TAG, %o3 ! LSU Group
+ add %o4, (1 << 5), %o4 ! IEU0
+ ldxa [%o4] ASI_DCACHE_TAG, %g1 ! LSU Group
+ add %o4, (1 << 5), %o4 ! IEU0
+ ldxa [%o4] ASI_DCACHE_TAG, %g2 ! LSU Group o3 available
+ add %o4, (1 << 5), %o4 ! IEU0
+ andn %o3, 0x3, %o3 ! IEU1
+ ldxa [%o4] ASI_DCACHE_TAG, %g3 ! LSU Group
+ add %o4, (1 << 5), %o4 ! IEU0
+ andn %g1, 0x3, %g1 ! IEU1
+ cmp %o0, %o3 ! IEU1 Group
+ be,a,pn %xcc, dflush1 ! CTI
+ sub %o4, (4 << 5), %o4 ! IEU0 (Group)
+ cmp %o0, %g1 ! IEU1 Group
+ andn %g2, 0x3, %g2 ! IEU0
+ be,a,pn %xcc, dflush2 ! CTI
+ sub %o4, (3 << 5), %o4 ! IEU0 (Group)
+ cmp %o0, %g2 ! IEU1 Group
+ andn %g3, 0x3, %g3 ! IEU0
+ be,a,pn %xcc, dflush3 ! CTI
+ sub %o4, (2 << 5), %o4 ! IEU0 (Group)
+ cmp %o0, %g3 ! IEU1 Group
+ be,a,pn %xcc, dflush4 ! CTI
+ sub %o4, (1 << 5), %o4 ! IEU0
+2: cmp %o4, %o2 ! IEU1 Group
+ bne,pt %xcc, 1b ! CTI
+ nop ! IEU0
+
/* The I-cache does not snoop local stores so we
- * better flush that too.
+ * better flush that too when necessary.
*/
- ba,pt %xcc, __flush_icache_page
+ brnz,pt %o1, __flush_icache_page
sllx %o0, 11, %o0
+ retl
+ nop
+
+dflush1:stxa %g0, [%o4] ASI_DCACHE_TAG
+ add %o4, (1 << 5), %o4
+dflush2:stxa %g0, [%o4] ASI_DCACHE_TAG
+ add %o4, (1 << 5), %o4
+dflush3:stxa %g0, [%o4] ASI_DCACHE_TAG
+ add %o4, (1 << 5), %o4
+dflush4:stxa %g0, [%o4] ASI_DCACHE_TAG
+ add %o4, (1 << 5), %o4
+ membar #Sync
+ ba,pt %xcc, 2b
+ nop
.align 32
__prefill_dtlb:
@@ -250,8 +281,8 @@ __prefill_itlb:
retl
wrpr %g7, %pstate
- .globl update_mmu_cache
-update_mmu_cache: /* %o0=vma, %o1=address, %o2=pte */
+ .globl __update_mmu_cache
+__update_mmu_cache: /* %o0=vma, %o1=address, %o2=pte */
ldub [%g6 + AOFF_task_thread + AOFF_thread_fault_code], %o3
srlx %o1, 13, %o1
ldx [%o0 + 0x0], %o4 /* XXX vma->vm_mm */
diff --git a/arch/sparc64/solaris/ioctl.c b/arch/sparc64/solaris/ioctl.c
index 0e899da18..4a10c1b4c 100644
--- a/arch/sparc64/solaris/ioctl.c
+++ b/arch/sparc64/solaris/ioctl.c
@@ -464,8 +464,8 @@ static inline int solaris_S(struct file *filp, unsigned int fd, unsigned int cmd
struct sol_socket_struct *sock;
struct module_info *mi;
- if (! (ino = filp->f_dentry->d_inode) ||
- ! ino->i_sock)
+ ino = filp->f_dentry->d_inode;
+ if (! ino->i_sock)
return -EBADF;
sock = filp->private_data;
if (! sock) {
diff --git a/arch/sparc64/solaris/socket.c b/arch/sparc64/solaris/socket.c
index 3013d43cf..9b910a633 100644
--- a/arch/sparc64/solaris/socket.c
+++ b/arch/sparc64/solaris/socket.c
@@ -265,7 +265,7 @@ extern __inline__ struct socket *sockfd_lookup(int fd, int *err)
}
inode = file->f_dentry->d_inode;
- if (!inode || !inode->i_sock || !socki_lookup(inode)) {
+ if (!inode->i_sock || !socki_lookup(inode)) {
*err = -ENOTSOCK;
fput(file);
return NULL;
diff --git a/arch/sparc64/vmlinux.lds b/arch/sparc64/vmlinux.lds
index f686decfb..91d4575d0 100644
--- a/arch/sparc64/vmlinux.lds
+++ b/arch/sparc64/vmlinux.lds
@@ -35,6 +35,9 @@ SECTIONS
__ksymtab : { *(__ksymtab) }
__stop___ksymtab = .;
__kstrtab : { *(.kstrtab) }
+ __start___kallsyms = .; /* All kernel symbols */
+ __kallsyms : { *(__kallsyms) }
+ __stop___kallsyms = .;
. = ALIGN(8192);
__init_begin = .;
.text.init : { *(.text.init) }