Merge with 2.4.0-test11.

author: Ralf Baechle <ralf@linux-mips.org> 2000-11-28 03:58:46 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-11-28 03:58:46 +0000
commit: b63ad0882a16a5d28003e57f2b0b81dee3fb322b (patch)
tree: 0a343ce219e2b8b38a5d702d66032c57b83d9720 /arch/sparc64
parent: a9d7bff9a84dba79609a0002e5321b74c4d64c64 (diff)
23 files changed, 2189 insertions, 162 deletions
diff --git a/arch/sparc64/config.in b/arch/sparc64/config.in
index a754b796b..19b05e28f 100644
--- a/arch/sparc64/config.in
+++ b/arch/sparc64/config.in
@@ -29,6 +29,8 @@ bool 'Symmetric multi-processing support' CONFIG_SMP
 # Global things across all Sun machines.
 define_bool CONFIG_HAVE_DEC_LOCK y
 define_bool CONFIG_ISA n
+define_bool CONFIG_EISA n
+define_bool CONFIG_MCA n
 define_bool CONFIG_PCMCIA n
 define_bool CONFIG_SBUS y
 define_bool CONFIG_SBUSCHAR y
diff --git a/arch/sparc64/kernel/dtlb_base.S b/arch/sparc64/kernel/dtlb_base.S
index 72120b563..80c74aa18 100644
--- a/arch/sparc64/kernel/dtlb_base.S
+++ b/arch/sparc64/kernel/dtlb_base.S
@@ -1,4 +1,4 @@
-/* $Id: dtlb_base.S,v 1.7 2000/03/26 09:13:48 davem Exp $
+/* $Id: dtlb_base.S,v 1.8 2000/11/10 08:28:45 davem Exp $
  * dtlb_base.S:	Front end to DTLB miss replacement strategy.
  *              This is included directly into the trap table.
  *
@@ -57,7 +57,7 @@
 	 srax		%g4, VPTE_SHIFT, %g6		! Create VPTE offset
 	ldxa		[%g3 + %g6] ASI_S, %g5		! Load VPTE
 1:	brlz,pt		%g5, 9f				! Valid, load into TLB
-	 and		%g5, (_PAGE_PRESENT|_PAGE_READ), %g4	! Mask readable bits
+	 nop						! Delay-slot
 	ba,a,pt		%xcc, 4f			! Invalid, branch out
 
 /* DTLB ** ICACHE line 2: Quick kernel TLB misses	*/
@@ -68,27 +68,27 @@
 	 nop
 9:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
 	retry						! Trap return
-	nop
+4:	rdpr		%pstate, %g5			! Move into alternate globals
 
 /* DTLB ** ICACHE line 3: winfixups+real_faults		*/
-4:	cmp		%g4, (_PAGE_PRESENT|_PAGE_READ)	! Readable page?
-	be,pn		%xcc, 5f			! Yep, refbit update
-	 sllx		%g1, 60, %g4			! Get valid bit
-	rdpr		%pstate, %g5			! Move into alternate globals
 	wrpr		%g5, PSTATE_AG|PSTATE_MG, %pstate
 	rdpr		%tl, %g4			! See where we came from.
 	cmp		%g4, 1				! Is etrap/rtrap window fault?
 	mov		TLB_TAG_ACCESS, %g4		! Prepare for fault processing
-
-/* DTLB ** ICACHE line 4: padding		*/
 	ldxa		[%g4] ASI_DMMU, %g5		! Load faulting VA page
 	be,pt		%xcc, sparc64_realfault_common	! Jump to normal fault handling
 	 mov		FAULT_CODE_DTLB, %g4		! It was read from DTLB
 	ba,a,pt		%xcc, winfix_trampoline		! Call window fixup code
-5:	or		%g5, _PAGE_ACCESSED, %g5	! Indicate reference
-	or		%g5, %g4, %g5			! Set valid
-	stxa		%g5, [%g3 + %g6] ASI_S		! Update PTE table (cant trap)
-	ba,a,pt		%xcc, 9b			! Complete tlb miss
+
+/* DTLB ** ICACHE line 4: Unused...	*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
 #undef TAG_CONTEXT_BITS
 #undef VPTE_SHIFT
diff --git a/arch/sparc64/kernel/dtlb_prot.S b/arch/sparc64/kernel/dtlb_prot.S
index 5e99d5d47..1da370c7c 100644
--- a/arch/sparc64/kernel/dtlb_prot.S
+++ b/arch/sparc64/kernel/dtlb_prot.S
@@ -1,4 +1,4 @@
-/* $Id: dtlb_prot.S,v 1.20 2000/03/26 09:13:48 davem Exp $
+/* $Id: dtlb_prot.S,v 1.21 2000/11/10 08:28:45 davem Exp $
  * dtlb_prot.S: DTLB protection trap strategy.
  *              This is included directly into the trap table.
  *
@@ -6,10 +6,6 @@
  * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
  */
 
-#define TAG_CONTEXT_BITS	0x3ff
-#define VPTE_SHIFT		(PAGE_SHIFT - 3)
-#define MODIFIED_BITS		(_PAGE_WRITE | _PAGE_W | _PAGE_MODIFIED | _PAGE_ACCESSED)
-
 /* Ways we can get here:
  *
  * [TL == 0] 1) User stores to readonly pages.
@@ -18,45 +14,41 @@
  */
 
 /* PROT ** ICACHE line 1: User DTLB protection trap	*/
-	ldxa		[%g1] ASI_DMMU, %g6		! Primary or Secondary ctx?
-	and		%g6, 0x10, %g6			! Get pri/sec ctx bit
 	stxa		%g0, [%g1] ASI_DMMU		! Clear SFSR FaultValid bit
 	membar		#Sync				! Synchronize ASI stores
-	ldxa		[%g1 + %g1] ASI_DMMU, %g4	! Load TAG_ACCESS
-	andn		%g4, TAG_CONTEXT_BITS, %g4	! Clear CTX bits
-	stxa		%g0, [%g4 + %g6] ASI_DMMU_DEMAP	! Perform TLB flush of page
-	membar		#Sync				! Synchronize ASI stores
-
-/* PROT ** ICACHE line 2: Further normal processing	*/
-	srax		%g4, VPTE_SHIFT, %g6		! Compute VPTE offset
-	ldxa		[%g3 + %g6] ASI_S, %g5		! Load PTE entry
-	andcc		%g5, _PAGE_WRITE, %g0		! Writable page?
-	be,pt		%xcc, 1f			! Nope, real fault
-	 or		%g5, (MODIFIED_BITS), %g5	! Mark as writable/modified
-	stxa		%g5, [%g3 + %g6] ASI_S		! Update PTE entry
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Load PTE into TLB
-	retry						! Trap return
-
-/* PROT ** ICACHE line 3: Real user faults		*/
-1:	rdpr		%pstate, %g5			! Move into alternate globals
+	rdpr		%pstate, %g5			! Move into alternate globals
 	wrpr		%g5, PSTATE_AG|PSTATE_MG, %pstate
 	rdpr		%tl, %g1			! Need to do a winfixup?
 	cmp		%g1, 1				! Trap level >1?
 	mov		TLB_TAG_ACCESS, %g4		! Prepare reload of vaddr
+	nop
+
+/* PROT ** ICACHE line 2: More real fault processing */
 	bgu,pn		%xcc, winfix_trampoline		! Yes, perform winfixup
 	 ldxa		[%g4] ASI_DMMU, %g5		! Put tagaccess in %g5
 	ba,pt		%xcc, sparc64_realfault_common	! Nope, normal fault
-
-/* PROT ** ICACHE line 4: More real fault processing */
 	 mov		FAULT_CODE_DTLB | FAULT_CODE_WRITE, %g4
 	nop
 	nop
 	nop
 	nop
+
+/* PROT ** ICACHE line 3: Unused...	*/
+	nop
+	nop
+	nop
+	nop
+	nop
 	nop
 	nop
 	nop
 
-#undef TAG_CONTEXT_BITS
-#undef VPTE_SHIFT
-#undef MODIFIED_BITS
+/* PROT ** ICACHE line 3: Unused...	*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
diff --git a/arch/sparc64/kernel/ebus.c b/arch/sparc64/kernel/ebus.c
index 5872046b1..e175fac27 100644
--- a/arch/sparc64/kernel/ebus.c
+++ b/arch/sparc64/kernel/ebus.c
@@ -1,4 +1,4 @@
-/* $Id: ebus.c,v 1.48 2000/08/02 06:22:35 davem Exp $
+/* $Id: ebus.c,v 1.53 2000/11/08 05:08:23 davem Exp $
  * ebus.c: PCI to EBus bridge device.
  *
  * Copyright (C) 1997  Eddie C. Dost  (ecd@skynet.be)
@@ -22,21 +22,9 @@
 
 struct linux_ebus *ebus_chain = 0;
 
-#ifdef CONFIG_SUN_OPENPROMIO
-extern int openprom_init(void);
-#endif
 #ifdef CONFIG_SUN_AUXIO
 extern void auxio_probe(void);
 #endif
-#ifdef CONFIG_OBP_FLASH
-extern int flash_init(void);
-#endif
-#ifdef CONFIG_ENVCTRL
-extern int envctrl_init(void);
-#endif
-#ifdef CONFIG_DISPLAY7SEG
-extern int d7s_init(void);
-#endif
 
 static inline void *ebus_alloc(size_t size)
 {
@@ -372,24 +360,9 @@ void __init ebus_init(void)
 		++num_ebus;
 	}
 
-#ifdef CONFIG_SUN_OPENPROMIO
-	openprom_init();
-#endif
-#ifdef CONFIG_SUN_BPP
-	bpp_init();
-#endif
 #ifdef CONFIG_SUN_AUXIO
 	auxio_probe();
 #endif
-#ifdef CONFIG_ENVCTRL
-	envctrl_init();
-#endif
-#ifdef CONFIG_OBP_FLASH
-	flash_init();
-#endif
-#ifdef CONFIG_DISPLAY7SEG
-	d7s_init();
-#endif
 	clock_probe();
 	power_init();
 }
diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c
index 9f7cd59e9..a510c2aff 100644
--- a/arch/sparc64/kernel/ioctl32.c
+++ b/arch/sparc64/kernel/ioctl32.c
@@ -1,4 +1,4 @@
-/* $Id: ioctl32.c,v 1.99 2000/10/17 16:20:33 davem Exp $
+/* $Id: ioctl32.c,v 1.103 2000/11/10 05:44:33 davem Exp $
  * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
  *
  * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
@@ -71,8 +71,9 @@
 #include <asm/openpromio.h>
 #include <asm/envctrl.h>
 #include <asm/audioio.h>
-#include <asm/ethtool.h>
+#include <linux/ethtool.h>
 #include <asm/display7seg.h>
+#include <asm/module.h>
 #include <linux/soundcard.h>
 
 #include <linux/atm.h>
@@ -3230,6 +3231,7 @@ COMPATIBLE_IOCTL(ENVCTRL_RD_SCSI_TEMPERATURE)
 COMPATIBLE_IOCTL(ENVCTRL_RD_ETHERNET_TEMPERATURE)
 COMPATIBLE_IOCTL(ENVCTRL_RD_MTHRBD_TEMPERATURE)
 COMPATIBLE_IOCTL(ENVCTRL_RD_CPU_VOLTAGE)
+COMPATIBLE_IOCTL(ENVCTRL_RD_GLOBALADDRESS)
 /* COMPATIBLE_IOCTL(D7SIOCRD) same value as ENVCTRL_RD_VOLTAGE_STATUS */
 COMPATIBLE_IOCTL(D7SIOCWR)
 COMPATIBLE_IOCTL(D7SIOCTM)
@@ -3467,6 +3469,14 @@ COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
 COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
 COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
 COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
 COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
 /* SOUND_MIXER_READ_ENHANCE,  same value as READ_MUTE */
 /* SOUND_MIXER_READ_LOUD,  same value as READ_MUTE */
@@ -3492,6 +3502,14 @@ COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
 COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
 COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
 COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
 COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
 /* SOUND_MIXER_WRITE_ENHANCE,  same value as WRITE_MUTE */
 /* SOUND_MIXER_WRITE_LOUD,  same value as WRITE_MUTE */
diff --git a/arch/sparc64/kernel/itlb_base.S b/arch/sparc64/kernel/itlb_base.S
index 7f0da3d14..bd6a3603d 100644
--- a/arch/sparc64/kernel/itlb_base.S
+++ b/arch/sparc64/kernel/itlb_base.S
@@ -1,4 +1,4 @@
-/* $Id: itlb_base.S,v 1.9 2000/03/26 09:13:48 davem Exp $
+/* $Id: itlb_base.S,v 1.10 2000/11/10 08:28:45 davem Exp $
  * itlb_base.S:	Front end to ITLB miss replacement strategy.
  *              This is included directly into the trap table.
  *
@@ -23,22 +23,13 @@
 	srax		%g4, VPTE_SHIFT, %g6		! Create VPTE offset
 	ldxa		[%g3 + %g6] ASI_P, %g5		! Load VPTE
 1:	brgez,pn	%g5, 3f				! Not valid, branch out
-	 and		%g5, (_PAGE_PRESENT|_PAGE_READ), %g4	! Mask readable bits
+	 nop						! Delay-slot
 2:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN	! Load PTE into TLB
 	retry						! Trap return
-3:	cmp		%g4, (_PAGE_PRESENT|_PAGE_READ)	! Readable page?
+3:	rdpr		%pstate, %g4			! Move into alternate globals
 
-/* ITLB ** ICACHE line 2: Quick user ref updates	*/
-	bne,pn		%xcc, 4f			! Nope, real missing page
-	 sllx		%g1, 60, %g4			! Sliiickkk...
-	or		%g5, _PAGE_ACCESSED, %g5	! Mark as touched
-	or		%g5, %g4, %g5			! Allow user to see it
-	ba,pt		%xcc, 2b			! Branch to load TLB
-	 stxa		%g5, [%g3 + %g6] ASI_S		! Update PTE table
-4:	rdpr		%pstate, %g4			! Move into alternate globals
+/* ITLB ** ICACHE line 2: Real faults			*/
 	wrpr		%g4, PSTATE_AG|PSTATE_MG, %pstate
-
-/* ITLB ** ICACHE line 3: Real faults			*/
 	rdpr		%tpc, %g5			! And load faulting VA
 	mov		FAULT_CODE_ITLB, %g4		! It was read from ITLB
 sparc64_realfault_common:				! Called by TL0 dtlb_miss too
@@ -46,10 +37,11 @@ sparc64_realfault_common:				! Called by TL0 dtlb_miss too
 	stx		%g5, [%g6 + AOFF_task_thread + AOFF_thread_fault_address]
 	ba,pt		%xcc, etrap			! Save state
 1:	 rd		%pc, %g7			! ...
+	nop
+
+/* ITLB ** ICACHE line 3: Finish faults + window fixups	*/
 	call		do_sparc64_fault		! Call fault handler
 	 add		%sp, STACK_BIAS + REGWIN_SZ, %o0! Compute pt_regs arg
-
-/* ITLB ** ICACHE line 4: Finish faults + window fixups	*/
 	ba,pt		%xcc, rtrap_clr_l6		! Restore cpu state
 	 nop
 winfix_trampoline:
@@ -57,6 +49,14 @@ winfix_trampoline:
 	or		%g3, 0x7c, %g3			! Compute offset to branch
 	wrpr		%g3, %tnpc			! Write it into TNPC
 	done						! Do it to it
+
+/* ITLB ** ICACHE line 4: Unused...	*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 	nop
 	nop
 
diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c
index dd153a24e..1abef824f 100644
--- a/arch/sparc64/kernel/pci.c
+++ b/arch/sparc64/kernel/pci.c
@@ -1,4 +1,4 @@
-/* $Id: pci.c,v 1.18 2000/10/03 11:31:42 anton Exp $
+/* $Id: pci.c,v 1.19 2000/11/08 04:49:17 davem Exp $
  * pci.c: UltraSparc PCI controller support.
  *
  * Copyright (C) 1997, 1998, 1999 David S. Miller (davem@redhat.com)
@@ -202,12 +202,6 @@ void pcibios_update_irq(struct pci_dev *pdev, int irq)
 {
 }
 
-unsigned long resource_fixup(struct pci_dev *pdev, struct resource *res,
-			     unsigned long start, unsigned long size)
-{
-	return start;
-}
-
 void pcibios_fixup_pbus_ranges(struct pci_bus *pbus,
 			       struct pbus_set_ranges_data *pranges)
 {
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 1f3386d53..4534ad59b 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -1,4 +1,4 @@
-/*  $Id: process.c,v 1.112 2000/09/06 00:45:01 davem Exp $
+/*  $Id: process.c,v 1.113 2000/11/08 08:14:58 davem Exp $
  *  arch/sparc64/kernel/process.c
  *
  *  Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -647,14 +647,21 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 {
 	long retval;
 
-	__asm__ __volatile("mov %1, %%g1\n\t"
+	/* If the parent runs before fn(arg) is called by the child,
+	 * the input registers of this function can be clobbered.
+	 * So we stash 'fn' and 'arg' into global registers which
+	 * will not be modified by the parent.
+	 */
+	__asm__ __volatile("mov %4, %%g2\n\t"	   /* Save FN into global */
+			   "mov %5, %%g3\n\t"	   /* Save ARG into global */
+			   "mov %1, %%g1\n\t"	   /* Clone syscall nr. */
 			   "mov %2, %%o0\n\t"	   /* Clone flags. */
 			   "mov 0, %%o1\n\t"	   /* usp arg == 0 */
 			   "t 0x6d\n\t"		   /* Linux/Sparc clone(). */
 			   "brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */
 			   " mov %%o0, %0\n\t"
-			   "jmpl %4, %%o7\n\t"	   /* Call the function. */
-			   " mov %5, %%o0\n\t"	   /* Set arg in delay. */
+			   "jmpl %%g2, %%o7\n\t"   /* Call the function. */
+			   " mov %%g3, %%o0\n\t"   /* Set arg in delay. */
 			   "mov %3, %%g1\n\t"
 			   "t 0x6d\n\t"		   /* Linux/Sparc exit(). */
 			   /* Notreached by child. */
@@ -662,7 +669,7 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 			   "=r" (retval) :
 			   "i" (__NR_clone), "r" (flags | CLONE_VM),
 			   "i" (__NR_exit),  "r" (fn), "r" (arg) :
-			   "g1", "o0", "o1", "memory", "cc");
+			   "g1", "g2", "g3", "o0", "o1", "memory", "cc");
 	return retval;
 }
 
diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
index 8cb6e7211..1928d5a0d 100644
--- a/arch/sparc64/kernel/semaphore.c
+++ b/arch/sparc64/kernel/semaphore.c
@@ -1,4 +1,4 @@
-/* $Id: semaphore.c,v 1.4 2000/10/14 10:09:00 davem Exp $
+/* $Id: semaphore.c,v 1.5 2000/11/10 04:02:03 davem Exp $
  *  Generic semaphore code. Buyer beware. Do your own
  * specific changes in <asm/semaphore-helper.h>
  */
@@ -223,7 +223,7 @@ void down_write_failed_biased(struct rw_semaphore *sem)
 	for (;;) {
 		if (test_and_clear_le_bit(1, &sem->granted))
 			break;
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE | TASK_EXCLUSIVE);
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!test_le_bit(1, &sem->granted))
 			schedule();
 	}
@@ -273,7 +273,7 @@ void down_write_failed(struct rw_semaphore *sem)
 	add_wait_queue_exclusive(&sem->wait, &wait);
 
 	while (sem->count < 0) {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE | TASK_EXCLUSIVE);
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (sem->count >= 0)
 			break;  /* we must attempt to acquire or bias the lock */
 		schedule();
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 6e7f59309..e1ae982bf 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -1,4 +1,4 @@
-/* $Id: sparc64_ksyms.c,v 1.95 2000/10/30 21:01:40 davem Exp $
+/* $Id: sparc64_ksyms.c,v 1.98 2000/11/13 10:03:32 davem Exp $
  * arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support.
  *
  * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -87,7 +87,6 @@ extern long sparc32_open(const char * filename, int flags, int mode);
 extern int register_ioctl32_conversion(unsigned int cmd, int (*handler)(unsigned int, unsigned int, unsigned long, struct file *));
 extern int unregister_ioctl32_conversion(unsigned int cmd);
 extern int io_remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot, int space);
-extern void __flush_dcache_page(void *addr);
                 
 extern int __ashrdi3(int, int);
 
@@ -110,25 +109,14 @@ extern void _do_write_unlock(rwlock_t *rw);
 
 extern unsigned long phys_base;
 
-/* One thing to note is that the way the symbols of the mul/div
- * support routines are named is a mess, they all start with
- * a '.' which makes it a bitch to export, here is the trick:
- */
-
-#define EXPORT_SYMBOL_PRIVATE(sym)				\
-extern int __sparc_priv_ ## sym (int) __asm__("__" #sym);	\
-const struct module_symbol __export_priv_##sym			\
-__attribute__((section("__ksymtab"))) =				\
-{ (unsigned long) &__sparc_priv_ ## sym, "__" #sym }
-
 /* used by various drivers */
 #ifdef CONFIG_SMP
 #ifndef SPIN_LOCK_DEBUG
 /* Out of line rw-locking implementation. */
-EXPORT_SYMBOL_PRIVATE(read_lock);
-EXPORT_SYMBOL_PRIVATE(read_unlock);
-EXPORT_SYMBOL_PRIVATE(write_lock);
-EXPORT_SYMBOL_PRIVATE(write_unlock);
+EXPORT_SYMBOL(__read_lock);
+EXPORT_SYMBOL(__read_unlock);
+EXPORT_SYMBOL(__write_lock);
+EXPORT_SYMBOL(__write_unlock);
 #endif
 
 /* Kernel wide locking */
@@ -137,10 +125,10 @@ EXPORT_SYMBOL(kernel_flag);
 /* Hard IRQ locking */
 EXPORT_SYMBOL(global_irq_holder);
 EXPORT_SYMBOL(synchronize_irq);
-EXPORT_SYMBOL_PRIVATE(global_cli);
-EXPORT_SYMBOL_PRIVATE(global_sti);
-EXPORT_SYMBOL_PRIVATE(global_save_flags);
-EXPORT_SYMBOL_PRIVATE(global_restore_flags);
+EXPORT_SYMBOL(__global_cli);
+EXPORT_SYMBOL(__global_sti);
+EXPORT_SYMBOL(__global_save_flags);
+EXPORT_SYMBOL(__global_restore_flags);
 
 /* Per-CPU information table */
 EXPORT_SYMBOL(cpu_data);
@@ -163,27 +151,33 @@ EXPORT_SYMBOL(_do_write_unlock);
 
 #endif
 
+/* semaphores */
+EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__down_trylock);
+EXPORT_SYMBOL(__up);
+
 /* rw semaphores */
 EXPORT_SYMBOL_NOVERS(__down_read_failed);
 EXPORT_SYMBOL_NOVERS(__down_write_failed);
 EXPORT_SYMBOL_NOVERS(__rwsem_wake);
 
 /* Atomic counter implementation. */
-EXPORT_SYMBOL_PRIVATE(atomic_add);
-EXPORT_SYMBOL_PRIVATE(atomic_sub);
+EXPORT_SYMBOL(__atomic_add);
+EXPORT_SYMBOL(__atomic_sub);
 
 /* Atomic bit operations. */
-EXPORT_SYMBOL_PRIVATE(test_and_set_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_clear_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_change_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_set_le_bit);
-EXPORT_SYMBOL_PRIVATE(test_and_clear_le_bit);
+EXPORT_SYMBOL(__test_and_set_bit);
+EXPORT_SYMBOL(__test_and_clear_bit);
+EXPORT_SYMBOL(__test_and_change_bit);
+EXPORT_SYMBOL(__test_and_set_le_bit);
+EXPORT_SYMBOL(__test_and_clear_le_bit);
 
 EXPORT_SYMBOL(ivector_table);
 EXPORT_SYMBOL(enable_irq);
 EXPORT_SYMBOL(disable_irq);
 
-EXPORT_SYMBOL_PRIVATE(flushw_user);
+EXPORT_SYMBOL(__flushw_user);
 
 EXPORT_SYMBOL(__flush_dcache_page);
 
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 24c8cd593..9b211d86d 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -1,4 +1,4 @@
-/* $Id: sys_sparc32.c,v 1.165 2000/10/10 04:47:31 davem Exp $
+/* $Id: sys_sparc32.c,v 1.166 2000/11/10 04:49:56 davem Exp $
  * sys_sparc32.c: Conversion between 32bit and 64bit native syscalls.
  *
  * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
@@ -2952,7 +2952,7 @@ static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm)
 					return -ENOMEM;
 				new = 1;
 			}
-			kaddr = (char *)kmap(page);
+			kaddr = kmap(page);
 
 			if (new && offset)
 				memset(kaddr, 0, offset);
@@ -2967,7 +2967,7 @@ static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm)
 			err = copy_from_user(kaddr + offset, (char *)A(str),
 					     bytes_to_copy);
 			flush_page_to_ram(page);
-			kunmap((unsigned long)kaddr);
+			kunmap(page);
 
 			if (err)
 				return -EFAULT;
diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c
index 75d5c096e..a5f5411f5 100644
--- a/arch/sparc64/kernel/sys_sunos32.c
+++ b/arch/sparc64/kernel/sys_sunos32.c
@@ -601,7 +601,6 @@ sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr)
 	int    try_port;
 	int    ret;
 	struct socket *socket;
-	struct dentry *dentry;
 	struct inode  *inode;
 	struct file   *file;
 
@@ -609,8 +608,7 @@ sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr)
 	if(!file)
 		return 0;
 
-	dentry = file->f_dentry;
-	inode = dentry->d_inode;
+	inode = file->f_dentry->d_inode;
 
 	socket = &inode->u.socket_i;
 	local.sin_family = AF_INET;
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
index fa057936a..77531321d 100644
--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -1,4 +1,4 @@
-# $Id: Makefile,v 1.23 2000/07/10 20:57:34 davem Exp $
+# $Id: Makefile,v 1.24 2000/11/01 07:33:47 davem Exp $
 # Makefile for Sparc64 library files..
 #
 
@@ -8,7 +8,8 @@ OBJS  = PeeCeeI.o blockops.o debuglocks.o strlen.o strncmp.o \
 	memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \
 	VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \
 	VIScsumcopyusr.o VISsave.o atomic.o rwlock.o bitops.o \
-	dec_and_lock.o
+	dec_and_lock.o U3memcpy.o U3copy_from_user.o U3copy_to_user.o \
+	U3copy_in_user.o
 
 lib.a: $(OBJS)
 	$(AR) rcs lib.a $(OBJS)
diff --git a/arch/sparc64/lib/U3copy_from_user.S b/arch/sparc64/lib/U3copy_from_user.S
new file mode 100644
index 000000000..b1003e607
--- /dev/null
+++ b/arch/sparc64/lib/U3copy_from_user.S
@@ -0,0 +1,500 @@
+/* $Id: U3copy_from_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized copy from userspace.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	ba,pt %xcc, U3cfu_fixup;	\
+	 a, b, %o1;			\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EX(x,y,a,b)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	ba,pt %xcc, U3cfu_fixup;	\
+	 a, b, %o1;			\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EX2(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	and %o2, (0x40 - 1), %o1;	\
+	add %o1, %o4, %o1;		\
+	ba,pt %xcc, U3cfu_fixup;	\
+	 add %o1, 0x1c0, %o1;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EX3(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	and %o2, (0x40 - 1), %o1;	\
+	sll %g3, 6, %g3;		\
+	add %o1, 0x80, %o1;		\
+	ba,pt %xcc, U3cfu_fixup;	\
+	 add %o1, %g3, %o1;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EX4(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	and %o2, (0x40 - 1), %o1;	\
+	add %o1, 0x40, %o1;		\
+	ba,pt %xcc, U3cfu_fixup;	\
+	 add %o1, %g3, %o1;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#else
+#define ASI_BLK_P 0xf0
+#define FPRS_FEF  0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b)	x,y;
+#define EX(x,y,a,b)	x,y;
+#define EX2(x,y)	x,y;
+#define EX3(x,y)	x,y;
+#define EX4(x,y)	x,y;
+#endif
+
+	/* Special/non-trivial issues of this code:
+	 *
+	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+	 * 2) Only low 32 FPU registers are used so that only the
+	 *    lower half of the FPU register set is dirtied by this
+	 *    code.  This is especially important in the kernel.
+	 * 3) This code never prefetches cachelines past the end
+	 *    of the source buffer.
+	 */
+
+	.text
+	.align	32
+
+	/* The cheetah's flexible spine, oversized liver, enlarged heart,
+	 * slender muscular body, and claws make it the swiftest hunter
+	 * in Africa and the fastest animal on land.  Can reach speeds
+	 * of up to 2.4GB per second.
+	 */
+
+	.globl		U3copy_from_user
+U3copy_from_user: /* %o0=dst, %o1=src, %o2=len */
+#ifndef __KERNEL__
+	/* Save away original 'dst' for memcpy return value. */
+	mov		%o0, %g3			! A0	Group
+#endif
+	/* Anything to copy at all? */
+	cmp		%o2, 0				! A1
+	ble,pn		%icc, U3copy_from_user_short_ret! BR
+
+	/* Extremely small copy? */
+	 cmp		%o2, 31				! A0	Group
+	ble,pn		%icc, U3copy_from_user_short	! BR
+
+	/* Large enough to use unrolled prefetch loops? */
+	 cmp		%o2, 0x100			! A1
+	bge,a,pt	%icc, U3copy_from_user_enter	! BR	Group
+	 andcc		%o0, 0x3f, %g2			! A0
+
+	ba,pt		%xcc, U3copy_from_user_toosmall	! BR	Group
+	 andcc		%o0, 0x7, %g2			! A0
+
+	.align		32
+U3copy_from_user_short:
+	/* Copy %o2 bytes from src to dst, one byte at a time. */
+	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS	Group
+	add		%o1, 0x1, %o1			! A0
+	add		%o0, 0x1, %o0			! A1
+	subcc		%o2, 1, %o2			! A0	Group
+
+	bg,pt		%icc, U3copy_from_user_short	! BR
+	 stb		%o3, [%o0 + -1]			! MS	Group (1-cycle stall)
+
+U3copy_from_user_short_ret:
+#ifdef __KERNEL__
+	retl						! BR	Group (0-4 cycle stall)
+	 clr		%o0				! A0
+#else
+	retl						! BR	Group (0-4 cycle stall)
+	 mov		%g3, %o0			! A0
+#endif
+
+	/* Here len >= (6 * 64) and condition codes reflect execution
+	 * of "andcc %o0, 0x7, %g2", done by caller.
+	 */
+	.align		64
+U3copy_from_user_enter:
+	/* Is 'dst' already aligned on an 64-byte boundary? */
+	be,pt		%xcc, 2f			! BR
+
+	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x40, %g2			! A0	Group
+	sub		%g0, %g2, %g2			! A0	Group
+	sub		%o2, %g2, %o2			! A0	Group
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS	(Group)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 stb		%o3, [%o0 + -1]			! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+	and		%o1, 0x7, %g1			! A1
+	ba,pt		%xcc, U3copy_from_user_begin		! BR
+	 alignaddr	%o1, %g0, %o1			! MS	      (Break-after)
+
+	.align		64
+U3copy_from_user_begin:
+	prefetcha	[%o1 + 0x000] %asi, #one_read	! MS	Group1
+	prefetcha	[%o1 + 0x040] %asi, #one_read	! MS	Group2
+	andn		%o2, (0x40 - 1), %o4		! A0
+	prefetcha	[%o1 + 0x080] %asi, #one_read	! MS	Group3
+	cmp		%o4, 0x140			! A0
+	prefetcha	[%o1 + 0x0c0] %asi, #one_read	! MS	Group4
+	EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0)	! MS	Group5 (%f0 results at G8)
+	bge,a,pt	%icc, 1f			! BR
+
+	prefetcha	[%o1 + 0x100] %asi, #one_read	! MS	Group6
+1:	EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0)	! AX	       (%f2 results at G9)
+	cmp		%o4, 0x180			! A1
+	bge,a,pt	%icc, 1f			! BR
+	 prefetcha	[%o1 + 0x140] %asi, #one_read	! MS	Group7
+1:	EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0)	! AX	       (%f4 results at G10)
+	cmp		%o4, 0x1c0			! A1
+	bge,a,pt	%icc, 1f			! BR
+
+	 prefetcha	[%o1 + 0x180] %asi, #one_read	! MS	Group8
+1:	faligndata	%f0, %f2, %f16			! FGA	Group9 (%f16 at G12)
+	EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0)	! AX	       (%f6 results at G12)
+	faligndata	%f2, %f4, %f18			! FGA	Group10 (%f18 results at G13)
+	EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0)	! MS	        (%f8 results at G13)
+	faligndata	%f4, %f6, %f20			! FGA	Group12	(1-cycle stall,%f20 at G15)
+	EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0)	! MS		(%f10 results at G15)
+	faligndata	%f6, %f8, %f22			! FGA	Group13 (%f22 results at G16)
+
+	EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0)	! MS		(%f12 results at G16)
+	faligndata	%f8, %f10, %f24			! FGA	Group15 (1-cycle stall,%f24 at G18)
+	EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0)	! MS		(%f14 results at G18)
+	faligndata	%f10, %f12, %f26		! FGA	Group16 (%f26 results at G19)
+	EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0)	! MS		(%f0 results at G19)
+
+	/* We only use the first loop if len > (7 * 64). */
+	subcc		%o4, 0x1c0, %o4			! A0	Group17
+	bg,pt		%icc, U3copy_from_user_loop1	! BR
+	 add		%o1, 0x40, %o1			! A1
+
+	add		%o4, 0x140, %o4			! A0	Group18
+	ba,pt		%xcc, U3copy_from_user_loop2	! BR
+	 srl		%o4, 6, %o3			! A0	Group19
+	nop
+	nop
+	nop
+	nop
+	nop
+
+	nop
+	nop
+
+	/* This loop performs the copy and queues new prefetches.
+	 * We drop into the second loop when len <= (5 * 64).  Note
+	 * that this (5 * 64) factor has been subtracted from len
+	 * already.
+	 */
+U3copy_from_user_loop1:
+	EX2(ldda [%o1 + 0x008] %asi, %f2)		! MS	Group2	(%f2 results at G5)
+	faligndata	%f12, %f14, %f28		! FGA		(%f28 results at G5)
+	EX2(ldda [%o1 + 0x010] %asi, %f4)		! MS	Group3	(%f4 results at G6)
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall, %f30 at G7)
+	stda		%f16, [%o0] ASI_BLK_P		! MS
+	EX2(ldda [%o1 + 0x018] %asi, %f6)		! AX		(%f6 results at G7)
+
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+	EX2(ldda [%o1 + 0x020] %asi, %f8)		! MS	        (%f8 results at G15)
+	faligndata	%f2, %f4, %f18			! FGA	Group13	(%f18 results at G16)
+	EX2(ldda [%o1 + 0x028] %asi, %f10)		! MS		(%f10 results at G16)
+	faligndata	%f4, %f6, %f20			! FGA	Group14	(%f20 results at G17)
+	EX2(ldda [%o1 + 0x030] %asi, %f12)		! MS		(%f12 results at G17)
+	faligndata	%f6, %f8, %f22			! FGA	Group15	(%f22 results at G18)
+	EX2(ldda [%o1 + 0x038] %asi, %f14)		! MS		(%f14 results at G18)
+
+	faligndata	%f8, %f10, %f24			! FGA	Group16	(%f24 results at G19)
+	EX2(ldda [%o1 + 0x040] %asi, %f0)		! AX		(%f0 results at G19)
+	prefetcha	[%o1 + 0x180] %asi, #one_read	! MS
+	faligndata	%f10, %f12, %f26		! FGA	Group17	(%f26 results at G20)
+	subcc		%o4, 0x40, %o4			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3copy_from_user_loop1	! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+U3copy_from_user_loop2_enter:
+	mov		5, %o3				! A1
+
+	/* This loop performs on the copy, no new prefetches are
+	 * queued.  We do things this way so that we do not perform
+	 * any spurious prefetches past the end of the src buffer.
+	 */
+U3copy_from_user_loop2:
+	EX3(ldda [%o1 + 0x008] %asi, %f2)		! MS
+	faligndata	%f12, %f14, %f28		! FGA	Group2
+	EX3(ldda [%o1 + 0x010] %asi, %f4)		! MS
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall)
+	stda		%f16, [%o0] ASI_BLK_P		! MS
+	EX3(ldda [%o1 + 0x018] %asi, %f6)		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+
+	EX3(ldda [%o1 + 0x020] %asi, %f8)		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group13
+	EX3(ldda [%o1 + 0x028] %asi, %f10)		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group14
+	EX3(ldda [%o1 + 0x030] %asi, %f12)		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group15
+	EX3(ldda [%o1 + 0x038] %asi, %f14)		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group16
+
+	EX3(ldda [%o1 + 0x040] %asi, %f0)		! AX
+	faligndata	%f10, %f12, %f26		! FGA	Group17
+	subcc		%o3, 0x01, %o3			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3copy_from_user_loop2	! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+	/* Finally we copy the last full 64-byte block. */
+U3copy_from_user_loopfini:
+	EX3(ldda [%o1 + 0x008] %asi, %f2)		! MS
+	faligndata	%f12, %f14, %f28		! FGA
+	EX3(ldda [%o1 + 0x010] %asi, %f4)		! MS	Group19
+	faligndata	%f14, %f0, %f30			! FGA
+	stda		%f16, [%o0] ASI_BLK_P		! MS	Group20
+	EX3(ldda [%o1 + 0x018] %asi, %f6)		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group11 (7-cycle stall)
+	EX3(ldda [%o1 + 0x020] %asi, %f8)		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group12
+	EX3(ldda [%o1 + 0x028] %asi, %f10)		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group13
+	EX3(ldda [%o1 + 0x030] %asi, %f12)		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group14
+	EX3(ldda [%o1 + 0x038] %asi, %f14)		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group15
+	cmp		%g1, 0				! A0
+	be,pt		%icc, 1f			! BR
+	 add		%o0, 0x40, %o0			! A1
+	EX4(ldda [%o1 + 0x040] %asi, %f0)		! MS
+1:	faligndata	%f10, %f12, %f26		! FGA	Group16
+	faligndata	%f12, %f14, %f28		! FGA	Group17
+	faligndata	%f14, %f0, %f30			! FGA	Group18
+	stda		%f16, [%o0] ASI_BLK_P		! MS
+	add		%o0, 0x40, %o0			! A0
+	add		%o1, 0x40, %o1			! A1
+	membar		#Sync				! MS	Group26 (7-cycle stall)
+
+	/* Now we copy the (len modulo 64) bytes at the end.
+	 * Note how we borrow the %f0 loaded above.
+	 *
+	 * Also notice how this code is careful not to perform a
+	 * load past the end of the src buffer just like similar
+	 * code found in U3copy_from_user_toosmall processing.
+	 */
+U3copy_from_user_loopend:
+	and		%o2, 0x3f, %o2			! A0	Group
+	andcc		%o2, 0x38, %g2			! A0	Group
+	be,pn		%icc, U3copy_from_user_endcruft	! BR
+	 subcc		%g2, 0x8, %g2			! A1
+	be,pn		%icc, U3copy_from_user_endcruft	! BR	Group
+	 cmp		%g1, 0				! A0
+
+	be,a,pt		%icc, 1f			! BR	Group
+	 EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0)	! MS
+
+1:	EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0)	! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f0, %f2, %f8			! FGA	Group
+	std		%f8, [%o0 + 0x00]		! MS	(XXX does it stall here? XXX)
+	be,pn		%icc, U3copy_from_user_endcruft	! BR
+	 add		%o0, 0x8, %o0			! A0
+	EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0)	! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA
+	std		%f8, [%o0 + 0x00]		! MS	(XXX does it stall here? XXX)
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A0	Group
+
+	/* If anything is left, we copy it one byte at a time.
+	 * Note that %g1 is (src & 0x3) saved above before the
+	 * alignaddr was performed.
+	 */
+U3copy_from_user_endcruft:
+	cmp		%o2, 0
+	add		%o1, %g1, %o1
+	VISExitHalf
+	be,pn		%icc, U3copy_from_user_short_ret
+	 nop
+	ba,a,pt		%xcc, U3copy_from_user_short
+
+	/* If we get here, then 32 <= len < (6 * 64) */
+U3copy_from_user_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+	/* Is 'dst' already aligned on an 8-byte boundary? */
+	be,pt		%xcc, 2f			! BR	Group
+
+	/* Compute abs((dst & 7) - 8) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 8-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x8, %g2			! A0
+	sub		%g0, %g2, %g2			! A0	Group (reg-dep)
+	sub		%o2, %g2, %o2			! A0	Group (reg-dep)
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS	(Group) (%o3 in 3 cycles)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 stb		%o3, [%o0 + -1]			! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+
+	/* Compute (len - (len % 8)) into %g2.  This is guarenteed
+	 * to be nonzero.
+	 */
+	andn		%o2, 0x7, %g2			! A0	Group
+
+	/* You may read this and believe that it allows reading
+	 * one 8-byte longword past the end of src.  It actually
+	 * does not, as %g2 is subtracted as loads are done from
+	 * src, so we always stop before running off the end.
+	 * Also, we are guarenteed to have at least 0x10 bytes
+	 * to move here.
+	 */
+	sub		%g2, 0x8, %g2			! A0	Group (reg-dep)
+	alignaddr	%o1, %g0, %g1			! MS	      (Break-after)
+	EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)	! MS	Group (1-cycle stall)
+	add		%g1, 0x8, %g1			! A0
+
+1:	EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0)	! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+
+	faligndata	%f0, %f2, %f8			! FGA	Group (1-cycle stall)
+	std		%f8, [%o0 + 0x00]		! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+	be,pn		%icc, 2f			! BR
+
+	 add		%o0, 0x8, %o0			! A1
+	EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)	! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA	Group (1-cycle stall)
+	std		%f8, [%o0 + 0x00]		! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A1
+
+	/* Nothing left to copy? */
+2:	cmp		%o2, 0				! A0	Group
+	VISExitHalf					! A0+MS
+	be,pn		%icc, U3copy_from_user_short_ret! BR	Group
+	 nop						! A0
+	ba,a,pt		%xcc, U3copy_from_user_short	! BR	Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+	xor		%o1, %o0, %g2
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, U3copy_from_user_short
+	 andcc		%o1, 0x7, %g2
+
+	be,pt		%xcc, 2f
+	 sub		%g2, 0x8, %g2
+	sub		%g0, %g2, %g2
+	sub		%o2, %g2, %o2
+
+1:	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+	add		%o1, 0x1, %o1
+	add		%o0, 0x1, %o0
+	subcc		%g2, 0x1, %g2
+	bg,pt		%icc, 1b
+	 stb		%o3, [%o0 + -1]
+
+2:	andn		%o2, 0x7, %g2
+	sub		%o2, %g2, %o2
+
+3:	EXNV(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+	add		%o1, 0x8, %o1
+	add		%o0, 0x8, %o0
+	subcc		%g2, 0x8, %g2
+	bg,pt		%icc, 3b
+	 stx		%o3, [%o0 + -8]
+
+	cmp		%o2, 0
+	bne,pn		%icc, U3copy_from_user_short
+	 nop
+	ba,a,pt		%xcc, U3copy_from_user_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
+
+#ifdef __KERNEL__
+	.globl		U3cfu_fixup
+U3cfu_fixup:
+	/* Since this is copy_from_user(), zero out the rest of the
+	 * kernel buffer.
+	 */
+	cmp		%o1, 0
+	ble,pn		%icc, 2f
+	 mov		%o1, %g2
+
+1:	subcc		%g2, 1, %g2
+	stb		%g0, [%o0]
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+
+2:	retl
+	 mov		%o1, %o0
+#endif
diff --git a/arch/sparc64/lib/U3copy_in_user.S b/arch/sparc64/lib/U3copy_in_user.S
new file mode 100644
index 000000000..0fc169b9d
--- /dev/null
+++ b/arch/sparc64/lib/U3copy_in_user.S
@@ -0,0 +1,531 @@
+/* $Id: U3copy_in_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized copy within userspace.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b)	\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	retl;				\
+	 a, b, %o0;			\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXNV2(x,y,a,b)	\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	a, b, %o0;			\
+	retl;				\
+	 add %o0, 1, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXNV3(x,y,a,b)	\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	a, b, %o0;			\
+	retl;				\
+	 add %o0, 8, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EX(x,y,a,b)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	retl;				\
+	 a, b, %o0;			\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK1(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	add %o4, 0x1c0, %o1;		\
+	and %o2, (0x40 - 1), %o2;	\
+	retl;				\
+	 add %o1, %o2, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK2(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	sll %o3, 6, %o3;		\
+	and %o2, (0x40 - 1), %o2;	\
+	add %o3, 0x80, %o1;		\
+	retl;				\
+	 add %o1, %o2, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK3(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	and %o2, (0x40 - 1), %o2;	\
+	retl;				\
+	 add %o2, 0x80, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK4(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	and %o2, (0x40 - 1), %o2;	\
+	retl;				\
+	 add %o2, 0x40, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#else
+#define ASI_AIUS 0x80
+#define ASI_BLK_AIUS 0xf0
+#define FPRS_FEF  0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b)	x,y;
+#define EXNV2(x,y,a,b)	x,y;
+#define EXNV3(x,y,a,b)	x,y;
+#define EX(x,y,a,b)	x,y;
+#define EXBLK1(x,y)	x,y;
+#define EXBLK2(x,y)	x,y;
+#define EXBLK3(x,y)	x,y;
+#define EXBLK4(x,y)	x,y;
+#endif
+
+	/* Special/non-trivial issues of this code:
+	 *
+	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+	 * 2) Only low 32 FPU registers are used so that only the
+	 *    lower half of the FPU register set is dirtied by this
+	 *    code.  This is especially important in the kernel.
+	 * 3) This code never prefetches cachelines past the end
+	 *    of the source buffer.
+	 *
+	 *    XXX Actually, Cheetah can buffer up to 8 concurrent
+	 *    XXX prefetches, revisit this...
+	 */
+
+	.text
+	.align	32
+
+	/* The cheetah's flexible spine, oversized liver, enlarged heart,
+	 * slender muscular body, and claws make it the swiftest hunter
+	 * in Africa and the fastest animal on land.  Can reach speeds
+	 * of up to 2.4GB per second.
+	 */
+
+	.globl		U3copy_in_user
+U3copy_in_user: /* %o0=dst, %o1=src, %o2=len */
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+	rd		%asi, %g1			! MS	Group	(4 cycles)
+	cmp		%g1, ASI_AIUS			! A0	Group
+	bne		U3memcpy			! BR
+	 nop						! A1
+#ifndef __KERNEL__
+	/* Save away original 'dst' for memcpy return value. */
+	mov		%o0, %g3			! A0	Group
+#endif
+	/* Anything to copy at all? */
+	cmp		%o2, 0				! A1
+	ble,pn		%icc, U3copy_in_user_short_ret	! BR
+
+	/* Extremely small copy? */
+	 cmp		%o2, 31				! A0	Group
+	ble,pn		%icc, U3copy_in_user_short	! BR
+
+	/* Large enough to use unrolled prefetch loops? */
+	 cmp		%o2, 0x100			! A1
+	bge,a,pt	%icc, U3copy_in_user_enter	! BR	Group
+	 andcc		%o0, 0x3f, %g2			! A0
+
+	ba,pt		%xcc, U3copy_in_user_toosmall	! BR	Group
+	 andcc		%o0, 0x7, %g2			! A0
+
+	.align		32
+U3copy_in_user_short:
+	/* Copy %o2 bytes from src to dst, one byte at a time. */
+	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS	Group
+	add		%o1, 0x1, %o1			! A0
+	add		%o0, 0x1, %o0			! A1
+	subcc		%o2, 1, %o2			! A0	Group
+
+	bg,pt		%icc, U3copy_in_user_short	! BR
+	 EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1)	! MS	Group (1-cycle stall)
+
+U3copy_in_user_short_ret:
+#ifdef __KERNEL__
+	retl						! BR	Group (0-4 cycle stall)
+	 clr		%o0				! A0
+#else
+	retl						! BR	Group (0-4 cycle stall)
+	 mov		%g3, %o0			! A0
+#endif
+
+	/* Here len >= (6 * 64) and condition codes reflect execution
+	 * of "andcc %o0, 0x7, %g2", done by caller.
+	 */
+	.align		64
+U3copy_in_user_enter:
+	/* Is 'dst' already aligned on an 64-byte boundary? */
+	be,pt		%xcc, 2f			! BR
+
+	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x40, %g2			! A0	Group
+	sub		%g0, %g2, %g2			! A0	Group
+	sub		%o2, %g2, %o2			! A0	Group
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS	(Group)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)	! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+	and		%o1, 0x7, %g1			! A1
+	ba,pt		%xcc, U3copy_in_user_begin	! BR
+	 alignaddr	%o1, %g0, %o1			! MS	      (Break-after)
+
+	.align		64
+U3copy_in_user_begin:
+	prefetch	[%o1 + 0x000], #one_read	! MS	Group1
+	prefetch	[%o1 + 0x040], #one_read	! MS	Group2
+	andn		%o2, (0x40 - 1), %o4		! A0
+	prefetch	[%o1 + 0x080], #one_read	! MS	Group3
+	cmp		%o4, 0x140			! A0
+	prefetch	[%o1 + 0x0c0], #one_read	! MS	Group4
+	EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0)	! MS	Group5 (%f0 results at G8)
+	bge,a,pt	%icc, 1f			! BR
+
+	 prefetch	[%o1 + 0x100], #one_read	! MS	Group6
+1:	EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0)	! AX	       (%f2 results at G9)
+	cmp		%o4, 0x180			! A1
+	bge,a,pt	%icc, 1f			! BR
+	 prefetch	[%o1 + 0x140], #one_read	! MS	Group7
+1:	EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0)	! AX	       (%f4 results at G10)
+	cmp		%o4, 0x1c0			! A1
+	bge,a,pt	%icc, 1f			! BR
+
+	 prefetch	[%o1 + 0x180], #one_read	! MS	Group8
+1:	faligndata	%f0, %f2, %f16			! FGA	Group9 (%f16 at G12)
+	EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0)	! AX	       (%f6 results at G12)
+	faligndata	%f2, %f4, %f18			! FGA	Group10 (%f18 results at G13)
+	EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0)	! MS	        (%f8 results at G13)
+	faligndata	%f4, %f6, %f20			! FGA	Group12	(1-cycle stall,%f20 at G15)
+	EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0)	! MS		(%f10 results at G15)
+	faligndata	%f6, %f8, %f22			! FGA	Group13 (%f22 results at G16)
+	
+	EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0)	! MS		(%f12 results at G16)
+	faligndata	%f8, %f10, %f24			! FGA	Group15 (1-cycle stall,%f24 at G18)
+	EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0)	! MS		(%f14 results at G18)
+	faligndata	%f10, %f12, %f26		! FGA	Group16 (%f26 results at G19)
+	EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0)	! MS		(%f0 results at G19)
+
+	/* We only use the first loop if len > (7 * 64). */
+	subcc		%o4, 0x1c0, %o4			! A0	Group17
+	bg,pt		%icc, U3copy_in_user_loop1	! BR
+	 add		%o1, 0x40, %o1			! A1
+
+	add		%o4, 0x140, %o4			! A0	Group18
+	ba,pt		%xcc, U3copy_in_user_loop2	! BR
+	 srl		%o4, 6, %o3			! A0	Group19
+	nop
+	nop
+	nop
+	nop
+	nop
+
+	nop
+	nop
+
+	/* This loop performs the copy and queues new prefetches.
+	 * We drop into the second loop when len <= (5 * 64).  Note
+	 * that this (5 * 64) factor has been subtracted from len
+	 * already.
+	 */
+U3copy_in_user_loop1:
+	EXBLK1(ldda [%o1 + 0x008] %asi, %f2)		! MS	Group2	(%f2 results at G5)
+	faligndata	%f12, %f14, %f28		! FGA		(%f28 results at G5)
+	EXBLK1(ldda [%o1 + 0x010] %asi, %f4)		! MS	Group3	(%f4 results at G6)
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall, %f30 at G7)
+	EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS)		! MS
+	EXBLK1(ldda [%o1 + 0x018] %asi, %f6)		! AX		(%f6 results at G7)
+
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+	EXBLK1(ldda [%o1 + 0x020] %asi, %f8)		! MS	        (%f8 results at G15)
+	faligndata	%f2, %f4, %f18			! FGA	Group13	(%f18 results at G16)
+	EXBLK1(ldda [%o1 + 0x028] %asi, %f10)		! MS		(%f10 results at G16)
+	faligndata	%f4, %f6, %f20			! FGA	Group14	(%f20 results at G17)
+	EXBLK1(ldda [%o1 + 0x030] %asi, %f12)		! MS		(%f12 results at G17)
+	faligndata	%f6, %f8, %f22			! FGA	Group15	(%f22 results at G18)
+	EXBLK1(ldda [%o1 + 0x038] %asi, %f14)		! MS		(%f14 results at G18)
+
+	faligndata	%f8, %f10, %f24			! FGA	Group16	(%f24 results at G19)
+	EXBLK1(ldda [%o1 + 0x040] %asi, %f0)		! AX		(%f0 results at G19)
+	prefetch	[%o1 + 0x180], #one_read	! MS
+	faligndata	%f10, %f12, %f26		! FGA	Group17	(%f26 results at G20)
+	subcc		%o4, 0x40, %o4			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3copy_in_user_loop1		! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+U3copy_in_user_loop2_enter:
+	mov		5, %o3				! A1
+
+	/* This loop performs on the copy, no new prefetches are
+	 * queued.  We do things this way so that we do not perform
+	 * any spurious prefetches past the end of the src buffer.
+	 */
+U3copy_in_user_loop2:
+	EXBLK2(ldda [%o1 + 0x008] %asi, %f2)		! MS
+	faligndata	%f12, %f14, %f28		! FGA	Group2
+	EXBLK2(ldda [%o1 + 0x010] %asi, %f4)		! MS
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall)
+	EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS)		! MS
+	EXBLK2(ldda [%o1 + 0x018] %asi, %f6)		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+
+	EXBLK2(ldda [%o1 + 0x020] %asi, %f8)		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group13
+	EXBLK2(ldda [%o1 + 0x028] %asi, %f10)		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group14
+	EXBLK2(ldda [%o1 + 0x030] %asi, %f12)		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group15
+	EXBLK2(ldda [%o1 + 0x038] %asi, %f14)		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group16
+
+	EXBLK2(ldda [%o1 + 0x040] %asi, %f0)		! AX
+	faligndata	%f10, %f12, %f26		! FGA	Group17
+	subcc		%o3, 0x01, %o3			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3copy_in_user_loop2	! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+	/* Finally we copy the last full 64-byte block. */
+U3copy_in_user_loopfini:
+	EXBLK3(ldda [%o1 + 0x008] %asi, %f2)		! MS
+	faligndata	%f12, %f14, %f28		! FGA
+	EXBLK3(ldda [%o1 + 0x010] %asi, %f4)		! MS	Group19
+	faligndata	%f14, %f0, %f30			! FGA
+	EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS)		! MS	Group20
+	EXBLK4(ldda [%o1 + 0x018] %asi, %f6)		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group11 (7-cycle stall)
+	EXBLK4(ldda [%o1 + 0x020] %asi, %f8)		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group12
+	EXBLK4(ldda [%o1 + 0x028] %asi, %f10)		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group13
+	EXBLK4(ldda [%o1 + 0x030] %asi, %f12)		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group14
+	EXBLK4(ldda [%o1 + 0x038] %asi, %f14)		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group15
+	cmp		%g1, 0				! A0
+	be,pt		%icc, 1f			! BR
+	 add		%o0, 0x40, %o0			! A1
+	EXBLK4(ldda [%o1 + 0x040] %asi, %f0)		! MS
+1:	faligndata	%f10, %f12, %f26		! FGA	Group16
+	faligndata	%f12, %f14, %f28		! FGA	Group17
+	faligndata	%f14, %f0, %f30			! FGA	Group18
+	EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS)		! MS
+	add		%o0, 0x40, %o0			! A0
+	add		%o1, 0x40, %o1			! A1
+	membar		#Sync				! MS	Group26 (7-cycle stall)
+
+	/* Now we copy the (len modulo 64) bytes at the end.
+	 * Note how we borrow the %f0 loaded above.
+	 *
+	 * Also notice how this code is careful not to perform a
+	 * load past the end of the src buffer just like similar
+	 * code found in U3copy_in_user_toosmall processing.
+	 */
+U3copy_in_user_loopend:
+	and		%o2, 0x3f, %o2			! A0	Group
+	andcc		%o2, 0x38, %g2			! A0	Group
+	be,pn		%icc, U3copy_in_user_endcruft	! BR
+	 subcc		%g2, 0x8, %g2			! A1
+	be,pn		%icc, U3copy_in_user_endcruft	! BR	Group
+	 cmp		%g1, 0				! A0
+
+	be,a,pt		%icc, 1f			! BR	Group
+	 EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0)	! MS
+
+1:	EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0)	! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f0, %f2, %f8			! FGA	Group
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	(XXX does it stall here? XXX)
+	be,pn		%icc, U3copy_in_user_endcruft	! BR
+	 add		%o0, 0x8, %o0			! A0
+	EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0)	! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	(XXX does it stall here? XXX)
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A0	Group
+
+	/* If anything is left, we copy it one byte at a time.
+	 * Note that %g1 is (src & 0x3) saved above before the
+	 * alignaddr was performed.
+	 */
+U3copy_in_user_endcruft:
+	cmp		%o2, 0
+	add		%o1, %g1, %o1
+	VISExitHalf
+	be,pn		%icc, U3copy_in_user_short_ret
+	 nop
+	ba,a,pt		%xcc, U3copy_in_user_short
+
+	/* If we get here, then 32 <= len < (6 * 64) */
+U3copy_in_user_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+	/* Is 'dst' already aligned on an 8-byte boundary? */
+	be,pt		%xcc, 2f			! BR	Group
+
+	/* Compute abs((dst & 7) - 8) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 8-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x8, %g2			! A0
+	sub		%g0, %g2, %g2			! A0	Group (reg-dep)
+	sub		%o2, %g2, %o2			! A0	Group (reg-dep)
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS	(Group) (%o3 in 3 cycles)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)	! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+
+	/* Compute (len - (len % 8)) into %g2.  This is guarenteed
+	 * to be nonzero.
+	 */
+	andn		%o2, 0x7, %g2			! A0	Group
+
+	/* You may read this and believe that it allows reading
+	 * one 8-byte longword past the end of src.  It actually
+	 * does not, as %g2 is subtracted as loads are done from
+	 * src, so we always stop before running off the end.
+	 * Also, we are guarenteed to have at least 0x10 bytes
+	 * to move here.
+	 */
+	sub		%g2, 0x8, %g2			! A0	Group (reg-dep)
+	alignaddr	%o1, %g0, %g1			! MS	      (Break-after)
+	EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)	! MS	Group (1-cycle stall)
+	add		%g1, 0x8, %g1			! A0
+
+1:	EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0)	! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+
+	faligndata	%f0, %f2, %f8			! FGA	Group (1-cycle stall)
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+	be,pn		%icc, 2f			! BR
+
+	 add		%o0, 0x8, %o0			! A1
+	EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)	! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA	Group (1-cycle stall)
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A1
+
+	/* Nothing left to copy? */
+2:	cmp		%o2, 0				! A0	Group
+	VISExitHalf					! A0+MS
+	be,pn		%icc, U3copy_in_user_short_ret	! BR	Group
+	 nop						! A0
+	ba,a,pt		%xcc, U3copy_in_user_short	! BR	Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+	xor		%o1, %o0, %g2
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, U3copy_in_user_short
+	 andcc		%o1, 0x7, %g2
+
+	be,pt		%xcc, 2f
+	 sub		%g2, 0x8, %g2
+	sub		%g0, %g2, %g2
+	sub		%o2, %g2, %o2
+
+1:	EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+	add		%o1, 0x1, %o1
+	add		%o0, 0x1, %o0
+	subcc		%g2, 0x1, %g2
+	bg,pt		%icc, 1b
+	 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
+
+2:	andn		%o2, 0x7, %g2
+	sub		%o2, %g2, %o2
+
+3:	EXNV3(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2)
+	add		%o1, 0x8, %o1
+	add		%o0, 0x8, %o0
+	subcc		%g2, 0x8, %g2
+	bg,pt		%icc, 3b
+	 EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
+
+	cmp		%o2, 0
+	bne,pn		%icc, U3copy_in_user_short
+	 nop
+	ba,a,pt		%xcc, U3copy_in_user_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
diff --git a/arch/sparc64/lib/U3copy_to_user.S b/arch/sparc64/lib/U3copy_to_user.S
new file mode 100644
index 000000000..e08b1290b
--- /dev/null
+++ b/arch/sparc64/lib/U3copy_to_user.S
@@ -0,0 +1,528 @@
+/* $Id: U3copy_to_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized copy to userspace.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b)	\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	retl;				\
+	 a, b, %o0;			\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXNV2(x,y,a,b)	\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	a, b, %o0;			\
+	retl;				\
+	 add %o0, 1, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXNV3(x,y,a,b)	\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	a, b, %o0;			\
+	retl;				\
+	 add %o0, 8, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EX(x,y,a,b)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	retl;				\
+	 a, b, %o0;			\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK1(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	add %o4, 0x1c0, %o1;		\
+	and %o2, (0x40 - 1), %o2;	\
+	retl;				\
+	 add %o1, %o2, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK2(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	sll %o3, 6, %o3;		\
+	and %o2, (0x40 - 1), %o2;	\
+	add %o3, 0x80, %o1;		\
+	retl;				\
+	 add %o1, %o2, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK3(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	and %o2, (0x40 - 1), %o2;	\
+	retl;				\
+	 add %o2, 0x80, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#define EXBLK4(x,y)			\
+98:	x,y;				\
+	.section .fixup;		\
+	.align 4;			\
+99:	VISExitHalf;			\
+	and %o2, (0x40 - 1), %o2;	\
+	retl;				\
+	 add %o2, 0x40, %o0;		\
+	.section __ex_table;		\
+	.align 4;			\
+	.word 98b, 99b;			\
+	.text;				\
+	.align 4;
+#else
+#define ASI_AIUS 0x80
+#define ASI_BLK_AIUS 0xf0
+#define FPRS_FEF  0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#define EXNV(x,y,a,b)	x,y;
+#define EXNV2(x,y,a,b)	x,y;
+#define EXNV3(x,y,a,b)	x,y;
+#define EX(x,y,a,b)	x,y;
+#define EXBLK1(x,y)	x,y;
+#define EXBLK2(x,y)	x,y;
+#define EXBLK3(x,y)	x,y;
+#define EXBLK4(x,y)	x,y;
+#endif
+
+	/* Special/non-trivial issues of this code:
+	 *
+	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+	 * 2) Only low 32 FPU registers are used so that only the
+	 *    lower half of the FPU register set is dirtied by this
+	 *    code.  This is especially important in the kernel.
+	 * 3) This code never prefetches cachelines past the end
+	 *    of the source buffer.
+	 */
+
+	.text
+	.align	32
+
+	/* The cheetah's flexible spine, oversized liver, enlarged heart,
+	 * slender muscular body, and claws make it the swiftest hunter
+	 * in Africa and the fastest animal on land.  Can reach speeds
+	 * of up to 2.4GB per second.
+	 */
+
+	.globl		U3copy_to_user
+U3copy_to_user: /* %o0=dst, %o1=src, %o2=len */
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+	rd		%asi, %g1			! MS	Group	(4 cycles)
+	cmp		%g1, ASI_AIUS			! A0	Group
+	bne		U3memcpy			! BR
+	 nop						! A1
+#ifndef __KERNEL__
+	/* Save away original 'dst' for memcpy return value. */
+	mov		%o0, %g3			! A0	Group
+#endif
+	/* Anything to copy at all? */
+	cmp		%o2, 0				! A1
+	ble,pn		%icc, U3copy_to_user_short_ret	! BR
+
+	/* Extremely small copy? */
+	 cmp		%o2, 31				! A0	Group
+	ble,pn		%icc, U3copy_to_user_short	! BR
+
+	/* Large enough to use unrolled prefetch loops? */
+	 cmp		%o2, 0x100			! A1
+	bge,a,pt	%icc, U3copy_to_user_enter	! BR	Group
+	 andcc		%o0, 0x3f, %g2			! A0
+
+	ba,pt		%xcc, U3copy_to_user_toosmall	! BR	Group
+	 andcc		%o0, 0x7, %g2			! A0
+
+	.align		32
+U3copy_to_user_short:
+	/* Copy %o2 bytes from src to dst, one byte at a time. */
+	ldub		[%o1 + 0x00], %o3		! MS	Group
+	add		%o1, 0x1, %o1			! A0
+	add		%o0, 0x1, %o0			! A1
+	subcc		%o2, 1, %o2			! A0	Group
+
+	bg,pt		%icc, U3copy_to_user_short	! BR
+	 EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1)	! MS	Group (1-cycle stall)
+
+U3copy_to_user_short_ret:
+#ifdef __KERNEL__
+	retl						! BR	Group (0-4 cycle stall)
+	 clr		%o0				! A0
+#else
+	retl						! BR	Group (0-4 cycle stall)
+	 mov		%g3, %o0			! A0
+#endif
+
+	/* Here len >= (6 * 64) and condition codes reflect execution
+	 * of "andcc %o0, 0x7, %g2", done by caller.
+	 */
+	.align		64
+U3copy_to_user_enter:
+	/* Is 'dst' already aligned on an 64-byte boundary? */
+	be,pt		%xcc, 2f			! BR
+
+	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x40, %g2			! A0	Group
+	sub		%g0, %g2, %g2			! A0	Group
+	sub		%o2, %g2, %o2			! A0	Group
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	ldub		[%o1 + 0x00], %o3		! MS	(Group)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)	! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+	and		%o1, 0x7, %g1			! A1
+	ba,pt		%xcc, U3copy_to_user_begin	! BR
+	 alignaddr	%o1, %g0, %o1			! MS	      (Break-after)
+
+	.align		64
+U3copy_to_user_begin:
+	prefetch	[%o1 + 0x000], #one_read	! MS	Group1
+	prefetch	[%o1 + 0x040], #one_read	! MS	Group2
+	andn		%o2, (0x40 - 1), %o4		! A0
+	prefetch	[%o1 + 0x080], #one_read	! MS	Group3
+	cmp		%o4, 0x140			! A0
+	prefetch	[%o1 + 0x0c0], #one_read	! MS	Group4
+	ldd		[%o1 + 0x000], %f0		! MS	Group5 (%f0 results at G8)
+	bge,a,pt	%icc, 1f			! BR
+
+	 prefetch	[%o1 + 0x100], #one_read	! MS	Group6
+1:	ldd		[%o1 + 0x008], %f2		! AX	       (%f2 results at G9)
+	cmp		%o4, 0x180			! A1
+	bge,a,pt	%icc, 1f			! BR
+	 prefetch	[%o1 + 0x140], #one_read	! MS	Group7
+1:	ldd		[%o1 + 0x010], %f4		! AX	       (%f4 results at G10)
+	cmp		%o4, 0x1c0			! A1
+	bge,a,pt	%icc, 1f			! BR
+
+	 prefetch	[%o1 + 0x180], #one_read	! MS	Group8
+1:	faligndata	%f0, %f2, %f16			! FGA	Group9 (%f16 at G12)
+	ldd		[%o1 + 0x018], %f6		! AX	       (%f6 results at G12)
+	faligndata	%f2, %f4, %f18			! FGA	Group10 (%f18 results at G13)
+	ldd		[%o1 + 0x020], %f8		! MS	        (%f8 results at G13)
+	faligndata	%f4, %f6, %f20			! FGA	Group12	(1-cycle stall,%f20 at G15)
+	ldd		[%o1 + 0x028], %f10		! MS		(%f10 results at G15)
+	faligndata	%f6, %f8, %f22			! FGA	Group13 (%f22 results at G16)
+	
+	ldd		[%o1 + 0x030], %f12		! MS		(%f12 results at G16)
+	faligndata	%f8, %f10, %f24			! FGA	Group15 (1-cycle stall,%f24 at G18)
+	ldd		[%o1 + 0x038], %f14		! MS		(%f14 results at G18)
+	faligndata	%f10, %f12, %f26		! FGA	Group16 (%f26 results at G19)
+	ldd		[%o1 + 0x040], %f0		! MS		(%f0 results at G19)
+
+	/* We only use the first loop if len > (7 * 64). */
+	subcc		%o4, 0x1c0, %o4			! A0	Group17
+	bg,pt		%icc, U3copy_to_user_loop1	! BR
+	 add		%o1, 0x40, %o1			! A1
+
+	add		%o4, 0x140, %o4			! A0	Group18
+	ba,pt		%xcc, U3copy_to_user_loop2	! BR
+	 srl		%o4, 6, %o3			! A0	Group19
+	nop
+	nop
+	nop
+	nop
+	nop
+
+	nop
+	nop
+
+	/* This loop performs the copy and queues new prefetches.
+	 * We drop into the second loop when len <= (5 * 64).  Note
+	 * that this (5 * 64) factor has been subtracted from len
+	 * already.
+	 */
+U3copy_to_user_loop1:
+	ldd		[%o1 + 0x008], %f2		! MS	Group2	(%f2 results at G5)
+	faligndata	%f12, %f14, %f28		! FGA		(%f28 results at G5)
+	ldd		[%o1 + 0x010], %f4		! MS	Group3	(%f4 results at G6)
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall, %f30 at G7)
+	EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS)		! MS
+	ldd		[%o1 + 0x018], %f6		! AX		(%f6 results at G7)
+
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+	ldd		[%o1 + 0x020], %f8		! MS	        (%f8 results at G15)
+	faligndata	%f2, %f4, %f18			! FGA	Group13	(%f18 results at G16)
+	ldd		[%o1 + 0x028], %f10		! MS		(%f10 results at G16)
+	faligndata	%f4, %f6, %f20			! FGA	Group14	(%f20 results at G17)
+	ldd		[%o1 + 0x030], %f12		! MS		(%f12 results at G17)
+	faligndata	%f6, %f8, %f22			! FGA	Group15	(%f22 results at G18)
+	ldd		[%o1 + 0x038], %f14		! MS		(%f14 results at G18)
+
+	faligndata	%f8, %f10, %f24			! FGA	Group16	(%f24 results at G19)
+	ldd		[%o1 + 0x040], %f0		! AX		(%f0 results at G19)
+	prefetch	[%o1 + 0x180], #one_read	! MS
+	faligndata	%f10, %f12, %f26		! FGA	Group17	(%f26 results at G20)
+	subcc		%o4, 0x40, %o4			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3copy_to_user_loop1		! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+U3copy_to_user_loop2_enter:
+	mov		5, %o3				! A1
+
+	/* This loop performs on the copy, no new prefetches are
+	 * queued.  We do things this way so that we do not perform
+	 * any spurious prefetches past the end of the src buffer.
+	 */
+U3copy_to_user_loop2:
+	ldd		[%o1 + 0x008], %f2		! MS
+	faligndata	%f12, %f14, %f28		! FGA	Group2
+	ldd		[%o1 + 0x010], %f4		! MS
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall)
+	EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS)		! MS
+	ldd		[%o1 + 0x018], %f6		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+
+	ldd		[%o1 + 0x020], %f8		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group13
+	ldd		[%o1 + 0x028], %f10		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group14
+	ldd		[%o1 + 0x030], %f12		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group15
+	ldd		[%o1 + 0x038], %f14		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group16
+
+	ldd		[%o1 + 0x040], %f0		! AX
+	faligndata	%f10, %f12, %f26		! FGA	Group17
+	subcc		%o3, 0x01, %o3			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3copy_to_user_loop2	! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+	/* Finally we copy the last full 64-byte block. */
+U3copy_to_user_loopfini:
+	ldd		[%o1 + 0x008], %f2		! MS
+	faligndata	%f12, %f14, %f28		! FGA
+	ldd		[%o1 + 0x010], %f4		! MS	Group19
+	faligndata	%f14, %f0, %f30			! FGA
+	EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS)		! MS	Group20
+	ldd		[%o1 + 0x018], %f6		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group11 (7-cycle stall)
+	ldd		[%o1 + 0x020], %f8		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group12
+	ldd		[%o1 + 0x028], %f10		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group13
+	ldd		[%o1 + 0x030], %f12		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group14
+	ldd		[%o1 + 0x038], %f14		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group15
+	cmp		%g1, 0				! A0
+	be,pt		%icc, 1f			! BR
+	 add		%o0, 0x40, %o0			! A1
+	ldd		[%o1 + 0x040], %f0		! MS
+1:	faligndata	%f10, %f12, %f26		! FGA	Group16
+	faligndata	%f12, %f14, %f28		! FGA	Group17
+	faligndata	%f14, %f0, %f30			! FGA	Group18
+	EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS)		! MS
+	add		%o0, 0x40, %o0			! A0
+	add		%o1, 0x40, %o1			! A1
+	membar		#Sync				! MS	Group26 (7-cycle stall)
+
+	/* Now we copy the (len modulo 64) bytes at the end.
+	 * Note how we borrow the %f0 loaded above.
+	 *
+	 * Also notice how this code is careful not to perform a
+	 * load past the end of the src buffer just like similar
+	 * code found in U3copy_to_user_toosmall processing.
+	 */
+U3copy_to_user_loopend:
+	and		%o2, 0x3f, %o2			! A0	Group
+	andcc		%o2, 0x38, %g2			! A0	Group
+	be,pn		%icc, U3copy_to_user_endcruft	! BR
+	 subcc		%g2, 0x8, %g2			! A1
+	be,pn		%icc, U3copy_to_user_endcruft	! BR	Group
+	 cmp		%g1, 0				! A0
+
+	be,a,pt		%icc, 1f			! BR	Group
+	 ldd		[%o1 + 0x00], %f0		! MS
+
+1:	ldd		[%o1 + 0x08], %f2		! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f0, %f2, %f8			! FGA	Group
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	(XXX does it stall here? XXX)
+	be,pn		%icc, U3copy_to_user_endcruft	! BR
+	 add		%o0, 0x8, %o0			! A0
+	ldd		[%o1 + 0x08], %f0		! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	(XXX does it stall here? XXX)
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A0	Group
+
+	/* If anything is left, we copy it one byte at a time.
+	 * Note that %g1 is (src & 0x3) saved above before the
+	 * alignaddr was performed.
+	 */
+U3copy_to_user_endcruft:
+	cmp		%o2, 0
+	add		%o1, %g1, %o1
+	VISExitHalf
+	be,pn		%icc, U3copy_to_user_short_ret
+	 nop
+	ba,a,pt		%xcc, U3copy_to_user_short
+
+	/* If we get here, then 32 <= len < (6 * 64) */
+U3copy_to_user_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+	/* Is 'dst' already aligned on an 8-byte boundary? */
+	be,pt		%xcc, 2f			! BR	Group
+
+	/* Compute abs((dst & 7) - 8) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 8-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x8, %g2			! A0
+	sub		%g0, %g2, %g2			! A0	Group (reg-dep)
+	sub		%o2, %g2, %o2			! A0	Group (reg-dep)
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	ldub		[%o1 + 0x00], %o3		! MS	(Group) (%o3 in 3 cycles)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)	! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+
+	/* Compute (len - (len % 8)) into %g2.  This is guarenteed
+	 * to be nonzero.
+	 */
+	andn		%o2, 0x7, %g2			! A0	Group
+
+	/* You may read this and believe that it allows reading
+	 * one 8-byte longword past the end of src.  It actually
+	 * does not, as %g2 is subtracted as loads are done from
+	 * src, so we always stop before running off the end.
+	 * Also, we are guarenteed to have at least 0x10 bytes
+	 * to move here.
+	 */
+	sub		%g2, 0x8, %g2			! A0	Group (reg-dep)
+	alignaddr	%o1, %g0, %g1			! MS	      (Break-after)
+	ldd		[%g1 + 0x00], %f0		! MS	Group (1-cycle stall)
+	add		%g1, 0x8, %g1			! A0
+
+1:	ldd		[%g1 + 0x00], %f2		! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+
+	faligndata	%f0, %f2, %f8			! FGA	Group (1-cycle stall)
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+	be,pn		%icc, 2f			! BR
+
+	 add		%o0, 0x8, %o0			! A1
+	ldd		[%g1 + 0x00], %f0		! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA	Group (1-cycle stall)
+	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)	! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A1
+
+	/* Nothing left to copy? */
+2:	cmp		%o2, 0				! A0	Group
+	VISExitHalf					! A0+MS
+	be,pn		%icc, U3copy_to_user_short_ret	! BR	Group
+	 nop						! A0
+	ba,a,pt		%xcc, U3copy_to_user_short	! BR	Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+	xor		%o1, %o0, %g2
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, U3copy_to_user_short
+	 andcc		%o1, 0x7, %g2
+
+	be,pt		%xcc, 2f
+	 sub		%g2, 0x8, %g2
+	sub		%g0, %g2, %g2
+	sub		%o2, %g2, %o2
+
+1:	ldub		[%o1 + 0x00], %o3
+	add		%o1, 0x1, %o1
+	add		%o0, 0x1, %o0
+	subcc		%g2, 0x1, %g2
+	bg,pt		%icc, 1b
+	 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
+
+2:	andn		%o2, 0x7, %g2
+	sub		%o2, %g2, %o2
+
+3:	ldx		[%o1 + 0x00], %o3
+	add		%o1, 0x8, %o1
+	add		%o0, 0x8, %o0
+	subcc		%g2, 0x8, %g2
+	bg,pt		%icc, 3b
+	 EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
+
+	cmp		%o2, 0
+	bne,pn		%icc, U3copy_to_user_short
+	 nop
+	ba,a,pt		%xcc, U3copy_to_user_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
diff --git a/arch/sparc64/lib/U3memcpy.S b/arch/sparc64/lib/U3memcpy.S
new file mode 100644
index 000000000..d38289145
--- /dev/null
+++ b/arch/sparc64/lib/U3memcpy.S
@@ -0,0 +1,409 @@
+/* $Id: U3memcpy.S,v 1.2 2000/11/01 09:29:19 davem Exp $
+ * U3memcpy.S: UltraSparc-III optimized memcpy.
+ *
+ * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#undef SMALL_COPY_USES_FPU
+#else
+#define ASI_BLK_P 0xf0
+#define FPRS_FEF  0x04
+#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#define SMALL_COPY_USES_FPU
+#endif
+
+	/* Special/non-trivial issues of this code:
+	 *
+	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
+	 * 2) Only low 32 FPU registers are used so that only the
+	 *    lower half of the FPU register set is dirtied by this
+	 *    code.  This is especially important in the kernel.
+	 * 3) This code never prefetches cachelines past the end
+	 *    of the source buffer.
+	 */
+
+	.text
+	.align	32
+
+	/* The cheetah's flexible spine, oversized liver, enlarged heart,
+	 * slender muscular body, and claws make it the swiftest hunter
+	 * in Africa and the fastest animal on land.  Can reach speeds
+	 * of up to 2.4GB per second.
+	 */
+
+	.globl		U3memcpy
+U3memcpy: /* %o0=dst, %o1=src, %o2=len */
+#ifndef __KERNEL__
+	/* Save away original 'dst' for memcpy return value. */
+	mov		%o0, %g3			! A0	Group
+#endif
+	/* Anything to copy at all? */
+	cmp		%o2, 0				! A1
+	ble,pn		%icc, U3memcpy_short_ret	! BR
+
+	/* Extremely small copy? */
+	 cmp		%o2, 31				! A0	Group
+	ble,pn		%icc, U3memcpy_short		! BR
+
+	/* Large enough to use unrolled prefetch loops? */
+	 cmp		%o2, 0x100			! A1
+	bge,a,pt	%icc, U3memcpy_enter		! BR	Group
+	 andcc		%o0, 0x3f, %g2			! A0
+
+	ba,pt		%xcc, U3memcpy_toosmall		! BR	Group
+	 andcc		%o0, 0x7, %g2			! A0
+
+	.align		32
+U3memcpy_short:
+	/* Copy %o2 bytes from src to dst, one byte at a time. */
+	ldub		[%o1 + 0x00], %o3		! MS	Group
+	add		%o1, 0x1, %o1			! A0
+	add		%o0, 0x1, %o0			! A1
+	subcc		%o2, 1, %o2			! A0	Group
+
+	bg,pt		%icc, U3memcpy_short		! BR
+	 stb		%o3, [%o0 + -1]			! MS	Group (1-cycle stall)
+
+U3memcpy_short_ret:
+#ifdef __KERNEL__
+	retl						! BR	Group (0-4 cycle stall)
+	 clr		%o0				! A0
+#else
+	retl						! BR	Group (0-4 cycle stall)
+	 mov		%g3, %o0			! A0
+#endif
+
+	/* Here len >= (6 * 64) and condition codes reflect execution
+	 * of "andcc %o0, 0x7, %g2", done by caller.
+	 */
+	.align		64
+U3memcpy_enter:
+	/* Is 'dst' already aligned on an 64-byte boundary? */
+	be,pt		%xcc, 2f			! BR
+
+	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x40, %g2			! A0	Group
+	sub		%g0, %g2, %g2			! A0	Group
+	sub		%o2, %g2, %o2			! A0	Group
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	ldub		[%o1 + 0x00], %o3		! MS	(Group)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 stb		%o3, [%o0 + -1]			! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+	and		%o1, 0x7, %g1			! A1
+	ba,pt		%xcc, U3memcpy_begin		! BR
+	 alignaddr	%o1, %g0, %o1			! MS	      (Break-after)
+
+	.align		64
+U3memcpy_begin:
+	prefetch	[%o1 + 0x000], #one_read	! MS	Group1
+	prefetch	[%o1 + 0x040], #one_read	! MS	Group2
+	andn		%o2, (0x40 - 1), %o4		! A0
+	prefetch	[%o1 + 0x080], #one_read	! MS	Group3
+	cmp		%o4, 0x140			! A0
+	prefetch	[%o1 + 0x0c0], #one_read	! MS	Group4
+	ldd		[%o1 + 0x000], %f0		! MS	Group5 (%f0 results at G8)
+	bge,a,pt	%icc, 1f			! BR
+
+	 prefetch	[%o1 + 0x100], #one_read	! MS	Group6
+1:	ldd		[%o1 + 0x008], %f2		! AX	       (%f2 results at G9)
+	cmp		%o4, 0x180			! A1
+	bge,a,pt	%icc, 1f			! BR
+	 prefetch	[%o1 + 0x140], #one_read	! MS	Group7
+1:	ldd		[%o1 + 0x010], %f4		! AX	       (%f4 results at G10)
+	cmp		%o4, 0x1c0			! A1
+	bge,a,pt	%icc, 1f			! BR
+
+	 prefetch	[%o1 + 0x180], #one_read	! MS	Group8
+1:	faligndata	%f0, %f2, %f16			! FGA	Group9 (%f16 at G12)
+	ldd		[%o1 + 0x018], %f6		! AX	       (%f6 results at G12)
+	faligndata	%f2, %f4, %f18			! FGA	Group10 (%f18 results at G13)
+	ldd		[%o1 + 0x020], %f8		! MS	        (%f8 results at G13)
+	faligndata	%f4, %f6, %f20			! FGA	Group12	(1-cycle stall,%f20 at G15)
+	ldd		[%o1 + 0x028], %f10		! MS		(%f10 results at G15)
+	faligndata	%f6, %f8, %f22			! FGA	Group13 (%f22 results at G16)
+
+	ldd		[%o1 + 0x030], %f12		! MS		(%f12 results at G16)
+	faligndata	%f8, %f10, %f24			! FGA	Group15 (1-cycle stall,%f24 at G18)
+	ldd		[%o1 + 0x038], %f14		! MS		(%f14 results at G18)
+	faligndata	%f10, %f12, %f26		! FGA	Group16 (%f26 results at G19)
+	ldd		[%o1 + 0x040], %f0		! MS		(%f0 results at G19)
+
+	/* We only use the first loop if len > (7 * 64). */
+	subcc		%o4, 0x1c0, %o4			! A0	Group17
+	bg,pt		%icc, U3memcpy_loop1		! BR
+	 add		%o1, 0x40, %o1			! A1
+
+	add		%o4, 0x140, %o4			! A0	Group18
+	ba,pt		%xcc, U3memcpy_loop2		! BR
+	 srl		%o4, 6, %o3			! A0	Group19
+	nop
+	nop
+	nop
+	nop
+	nop
+
+	nop
+	nop
+
+	/* This loop performs the copy and queues new prefetches.
+	 * We drop into the second loop when len <= (5 * 64).  Note
+	 * that this (5 * 64) factor has been subtracted from len
+	 * already.
+	 */
+U3memcpy_loop1:
+	ldd		[%o1 + 0x008], %f2		! MS	Group2	(%f2 results at G5)
+	faligndata	%f12, %f14, %f28		! FGA		(%f28 results at G5)
+	ldd		[%o1 + 0x010], %f4		! MS	Group3	(%f4 results at G6)
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall, %f30 at G7)
+	stda		%f16, [%o0] ASI_BLK_P		! MS
+	ldd		[%o1 + 0x018], %f6		! AX		(%f6 results at G7)
+
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+	ldd		[%o1 + 0x020], %f8		! MS	        (%f8 results at G15)
+	faligndata	%f2, %f4, %f18			! FGA	Group13	(%f18 results at G16)
+	ldd		[%o1 + 0x028], %f10		! MS		(%f10 results at G16)
+	faligndata	%f4, %f6, %f20			! FGA	Group14	(%f20 results at G17)
+	ldd		[%o1 + 0x030], %f12		! MS		(%f12 results at G17)
+	faligndata	%f6, %f8, %f22			! FGA	Group15	(%f22 results at G18)
+	ldd		[%o1 + 0x038], %f14		! MS		(%f14 results at G18)
+
+	faligndata	%f8, %f10, %f24			! FGA	Group16	(%f24 results at G19)
+	ldd		[%o1 + 0x040], %f0		! AX		(%f0 results at G19)
+	prefetch	[%o1 + 0x180], #one_read	! MS
+	faligndata	%f10, %f12, %f26		! FGA	Group17	(%f26 results at G20)
+	subcc		%o4, 0x40, %o4			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3memcpy_loop1		! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+U3memcpy_loop2_enter:
+	mov		5, %o3				! A1
+
+	/* This loop performs on the copy, no new prefetches are
+	 * queued.  We do things this way so that we do not perform
+	 * any spurious prefetches past the end of the src buffer.
+	 */
+U3memcpy_loop2:
+	ldd		[%o1 + 0x008], %f2		! MS
+	faligndata	%f12, %f14, %f28		! FGA	Group2
+	ldd		[%o1 + 0x010], %f4		! MS
+	faligndata	%f14, %f0, %f30			! FGA	Group4	(1-cycle stall)
+	stda		%f16, [%o0] ASI_BLK_P		! MS
+	ldd		[%o1 + 0x018], %f6		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group12 (7-cycle stall)
+
+	ldd		[%o1 + 0x020], %f8		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group13
+	ldd		[%o1 + 0x028], %f10		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group14
+	ldd		[%o1 + 0x030], %f12		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group15
+	ldd		[%o1 + 0x038], %f14		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group16
+
+	ldd		[%o1 + 0x040], %f0		! AX
+	faligndata	%f10, %f12, %f26		! FGA	Group17
+	subcc		%o3, 0x01, %o3			! A0
+	add		%o1, 0x40, %o1			! A1
+	bg,pt		%xcc, U3memcpy_loop2		! BR
+	 add		%o0, 0x40, %o0			! A0	Group18
+
+	/* Finally we copy the last full 64-byte block. */
+U3memcpy_loopfini:
+	ldd		[%o1 + 0x008], %f2		! MS
+	faligndata	%f12, %f14, %f28		! FGA
+	ldd		[%o1 + 0x010], %f4		! MS	Group19
+	faligndata	%f14, %f0, %f30			! FGA
+	stda		%f16, [%o0] ASI_BLK_P		! MS	Group20
+	ldd		[%o1 + 0x018], %f6		! AX
+	faligndata	%f0, %f2, %f16			! FGA	Group11 (7-cycle stall)
+	ldd		[%o1 + 0x020], %f8		! MS
+	faligndata	%f2, %f4, %f18			! FGA	Group12
+	ldd		[%o1 + 0x028], %f10		! MS
+	faligndata	%f4, %f6, %f20			! FGA	Group13
+	ldd		[%o1 + 0x030], %f12		! MS
+	faligndata	%f6, %f8, %f22			! FGA	Group14
+	ldd		[%o1 + 0x038], %f14		! MS
+	faligndata	%f8, %f10, %f24			! FGA	Group15
+	cmp		%g1, 0				! A0
+	be,pt		%icc, 1f			! BR
+	 add		%o0, 0x40, %o0			! A1
+	ldd		[%o1 + 0x040], %f0		! MS
+1:	faligndata	%f10, %f12, %f26		! FGA	Group16
+	faligndata	%f12, %f14, %f28		! FGA	Group17
+	faligndata	%f14, %f0, %f30			! FGA	Group18
+	stda		%f16, [%o0] ASI_BLK_P		! MS
+	add		%o0, 0x40, %o0			! A0
+	add		%o1, 0x40, %o1			! A1
+	membar		#Sync				! MS	Group26 (7-cycle stall)
+
+	/* Now we copy the (len modulo 64) bytes at the end.
+	 * Note how we borrow the %f0 loaded above.
+	 *
+	 * Also notice how this code is careful not to perform a
+	 * load past the end of the src buffer just like similar
+	 * code found in U3memcpy_toosmall processing.
+	 */
+U3memcpy_loopend:
+	and		%o2, 0x3f, %o2			! A0	Group
+	andcc		%o2, 0x38, %g2			! A0	Group
+	be,pn		%icc, U3memcpy_endcruft		! BR
+	 subcc		%g2, 0x8, %g2			! A1
+	be,pn		%icc, U3memcpy_endcruft		! BR	Group
+	 cmp		%g1, 0				! A0
+
+	be,a,pt		%icc, 1f			! BR	Group
+	 ldd		[%o1 + 0x00], %f0		! MS
+
+1:	ldd		[%o1 + 0x08], %f2		! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f0, %f2, %f8			! FGA	Group
+	std		%f8, [%o0 + 0x00]		! MS	(XXX does it stall here? XXX)
+	be,pn		%icc, U3memcpy_endcruft		! BR
+	 add		%o0, 0x8, %o0			! A0
+	ldd		[%o1 + 0x08], %f0		! MS	Group
+	add		%o1, 0x8, %o1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA
+	std		%f8, [%o0 + 0x00]		! MS	(XXX does it stall here? XXX)
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A0	Group
+
+	/* If anything is left, we copy it one byte at a time.
+	 * Note that %g1 is (src & 0x3) saved above before the
+	 * alignaddr was performed.
+	 */
+U3memcpy_endcruft:
+	cmp		%o2, 0
+	add		%o1, %g1, %o1
+	VISExitHalf
+	be,pn		%icc, U3memcpy_short_ret
+	 nop
+	ba,a,pt		%xcc, U3memcpy_short
+
+	/* If we get here, then 32 <= len < (6 * 64) */
+U3memcpy_toosmall:
+
+#ifdef SMALL_COPY_USES_FPU
+
+	/* Is 'dst' already aligned on an 8-byte boundary? */
+	be,pt		%xcc, 2f			! BR	Group
+
+	/* Compute abs((dst & 7) - 8) into %g2.  This is the number
+	 * of bytes to copy to make 'dst' 8-byte aligned.  We pre-
+	 * subtract this from 'len'.
+	 */
+	 sub		%g2, 0x8, %g2			! A0
+	sub		%g0, %g2, %g2			! A0	Group (reg-dep)
+	sub		%o2, %g2, %o2			! A0	Group (reg-dep)
+
+	/* Copy %g2 bytes from src to dst, one byte at a time. */
+1:	ldub		[%o1 + 0x00], %o3		! MS	(Group) (%o3 in 3 cycles)
+	add		%o1, 0x1, %o1			! A1
+	add		%o0, 0x1, %o0			! A0	Group
+	subcc		%g2, 0x1, %g2			! A1
+
+	bg,pt		%icc, 1b			! BR	Group
+	 stb		%o3, [%o0 + -1]			! MS	Group
+
+2:	VISEntryHalf					! MS+MS
+
+	/* Compute (len - (len % 8)) into %g2.  This is guarenteed
+	 * to be nonzero.
+	 */
+	andn		%o2, 0x7, %g2			! A0	Group
+
+	/* You may read this and believe that it allows reading
+	 * one 8-byte longword past the end of src.  It actually
+	 * does not, as %g2 is subtracted as loads are done from
+	 * src, so we always stop before running off the end.
+	 * Also, we are guarenteed to have at least 0x10 bytes
+	 * to move here.
+	 */
+	sub		%g2, 0x8, %g2			! A0	Group (reg-dep)
+	alignaddr	%o1, %g0, %g1			! MS	      (Break-after)
+	ldd		[%g1 + 0x00], %f0		! MS	Group (1-cycle stall)
+	add		%g1, 0x8, %g1			! A0
+
+1:	ldd		[%g1 + 0x00], %f2		! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+	subcc		%g2, 0x8, %g2			! A0	Group
+
+	faligndata	%f0, %f2, %f8			! FGA	Group (1-cycle stall)
+	std		%f8, [%o0 + 0x00]		! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+	be,pn		%icc, 2f			! BR
+
+	 add		%o0, 0x8, %o0			! A1
+	ldd		[%g1 + 0x00], %f0		! MS	Group
+	add		%g1, 0x8, %g1			! A0
+	sub		%o2, 0x8, %o2			! A1
+
+	subcc		%g2, 0x8, %g2			! A0	Group
+	faligndata	%f2, %f0, %f8			! FGA	Group (1-cycle stall)
+	std		%f8, [%o0 + 0x00]		! MS	Group (2-cycle stall)
+	add		%o1, 0x8, %o1			! A0
+
+	bne,pn		%icc, 1b			! BR
+	 add		%o0, 0x8, %o0			! A1
+
+	/* Nothing left to copy? */
+2:	cmp		%o2, 0				! A0	Group
+	VISExitHalf					! A0+MS
+	be,pn		%icc, U3memcpy_short_ret	! BR	Group
+	 nop						! A0
+	ba,a,pt		%xcc, U3memcpy_short		! BR	Group
+
+#else /* !(SMALL_COPY_USES_FPU) */
+
+	xor		%o1, %o0, %g2
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, U3memcpy_short
+	 andcc		%o1, 0x7, %g2
+
+	be,pt		%xcc, 2f
+	 sub		%g2, 0x8, %g2
+	sub		%g0, %g2, %g2
+	sub		%o2, %g2, %o2
+
+1:	ldub		[%o1 + 0x00], %o3
+	add		%o1, 0x1, %o1
+	add		%o0, 0x1, %o0
+	subcc		%g2, 0x1, %g2
+	bg,pt		%icc, 1b
+	 stb		%o3, [%o0 + -1]
+
+2:	andn		%o2, 0x7, %g2
+	sub		%o2, %g2, %o2
+
+3:	ldx		[%o1 + 0x00], %o3
+	add		%o1, 0x8, %o1
+	add		%o0, 0x8, %o0
+	subcc		%g2, 0x8, %g2
+	bg,pt		%icc, 3b
+	 stx		%o3, [%o0 + -8]
+
+	cmp		%o2, 0
+	bne,pn		%icc, U3memcpy_short
+	 nop
+	ba,a,pt		%xcc, U3memcpy_short_ret
+
+#endif /* !(SMALL_COPY_USES_FPU) */
diff --git a/arch/sparc64/lib/VIScopy.S b/arch/sparc64/lib/VIScopy.S
index 56634f83f..b944a0ae7 100644
--- a/arch/sparc64/lib/VIScopy.S
+++ b/arch/sparc64/lib/VIScopy.S
@@ -1,4 +1,4 @@
-/* $Id: VIScopy.S,v 1.23 2000/03/26 09:13:49 davem Exp $
+/* $Id: VIScopy.S,v 1.25 2000/11/01 09:29:19 davem Exp $
  * VIScopy.S: High speed copy operations utilizing the UltraSparc
  *            Visual Instruction Set.
  *
@@ -361,6 +361,38 @@ bcopy:		or		%o0, 0, %g3			! IEU0	Group
 		 clr		%o0				! IEU0
 
 
+#ifdef __KERNEL__
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define ULTRA3_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	srl	%g1, 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	cheetah_patch_copyops
+cheetah_patch_copyops:
+	ULTRA3_DO_PATCH(memcpy, U3memcpy)
+	ULTRA3_DO_PATCH(__copy_from_user, U3copy_from_user)
+	ULTRA3_DO_PATCH(__copy_to_user, U3copy_to_user)
+	ULTRA3_DO_PATCH(__copy_in_user, U3copy_in_user)
+	retl
+	 nop
+#undef BRANCH_ALWAYS
+#undef NOP
+#undef ULTRA3_DO_PATCH
+#endif /* __KERNEL__ */
+
 	.align			32
 #ifdef __KERNEL__
 __memcpy_384plus:
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 65fbd6e37..6da2d0b85 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1,4 +1,4 @@
-/*  $Id: init.c,v 1.157 2000/10/19 00:49:52 davem Exp $
+/*  $Id: init.c,v 1.159 2000/11/06 06:59:04 davem Exp $
  *  arch/sparc64/mm/init.c
  *
  *  Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu)
@@ -99,6 +99,20 @@ int do_check_pgt_cache(int low, int high)
         return freed;
 }
 
+extern void __update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
+
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte)
+{
+	struct page *page = pte_page(pte);
+
+	if (VALID_PAGE(page) && page->mapping &&
+	    test_bit(PG_dcache_dirty, &page->flags)) {
+		__flush_dcache_page(page->virtual, 1);
+		clear_bit(PG_dcache_dirty, &page->flags);
+	}
+	__update_mmu_cache(vma, address, pte);
+}
+
 /*
  * BAD_PAGE is the page that is used for page faults when linux
  * is out-of-memory. Older versions of linux just did a
diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
index 7940218d2..daaf580a0 100644
--- a/arch/sparc64/mm/ultra.S
+++ b/arch/sparc64/mm/ultra.S
@@ -1,4 +1,4 @@
-/* $Id: ultra.S,v 1.46 2000/08/05 13:30:33 davem Exp $
+/* $Id: ultra.S,v 1.48 2000/11/06 06:59:04 davem Exp $
  * ultra.S: Don't expand these all over the place...
  *
  * Copyright (C) 1997, 2000 David S. Miller (davem@redhat.com)
@@ -208,27 +208,58 @@ iflush2:sub		%o1, 0x20, %g3
 
 	.align		64
 	.globl		__flush_dcache_page
-__flush_dcache_page:
+__flush_dcache_page:	/* %o0=kaddr, %o1=flush_icache */
 	sub		%o0, %g4, %o0
-	clr		%o1
+	clr		%o4
 	srlx		%o0, 11, %o0
 	sethi		%hi(1 << 14), %o2
-1:	ldxa		[%o1] ASI_DCACHE_TAG, %o3
-	andn		%o3, 0x3, %o3
-	cmp		%o0, %o3
-	bne,pt		%xcc, 2f
-	 nop
-	stxa		%g0, [%o1] ASI_DCACHE_TAG
-	membar		#Sync
-2:	add		%o1, (1 << 5), %o1
-	cmp		%o1, %o2
-	bne,pt		%xcc, 1b
-	 nop
+1:	ldxa		[%o4] ASI_DCACHE_TAG, %o3	! LSU	Group
+	add		%o4, (1 << 5), %o4		! IEU0
+	ldxa		[%o4] ASI_DCACHE_TAG, %g1	! LSU	Group
+	add		%o4, (1 << 5), %o4		! IEU0
+	ldxa		[%o4] ASI_DCACHE_TAG, %g2	! LSU	Group	o3 available
+	add		%o4, (1 << 5), %o4		! IEU0
+	andn		%o3, 0x3, %o3			! IEU1
+	ldxa		[%o4] ASI_DCACHE_TAG, %g3	! LSU	Group
+	add		%o4, (1 << 5), %o4		! IEU0
+	andn		%g1, 0x3, %g1			! IEU1
+	cmp		%o0, %o3			! IEU1	Group
+	be,a,pn		%xcc, dflush1			! CTI
+	 sub		%o4, (4 << 5), %o4		! IEU0	(Group)
+	cmp		%o0, %g1			! IEU1	Group
+	andn		%g2, 0x3, %g2			! IEU0
+	be,a,pn		%xcc, dflush2			! CTI
+	 sub		%o4, (3 << 5), %o4		! IEU0	(Group)
+	cmp		%o0, %g2			! IEU1	Group
+	andn		%g3, 0x3, %g3			! IEU0
+	be,a,pn		%xcc, dflush3			! CTI
+	 sub		%o4, (2 << 5), %o4		! IEU0	(Group)
+	cmp		%o0, %g3			! IEU1	Group
+	be,a,pn		%xcc, dflush4			! CTI
+	 sub		%o4, (1 << 5), %o4		! IEU0
+2:	cmp		%o4, %o2			! IEU1	Group
+	bne,pt		%xcc, 1b			! CTI
+	 nop						! IEU0
+
 	/* The I-cache does not snoop local stores so we
-	 * better flush that too.
+	 * better flush that too when necessary.
 	 */
-	ba,pt		%xcc, __flush_icache_page
+	brnz,pt		%o1, __flush_icache_page
 	 sllx		%o0, 11, %o0
+	retl
+	 nop
+
+dflush1:stxa		%g0, [%o4] ASI_DCACHE_TAG
+	add		%o4, (1 << 5), %o4
+dflush2:stxa		%g0, [%o4] ASI_DCACHE_TAG
+	add		%o4, (1 << 5), %o4
+dflush3:stxa		%g0, [%o4] ASI_DCACHE_TAG
+	add		%o4, (1 << 5), %o4
+dflush4:stxa		%g0, [%o4] ASI_DCACHE_TAG
+	add		%o4, (1 << 5), %o4
+	membar		#Sync
+	ba,pt		%xcc, 2b
+	 nop
 
 	.align		32
 __prefill_dtlb:
@@ -250,8 +281,8 @@ __prefill_itlb:
 	retl
 	 wrpr		%g7, %pstate
 
-	.globl		update_mmu_cache
-update_mmu_cache:	/* %o0=vma, %o1=address, %o2=pte */
+	.globl		__update_mmu_cache
+__update_mmu_cache:	/* %o0=vma, %o1=address, %o2=pte */
 	ldub		[%g6 + AOFF_task_thread + AOFF_thread_fault_code], %o3
 	srlx		%o1, 13, %o1
 	ldx		[%o0 + 0x0], %o4		/* XXX vma->vm_mm */
diff --git a/arch/sparc64/solaris/ioctl.c b/arch/sparc64/solaris/ioctl.c
index 0e899da18..4a10c1b4c 100644
--- a/arch/sparc64/solaris/ioctl.c
+++ b/arch/sparc64/solaris/ioctl.c
@@ -464,8 +464,8 @@ static inline int solaris_S(struct file *filp, unsigned int fd, unsigned int cmd
         struct sol_socket_struct *sock;
         struct module_info *mi;
 
-        if (! (ino = filp->f_dentry->d_inode) ||
-	    ! ino->i_sock)
+        ino = filp->f_dentry->d_inode;
+        if (! ino->i_sock)
 		return -EBADF;
         sock = filp->private_data;
         if (! sock) {
diff --git a/arch/sparc64/solaris/socket.c b/arch/sparc64/solaris/socket.c
index 3013d43cf..9b910a633 100644
--- a/arch/sparc64/solaris/socket.c
+++ b/arch/sparc64/solaris/socket.c
@@ -265,7 +265,7 @@ extern __inline__ struct socket *sockfd_lookup(int fd, int *err)
 	}
 
 	inode = file->f_dentry->d_inode;
-	if (!inode || !inode->i_sock || !socki_lookup(inode)) {
+	if (!inode->i_sock || !socki_lookup(inode)) {
 		*err = -ENOTSOCK;
 		fput(file);
 		return NULL;
diff --git a/arch/sparc64/vmlinux.lds b/arch/sparc64/vmlinux.lds
index f686decfb..91d4575d0 100644
--- a/arch/sparc64/vmlinux.lds
+++ b/arch/sparc64/vmlinux.lds
@@ -35,6 +35,9 @@ SECTIONS
   __ksymtab  : { *(__ksymtab) }
   __stop___ksymtab = .;
   __kstrtab  : { *(.kstrtab) }
+  __start___kallsyms = .;	/* All kernel symbols */
+  __kallsyms : { *(__kallsyms) }
+  __stop___kallsyms = .;
   . = ALIGN(8192);
   __init_begin = .;
   .text.init : { *(.text.init) }
author	Ralf Baechle <ralf@linux-mips.org>	2000-11-28 03:58:46 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-11-28 03:58:46 +0000
commit	b63ad0882a16a5d28003e57f2b0b81dee3fb322b (patch)
tree	0a343ce219e2b8b38a5d702d66032c57b83d9720 /arch/sparc64
parent	a9d7bff9a84dba79609a0002e5321b74c4d64c64 (diff)