diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-11-28 03:58:46 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-11-28 03:58:46 +0000 |
commit | b63ad0882a16a5d28003e57f2b0b81dee3fb322b (patch) | |
tree | 0a343ce219e2b8b38a5d702d66032c57b83d9720 /arch/sparc64 | |
parent | a9d7bff9a84dba79609a0002e5321b74c4d64c64 (diff) |
Merge with 2.4.0-test11.
Diffstat (limited to 'arch/sparc64')
-rw-r--r-- | arch/sparc64/config.in | 2 | ||||
-rw-r--r-- | arch/sparc64/kernel/dtlb_base.S | 26 | ||||
-rw-r--r-- | arch/sparc64/kernel/dtlb_prot.S | 50 | ||||
-rw-r--r-- | arch/sparc64/kernel/ebus.c | 29 | ||||
-rw-r--r-- | arch/sparc64/kernel/ioctl32.c | 22 | ||||
-rw-r--r-- | arch/sparc64/kernel/itlb_base.S | 30 | ||||
-rw-r--r-- | arch/sparc64/kernel/pci.c | 8 | ||||
-rw-r--r-- | arch/sparc64/kernel/process.c | 17 | ||||
-rw-r--r-- | arch/sparc64/kernel/semaphore.c | 6 | ||||
-rw-r--r-- | arch/sparc64/kernel/sparc64_ksyms.c | 52 | ||||
-rw-r--r-- | arch/sparc64/kernel/sys_sparc32.c | 6 | ||||
-rw-r--r-- | arch/sparc64/kernel/sys_sunos32.c | 4 | ||||
-rw-r--r-- | arch/sparc64/lib/Makefile | 5 | ||||
-rw-r--r-- | arch/sparc64/lib/U3copy_from_user.S | 500 | ||||
-rw-r--r-- | arch/sparc64/lib/U3copy_in_user.S | 531 | ||||
-rw-r--r-- | arch/sparc64/lib/U3copy_to_user.S | 528 | ||||
-rw-r--r-- | arch/sparc64/lib/U3memcpy.S | 409 | ||||
-rw-r--r-- | arch/sparc64/lib/VIScopy.S | 34 | ||||
-rw-r--r-- | arch/sparc64/mm/init.c | 16 | ||||
-rw-r--r-- | arch/sparc64/mm/ultra.S | 67 | ||||
-rw-r--r-- | arch/sparc64/solaris/ioctl.c | 4 | ||||
-rw-r--r-- | arch/sparc64/solaris/socket.c | 2 | ||||
-rw-r--r-- | arch/sparc64/vmlinux.lds | 3 |
23 files changed, 2189 insertions, 162 deletions
diff --git a/arch/sparc64/config.in b/arch/sparc64/config.in index a754b796b..19b05e28f 100644 --- a/arch/sparc64/config.in +++ b/arch/sparc64/config.in @@ -29,6 +29,8 @@ bool 'Symmetric multi-processing support' CONFIG_SMP # Global things across all Sun machines. define_bool CONFIG_HAVE_DEC_LOCK y define_bool CONFIG_ISA n +define_bool CONFIG_EISA n +define_bool CONFIG_MCA n define_bool CONFIG_PCMCIA n define_bool CONFIG_SBUS y define_bool CONFIG_SBUSCHAR y diff --git a/arch/sparc64/kernel/dtlb_base.S b/arch/sparc64/kernel/dtlb_base.S index 72120b563..80c74aa18 100644 --- a/arch/sparc64/kernel/dtlb_base.S +++ b/arch/sparc64/kernel/dtlb_base.S @@ -1,4 +1,4 @@ -/* $Id: dtlb_base.S,v 1.7 2000/03/26 09:13:48 davem Exp $ +/* $Id: dtlb_base.S,v 1.8 2000/11/10 08:28:45 davem Exp $ * dtlb_base.S: Front end to DTLB miss replacement strategy. * This is included directly into the trap table. * @@ -57,7 +57,7 @@ srax %g4, VPTE_SHIFT, %g6 ! Create VPTE offset ldxa [%g3 + %g6] ASI_S, %g5 ! Load VPTE 1: brlz,pt %g5, 9f ! Valid, load into TLB - and %g5, (_PAGE_PRESENT|_PAGE_READ), %g4 ! Mask readable bits + nop ! Delay-slot ba,a,pt %xcc, 4f ! Invalid, branch out /* DTLB ** ICACHE line 2: Quick kernel TLB misses */ @@ -68,27 +68,27 @@ nop 9: stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Reload TLB retry ! Trap return - nop +4: rdpr %pstate, %g5 ! Move into alternate globals /* DTLB ** ICACHE line 3: winfixups+real_faults */ -4: cmp %g4, (_PAGE_PRESENT|_PAGE_READ) ! Readable page? - be,pn %xcc, 5f ! Yep, refbit update - sllx %g1, 60, %g4 ! Get valid bit - rdpr %pstate, %g5 ! Move into alternate globals wrpr %g5, PSTATE_AG|PSTATE_MG, %pstate rdpr %tl, %g4 ! See where we came from. cmp %g4, 1 ! Is etrap/rtrap window fault? mov TLB_TAG_ACCESS, %g4 ! Prepare for fault processing - -/* DTLB ** ICACHE line 4: padding */ ldxa [%g4] ASI_DMMU, %g5 ! Load faulting VA page be,pt %xcc, sparc64_realfault_common ! Jump to normal fault handling mov FAULT_CODE_DTLB, %g4 ! It was read from DTLB ba,a,pt %xcc, winfix_trampoline ! Call window fixup code -5: or %g5, _PAGE_ACCESSED, %g5 ! Indicate reference - or %g5, %g4, %g5 ! Set valid - stxa %g5, [%g3 + %g6] ASI_S ! Update PTE table (cant trap) - ba,a,pt %xcc, 9b ! Complete tlb miss + +/* DTLB ** ICACHE line 4: Unused... */ + nop + nop + nop + nop + nop + nop + nop + nop #undef TAG_CONTEXT_BITS #undef VPTE_SHIFT diff --git a/arch/sparc64/kernel/dtlb_prot.S b/arch/sparc64/kernel/dtlb_prot.S index 5e99d5d47..1da370c7c 100644 --- a/arch/sparc64/kernel/dtlb_prot.S +++ b/arch/sparc64/kernel/dtlb_prot.S @@ -1,4 +1,4 @@ -/* $Id: dtlb_prot.S,v 1.20 2000/03/26 09:13:48 davem Exp $ +/* $Id: dtlb_prot.S,v 1.21 2000/11/10 08:28:45 davem Exp $ * dtlb_prot.S: DTLB protection trap strategy. * This is included directly into the trap table. * @@ -6,10 +6,6 @@ * Copyright (C) 1997,1998 Jakub Jelinek (jj@ultra.linux.cz) */ -#define TAG_CONTEXT_BITS 0x3ff -#define VPTE_SHIFT (PAGE_SHIFT - 3) -#define MODIFIED_BITS (_PAGE_WRITE | _PAGE_W | _PAGE_MODIFIED | _PAGE_ACCESSED) - /* Ways we can get here: * * [TL == 0] 1) User stores to readonly pages. @@ -18,45 +14,41 @@ */ /* PROT ** ICACHE line 1: User DTLB protection trap */ - ldxa [%g1] ASI_DMMU, %g6 ! Primary or Secondary ctx? - and %g6, 0x10, %g6 ! Get pri/sec ctx bit stxa %g0, [%g1] ASI_DMMU ! Clear SFSR FaultValid bit membar #Sync ! Synchronize ASI stores - ldxa [%g1 + %g1] ASI_DMMU, %g4 ! Load TAG_ACCESS - andn %g4, TAG_CONTEXT_BITS, %g4 ! Clear CTX bits - stxa %g0, [%g4 + %g6] ASI_DMMU_DEMAP ! Perform TLB flush of page - membar #Sync ! Synchronize ASI stores - -/* PROT ** ICACHE line 2: Further normal processing */ - srax %g4, VPTE_SHIFT, %g6 ! Compute VPTE offset - ldxa [%g3 + %g6] ASI_S, %g5 ! Load PTE entry - andcc %g5, _PAGE_WRITE, %g0 ! Writable page? - be,pt %xcc, 1f ! Nope, real fault - or %g5, (MODIFIED_BITS), %g5 ! Mark as writable/modified - stxa %g5, [%g3 + %g6] ASI_S ! Update PTE entry - stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Load PTE into TLB - retry ! Trap return - -/* PROT ** ICACHE line 3: Real user faults */ -1: rdpr %pstate, %g5 ! Move into alternate globals + rdpr %pstate, %g5 ! Move into alternate globals wrpr %g5, PSTATE_AG|PSTATE_MG, %pstate rdpr %tl, %g1 ! Need to do a winfixup? cmp %g1, 1 ! Trap level >1? mov TLB_TAG_ACCESS, %g4 ! Prepare reload of vaddr + nop + +/* PROT ** ICACHE line 2: More real fault processing */ bgu,pn %xcc, winfix_trampoline ! Yes, perform winfixup ldxa [%g4] ASI_DMMU, %g5 ! Put tagaccess in %g5 ba,pt %xcc, sparc64_realfault_common ! Nope, normal fault - -/* PROT ** ICACHE line 4: More real fault processing */ mov FAULT_CODE_DTLB | FAULT_CODE_WRITE, %g4 nop nop nop nop + +/* PROT ** ICACHE line 3: Unused... */ + nop + nop + nop + nop + nop nop nop nop -#undef TAG_CONTEXT_BITS -#undef VPTE_SHIFT -#undef MODIFIED_BITS +/* PROT ** ICACHE line 3: Unused... */ + nop + nop + nop + nop + nop + nop + nop + nop diff --git a/arch/sparc64/kernel/ebus.c b/arch/sparc64/kernel/ebus.c index 5872046b1..e175fac27 100644 --- a/arch/sparc64/kernel/ebus.c +++ b/arch/sparc64/kernel/ebus.c @@ -1,4 +1,4 @@ -/* $Id: ebus.c,v 1.48 2000/08/02 06:22:35 davem Exp $ +/* $Id: ebus.c,v 1.53 2000/11/08 05:08:23 davem Exp $ * ebus.c: PCI to EBus bridge device. * * Copyright (C) 1997 Eddie C. Dost (ecd@skynet.be) @@ -22,21 +22,9 @@ struct linux_ebus *ebus_chain = 0; -#ifdef CONFIG_SUN_OPENPROMIO -extern int openprom_init(void); -#endif #ifdef CONFIG_SUN_AUXIO extern void auxio_probe(void); #endif -#ifdef CONFIG_OBP_FLASH -extern int flash_init(void); -#endif -#ifdef CONFIG_ENVCTRL -extern int envctrl_init(void); -#endif -#ifdef CONFIG_DISPLAY7SEG -extern int d7s_init(void); -#endif static inline void *ebus_alloc(size_t size) { @@ -372,24 +360,9 @@ void __init ebus_init(void) ++num_ebus; } -#ifdef CONFIG_SUN_OPENPROMIO - openprom_init(); -#endif -#ifdef CONFIG_SUN_BPP - bpp_init(); -#endif #ifdef CONFIG_SUN_AUXIO auxio_probe(); #endif -#ifdef CONFIG_ENVCTRL - envctrl_init(); -#endif -#ifdef CONFIG_OBP_FLASH - flash_init(); -#endif -#ifdef CONFIG_DISPLAY7SEG - d7s_init(); -#endif clock_probe(); power_init(); } diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c index 9f7cd59e9..a510c2aff 100644 --- a/arch/sparc64/kernel/ioctl32.c +++ b/arch/sparc64/kernel/ioctl32.c @@ -1,4 +1,4 @@ -/* $Id: ioctl32.c,v 1.99 2000/10/17 16:20:33 davem Exp $ +/* $Id: ioctl32.c,v 1.103 2000/11/10 05:44:33 davem Exp $ * ioctl32.c: Conversion between 32bit and 64bit native ioctls. * * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) @@ -71,8 +71,9 @@ #include <asm/openpromio.h> #include <asm/envctrl.h> #include <asm/audioio.h> -#include <asm/ethtool.h> +#include <linux/ethtool.h> #include <asm/display7seg.h> +#include <asm/module.h> #include <linux/soundcard.h> #include <linux/atm.h> @@ -3230,6 +3231,7 @@ COMPATIBLE_IOCTL(ENVCTRL_RD_SCSI_TEMPERATURE) COMPATIBLE_IOCTL(ENVCTRL_RD_ETHERNET_TEMPERATURE) COMPATIBLE_IOCTL(ENVCTRL_RD_MTHRBD_TEMPERATURE) COMPATIBLE_IOCTL(ENVCTRL_RD_CPU_VOLTAGE) +COMPATIBLE_IOCTL(ENVCTRL_RD_GLOBALADDRESS) /* COMPATIBLE_IOCTL(D7SIOCRD) same value as ENVCTRL_RD_VOLTAGE_STATUS */ COMPATIBLE_IOCTL(D7SIOCWR) COMPATIBLE_IOCTL(D7SIOCTM) @@ -3467,6 +3469,14 @@ COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN) COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1) COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2) COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO)) +COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR)) COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE) /* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */ /* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */ @@ -3492,6 +3502,14 @@ COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO)) +COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR)) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE) /* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */ /* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */ diff --git a/arch/sparc64/kernel/itlb_base.S b/arch/sparc64/kernel/itlb_base.S index 7f0da3d14..bd6a3603d 100644 --- a/arch/sparc64/kernel/itlb_base.S +++ b/arch/sparc64/kernel/itlb_base.S @@ -1,4 +1,4 @@ -/* $Id: itlb_base.S,v 1.9 2000/03/26 09:13:48 davem Exp $ +/* $Id: itlb_base.S,v 1.10 2000/11/10 08:28:45 davem Exp $ * itlb_base.S: Front end to ITLB miss replacement strategy. * This is included directly into the trap table. * @@ -23,22 +23,13 @@ srax %g4, VPTE_SHIFT, %g6 ! Create VPTE offset ldxa [%g3 + %g6] ASI_P, %g5 ! Load VPTE 1: brgez,pn %g5, 3f ! Not valid, branch out - and %g5, (_PAGE_PRESENT|_PAGE_READ), %g4 ! Mask readable bits + nop ! Delay-slot 2: stxa %g5, [%g0] ASI_ITLB_DATA_IN ! Load PTE into TLB retry ! Trap return -3: cmp %g4, (_PAGE_PRESENT|_PAGE_READ) ! Readable page? +3: rdpr %pstate, %g4 ! Move into alternate globals -/* ITLB ** ICACHE line 2: Quick user ref updates */ - bne,pn %xcc, 4f ! Nope, real missing page - sllx %g1, 60, %g4 ! Sliiickkk... - or %g5, _PAGE_ACCESSED, %g5 ! Mark as touched - or %g5, %g4, %g5 ! Allow user to see it - ba,pt %xcc, 2b ! Branch to load TLB - stxa %g5, [%g3 + %g6] ASI_S ! Update PTE table -4: rdpr %pstate, %g4 ! Move into alternate globals +/* ITLB ** ICACHE line 2: Real faults */ wrpr %g4, PSTATE_AG|PSTATE_MG, %pstate - -/* ITLB ** ICACHE line 3: Real faults */ rdpr %tpc, %g5 ! And load faulting VA mov FAULT_CODE_ITLB, %g4 ! It was read from ITLB sparc64_realfault_common: ! Called by TL0 dtlb_miss too @@ -46,10 +37,11 @@ sparc64_realfault_common: ! Called by TL0 dtlb_miss too stx %g5, [%g6 + AOFF_task_thread + AOFF_thread_fault_address] ba,pt %xcc, etrap ! Save state 1: rd %pc, %g7 ! ... + nop + +/* ITLB ** ICACHE line 3: Finish faults + window fixups */ call do_sparc64_fault ! Call fault handler add %sp, STACK_BIAS + REGWIN_SZ, %o0! Compute pt_regs arg - -/* ITLB ** ICACHE line 4: Finish faults + window fixups */ ba,pt %xcc, rtrap_clr_l6 ! Restore cpu state nop winfix_trampoline: @@ -57,6 +49,14 @@ winfix_trampoline: or %g3, 0x7c, %g3 ! Compute offset to branch wrpr %g3, %tnpc ! Write it into TNPC done ! Do it to it + +/* ITLB ** ICACHE line 4: Unused... */ + nop + nop + nop + nop + nop + nop nop nop diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c index dd153a24e..1abef824f 100644 --- a/arch/sparc64/kernel/pci.c +++ b/arch/sparc64/kernel/pci.c @@ -1,4 +1,4 @@ -/* $Id: pci.c,v 1.18 2000/10/03 11:31:42 anton Exp $ +/* $Id: pci.c,v 1.19 2000/11/08 04:49:17 davem Exp $ * pci.c: UltraSparc PCI controller support. * * Copyright (C) 1997, 1998, 1999 David S. Miller (davem@redhat.com) @@ -202,12 +202,6 @@ void pcibios_update_irq(struct pci_dev *pdev, int irq) { } -unsigned long resource_fixup(struct pci_dev *pdev, struct resource *res, - unsigned long start, unsigned long size) -{ - return start; -} - void pcibios_fixup_pbus_ranges(struct pci_bus *pbus, struct pbus_set_ranges_data *pranges) { diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index 1f3386d53..4534ad59b 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -1,4 +1,4 @@ -/* $Id: process.c,v 1.112 2000/09/06 00:45:01 davem Exp $ +/* $Id: process.c,v 1.113 2000/11/08 08:14:58 davem Exp $ * arch/sparc64/kernel/process.c * * Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu) @@ -647,14 +647,21 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { long retval; - __asm__ __volatile("mov %1, %%g1\n\t" + /* If the parent runs before fn(arg) is called by the child, + * the input registers of this function can be clobbered. + * So we stash 'fn' and 'arg' into global registers which + * will not be modified by the parent. + */ + __asm__ __volatile("mov %4, %%g2\n\t" /* Save FN into global */ + "mov %5, %%g3\n\t" /* Save ARG into global */ + "mov %1, %%g1\n\t" /* Clone syscall nr. */ "mov %2, %%o0\n\t" /* Clone flags. */ "mov 0, %%o1\n\t" /* usp arg == 0 */ "t 0x6d\n\t" /* Linux/Sparc clone(). */ "brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */ " mov %%o0, %0\n\t" - "jmpl %4, %%o7\n\t" /* Call the function. */ - " mov %5, %%o0\n\t" /* Set arg in delay. */ + "jmpl %%g2, %%o7\n\t" /* Call the function. */ + " mov %%g3, %%o0\n\t" /* Set arg in delay. */ "mov %3, %%g1\n\t" "t 0x6d\n\t" /* Linux/Sparc exit(). */ /* Notreached by child. */ @@ -662,7 +669,7 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) "=r" (retval) : "i" (__NR_clone), "r" (flags | CLONE_VM), "i" (__NR_exit), "r" (fn), "r" (arg) : - "g1", "o0", "o1", "memory", "cc"); + "g1", "g2", "g3", "o0", "o1", "memory", "cc"); return retval; } diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c index 8cb6e7211..1928d5a0d 100644 --- a/arch/sparc64/kernel/semaphore.c +++ b/arch/sparc64/kernel/semaphore.c @@ -1,4 +1,4 @@ -/* $Id: semaphore.c,v 1.4 2000/10/14 10:09:00 davem Exp $ +/* $Id: semaphore.c,v 1.5 2000/11/10 04:02:03 davem Exp $ * Generic semaphore code. Buyer beware. Do your own * specific changes in <asm/semaphore-helper.h> */ @@ -223,7 +223,7 @@ void down_write_failed_biased(struct rw_semaphore *sem) for (;;) { if (test_and_clear_le_bit(1, &sem->granted)) break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE | TASK_EXCLUSIVE); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!test_le_bit(1, &sem->granted)) schedule(); } @@ -273,7 +273,7 @@ void down_write_failed(struct rw_semaphore *sem) add_wait_queue_exclusive(&sem->wait, &wait); while (sem->count < 0) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE | TASK_EXCLUSIVE); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (sem->count >= 0) break; /* we must attempt to acquire or bias the lock */ schedule(); diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 6e7f59309..e1ae982bf 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -1,4 +1,4 @@ -/* $Id: sparc64_ksyms.c,v 1.95 2000/10/30 21:01:40 davem Exp $ +/* $Id: sparc64_ksyms.c,v 1.98 2000/11/13 10:03:32 davem Exp $ * arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) @@ -87,7 +87,6 @@ extern long sparc32_open(const char * filename, int flags, int mode); extern int register_ioctl32_conversion(unsigned int cmd, int (*handler)(unsigned int, unsigned int, unsigned long, struct file *)); extern int unregister_ioctl32_conversion(unsigned int cmd); extern int io_remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot, int space); -extern void __flush_dcache_page(void *addr); extern int __ashrdi3(int, int); @@ -110,25 +109,14 @@ extern void _do_write_unlock(rwlock_t *rw); extern unsigned long phys_base; -/* One thing to note is that the way the symbols of the mul/div - * support routines are named is a mess, they all start with - * a '.' which makes it a bitch to export, here is the trick: - */ - -#define EXPORT_SYMBOL_PRIVATE(sym) \ -extern int __sparc_priv_ ## sym (int) __asm__("__" #sym); \ -const struct module_symbol __export_priv_##sym \ -__attribute__((section("__ksymtab"))) = \ -{ (unsigned long) &__sparc_priv_ ## sym, "__" #sym } - /* used by various drivers */ #ifdef CONFIG_SMP #ifndef SPIN_LOCK_DEBUG /* Out of line rw-locking implementation. */ -EXPORT_SYMBOL_PRIVATE(read_lock); -EXPORT_SYMBOL_PRIVATE(read_unlock); -EXPORT_SYMBOL_PRIVATE(write_lock); -EXPORT_SYMBOL_PRIVATE(write_unlock); +EXPORT_SYMBOL(__read_lock); +EXPORT_SYMBOL(__read_unlock); +EXPORT_SYMBOL(__write_lock); +EXPORT_SYMBOL(__write_unlock); #endif /* Kernel wide locking */ @@ -137,10 +125,10 @@ EXPORT_SYMBOL(kernel_flag); /* Hard IRQ locking */ EXPORT_SYMBOL(global_irq_holder); EXPORT_SYMBOL(synchronize_irq); -EXPORT_SYMBOL_PRIVATE(global_cli); -EXPORT_SYMBOL_PRIVATE(global_sti); -EXPORT_SYMBOL_PRIVATE(global_save_flags); -EXPORT_SYMBOL_PRIVATE(global_restore_flags); +EXPORT_SYMBOL(__global_cli); +EXPORT_SYMBOL(__global_sti); +EXPORT_SYMBOL(__global_save_flags); +EXPORT_SYMBOL(__global_restore_flags); /* Per-CPU information table */ EXPORT_SYMBOL(cpu_data); @@ -163,27 +151,33 @@ EXPORT_SYMBOL(_do_write_unlock); #endif +/* semaphores */ +EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__down_trylock); +EXPORT_SYMBOL(__up); + /* rw semaphores */ EXPORT_SYMBOL_NOVERS(__down_read_failed); EXPORT_SYMBOL_NOVERS(__down_write_failed); EXPORT_SYMBOL_NOVERS(__rwsem_wake); /* Atomic counter implementation. */ -EXPORT_SYMBOL_PRIVATE(atomic_add); -EXPORT_SYMBOL_PRIVATE(atomic_sub); +EXPORT_SYMBOL(__atomic_add); +EXPORT_SYMBOL(__atomic_sub); /* Atomic bit operations. */ -EXPORT_SYMBOL_PRIVATE(test_and_set_bit); -EXPORT_SYMBOL_PRIVATE(test_and_clear_bit); -EXPORT_SYMBOL_PRIVATE(test_and_change_bit); -EXPORT_SYMBOL_PRIVATE(test_and_set_le_bit); -EXPORT_SYMBOL_PRIVATE(test_and_clear_le_bit); +EXPORT_SYMBOL(__test_and_set_bit); +EXPORT_SYMBOL(__test_and_clear_bit); +EXPORT_SYMBOL(__test_and_change_bit); +EXPORT_SYMBOL(__test_and_set_le_bit); +EXPORT_SYMBOL(__test_and_clear_le_bit); EXPORT_SYMBOL(ivector_table); EXPORT_SYMBOL(enable_irq); EXPORT_SYMBOL(disable_irq); -EXPORT_SYMBOL_PRIVATE(flushw_user); +EXPORT_SYMBOL(__flushw_user); EXPORT_SYMBOL(__flush_dcache_page); diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index 24c8cd593..9b211d86d 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -1,4 +1,4 @@ -/* $Id: sys_sparc32.c,v 1.165 2000/10/10 04:47:31 davem Exp $ +/* $Id: sys_sparc32.c,v 1.166 2000/11/10 04:49:56 davem Exp $ * sys_sparc32.c: Conversion between 32bit and 64bit native syscalls. * * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) @@ -2952,7 +2952,7 @@ static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm) return -ENOMEM; new = 1; } - kaddr = (char *)kmap(page); + kaddr = kmap(page); if (new && offset) memset(kaddr, 0, offset); @@ -2967,7 +2967,7 @@ static int copy_strings32(int argc, u32 * argv, struct linux_binprm *bprm) err = copy_from_user(kaddr + offset, (char *)A(str), bytes_to_copy); flush_page_to_ram(page); - kunmap((unsigned long)kaddr); + kunmap(page); if (err) return -EFAULT; diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c index 75d5c096e..a5f5411f5 100644 --- a/arch/sparc64/kernel/sys_sunos32.c +++ b/arch/sparc64/kernel/sys_sunos32.c @@ -601,7 +601,6 @@ sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr) int try_port; int ret; struct socket *socket; - struct dentry *dentry; struct inode *inode; struct file *file; @@ -609,8 +608,7 @@ sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr) if(!file) return 0; - dentry = file->f_dentry; - inode = dentry->d_inode; + inode = file->f_dentry->d_inode; socket = &inode->u.socket_i; local.sin_family = AF_INET; diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile index fa057936a..77531321d 100644 --- a/arch/sparc64/lib/Makefile +++ b/arch/sparc64/lib/Makefile @@ -1,4 +1,4 @@ -# $Id: Makefile,v 1.23 2000/07/10 20:57:34 davem Exp $ +# $Id: Makefile,v 1.24 2000/11/01 07:33:47 davem Exp $ # Makefile for Sparc64 library files.. # @@ -8,7 +8,8 @@ OBJS = PeeCeeI.o blockops.o debuglocks.o strlen.o strncmp.o \ memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \ VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \ VIScsumcopyusr.o VISsave.o atomic.o rwlock.o bitops.o \ - dec_and_lock.o + dec_and_lock.o U3memcpy.o U3copy_from_user.o U3copy_to_user.o \ + U3copy_in_user.o lib.a: $(OBJS) $(AR) rcs lib.a $(OBJS) diff --git a/arch/sparc64/lib/U3copy_from_user.S b/arch/sparc64/lib/U3copy_from_user.S new file mode 100644 index 000000000..b1003e607 --- /dev/null +++ b/arch/sparc64/lib/U3copy_from_user.S @@ -0,0 +1,500 @@ +/* $Id: U3copy_from_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $ + * U3memcpy.S: UltraSparc-III optimized copy from userspace. + * + * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#undef SMALL_COPY_USES_FPU +#define EXNV(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + ba,pt %xcc, U3cfu_fixup; \ + a, b, %o1; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EX(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + ba,pt %xcc, U3cfu_fixup; \ + a, b, %o1; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EX2(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + and %o2, (0x40 - 1), %o1; \ + add %o1, %o4, %o1; \ + ba,pt %xcc, U3cfu_fixup; \ + add %o1, 0x1c0, %o1; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EX3(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + and %o2, (0x40 - 1), %o1; \ + sll %g3, 6, %g3; \ + add %o1, 0x80, %o1; \ + ba,pt %xcc, U3cfu_fixup; \ + add %o1, %g3, %o1; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EX4(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + and %o2, (0x40 - 1), %o1; \ + add %o1, 0x40, %o1; \ + ba,pt %xcc, U3cfu_fixup; \ + add %o1, %g3, %o1; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#else +#define ASI_BLK_P 0xf0 +#define FPRS_FEF 0x04 +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#define SMALL_COPY_USES_FPU +#define EXNV(x,y,a,b) x,y; +#define EX(x,y,a,b) x,y; +#define EX2(x,y) x,y; +#define EX3(x,y) x,y; +#define EX4(x,y) x,y; +#endif + + /* Special/non-trivial issues of this code: + * + * 1) %o5 is preserved from VISEntryHalf to VISExitHalf + * 2) Only low 32 FPU registers are used so that only the + * lower half of the FPU register set is dirtied by this + * code. This is especially important in the kernel. + * 3) This code never prefetches cachelines past the end + * of the source buffer. + */ + + .text + .align 32 + + /* The cheetah's flexible spine, oversized liver, enlarged heart, + * slender muscular body, and claws make it the swiftest hunter + * in Africa and the fastest animal on land. Can reach speeds + * of up to 2.4GB per second. + */ + + .globl U3copy_from_user +U3copy_from_user: /* %o0=dst, %o1=src, %o2=len */ +#ifndef __KERNEL__ + /* Save away original 'dst' for memcpy return value. */ + mov %o0, %g3 ! A0 Group +#endif + /* Anything to copy at all? */ + cmp %o2, 0 ! A1 + ble,pn %icc, U3copy_from_user_short_ret! BR + + /* Extremely small copy? */ + cmp %o2, 31 ! A0 Group + ble,pn %icc, U3copy_from_user_short ! BR + + /* Large enough to use unrolled prefetch loops? */ + cmp %o2, 0x100 ! A1 + bge,a,pt %icc, U3copy_from_user_enter ! BR Group + andcc %o0, 0x3f, %g2 ! A0 + + ba,pt %xcc, U3copy_from_user_toosmall ! BR Group + andcc %o0, 0x7, %g2 ! A0 + + .align 32 +U3copy_from_user_short: + /* Copy %o2 bytes from src to dst, one byte at a time. */ + EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS Group + add %o1, 0x1, %o1 ! A0 + add %o0, 0x1, %o0 ! A1 + subcc %o2, 1, %o2 ! A0 Group + + bg,pt %icc, U3copy_from_user_short ! BR + stb %o3, [%o0 + -1] ! MS Group (1-cycle stall) + +U3copy_from_user_short_ret: +#ifdef __KERNEL__ + retl ! BR Group (0-4 cycle stall) + clr %o0 ! A0 +#else + retl ! BR Group (0-4 cycle stall) + mov %g3, %o0 ! A0 +#endif + + /* Here len >= (6 * 64) and condition codes reflect execution + * of "andcc %o0, 0x7, %g2", done by caller. + */ + .align 64 +U3copy_from_user_enter: + /* Is 'dst' already aligned on an 64-byte boundary? */ + be,pt %xcc, 2f ! BR + + /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number + * of bytes to copy to make 'dst' 64-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x40, %g2 ! A0 Group + sub %g0, %g2, %g2 ! A0 Group + sub %o2, %g2, %o2 ! A0 Group + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + stb %o3, [%o0 + -1] ! MS Group + +2: VISEntryHalf ! MS+MS + and %o1, 0x7, %g1 ! A1 + ba,pt %xcc, U3copy_from_user_begin ! BR + alignaddr %o1, %g0, %o1 ! MS (Break-after) + + .align 64 +U3copy_from_user_begin: + prefetcha [%o1 + 0x000] %asi, #one_read ! MS Group1 + prefetcha [%o1 + 0x040] %asi, #one_read ! MS Group2 + andn %o2, (0x40 - 1), %o4 ! A0 + prefetcha [%o1 + 0x080] %asi, #one_read ! MS Group3 + cmp %o4, 0x140 ! A0 + prefetcha [%o1 + 0x0c0] %asi, #one_read ! MS Group4 + EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0) ! MS Group5 (%f0 results at G8) + bge,a,pt %icc, 1f ! BR + + prefetcha [%o1 + 0x100] %asi, #one_read ! MS Group6 +1: EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0) ! AX (%f2 results at G9) + cmp %o4, 0x180 ! A1 + bge,a,pt %icc, 1f ! BR + prefetcha [%o1 + 0x140] %asi, #one_read ! MS Group7 +1: EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0) ! AX (%f4 results at G10) + cmp %o4, 0x1c0 ! A1 + bge,a,pt %icc, 1f ! BR + + prefetcha [%o1 + 0x180] %asi, #one_read ! MS Group8 +1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12) + EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0) ! AX (%f6 results at G12) + faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13) + EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0) ! MS (%f8 results at G13) + faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15) + EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0) ! MS (%f10 results at G15) + faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16) + + EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0) ! MS (%f12 results at G16) + faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18) + EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0) ! MS (%f14 results at G18) + faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19) + EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0) ! MS (%f0 results at G19) + + /* We only use the first loop if len > (7 * 64). */ + subcc %o4, 0x1c0, %o4 ! A0 Group17 + bg,pt %icc, U3copy_from_user_loop1 ! BR + add %o1, 0x40, %o1 ! A1 + + add %o4, 0x140, %o4 ! A0 Group18 + ba,pt %xcc, U3copy_from_user_loop2 ! BR + srl %o4, 6, %o3 ! A0 Group19 + nop + nop + nop + nop + nop + + nop + nop + + /* This loop performs the copy and queues new prefetches. + * We drop into the second loop when len <= (5 * 64). Note + * that this (5 * 64) factor has been subtracted from len + * already. + */ +U3copy_from_user_loop1: + EX2(ldda [%o1 + 0x008] %asi, %f2) ! MS Group2 (%f2 results at G5) + faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5) + EX2(ldda [%o1 + 0x010] %asi, %f4) ! MS Group3 (%f4 results at G6) + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7) + stda %f16, [%o0] ASI_BLK_P ! MS + EX2(ldda [%o1 + 0x018] %asi, %f6) ! AX (%f6 results at G7) + + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + EX2(ldda [%o1 + 0x020] %asi, %f8) ! MS (%f8 results at G15) + faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16) + EX2(ldda [%o1 + 0x028] %asi, %f10) ! MS (%f10 results at G16) + faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17) + EX2(ldda [%o1 + 0x030] %asi, %f12) ! MS (%f12 results at G17) + faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18) + EX2(ldda [%o1 + 0x038] %asi, %f14) ! MS (%f14 results at G18) + + faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19) + EX2(ldda [%o1 + 0x040] %asi, %f0) ! AX (%f0 results at G19) + prefetcha [%o1 + 0x180] %asi, #one_read ! MS + faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20) + subcc %o4, 0x40, %o4 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3copy_from_user_loop1 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + +U3copy_from_user_loop2_enter: + mov 5, %o3 ! A1 + + /* This loop performs on the copy, no new prefetches are + * queued. We do things this way so that we do not perform + * any spurious prefetches past the end of the src buffer. + */ +U3copy_from_user_loop2: + EX3(ldda [%o1 + 0x008] %asi, %f2) ! MS + faligndata %f12, %f14, %f28 ! FGA Group2 + EX3(ldda [%o1 + 0x010] %asi, %f4) ! MS + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall) + stda %f16, [%o0] ASI_BLK_P ! MS + EX3(ldda [%o1 + 0x018] %asi, %f6) ! AX + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + + EX3(ldda [%o1 + 0x020] %asi, %f8) ! MS + faligndata %f2, %f4, %f18 ! FGA Group13 + EX3(ldda [%o1 + 0x028] %asi, %f10) ! MS + faligndata %f4, %f6, %f20 ! FGA Group14 + EX3(ldda [%o1 + 0x030] %asi, %f12) ! MS + faligndata %f6, %f8, %f22 ! FGA Group15 + EX3(ldda [%o1 + 0x038] %asi, %f14) ! MS + faligndata %f8, %f10, %f24 ! FGA Group16 + + EX3(ldda [%o1 + 0x040] %asi, %f0) ! AX + faligndata %f10, %f12, %f26 ! FGA Group17 + subcc %o3, 0x01, %o3 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3copy_from_user_loop2 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + + /* Finally we copy the last full 64-byte block. */ +U3copy_from_user_loopfini: + EX3(ldda [%o1 + 0x008] %asi, %f2) ! MS + faligndata %f12, %f14, %f28 ! FGA + EX3(ldda [%o1 + 0x010] %asi, %f4) ! MS Group19 + faligndata %f14, %f0, %f30 ! FGA + stda %f16, [%o0] ASI_BLK_P ! MS Group20 + EX3(ldda [%o1 + 0x018] %asi, %f6) ! AX + faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall) + EX3(ldda [%o1 + 0x020] %asi, %f8) ! MS + faligndata %f2, %f4, %f18 ! FGA Group12 + EX3(ldda [%o1 + 0x028] %asi, %f10) ! MS + faligndata %f4, %f6, %f20 ! FGA Group13 + EX3(ldda [%o1 + 0x030] %asi, %f12) ! MS + faligndata %f6, %f8, %f22 ! FGA Group14 + EX3(ldda [%o1 + 0x038] %asi, %f14) ! MS + faligndata %f8, %f10, %f24 ! FGA Group15 + cmp %g1, 0 ! A0 + be,pt %icc, 1f ! BR + add %o0, 0x40, %o0 ! A1 + EX4(ldda [%o1 + 0x040] %asi, %f0) ! MS +1: faligndata %f10, %f12, %f26 ! FGA Group16 + faligndata %f12, %f14, %f28 ! FGA Group17 + faligndata %f14, %f0, %f30 ! FGA Group18 + stda %f16, [%o0] ASI_BLK_P ! MS + add %o0, 0x40, %o0 ! A0 + add %o1, 0x40, %o1 ! A1 + membar #Sync ! MS Group26 (7-cycle stall) + + /* Now we copy the (len modulo 64) bytes at the end. + * Note how we borrow the %f0 loaded above. + * + * Also notice how this code is careful not to perform a + * load past the end of the src buffer just like similar + * code found in U3copy_from_user_toosmall processing. + */ +U3copy_from_user_loopend: + and %o2, 0x3f, %o2 ! A0 Group + andcc %o2, 0x38, %g2 ! A0 Group + be,pn %icc, U3copy_from_user_endcruft ! BR + subcc %g2, 0x8, %g2 ! A1 + be,pn %icc, U3copy_from_user_endcruft ! BR Group + cmp %g1, 0 ! A0 + + be,a,pt %icc, 1f ! BR Group + EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0) ! MS + +1: EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0) ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f0, %f2, %f8 ! FGA Group + std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX) + be,pn %icc, U3copy_from_user_endcruft ! BR + add %o0, 0x8, %o0 ! A0 + EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0) ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA + std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX) + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A0 Group + + /* If anything is left, we copy it one byte at a time. + * Note that %g1 is (src & 0x3) saved above before the + * alignaddr was performed. + */ +U3copy_from_user_endcruft: + cmp %o2, 0 + add %o1, %g1, %o1 + VISExitHalf + be,pn %icc, U3copy_from_user_short_ret + nop + ba,a,pt %xcc, U3copy_from_user_short + + /* If we get here, then 32 <= len < (6 * 64) */ +U3copy_from_user_toosmall: + +#ifdef SMALL_COPY_USES_FPU + + /* Is 'dst' already aligned on an 8-byte boundary? */ + be,pt %xcc, 2f ! BR Group + + /* Compute abs((dst & 7) - 8) into %g2. This is the number + * of bytes to copy to make 'dst' 8-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x8, %g2 ! A0 + sub %g0, %g2, %g2 ! A0 Group (reg-dep) + sub %o2, %g2, %o2 ! A0 Group (reg-dep) + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group) (%o3 in 3 cycles) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + stb %o3, [%o0 + -1] ! MS Group + +2: VISEntryHalf ! MS+MS + + /* Compute (len - (len % 8)) into %g2. This is guarenteed + * to be nonzero. + */ + andn %o2, 0x7, %g2 ! A0 Group + + /* You may read this and believe that it allows reading + * one 8-byte longword past the end of src. It actually + * does not, as %g2 is subtracted as loads are done from + * src, so we always stop before running off the end. + * Also, we are guarenteed to have at least 0x10 bytes + * to move here. + */ + sub %g2, 0x8, %g2 ! A0 Group (reg-dep) + alignaddr %o1, %g0, %g1 ! MS (Break-after) + EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group (1-cycle stall) + add %g1, 0x8, %g1 ! A0 + +1: EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0) ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + + faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall) + std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + be,pn %icc, 2f ! BR + + add %o0, 0x8, %o0 ! A1 + EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall) + std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A1 + + /* Nothing left to copy? */ +2: cmp %o2, 0 ! A0 Group + VISExitHalf ! A0+MS + be,pn %icc, U3copy_from_user_short_ret! BR Group + nop ! A0 + ba,a,pt %xcc, U3copy_from_user_short ! BR Group + +#else /* !(SMALL_COPY_USES_FPU) */ + + xor %o1, %o0, %g2 + andcc %g2, 0x7, %g0 + bne,pn %icc, U3copy_from_user_short + andcc %o1, 0x7, %g2 + + be,pt %xcc, 2f + sub %g2, 0x8, %g2 + sub %g0, %g2, %g2 + sub %o2, %g2, %o2 + +1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2) + add %o1, 0x1, %o1 + add %o0, 0x1, %o0 + subcc %g2, 0x1, %g2 + bg,pt %icc, 1b + stb %o3, [%o0 + -1] + +2: andn %o2, 0x7, %g2 + sub %o2, %g2, %o2 + +3: EXNV(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2) + add %o1, 0x8, %o1 + add %o0, 0x8, %o0 + subcc %g2, 0x8, %g2 + bg,pt %icc, 3b + stx %o3, [%o0 + -8] + + cmp %o2, 0 + bne,pn %icc, U3copy_from_user_short + nop + ba,a,pt %xcc, U3copy_from_user_short_ret + +#endif /* !(SMALL_COPY_USES_FPU) */ + +#ifdef __KERNEL__ + .globl U3cfu_fixup +U3cfu_fixup: + /* Since this is copy_from_user(), zero out the rest of the + * kernel buffer. + */ + cmp %o1, 0 + ble,pn %icc, 2f + mov %o1, %g2 + +1: subcc %g2, 1, %g2 + stb %g0, [%o0] + bne,pt %icc, 1b + add %o0, 1, %o0 + +2: retl + mov %o1, %o0 +#endif diff --git a/arch/sparc64/lib/U3copy_in_user.S b/arch/sparc64/lib/U3copy_in_user.S new file mode 100644 index 000000000..0fc169b9d --- /dev/null +++ b/arch/sparc64/lib/U3copy_in_user.S @@ -0,0 +1,531 @@ +/* $Id: U3copy_in_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $ + * U3memcpy.S: UltraSparc-III optimized copy within userspace. + * + * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#undef SMALL_COPY_USES_FPU +#define EXNV(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: retl; \ + a, b, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXNV2(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: a, b, %o0; \ + retl; \ + add %o0, 1, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXNV3(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: a, b, %o0; \ + retl; \ + add %o0, 8, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EX(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + retl; \ + a, b, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK1(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + add %o4, 0x1c0, %o1; \ + and %o2, (0x40 - 1), %o2; \ + retl; \ + add %o1, %o2, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK2(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + sll %o3, 6, %o3; \ + and %o2, (0x40 - 1), %o2; \ + add %o3, 0x80, %o1; \ + retl; \ + add %o1, %o2, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK3(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + and %o2, (0x40 - 1), %o2; \ + retl; \ + add %o2, 0x80, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK4(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + and %o2, (0x40 - 1), %o2; \ + retl; \ + add %o2, 0x40, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#else +#define ASI_AIUS 0x80 +#define ASI_BLK_AIUS 0xf0 +#define FPRS_FEF 0x04 +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#define SMALL_COPY_USES_FPU +#define EXNV(x,y,a,b) x,y; +#define EXNV2(x,y,a,b) x,y; +#define EXNV3(x,y,a,b) x,y; +#define EX(x,y,a,b) x,y; +#define EXBLK1(x,y) x,y; +#define EXBLK2(x,y) x,y; +#define EXBLK3(x,y) x,y; +#define EXBLK4(x,y) x,y; +#endif + + /* Special/non-trivial issues of this code: + * + * 1) %o5 is preserved from VISEntryHalf to VISExitHalf + * 2) Only low 32 FPU registers are used so that only the + * lower half of the FPU register set is dirtied by this + * code. This is especially important in the kernel. + * 3) This code never prefetches cachelines past the end + * of the source buffer. + * + * XXX Actually, Cheetah can buffer up to 8 concurrent + * XXX prefetches, revisit this... + */ + + .text + .align 32 + + /* The cheetah's flexible spine, oversized liver, enlarged heart, + * slender muscular body, and claws make it the swiftest hunter + * in Africa and the fastest animal on land. Can reach speeds + * of up to 2.4GB per second. + */ + + .globl U3copy_in_user +U3copy_in_user: /* %o0=dst, %o1=src, %o2=len */ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ + rd %asi, %g1 ! MS Group (4 cycles) + cmp %g1, ASI_AIUS ! A0 Group + bne U3memcpy ! BR + nop ! A1 +#ifndef __KERNEL__ + /* Save away original 'dst' for memcpy return value. */ + mov %o0, %g3 ! A0 Group +#endif + /* Anything to copy at all? */ + cmp %o2, 0 ! A1 + ble,pn %icc, U3copy_in_user_short_ret ! BR + + /* Extremely small copy? */ + cmp %o2, 31 ! A0 Group + ble,pn %icc, U3copy_in_user_short ! BR + + /* Large enough to use unrolled prefetch loops? */ + cmp %o2, 0x100 ! A1 + bge,a,pt %icc, U3copy_in_user_enter ! BR Group + andcc %o0, 0x3f, %g2 ! A0 + + ba,pt %xcc, U3copy_in_user_toosmall ! BR Group + andcc %o0, 0x7, %g2 ! A0 + + .align 32 +U3copy_in_user_short: + /* Copy %o2 bytes from src to dst, one byte at a time. */ + EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS Group + add %o1, 0x1, %o1 ! A0 + add %o0, 0x1, %o0 ! A1 + subcc %o2, 1, %o2 ! A0 Group + + bg,pt %icc, U3copy_in_user_short ! BR + EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1) ! MS Group (1-cycle stall) + +U3copy_in_user_short_ret: +#ifdef __KERNEL__ + retl ! BR Group (0-4 cycle stall) + clr %o0 ! A0 +#else + retl ! BR Group (0-4 cycle stall) + mov %g3, %o0 ! A0 +#endif + + /* Here len >= (6 * 64) and condition codes reflect execution + * of "andcc %o0, 0x7, %g2", done by caller. + */ + .align 64 +U3copy_in_user_enter: + /* Is 'dst' already aligned on an 64-byte boundary? */ + be,pt %xcc, 2f ! BR + + /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number + * of bytes to copy to make 'dst' 64-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x40, %g2 ! A0 Group + sub %g0, %g2, %g2 ! A0 Group + sub %o2, %g2, %o2 ! A0 Group + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group + +2: VISEntryHalf ! MS+MS + and %o1, 0x7, %g1 ! A1 + ba,pt %xcc, U3copy_in_user_begin ! BR + alignaddr %o1, %g0, %o1 ! MS (Break-after) + + .align 64 +U3copy_in_user_begin: + prefetch [%o1 + 0x000], #one_read ! MS Group1 + prefetch [%o1 + 0x040], #one_read ! MS Group2 + andn %o2, (0x40 - 1), %o4 ! A0 + prefetch [%o1 + 0x080], #one_read ! MS Group3 + cmp %o4, 0x140 ! A0 + prefetch [%o1 + 0x0c0], #one_read ! MS Group4 + EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0) ! MS Group5 (%f0 results at G8) + bge,a,pt %icc, 1f ! BR + + prefetch [%o1 + 0x100], #one_read ! MS Group6 +1: EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0) ! AX (%f2 results at G9) + cmp %o4, 0x180 ! A1 + bge,a,pt %icc, 1f ! BR + prefetch [%o1 + 0x140], #one_read ! MS Group7 +1: EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0) ! AX (%f4 results at G10) + cmp %o4, 0x1c0 ! A1 + bge,a,pt %icc, 1f ! BR + + prefetch [%o1 + 0x180], #one_read ! MS Group8 +1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12) + EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0) ! AX (%f6 results at G12) + faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13) + EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0) ! MS (%f8 results at G13) + faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15) + EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0) ! MS (%f10 results at G15) + faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16) + + EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0) ! MS (%f12 results at G16) + faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18) + EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0) ! MS (%f14 results at G18) + faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19) + EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0) ! MS (%f0 results at G19) + + /* We only use the first loop if len > (7 * 64). */ + subcc %o4, 0x1c0, %o4 ! A0 Group17 + bg,pt %icc, U3copy_in_user_loop1 ! BR + add %o1, 0x40, %o1 ! A1 + + add %o4, 0x140, %o4 ! A0 Group18 + ba,pt %xcc, U3copy_in_user_loop2 ! BR + srl %o4, 6, %o3 ! A0 Group19 + nop + nop + nop + nop + nop + + nop + nop + + /* This loop performs the copy and queues new prefetches. + * We drop into the second loop when len <= (5 * 64). Note + * that this (5 * 64) factor has been subtracted from len + * already. + */ +U3copy_in_user_loop1: + EXBLK1(ldda [%o1 + 0x008] %asi, %f2) ! MS Group2 (%f2 results at G5) + faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5) + EXBLK1(ldda [%o1 + 0x010] %asi, %f4) ! MS Group3 (%f4 results at G6) + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7) + EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS) ! MS + EXBLK1(ldda [%o1 + 0x018] %asi, %f6) ! AX (%f6 results at G7) + + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + EXBLK1(ldda [%o1 + 0x020] %asi, %f8) ! MS (%f8 results at G15) + faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16) + EXBLK1(ldda [%o1 + 0x028] %asi, %f10) ! MS (%f10 results at G16) + faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17) + EXBLK1(ldda [%o1 + 0x030] %asi, %f12) ! MS (%f12 results at G17) + faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18) + EXBLK1(ldda [%o1 + 0x038] %asi, %f14) ! MS (%f14 results at G18) + + faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19) + EXBLK1(ldda [%o1 + 0x040] %asi, %f0) ! AX (%f0 results at G19) + prefetch [%o1 + 0x180], #one_read ! MS + faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20) + subcc %o4, 0x40, %o4 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3copy_in_user_loop1 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + +U3copy_in_user_loop2_enter: + mov 5, %o3 ! A1 + + /* This loop performs on the copy, no new prefetches are + * queued. We do things this way so that we do not perform + * any spurious prefetches past the end of the src buffer. + */ +U3copy_in_user_loop2: + EXBLK2(ldda [%o1 + 0x008] %asi, %f2) ! MS + faligndata %f12, %f14, %f28 ! FGA Group2 + EXBLK2(ldda [%o1 + 0x010] %asi, %f4) ! MS + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall) + EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS) ! MS + EXBLK2(ldda [%o1 + 0x018] %asi, %f6) ! AX + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + + EXBLK2(ldda [%o1 + 0x020] %asi, %f8) ! MS + faligndata %f2, %f4, %f18 ! FGA Group13 + EXBLK2(ldda [%o1 + 0x028] %asi, %f10) ! MS + faligndata %f4, %f6, %f20 ! FGA Group14 + EXBLK2(ldda [%o1 + 0x030] %asi, %f12) ! MS + faligndata %f6, %f8, %f22 ! FGA Group15 + EXBLK2(ldda [%o1 + 0x038] %asi, %f14) ! MS + faligndata %f8, %f10, %f24 ! FGA Group16 + + EXBLK2(ldda [%o1 + 0x040] %asi, %f0) ! AX + faligndata %f10, %f12, %f26 ! FGA Group17 + subcc %o3, 0x01, %o3 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3copy_in_user_loop2 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + + /* Finally we copy the last full 64-byte block. */ +U3copy_in_user_loopfini: + EXBLK3(ldda [%o1 + 0x008] %asi, %f2) ! MS + faligndata %f12, %f14, %f28 ! FGA + EXBLK3(ldda [%o1 + 0x010] %asi, %f4) ! MS Group19 + faligndata %f14, %f0, %f30 ! FGA + EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS) ! MS Group20 + EXBLK4(ldda [%o1 + 0x018] %asi, %f6) ! AX + faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall) + EXBLK4(ldda [%o1 + 0x020] %asi, %f8) ! MS + faligndata %f2, %f4, %f18 ! FGA Group12 + EXBLK4(ldda [%o1 + 0x028] %asi, %f10) ! MS + faligndata %f4, %f6, %f20 ! FGA Group13 + EXBLK4(ldda [%o1 + 0x030] %asi, %f12) ! MS + faligndata %f6, %f8, %f22 ! FGA Group14 + EXBLK4(ldda [%o1 + 0x038] %asi, %f14) ! MS + faligndata %f8, %f10, %f24 ! FGA Group15 + cmp %g1, 0 ! A0 + be,pt %icc, 1f ! BR + add %o0, 0x40, %o0 ! A1 + EXBLK4(ldda [%o1 + 0x040] %asi, %f0) ! MS +1: faligndata %f10, %f12, %f26 ! FGA Group16 + faligndata %f12, %f14, %f28 ! FGA Group17 + faligndata %f14, %f0, %f30 ! FGA Group18 + EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS) ! MS + add %o0, 0x40, %o0 ! A0 + add %o1, 0x40, %o1 ! A1 + membar #Sync ! MS Group26 (7-cycle stall) + + /* Now we copy the (len modulo 64) bytes at the end. + * Note how we borrow the %f0 loaded above. + * + * Also notice how this code is careful not to perform a + * load past the end of the src buffer just like similar + * code found in U3copy_in_user_toosmall processing. + */ +U3copy_in_user_loopend: + and %o2, 0x3f, %o2 ! A0 Group + andcc %o2, 0x38, %g2 ! A0 Group + be,pn %icc, U3copy_in_user_endcruft ! BR + subcc %g2, 0x8, %g2 ! A1 + be,pn %icc, U3copy_in_user_endcruft ! BR Group + cmp %g1, 0 ! A0 + + be,a,pt %icc, 1f ! BR Group + EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0) ! MS + +1: EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0) ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f0, %f2, %f8 ! FGA Group + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX) + be,pn %icc, U3copy_in_user_endcruft ! BR + add %o0, 0x8, %o0 ! A0 + EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0) ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX) + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A0 Group + + /* If anything is left, we copy it one byte at a time. + * Note that %g1 is (src & 0x3) saved above before the + * alignaddr was performed. + */ +U3copy_in_user_endcruft: + cmp %o2, 0 + add %o1, %g1, %o1 + VISExitHalf + be,pn %icc, U3copy_in_user_short_ret + nop + ba,a,pt %xcc, U3copy_in_user_short + + /* If we get here, then 32 <= len < (6 * 64) */ +U3copy_in_user_toosmall: + +#ifdef SMALL_COPY_USES_FPU + + /* Is 'dst' already aligned on an 8-byte boundary? */ + be,pt %xcc, 2f ! BR Group + + /* Compute abs((dst & 7) - 8) into %g2. This is the number + * of bytes to copy to make 'dst' 8-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x8, %g2 ! A0 + sub %g0, %g2, %g2 ! A0 Group (reg-dep) + sub %o2, %g2, %o2 ! A0 Group (reg-dep) + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group) (%o3 in 3 cycles) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group + +2: VISEntryHalf ! MS+MS + + /* Compute (len - (len % 8)) into %g2. This is guarenteed + * to be nonzero. + */ + andn %o2, 0x7, %g2 ! A0 Group + + /* You may read this and believe that it allows reading + * one 8-byte longword past the end of src. It actually + * does not, as %g2 is subtracted as loads are done from + * src, so we always stop before running off the end. + * Also, we are guarenteed to have at least 0x10 bytes + * to move here. + */ + sub %g2, 0x8, %g2 ! A0 Group (reg-dep) + alignaddr %o1, %g0, %g1 ! MS (Break-after) + EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group (1-cycle stall) + add %g1, 0x8, %g1 ! A0 + +1: EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0) ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + + faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall) + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + be,pn %icc, 2f ! BR + + add %o0, 0x8, %o0 ! A1 + EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall) + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A1 + + /* Nothing left to copy? */ +2: cmp %o2, 0 ! A0 Group + VISExitHalf ! A0+MS + be,pn %icc, U3copy_in_user_short_ret ! BR Group + nop ! A0 + ba,a,pt %xcc, U3copy_in_user_short ! BR Group + +#else /* !(SMALL_COPY_USES_FPU) */ + + xor %o1, %o0, %g2 + andcc %g2, 0x7, %g0 + bne,pn %icc, U3copy_in_user_short + andcc %o1, 0x7, %g2 + + be,pt %xcc, 2f + sub %g2, 0x8, %g2 + sub %g0, %g2, %g2 + sub %o2, %g2, %o2 + +1: EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2) + add %o1, 0x1, %o1 + add %o0, 0x1, %o0 + subcc %g2, 0x1, %g2 + bg,pt %icc, 1b + EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) + +2: andn %o2, 0x7, %g2 + sub %o2, %g2, %o2 + +3: EXNV3(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2) + add %o1, 0x8, %o1 + add %o0, 0x8, %o0 + subcc %g2, 0x8, %g2 + bg,pt %icc, 3b + EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2) + + cmp %o2, 0 + bne,pn %icc, U3copy_in_user_short + nop + ba,a,pt %xcc, U3copy_in_user_short_ret + +#endif /* !(SMALL_COPY_USES_FPU) */ diff --git a/arch/sparc64/lib/U3copy_to_user.S b/arch/sparc64/lib/U3copy_to_user.S new file mode 100644 index 000000000..e08b1290b --- /dev/null +++ b/arch/sparc64/lib/U3copy_to_user.S @@ -0,0 +1,528 @@ +/* $Id: U3copy_to_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $ + * U3memcpy.S: UltraSparc-III optimized copy to userspace. + * + * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#undef SMALL_COPY_USES_FPU +#define EXNV(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: retl; \ + a, b, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXNV2(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: a, b, %o0; \ + retl; \ + add %o0, 1, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXNV3(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: a, b, %o0; \ + retl; \ + add %o0, 8, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EX(x,y,a,b) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + retl; \ + a, b, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK1(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + add %o4, 0x1c0, %o1; \ + and %o2, (0x40 - 1), %o2; \ + retl; \ + add %o1, %o2, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK2(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + sll %o3, 6, %o3; \ + and %o2, (0x40 - 1), %o2; \ + add %o3, 0x80, %o1; \ + retl; \ + add %o1, %o2, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK3(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + and %o2, (0x40 - 1), %o2; \ + retl; \ + add %o2, 0x80, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#define EXBLK4(x,y) \ +98: x,y; \ + .section .fixup; \ + .align 4; \ +99: VISExitHalf; \ + and %o2, (0x40 - 1), %o2; \ + retl; \ + add %o2, 0x40, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; +#else +#define ASI_AIUS 0x80 +#define ASI_BLK_AIUS 0xf0 +#define FPRS_FEF 0x04 +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#define SMALL_COPY_USES_FPU +#define EXNV(x,y,a,b) x,y; +#define EXNV2(x,y,a,b) x,y; +#define EXNV3(x,y,a,b) x,y; +#define EX(x,y,a,b) x,y; +#define EXBLK1(x,y) x,y; +#define EXBLK2(x,y) x,y; +#define EXBLK3(x,y) x,y; +#define EXBLK4(x,y) x,y; +#endif + + /* Special/non-trivial issues of this code: + * + * 1) %o5 is preserved from VISEntryHalf to VISExitHalf + * 2) Only low 32 FPU registers are used so that only the + * lower half of the FPU register set is dirtied by this + * code. This is especially important in the kernel. + * 3) This code never prefetches cachelines past the end + * of the source buffer. + */ + + .text + .align 32 + + /* The cheetah's flexible spine, oversized liver, enlarged heart, + * slender muscular body, and claws make it the swiftest hunter + * in Africa and the fastest animal on land. Can reach speeds + * of up to 2.4GB per second. + */ + + .globl U3copy_to_user +U3copy_to_user: /* %o0=dst, %o1=src, %o2=len */ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ + rd %asi, %g1 ! MS Group (4 cycles) + cmp %g1, ASI_AIUS ! A0 Group + bne U3memcpy ! BR + nop ! A1 +#ifndef __KERNEL__ + /* Save away original 'dst' for memcpy return value. */ + mov %o0, %g3 ! A0 Group +#endif + /* Anything to copy at all? */ + cmp %o2, 0 ! A1 + ble,pn %icc, U3copy_to_user_short_ret ! BR + + /* Extremely small copy? */ + cmp %o2, 31 ! A0 Group + ble,pn %icc, U3copy_to_user_short ! BR + + /* Large enough to use unrolled prefetch loops? */ + cmp %o2, 0x100 ! A1 + bge,a,pt %icc, U3copy_to_user_enter ! BR Group + andcc %o0, 0x3f, %g2 ! A0 + + ba,pt %xcc, U3copy_to_user_toosmall ! BR Group + andcc %o0, 0x7, %g2 ! A0 + + .align 32 +U3copy_to_user_short: + /* Copy %o2 bytes from src to dst, one byte at a time. */ + ldub [%o1 + 0x00], %o3 ! MS Group + add %o1, 0x1, %o1 ! A0 + add %o0, 0x1, %o0 ! A1 + subcc %o2, 1, %o2 ! A0 Group + + bg,pt %icc, U3copy_to_user_short ! BR + EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1) ! MS Group (1-cycle stall) + +U3copy_to_user_short_ret: +#ifdef __KERNEL__ + retl ! BR Group (0-4 cycle stall) + clr %o0 ! A0 +#else + retl ! BR Group (0-4 cycle stall) + mov %g3, %o0 ! A0 +#endif + + /* Here len >= (6 * 64) and condition codes reflect execution + * of "andcc %o0, 0x7, %g2", done by caller. + */ + .align 64 +U3copy_to_user_enter: + /* Is 'dst' already aligned on an 64-byte boundary? */ + be,pt %xcc, 2f ! BR + + /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number + * of bytes to copy to make 'dst' 64-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x40, %g2 ! A0 Group + sub %g0, %g2, %g2 ! A0 Group + sub %o2, %g2, %o2 ! A0 Group + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: ldub [%o1 + 0x00], %o3 ! MS (Group) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group + +2: VISEntryHalf ! MS+MS + and %o1, 0x7, %g1 ! A1 + ba,pt %xcc, U3copy_to_user_begin ! BR + alignaddr %o1, %g0, %o1 ! MS (Break-after) + + .align 64 +U3copy_to_user_begin: + prefetch [%o1 + 0x000], #one_read ! MS Group1 + prefetch [%o1 + 0x040], #one_read ! MS Group2 + andn %o2, (0x40 - 1), %o4 ! A0 + prefetch [%o1 + 0x080], #one_read ! MS Group3 + cmp %o4, 0x140 ! A0 + prefetch [%o1 + 0x0c0], #one_read ! MS Group4 + ldd [%o1 + 0x000], %f0 ! MS Group5 (%f0 results at G8) + bge,a,pt %icc, 1f ! BR + + prefetch [%o1 + 0x100], #one_read ! MS Group6 +1: ldd [%o1 + 0x008], %f2 ! AX (%f2 results at G9) + cmp %o4, 0x180 ! A1 + bge,a,pt %icc, 1f ! BR + prefetch [%o1 + 0x140], #one_read ! MS Group7 +1: ldd [%o1 + 0x010], %f4 ! AX (%f4 results at G10) + cmp %o4, 0x1c0 ! A1 + bge,a,pt %icc, 1f ! BR + + prefetch [%o1 + 0x180], #one_read ! MS Group8 +1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12) + ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G12) + faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13) + ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G13) + faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15) + ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G15) + faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16) + + ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G16) + faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18) + ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18) + faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19) + ldd [%o1 + 0x040], %f0 ! MS (%f0 results at G19) + + /* We only use the first loop if len > (7 * 64). */ + subcc %o4, 0x1c0, %o4 ! A0 Group17 + bg,pt %icc, U3copy_to_user_loop1 ! BR + add %o1, 0x40, %o1 ! A1 + + add %o4, 0x140, %o4 ! A0 Group18 + ba,pt %xcc, U3copy_to_user_loop2 ! BR + srl %o4, 6, %o3 ! A0 Group19 + nop + nop + nop + nop + nop + + nop + nop + + /* This loop performs the copy and queues new prefetches. + * We drop into the second loop when len <= (5 * 64). Note + * that this (5 * 64) factor has been subtracted from len + * already. + */ +U3copy_to_user_loop1: + ldd [%o1 + 0x008], %f2 ! MS Group2 (%f2 results at G5) + faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5) + ldd [%o1 + 0x010], %f4 ! MS Group3 (%f4 results at G6) + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7) + EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS) ! MS + ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G7) + + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G15) + faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16) + ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G16) + faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17) + ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G17) + faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18) + ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18) + + faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19) + ldd [%o1 + 0x040], %f0 ! AX (%f0 results at G19) + prefetch [%o1 + 0x180], #one_read ! MS + faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20) + subcc %o4, 0x40, %o4 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3copy_to_user_loop1 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + +U3copy_to_user_loop2_enter: + mov 5, %o3 ! A1 + + /* This loop performs on the copy, no new prefetches are + * queued. We do things this way so that we do not perform + * any spurious prefetches past the end of the src buffer. + */ +U3copy_to_user_loop2: + ldd [%o1 + 0x008], %f2 ! MS + faligndata %f12, %f14, %f28 ! FGA Group2 + ldd [%o1 + 0x010], %f4 ! MS + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall) + EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS) ! MS + ldd [%o1 + 0x018], %f6 ! AX + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + + ldd [%o1 + 0x020], %f8 ! MS + faligndata %f2, %f4, %f18 ! FGA Group13 + ldd [%o1 + 0x028], %f10 ! MS + faligndata %f4, %f6, %f20 ! FGA Group14 + ldd [%o1 + 0x030], %f12 ! MS + faligndata %f6, %f8, %f22 ! FGA Group15 + ldd [%o1 + 0x038], %f14 ! MS + faligndata %f8, %f10, %f24 ! FGA Group16 + + ldd [%o1 + 0x040], %f0 ! AX + faligndata %f10, %f12, %f26 ! FGA Group17 + subcc %o3, 0x01, %o3 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3copy_to_user_loop2 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + + /* Finally we copy the last full 64-byte block. */ +U3copy_to_user_loopfini: + ldd [%o1 + 0x008], %f2 ! MS + faligndata %f12, %f14, %f28 ! FGA + ldd [%o1 + 0x010], %f4 ! MS Group19 + faligndata %f14, %f0, %f30 ! FGA + EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS) ! MS Group20 + ldd [%o1 + 0x018], %f6 ! AX + faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall) + ldd [%o1 + 0x020], %f8 ! MS + faligndata %f2, %f4, %f18 ! FGA Group12 + ldd [%o1 + 0x028], %f10 ! MS + faligndata %f4, %f6, %f20 ! FGA Group13 + ldd [%o1 + 0x030], %f12 ! MS + faligndata %f6, %f8, %f22 ! FGA Group14 + ldd [%o1 + 0x038], %f14 ! MS + faligndata %f8, %f10, %f24 ! FGA Group15 + cmp %g1, 0 ! A0 + be,pt %icc, 1f ! BR + add %o0, 0x40, %o0 ! A1 + ldd [%o1 + 0x040], %f0 ! MS +1: faligndata %f10, %f12, %f26 ! FGA Group16 + faligndata %f12, %f14, %f28 ! FGA Group17 + faligndata %f14, %f0, %f30 ! FGA Group18 + EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS) ! MS + add %o0, 0x40, %o0 ! A0 + add %o1, 0x40, %o1 ! A1 + membar #Sync ! MS Group26 (7-cycle stall) + + /* Now we copy the (len modulo 64) bytes at the end. + * Note how we borrow the %f0 loaded above. + * + * Also notice how this code is careful not to perform a + * load past the end of the src buffer just like similar + * code found in U3copy_to_user_toosmall processing. + */ +U3copy_to_user_loopend: + and %o2, 0x3f, %o2 ! A0 Group + andcc %o2, 0x38, %g2 ! A0 Group + be,pn %icc, U3copy_to_user_endcruft ! BR + subcc %g2, 0x8, %g2 ! A1 + be,pn %icc, U3copy_to_user_endcruft ! BR Group + cmp %g1, 0 ! A0 + + be,a,pt %icc, 1f ! BR Group + ldd [%o1 + 0x00], %f0 ! MS + +1: ldd [%o1 + 0x08], %f2 ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f0, %f2, %f8 ! FGA Group + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX) + be,pn %icc, U3copy_to_user_endcruft ! BR + add %o0, 0x8, %o0 ! A0 + ldd [%o1 + 0x08], %f0 ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX) + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A0 Group + + /* If anything is left, we copy it one byte at a time. + * Note that %g1 is (src & 0x3) saved above before the + * alignaddr was performed. + */ +U3copy_to_user_endcruft: + cmp %o2, 0 + add %o1, %g1, %o1 + VISExitHalf + be,pn %icc, U3copy_to_user_short_ret + nop + ba,a,pt %xcc, U3copy_to_user_short + + /* If we get here, then 32 <= len < (6 * 64) */ +U3copy_to_user_toosmall: + +#ifdef SMALL_COPY_USES_FPU + + /* Is 'dst' already aligned on an 8-byte boundary? */ + be,pt %xcc, 2f ! BR Group + + /* Compute abs((dst & 7) - 8) into %g2. This is the number + * of bytes to copy to make 'dst' 8-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x8, %g2 ! A0 + sub %g0, %g2, %g2 ! A0 Group (reg-dep) + sub %o2, %g2, %o2 ! A0 Group (reg-dep) + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: ldub [%o1 + 0x00], %o3 ! MS (Group) (%o3 in 3 cycles) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group + +2: VISEntryHalf ! MS+MS + + /* Compute (len - (len % 8)) into %g2. This is guarenteed + * to be nonzero. + */ + andn %o2, 0x7, %g2 ! A0 Group + + /* You may read this and believe that it allows reading + * one 8-byte longword past the end of src. It actually + * does not, as %g2 is subtracted as loads are done from + * src, so we always stop before running off the end. + * Also, we are guarenteed to have at least 0x10 bytes + * to move here. + */ + sub %g2, 0x8, %g2 ! A0 Group (reg-dep) + alignaddr %o1, %g0, %g1 ! MS (Break-after) + ldd [%g1 + 0x00], %f0 ! MS Group (1-cycle stall) + add %g1, 0x8, %g1 ! A0 + +1: ldd [%g1 + 0x00], %f2 ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + + faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall) + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + be,pn %icc, 2f ! BR + + add %o0, 0x8, %o0 ! A1 + ldd [%g1 + 0x00], %f0 ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall) + EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A1 + + /* Nothing left to copy? */ +2: cmp %o2, 0 ! A0 Group + VISExitHalf ! A0+MS + be,pn %icc, U3copy_to_user_short_ret ! BR Group + nop ! A0 + ba,a,pt %xcc, U3copy_to_user_short ! BR Group + +#else /* !(SMALL_COPY_USES_FPU) */ + + xor %o1, %o0, %g2 + andcc %g2, 0x7, %g0 + bne,pn %icc, U3copy_to_user_short + andcc %o1, 0x7, %g2 + + be,pt %xcc, 2f + sub %g2, 0x8, %g2 + sub %g0, %g2, %g2 + sub %o2, %g2, %o2 + +1: ldub [%o1 + 0x00], %o3 + add %o1, 0x1, %o1 + add %o0, 0x1, %o0 + subcc %g2, 0x1, %g2 + bg,pt %icc, 1b + EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) + +2: andn %o2, 0x7, %g2 + sub %o2, %g2, %o2 + +3: ldx [%o1 + 0x00], %o3 + add %o1, 0x8, %o1 + add %o0, 0x8, %o0 + subcc %g2, 0x8, %g2 + bg,pt %icc, 3b + EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2) + + cmp %o2, 0 + bne,pn %icc, U3copy_to_user_short + nop + ba,a,pt %xcc, U3copy_to_user_short_ret + +#endif /* !(SMALL_COPY_USES_FPU) */ diff --git a/arch/sparc64/lib/U3memcpy.S b/arch/sparc64/lib/U3memcpy.S new file mode 100644 index 000000000..d38289145 --- /dev/null +++ b/arch/sparc64/lib/U3memcpy.S @@ -0,0 +1,409 @@ +/* $Id: U3memcpy.S,v 1.2 2000/11/01 09:29:19 davem Exp $ + * U3memcpy.S: UltraSparc-III optimized memcpy. + * + * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#undef SMALL_COPY_USES_FPU +#else +#define ASI_BLK_P 0xf0 +#define FPRS_FEF 0x04 +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#define SMALL_COPY_USES_FPU +#endif + + /* Special/non-trivial issues of this code: + * + * 1) %o5 is preserved from VISEntryHalf to VISExitHalf + * 2) Only low 32 FPU registers are used so that only the + * lower half of the FPU register set is dirtied by this + * code. This is especially important in the kernel. + * 3) This code never prefetches cachelines past the end + * of the source buffer. + */ + + .text + .align 32 + + /* The cheetah's flexible spine, oversized liver, enlarged heart, + * slender muscular body, and claws make it the swiftest hunter + * in Africa and the fastest animal on land. Can reach speeds + * of up to 2.4GB per second. + */ + + .globl U3memcpy +U3memcpy: /* %o0=dst, %o1=src, %o2=len */ +#ifndef __KERNEL__ + /* Save away original 'dst' for memcpy return value. */ + mov %o0, %g3 ! A0 Group +#endif + /* Anything to copy at all? */ + cmp %o2, 0 ! A1 + ble,pn %icc, U3memcpy_short_ret ! BR + + /* Extremely small copy? */ + cmp %o2, 31 ! A0 Group + ble,pn %icc, U3memcpy_short ! BR + + /* Large enough to use unrolled prefetch loops? */ + cmp %o2, 0x100 ! A1 + bge,a,pt %icc, U3memcpy_enter ! BR Group + andcc %o0, 0x3f, %g2 ! A0 + + ba,pt %xcc, U3memcpy_toosmall ! BR Group + andcc %o0, 0x7, %g2 ! A0 + + .align 32 +U3memcpy_short: + /* Copy %o2 bytes from src to dst, one byte at a time. */ + ldub [%o1 + 0x00], %o3 ! MS Group + add %o1, 0x1, %o1 ! A0 + add %o0, 0x1, %o0 ! A1 + subcc %o2, 1, %o2 ! A0 Group + + bg,pt %icc, U3memcpy_short ! BR + stb %o3, [%o0 + -1] ! MS Group (1-cycle stall) + +U3memcpy_short_ret: +#ifdef __KERNEL__ + retl ! BR Group (0-4 cycle stall) + clr %o0 ! A0 +#else + retl ! BR Group (0-4 cycle stall) + mov %g3, %o0 ! A0 +#endif + + /* Here len >= (6 * 64) and condition codes reflect execution + * of "andcc %o0, 0x7, %g2", done by caller. + */ + .align 64 +U3memcpy_enter: + /* Is 'dst' already aligned on an 64-byte boundary? */ + be,pt %xcc, 2f ! BR + + /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number + * of bytes to copy to make 'dst' 64-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x40, %g2 ! A0 Group + sub %g0, %g2, %g2 ! A0 Group + sub %o2, %g2, %o2 ! A0 Group + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: ldub [%o1 + 0x00], %o3 ! MS (Group) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + stb %o3, [%o0 + -1] ! MS Group + +2: VISEntryHalf ! MS+MS + and %o1, 0x7, %g1 ! A1 + ba,pt %xcc, U3memcpy_begin ! BR + alignaddr %o1, %g0, %o1 ! MS (Break-after) + + .align 64 +U3memcpy_begin: + prefetch [%o1 + 0x000], #one_read ! MS Group1 + prefetch [%o1 + 0x040], #one_read ! MS Group2 + andn %o2, (0x40 - 1), %o4 ! A0 + prefetch [%o1 + 0x080], #one_read ! MS Group3 + cmp %o4, 0x140 ! A0 + prefetch [%o1 + 0x0c0], #one_read ! MS Group4 + ldd [%o1 + 0x000], %f0 ! MS Group5 (%f0 results at G8) + bge,a,pt %icc, 1f ! BR + + prefetch [%o1 + 0x100], #one_read ! MS Group6 +1: ldd [%o1 + 0x008], %f2 ! AX (%f2 results at G9) + cmp %o4, 0x180 ! A1 + bge,a,pt %icc, 1f ! BR + prefetch [%o1 + 0x140], #one_read ! MS Group7 +1: ldd [%o1 + 0x010], %f4 ! AX (%f4 results at G10) + cmp %o4, 0x1c0 ! A1 + bge,a,pt %icc, 1f ! BR + + prefetch [%o1 + 0x180], #one_read ! MS Group8 +1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12) + ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G12) + faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13) + ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G13) + faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15) + ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G15) + faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16) + + ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G16) + faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18) + ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18) + faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19) + ldd [%o1 + 0x040], %f0 ! MS (%f0 results at G19) + + /* We only use the first loop if len > (7 * 64). */ + subcc %o4, 0x1c0, %o4 ! A0 Group17 + bg,pt %icc, U3memcpy_loop1 ! BR + add %o1, 0x40, %o1 ! A1 + + add %o4, 0x140, %o4 ! A0 Group18 + ba,pt %xcc, U3memcpy_loop2 ! BR + srl %o4, 6, %o3 ! A0 Group19 + nop + nop + nop + nop + nop + + nop + nop + + /* This loop performs the copy and queues new prefetches. + * We drop into the second loop when len <= (5 * 64). Note + * that this (5 * 64) factor has been subtracted from len + * already. + */ +U3memcpy_loop1: + ldd [%o1 + 0x008], %f2 ! MS Group2 (%f2 results at G5) + faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5) + ldd [%o1 + 0x010], %f4 ! MS Group3 (%f4 results at G6) + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7) + stda %f16, [%o0] ASI_BLK_P ! MS + ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G7) + + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G15) + faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16) + ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G16) + faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17) + ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G17) + faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18) + ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18) + + faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19) + ldd [%o1 + 0x040], %f0 ! AX (%f0 results at G19) + prefetch [%o1 + 0x180], #one_read ! MS + faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20) + subcc %o4, 0x40, %o4 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3memcpy_loop1 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + +U3memcpy_loop2_enter: + mov 5, %o3 ! A1 + + /* This loop performs on the copy, no new prefetches are + * queued. We do things this way so that we do not perform + * any spurious prefetches past the end of the src buffer. + */ +U3memcpy_loop2: + ldd [%o1 + 0x008], %f2 ! MS + faligndata %f12, %f14, %f28 ! FGA Group2 + ldd [%o1 + 0x010], %f4 ! MS + faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall) + stda %f16, [%o0] ASI_BLK_P ! MS + ldd [%o1 + 0x018], %f6 ! AX + faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) + + ldd [%o1 + 0x020], %f8 ! MS + faligndata %f2, %f4, %f18 ! FGA Group13 + ldd [%o1 + 0x028], %f10 ! MS + faligndata %f4, %f6, %f20 ! FGA Group14 + ldd [%o1 + 0x030], %f12 ! MS + faligndata %f6, %f8, %f22 ! FGA Group15 + ldd [%o1 + 0x038], %f14 ! MS + faligndata %f8, %f10, %f24 ! FGA Group16 + + ldd [%o1 + 0x040], %f0 ! AX + faligndata %f10, %f12, %f26 ! FGA Group17 + subcc %o3, 0x01, %o3 ! A0 + add %o1, 0x40, %o1 ! A1 + bg,pt %xcc, U3memcpy_loop2 ! BR + add %o0, 0x40, %o0 ! A0 Group18 + + /* Finally we copy the last full 64-byte block. */ +U3memcpy_loopfini: + ldd [%o1 + 0x008], %f2 ! MS + faligndata %f12, %f14, %f28 ! FGA + ldd [%o1 + 0x010], %f4 ! MS Group19 + faligndata %f14, %f0, %f30 ! FGA + stda %f16, [%o0] ASI_BLK_P ! MS Group20 + ldd [%o1 + 0x018], %f6 ! AX + faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall) + ldd [%o1 + 0x020], %f8 ! MS + faligndata %f2, %f4, %f18 ! FGA Group12 + ldd [%o1 + 0x028], %f10 ! MS + faligndata %f4, %f6, %f20 ! FGA Group13 + ldd [%o1 + 0x030], %f12 ! MS + faligndata %f6, %f8, %f22 ! FGA Group14 + ldd [%o1 + 0x038], %f14 ! MS + faligndata %f8, %f10, %f24 ! FGA Group15 + cmp %g1, 0 ! A0 + be,pt %icc, 1f ! BR + add %o0, 0x40, %o0 ! A1 + ldd [%o1 + 0x040], %f0 ! MS +1: faligndata %f10, %f12, %f26 ! FGA Group16 + faligndata %f12, %f14, %f28 ! FGA Group17 + faligndata %f14, %f0, %f30 ! FGA Group18 + stda %f16, [%o0] ASI_BLK_P ! MS + add %o0, 0x40, %o0 ! A0 + add %o1, 0x40, %o1 ! A1 + membar #Sync ! MS Group26 (7-cycle stall) + + /* Now we copy the (len modulo 64) bytes at the end. + * Note how we borrow the %f0 loaded above. + * + * Also notice how this code is careful not to perform a + * load past the end of the src buffer just like similar + * code found in U3memcpy_toosmall processing. + */ +U3memcpy_loopend: + and %o2, 0x3f, %o2 ! A0 Group + andcc %o2, 0x38, %g2 ! A0 Group + be,pn %icc, U3memcpy_endcruft ! BR + subcc %g2, 0x8, %g2 ! A1 + be,pn %icc, U3memcpy_endcruft ! BR Group + cmp %g1, 0 ! A0 + + be,a,pt %icc, 1f ! BR Group + ldd [%o1 + 0x00], %f0 ! MS + +1: ldd [%o1 + 0x08], %f2 ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f0, %f2, %f8 ! FGA Group + std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX) + be,pn %icc, U3memcpy_endcruft ! BR + add %o0, 0x8, %o0 ! A0 + ldd [%o1 + 0x08], %f0 ! MS Group + add %o1, 0x8, %o1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA + std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX) + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A0 Group + + /* If anything is left, we copy it one byte at a time. + * Note that %g1 is (src & 0x3) saved above before the + * alignaddr was performed. + */ +U3memcpy_endcruft: + cmp %o2, 0 + add %o1, %g1, %o1 + VISExitHalf + be,pn %icc, U3memcpy_short_ret + nop + ba,a,pt %xcc, U3memcpy_short + + /* If we get here, then 32 <= len < (6 * 64) */ +U3memcpy_toosmall: + +#ifdef SMALL_COPY_USES_FPU + + /* Is 'dst' already aligned on an 8-byte boundary? */ + be,pt %xcc, 2f ! BR Group + + /* Compute abs((dst & 7) - 8) into %g2. This is the number + * of bytes to copy to make 'dst' 8-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %g2, 0x8, %g2 ! A0 + sub %g0, %g2, %g2 ! A0 Group (reg-dep) + sub %o2, %g2, %o2 ! A0 Group (reg-dep) + + /* Copy %g2 bytes from src to dst, one byte at a time. */ +1: ldub [%o1 + 0x00], %o3 ! MS (Group) (%o3 in 3 cycles) + add %o1, 0x1, %o1 ! A1 + add %o0, 0x1, %o0 ! A0 Group + subcc %g2, 0x1, %g2 ! A1 + + bg,pt %icc, 1b ! BR Group + stb %o3, [%o0 + -1] ! MS Group + +2: VISEntryHalf ! MS+MS + + /* Compute (len - (len % 8)) into %g2. This is guarenteed + * to be nonzero. + */ + andn %o2, 0x7, %g2 ! A0 Group + + /* You may read this and believe that it allows reading + * one 8-byte longword past the end of src. It actually + * does not, as %g2 is subtracted as loads are done from + * src, so we always stop before running off the end. + * Also, we are guarenteed to have at least 0x10 bytes + * to move here. + */ + sub %g2, 0x8, %g2 ! A0 Group (reg-dep) + alignaddr %o1, %g0, %g1 ! MS (Break-after) + ldd [%g1 + 0x00], %f0 ! MS Group (1-cycle stall) + add %g1, 0x8, %g1 ! A0 + +1: ldd [%g1 + 0x00], %f2 ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + subcc %g2, 0x8, %g2 ! A0 Group + + faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall) + std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + be,pn %icc, 2f ! BR + + add %o0, 0x8, %o0 ! A1 + ldd [%g1 + 0x00], %f0 ! MS Group + add %g1, 0x8, %g1 ! A0 + sub %o2, 0x8, %o2 ! A1 + + subcc %g2, 0x8, %g2 ! A0 Group + faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall) + std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall) + add %o1, 0x8, %o1 ! A0 + + bne,pn %icc, 1b ! BR + add %o0, 0x8, %o0 ! A1 + + /* Nothing left to copy? */ +2: cmp %o2, 0 ! A0 Group + VISExitHalf ! A0+MS + be,pn %icc, U3memcpy_short_ret ! BR Group + nop ! A0 + ba,a,pt %xcc, U3memcpy_short ! BR Group + +#else /* !(SMALL_COPY_USES_FPU) */ + + xor %o1, %o0, %g2 + andcc %g2, 0x7, %g0 + bne,pn %icc, U3memcpy_short + andcc %o1, 0x7, %g2 + + be,pt %xcc, 2f + sub %g2, 0x8, %g2 + sub %g0, %g2, %g2 + sub %o2, %g2, %o2 + +1: ldub [%o1 + 0x00], %o3 + add %o1, 0x1, %o1 + add %o0, 0x1, %o0 + subcc %g2, 0x1, %g2 + bg,pt %icc, 1b + stb %o3, [%o0 + -1] + +2: andn %o2, 0x7, %g2 + sub %o2, %g2, %o2 + +3: ldx [%o1 + 0x00], %o3 + add %o1, 0x8, %o1 + add %o0, 0x8, %o0 + subcc %g2, 0x8, %g2 + bg,pt %icc, 3b + stx %o3, [%o0 + -8] + + cmp %o2, 0 + bne,pn %icc, U3memcpy_short + nop + ba,a,pt %xcc, U3memcpy_short_ret + +#endif /* !(SMALL_COPY_USES_FPU) */ diff --git a/arch/sparc64/lib/VIScopy.S b/arch/sparc64/lib/VIScopy.S index 56634f83f..b944a0ae7 100644 --- a/arch/sparc64/lib/VIScopy.S +++ b/arch/sparc64/lib/VIScopy.S @@ -1,4 +1,4 @@ -/* $Id: VIScopy.S,v 1.23 2000/03/26 09:13:49 davem Exp $ +/* $Id: VIScopy.S,v 1.25 2000/11/01 09:29:19 davem Exp $ * VIScopy.S: High speed copy operations utilizing the UltraSparc * Visual Instruction Set. * @@ -361,6 +361,38 @@ bcopy: or %o0, 0, %g3 ! IEU0 Group clr %o0 ! IEU0 +#ifdef __KERNEL__ +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define ULTRA3_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + srl %g1, 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl cheetah_patch_copyops +cheetah_patch_copyops: + ULTRA3_DO_PATCH(memcpy, U3memcpy) + ULTRA3_DO_PATCH(__copy_from_user, U3copy_from_user) + ULTRA3_DO_PATCH(__copy_to_user, U3copy_to_user) + ULTRA3_DO_PATCH(__copy_in_user, U3copy_in_user) + retl + nop +#undef BRANCH_ALWAYS +#undef NOP +#undef ULTRA3_DO_PATCH +#endif /* __KERNEL__ */ + .align 32 #ifdef __KERNEL__ __memcpy_384plus: diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 65fbd6e37..6da2d0b85 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1,4 +1,4 @@ -/* $Id: init.c,v 1.157 2000/10/19 00:49:52 davem Exp $ +/* $Id: init.c,v 1.159 2000/11/06 06:59:04 davem Exp $ * arch/sparc64/mm/init.c * * Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu) @@ -99,6 +99,20 @@ int do_check_pgt_cache(int low, int high) return freed; } +extern void __update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t); + +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte) +{ + struct page *page = pte_page(pte); + + if (VALID_PAGE(page) && page->mapping && + test_bit(PG_dcache_dirty, &page->flags)) { + __flush_dcache_page(page->virtual, 1); + clear_bit(PG_dcache_dirty, &page->flags); + } + __update_mmu_cache(vma, address, pte); +} + /* * BAD_PAGE is the page that is used for page faults when linux * is out-of-memory. Older versions of linux just did a diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S index 7940218d2..daaf580a0 100644 --- a/arch/sparc64/mm/ultra.S +++ b/arch/sparc64/mm/ultra.S @@ -1,4 +1,4 @@ -/* $Id: ultra.S,v 1.46 2000/08/05 13:30:33 davem Exp $ +/* $Id: ultra.S,v 1.48 2000/11/06 06:59:04 davem Exp $ * ultra.S: Don't expand these all over the place... * * Copyright (C) 1997, 2000 David S. Miller (davem@redhat.com) @@ -208,27 +208,58 @@ iflush2:sub %o1, 0x20, %g3 .align 64 .globl __flush_dcache_page -__flush_dcache_page: +__flush_dcache_page: /* %o0=kaddr, %o1=flush_icache */ sub %o0, %g4, %o0 - clr %o1 + clr %o4 srlx %o0, 11, %o0 sethi %hi(1 << 14), %o2 -1: ldxa [%o1] ASI_DCACHE_TAG, %o3 - andn %o3, 0x3, %o3 - cmp %o0, %o3 - bne,pt %xcc, 2f - nop - stxa %g0, [%o1] ASI_DCACHE_TAG - membar #Sync -2: add %o1, (1 << 5), %o1 - cmp %o1, %o2 - bne,pt %xcc, 1b - nop +1: ldxa [%o4] ASI_DCACHE_TAG, %o3 ! LSU Group + add %o4, (1 << 5), %o4 ! IEU0 + ldxa [%o4] ASI_DCACHE_TAG, %g1 ! LSU Group + add %o4, (1 << 5), %o4 ! IEU0 + ldxa [%o4] ASI_DCACHE_TAG, %g2 ! LSU Group o3 available + add %o4, (1 << 5), %o4 ! IEU0 + andn %o3, 0x3, %o3 ! IEU1 + ldxa [%o4] ASI_DCACHE_TAG, %g3 ! LSU Group + add %o4, (1 << 5), %o4 ! IEU0 + andn %g1, 0x3, %g1 ! IEU1 + cmp %o0, %o3 ! IEU1 Group + be,a,pn %xcc, dflush1 ! CTI + sub %o4, (4 << 5), %o4 ! IEU0 (Group) + cmp %o0, %g1 ! IEU1 Group + andn %g2, 0x3, %g2 ! IEU0 + be,a,pn %xcc, dflush2 ! CTI + sub %o4, (3 << 5), %o4 ! IEU0 (Group) + cmp %o0, %g2 ! IEU1 Group + andn %g3, 0x3, %g3 ! IEU0 + be,a,pn %xcc, dflush3 ! CTI + sub %o4, (2 << 5), %o4 ! IEU0 (Group) + cmp %o0, %g3 ! IEU1 Group + be,a,pn %xcc, dflush4 ! CTI + sub %o4, (1 << 5), %o4 ! IEU0 +2: cmp %o4, %o2 ! IEU1 Group + bne,pt %xcc, 1b ! CTI + nop ! IEU0 + /* The I-cache does not snoop local stores so we - * better flush that too. + * better flush that too when necessary. */ - ba,pt %xcc, __flush_icache_page + brnz,pt %o1, __flush_icache_page sllx %o0, 11, %o0 + retl + nop + +dflush1:stxa %g0, [%o4] ASI_DCACHE_TAG + add %o4, (1 << 5), %o4 +dflush2:stxa %g0, [%o4] ASI_DCACHE_TAG + add %o4, (1 << 5), %o4 +dflush3:stxa %g0, [%o4] ASI_DCACHE_TAG + add %o4, (1 << 5), %o4 +dflush4:stxa %g0, [%o4] ASI_DCACHE_TAG + add %o4, (1 << 5), %o4 + membar #Sync + ba,pt %xcc, 2b + nop .align 32 __prefill_dtlb: @@ -250,8 +281,8 @@ __prefill_itlb: retl wrpr %g7, %pstate - .globl update_mmu_cache -update_mmu_cache: /* %o0=vma, %o1=address, %o2=pte */ + .globl __update_mmu_cache +__update_mmu_cache: /* %o0=vma, %o1=address, %o2=pte */ ldub [%g6 + AOFF_task_thread + AOFF_thread_fault_code], %o3 srlx %o1, 13, %o1 ldx [%o0 + 0x0], %o4 /* XXX vma->vm_mm */ diff --git a/arch/sparc64/solaris/ioctl.c b/arch/sparc64/solaris/ioctl.c index 0e899da18..4a10c1b4c 100644 --- a/arch/sparc64/solaris/ioctl.c +++ b/arch/sparc64/solaris/ioctl.c @@ -464,8 +464,8 @@ static inline int solaris_S(struct file *filp, unsigned int fd, unsigned int cmd struct sol_socket_struct *sock; struct module_info *mi; - if (! (ino = filp->f_dentry->d_inode) || - ! ino->i_sock) + ino = filp->f_dentry->d_inode; + if (! ino->i_sock) return -EBADF; sock = filp->private_data; if (! sock) { diff --git a/arch/sparc64/solaris/socket.c b/arch/sparc64/solaris/socket.c index 3013d43cf..9b910a633 100644 --- a/arch/sparc64/solaris/socket.c +++ b/arch/sparc64/solaris/socket.c @@ -265,7 +265,7 @@ extern __inline__ struct socket *sockfd_lookup(int fd, int *err) } inode = file->f_dentry->d_inode; - if (!inode || !inode->i_sock || !socki_lookup(inode)) { + if (!inode->i_sock || !socki_lookup(inode)) { *err = -ENOTSOCK; fput(file); return NULL; diff --git a/arch/sparc64/vmlinux.lds b/arch/sparc64/vmlinux.lds index f686decfb..91d4575d0 100644 --- a/arch/sparc64/vmlinux.lds +++ b/arch/sparc64/vmlinux.lds @@ -35,6 +35,9 @@ SECTIONS __ksymtab : { *(__ksymtab) } __stop___ksymtab = .; __kstrtab : { *(.kstrtab) } + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; . = ALIGN(8192); __init_begin = .; .text.init : { *(.text.init) } |