summaryrefslogtreecommitdiffstats
path: root/arch/ia64/lib/memset.S
blob: 595720a2dfc1f5fbf1ebfa499c0f18befcaa2536 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
 *
 * Optimized version of the standard memset() function
 *
 * Return: none
 *         
 *
 * Inputs:
 *	in0:	address of buffer
 * 	in1:	byte value to use for storing
 *	in2:	length of the buffer
 *
 * Copyright (C) 1999 Hewlett-Packard Co
 * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
 */


// arguments
//
#define buf		r32
#define val		r33
#define len		r34

//
// local registers
//
#define saved_pfs	r14
#define cnt		r18
#define buf2		r19
#define saved_lc	r20
#define saved_pr	r21
#define tmp		r22

 	.text
 	.psr	abi64
 	.psr	lsb

 	.align	16
 	.global	memset
 	.proc	memset

memset:
 	alloc	saved_pfs=ar.pfs,3,0,0,0	// cnt is sink here
	cmp.eq p8,p0=r0,len	// check for zero length
	mov saved_lc=ar.lc	// preserve ar.lc (slow)
	;; 
	adds tmp=-1,len		// br.ctop is repeat/until
	tbit.nz p6,p0=buf,0	// odd alignment
(p8)	br.ret.spnt.few rp

	cmp.lt p7,p0=16,len	// if len > 16 then long memset
	mux1 val=val,@brcst	// prepare value
(p7)	br.cond.dptk.few long_memset
	;;
	mov ar.lc=tmp		// initialize lc for small count
	;;			// avoid RAW and WAW on ar.lc
1:				// worst case 15 cyles, avg 8 cycles
	st1 [buf]=val,1
	br.cloop.dptk.few 1b
	;;				// avoid RAW on ar.lc
	mov ar.lc=saved_lc
	mov ar.pfs=saved_pfs
	br.ret.sptk.few rp	// end of short memset

	// at this point we know we have more than 16 bytes to copy
	// so we focus on alignment
long_memset:
(p6)	st1 [buf]=val,1		// 1-byte aligned
(p6)	adds len=-1,len;;	// sync because buf is modified
	tbit.nz p6,p0=buf,1
	;;
(p6)	st2 [buf]=val,2		// 2-byte aligned
(p6)	adds len=-2,len;;
	tbit.nz p6,p0=buf,2
	;;
(p6)	st4 [buf]=val,4		// 4-byte aligned
(p6)	adds len=-4,len;;
	tbit.nz p6,p0=buf,3
	;;
(p6)	st8 [buf]=val,8		// 8-byte aligned
(p6)	adds len=-8,len;;
	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
	;;	
	cmp.eq p6,p0=r0,cnt
	adds tmp=-1,cnt
(p6)	br.cond.dpnt.few .dotail // we have less than 16 bytes left
	;;
	adds buf2=8,buf		// setup second base pointer
	mov ar.lc=tmp
	;;
2:				// 16bytes/iteration
	st8 [buf]=val,16
	st8 [buf2]=val,16
	br.cloop.dptk.few 2b
	;;
.dotail:			// tail correction based on len only
	tbit.nz p6,p0=len,3	
	;;
(p6)	st8 [buf]=val,8		// at least 8 bytes
	tbit.nz p6,p0=len,2	
	;;
(p6)	st4 [buf]=val,4		// at least 4 bytes
	tbit.nz p6,p0=len,1
	;;
(p6)	st2 [buf]=val,2		// at least 2 bytes
	tbit.nz p6,p0=len,0
	mov ar.lc=saved_lc
	;;
(p6)	st1 [buf]=val		// only 1 byte left
	br.ret.dptk.few rp
 	.endp