arch/ia64/lib/clear_user.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226

/*
 * This routine clears to zero a linear memory buffer in user space.
 *
 * Inputs:
 *	in0:	address of buffer
 *	in1:	length of buffer in bytes
 * Outputs:
 *	r8:	number of bytes that didn't get cleared due to a fault
 * 
 * Copyright (C) 1998, 1999 Hewlett-Packard Co
 * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
 */

#include <asm/asmmacro.h>

//
// arguments
//
#define buf		r32
#define len		r33

//
// local registers
//
#define cnt		r16
#define buf2		r17
#define saved_lc	r18
#define saved_pfs	r19
#define tmp		r20
#define len2		r21
#define len3		r22

//
// Theory of operations:
//	- we check whether or not the buffer is small, i.e., less than 17
//	  in which case we do the byte by byte loop.
//
//	- Otherwise we go progressively from 1 byte store to 8byte store in
//	  the head part, the body is a 16byte store loop and we finish we the
//	  tail for the last 15 bytes.
//	  The good point about this breakdown is that the long buffer handling
//	  contains only 2 branches.
//
//	The reason for not using shifting & masking for both the head and the
//	tail is to stay semantically correct. This routine is not supposed
//	to write bytes outside of the buffer. While most of the time this would
//	be ok, we can't tolerate a mistake. A classical example is the case
//	of multithreaded code were to the extra bytes touched is actually owned
//	by another thread which runs concurrently to ours. Another, less likely,
//	example is with device drivers where reading an I/O mapped location may
//	have side effects (same thing for writing).
//

// The label comes first because our store instruction contains a comma
// and confuse the preprocessor otherwise
//
#define EX(y,x...)				\
	.section __ex_table,"a";		\
	data4 @gprel(99f);			\
	data4 y-99f;				\
	.previous;				\
99:	x

	.text
	.psr abi64
	.psr lsb
	.lsb

GLOBAL_ENTRY(__do_clear_user)
	UNW(.prologue)
	UNW(.save ar.pfs, saved_pfs)
 	alloc	saved_pfs=ar.pfs,2,0,0,0
	cmp.eq p6,p0=r0,len		// check for zero length
	UNW(.save ar.lc, saved_lc)
	mov saved_lc=ar.lc		// preserve ar.lc (slow)
	.body
	;;				// avoid WAW on CFM
	adds tmp=-1,len			// br.ctop is repeat/until
	mov ret0=len			// return value is length at this point
(p6)	br.ret.spnt.few rp
	;;
	cmp.lt p6,p0=16,len		// if len > 16 then long memset
	mov ar.lc=tmp			// initialize lc for small count 
(p6)	br.cond.dptk.few long_do_clear
	;;				// WAR on ar.lc
	//
	// worst case 16 cyles, avg 8 cycles
	//
	// We could have played with the predicates to use the extra
	// M slot for 2 stores/iteration but the cost the initialization
	// the various counters compared to how long the loop is supposed
	// to last on average does not make this solution viable.
	//
1:	
	EX( .Lexit1, st1 [buf]=r0,1 )
	adds len=-1,len			// countdown length using len
	br.cloop.dptk.few 1b
	;;				// avoid RAW on ar.lc
	//
	// .Lexit4: comes from byte by byte loop
	//	    len contains bytes left
.Lexit1:			
	mov ret0=len			// faster than using ar.lc
	mov ar.lc=saved_lc
	br.ret.sptk.few rp		// end of short clear_user


	//
	// At this point we know we have more than 16 bytes to copy
	// so we focus on alignment (no branches required)
	//
	// The use of len/len2 for countdown of the number of bytes left
	// instead of ret0 is due to the fact that the exception code 
	// changes the values of r8.
	//
long_do_clear:
	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
	;;
	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
(p6)	adds len=-1,len;;		// sync because buf is modified
	tbit.nz p6,p0=buf,1
	;;
	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
(p6)	adds len=-2,len;;
	tbit.nz p6,p0=buf,2
	;;
	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
(p6)	adds len=-4,len;;
	tbit.nz p6,p0=buf,3
	;;
	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
(p6)	adds len=-8,len;;
	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
	;;	
	cmp.eq p6,p0=r0,cnt
	adds tmp=-1,cnt
(p6)	br.cond.dpnt.few .dotail 	// we have less than 16 bytes left
	;;
	adds buf2=8,buf			// setup second base pointer
	mov ar.lc=tmp
	;;

	//
	// 16bytes/iteration core loop
	//
	// The second store can never generate a fault because 
	// we come into the loop only when we are 16-byte aligned.
	// This means that if we cross a page then it will always be
	// in the first store and never in the second.
	//
	// 
	// We need to keep track of the remaining length. A possible (optimistic)
	// way would be to ue ar.lc and derive how many byte were left by
	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
	// every iteration. 
	// However we need to keep the synchronization point. A template
	// M;;MB does not exist and thus we can keep the addition at no
	// extra cycle cost (use a nop slot anyway). It also simplifies the
	// (unlikely)  error recovery code
	//

2:					

	EX(.Lexit3, st8 [buf]=r0,16 )
	;;				// needed to get len correct when error
	st8 [buf2]=r0,16
	adds len=-16,len		
	br.cloop.dptk.few 2b
	;;
	mov ar.lc=saved_lc
	// 
	// tail correction based on len only
	//
	// We alternate the use of len3,len2 to allow parallelism and correct
	// error handling. We also reuse p6/p7 to return correct value.
	// The addition of len2/len3 does not cost anything more compared to
	// the regular memset as we had empty slots.
	//
.dotail:				
	mov len2=len			// for parallelization of error handling
	mov len3=len
	tbit.nz p6,p0=len,3	
	;;
	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
(p6)	adds len3=-8,len2
	tbit.nz p7,p6=len,2	
	;;
	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
(p7)	adds len2=-4,len3
	tbit.nz p6,p7=len,1
	;;
	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
(p6)	adds len3=-2,len2
	tbit.nz p7,p6=len,0
	;;
	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
	mov ret0=r0				// success
	br.ret.dptk.few rp			// end of most likely path

	//
	// Outlined error handling code
	//

	//
	// .Lexit3: comes from core loop, need restore pr/lc
	//	    len contains bytes left
	//
	//
	// .Lexit2:
	// 	if p6 -> coming from st8 or st2 : len2 contains what's left
	// 	if p7 -> coming from st4 or st1 : len3 contains what's left
	// We must restore lc/pr even though might not have been used.
.Lexit2:
	.pred.rel "mutex", p6, p7
(p6)	mov len=len2
(p7)	mov len=len3
	;;
	//
	// .Lexit4: comes from head, need not restore pr/lc
	//	    len contains bytes left
	//
.Lexit3:
	mov ret0=len
	mov ar.lc=saved_lc
	br.ret.dptk.few rp
END(__do_clear_user)