1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
|
/*
* arch/alpha/lib/ev6-clear_user.S
* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
*
* Zero user space, handling exceptions as we go.
*
* We have to make sure that $0 is always up-to-date and contains the
* right "bytes left to zero" value (and that it is updated only _after_
* a successful copy). There is also some rather minor exception setup
* stuff.
*
* NOTE! This is not directly C-callable, because the calling semantics
* are different:
*
* Inputs:
* length in $0
* destination address in $6
* exception pointer in $7
* return address in $28 (exceptions expect it there)
*
* Outputs:
* bytes left to copy in $0
*
* Clobbers:
* $1,$2,$3,$4,$5,$6
*
* Much of the information about 21264 scheduling/coding comes from:
* Compiler Writer's Guide for the Alpha 21264
* abbreviated as 'CWG' in other comments here
* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
* Scheduling notation:
* E - either cluster
* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
* Try not to change the actual algorithm if possible for consistency.
* Determining actual stalls (other than slotting) doesn't appear to be easy to do.
* From perusing the source code context where this routine is called, it is
* a fair assumption that significant fractions of entire pages are zeroed, so
* it's going to be worth the effort to hand-unroll a big loop, and use wh64.
* ASSUMPTION:
* The believed purpose of only updating $0 after a store is that a signal
* may come along during the execution of this chunk of code, and we don't
* want to leave a hole (and we also want to avoid repeating lots of work)
*/
/* Allow an exception for an insn; exit if we get one. */
#define EX(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.gprel32 99b; \
lda $31, $exception-99b($31); \
.previous
.set noat
.set noreorder
.align 4
.globl __do_clear_user
.ent __do_clear_user
.frame $30, 0, $28
.prologue 0
# Pipeline info : Slotting & Comments
__do_clear_user:
ldgp $29,0($27) # we do exceptions -- we need the gp.
# Macro instruction becomes ldah/lda
# .. .. E E :
and $6, 7, $4 # .. E .. .. : find dest head misalignment
beq $0, $zerolength # U .. .. .. : U L U L
addq $0, $4, $1 # .. .. .. E : bias counter
and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail
# Note - we never actually use $2, so this is a moot computation
# and we can rewrite this later...
srl $1, 3, $1 # .. E .. .. : number of quadwords to clear
beq $4, $headalign # U .. .. .. : U L U L
/*
* Head is not aligned. Write (8 - $4) bytes to head of destination
* This means $6 is known to be misaligned
*/
EX( ldq_u $5, 0($6) ) # .. .. .. L : load dst word to mask back in
beq $1, $onebyte # .. .. U .. : sub-word store?
mskql $5, $6, $5 # .. U .. .. : take care of misaligned head
addq $6, 8, $6 # E .. .. .. : L U U L
EX( stq_u $5, -8($6) ) # .. .. .. L :
subq $1, 1, $1 # .. .. E .. :
addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment
subq $0, 8, $0 # E .. .. .. : U L U L
.align 4
/*
* (The .align directive ought to be a moot point)
* values upon initial entry to the loop
* $1 is number of quadwords to clear (zero is a valid value)
* $2 is number of trailing bytes (0..7) ($2 never used...)
* $6 is known to be aligned 0mod8
*/
$headalign:
subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop
and $6, 0x3f, $2 # .. .. E .. : Forward work for huge loop
subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop)
blt $4, $trailquad # U .. .. .. : U L U L
/*
* We know that we're going to do at least 16 quads, which means we are
* going to be able to use the large block clear loop at least once.
* Figure out how many quads we need to clear before we are 0mod64 aligned
* so we can use the wh64 instruction.
*/
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64
$alignmod64:
EX( stq_u $31, 0($6) ) # .. .. .. L
addq $3, 8, $3 # .. .. E ..
subq $0, 8, $0 # .. E .. ..
nop # E .. .. .. : U L U L
nop # .. .. .. E
subq $1, 1, $1 # .. .. E ..
addq $6, 8, $6 # .. E .. ..
blt $3, $alignmod64 # U .. .. .. : U L U L
$bigalign:
/*
* $0 is the number of bytes left
* $1 is the number of quads left
* $6 is aligned 0mod64
* we know that we'll be taking a minimum of one trip through
* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
* We are _not_ going to update $0 after every single store. That
* would be silly, because there will be cross-cluster dependencies
* no matter how the code is scheduled. By doing it in slightly
* staggered fashion, we can still do this loop in 5 fetches
* The worse case will be doing two extra quads in some future execution,
* in the event of an interrupted clear.
* Assumes the wh64 needs to be for 2 trips through the loop in the future
* The wh64 is issued on for the starting destination address for trip +2
* through the loop, and if there are less than two trips left, the target
* address will be for the current trip.
*/
nop # E :
nop # E :
nop # E :
bis $6,$6,$3 # E : U L U L : Initial wh64 address is dest
/* This might actually help for the current trip... */
$do_wh64:
wh64 ($3) # .. .. .. L1 : memory subsystem hint
subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop?
EX( stq_u $31, 0($6) ) # .. L .. ..
subq $0, 8, $0 # E .. .. .. : U L U L
addq $6, 128, $3 # E : Target address of wh64
EX( stq_u $31, 8($6) ) # L :
EX( stq_u $31, 16($6) ) # L :
subq $0, 16, $0 # E : U L L U
nop # E :
EX( stq_u $31, 24($6) ) # L :
EX( stq_u $31, 32($6) ) # L :
subq $0, 168, $5 # E : U L L U : two trips through the loop left?
/* 168 = 192 - 24, since we've already completed some stores */
subq $0, 16, $0 # E :
EX( stq_u $31, 40($6) ) # L :
EX( stq_u $31, 48($6) ) # L :
cmovlt $5, $6, $3 # E : U L L U : Latency 2, extra mapping cycle
subq $1, 8, $1 # E :
subq $0, 16, $0 # E :
EX( stq_u $31, 56($6) ) # L :
nop # E : U L U L
nop # E :
subq $0, 8, $0 # E :
addq $6, 64, $6 # E :
bge $4, $do_wh64 # U : U L U L
$trailquad:
# zero to 16 quadwords left to store, plus any trailing bytes
# $1 is the number of quadwords left to go.
#
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go
$onequad:
EX( stq_u $31, 0($6) ) # .. .. .. L
subq $1, 1, $1 # .. .. E ..
subq $0, 8, $0 # .. E .. ..
nop # E .. .. .. : U L U L
nop # .. .. .. E
nop # .. .. E ..
addq $6, 8, $6 # .. E .. ..
bgt $1, $onequad # U .. .. .. : U L U L
# We have an unknown number of bytes left to go.
$trailbytes:
nop # .. .. .. E
nop # .. .. E ..
nop # .. E .. ..
beq $0, $zerolength # U .. .. .. : U L U L
# $0 contains the number of bytes left to copy (0..31)
# so we will use $0 as the loop counter
# We know for a fact that $0 > 0 zero due to previous context
$onebyte:
EX( stb $31, 0($6) ) # .. .. .. L
subq $0, 1, $0 # .. .. E .. :
addq $6, 1, $6 # .. E .. .. :
bgt $0, $onebyte # U .. .. .. : U L U L
$zerolength:
$exception: # Destination for exception recovery(?)
nop # .. .. .. E :
nop # .. .. E .. :
nop # .. E .. .. :
ret $31, ($28), 1 # L0 .. .. .. : L U L U
.end __do_clear_user
|