summaryrefslogtreecommitdiffstats
path: root/arch/ia64/lib/strlen_user.S
blob: 4f3715d5569430b29deeb8cbe8114337f3b801a1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/*
 * Optimized version of the strlen_user() function
 *
 * Inputs:
 *	in0	address of buffer
 *
 * Outputs:
 *	ret0	0 in case of fault, strlen(buffer)+1 otherwise
 * 
 * Copyright (C) 1998, 1999 Hewlett-Packard Co
 * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
 * Copyright (C) 1998, 1999 Stephane Eranian <eranian@hpl.hp.com>
 *
 * 01/19/99 S.Eranian heavily enhanced version (see details below)
 * 09/24/99 S.Eranian added speculation recovery code
 */

#include <asm/asmmacro.h>

//
// int strlen_user(char *)
// ------------------------
// Returns:
//	- length of string + 1
//	- 0 in case an exception is raised
//
// This is an enhanced version of the basic strlen_user. it includes a 
// combination of compute zero index (czx), parallel comparisons, speculative 
// loads and loop unroll using rotating registers.
//
// General Ideas about the algorithm:
//	  The goal is to look at the string in chunks of 8 bytes.
//	  so we need to do a few extra checks at the beginning because the
//	  string may not be 8-byte aligned. In this case we load the 8byte
//	  quantity which includes the start of the string and mask the unused
//	  bytes with 0xff to avoid confusing czx.
//	  We use speculative loads and software pipelining to hide memory 
//	  latency and do read ahead safely. This way we defer any exception.
//
//	  Because we don't want the kernel to be relying on particular
//	  settings of the DCR register, we provide recovery code in case
//	  speculation fails. The recovery code is going to "redo" the work using
//	  only normal loads. If we still get a fault then we return an
//	  error (ret0=0). Otherwise we return the strlen+1 as usual.
//	  The fact that speculation may fail can be caused, for instance, by
//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
//	  a NaT bit will be set if the translation is not present. The normal
//	  load, on the other hand, will cause the translation to be inserted 
//	  if the mapping exists.
//
//	  It should be noted that we execute recovery code only when we need
//	  to use the data that has been speculatively loaded: we don't execute
//	  recovery code on pure read ahead data.
//
// Remarks:
//	- the cmp r0,r0 is used as a fast way to initialize a predicate 
//	  register to 1. This is required to make sure that we get the parallel
//	  compare correct.
//
//	- we don't use the epilogue counter to exit the loop but we need to set
//	  it to zero beforehand.
//
//	- after the loop we must test for Nat values because neither the 
//	  czx nor cmp instruction raise a NaT consumption fault. We must be
//	  careful not to look too far for a Nat for which we don't care. 
//	  For instance we don't need to look at a NaT in val2 if the zero byte
//	  was in val1.
//
//	- Clearly performance tuning is required.
//
// 
//

#define EX(y,x...)				\
	.section __ex_table,"a";		\
	data4 @gprel(99f);			\
	data4 y-99f;				\
	.previous;				\
99:	x

#define saved_pfs	r11
#define	tmp		r10
#define base		r16
#define orig		r17
#define saved_pr	r18
#define src		r19
#define mask		r20
#define val		r21
#define val1		r22
#define val2		r23


	.text
	.psr abi64
	.psr lsb
	.lsb

GLOBAL_ENTRY(__strlen_user)
	UNW(.prologue)
	UNW(.save ar.pfs, saved_pfs)
	alloc saved_pfs=ar.pfs,11,0,0,8

	.rotr v[2], w[2]	// declares our 4 aliases

	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
	mov orig=in0		// keep trackof initial byte address
	dep src=0,in0,0,3	// src=8byte-aligned in0 address
	UNW(.save pr, saved_pr)
	mov saved_pr=pr		// preserve predicates (rotation)
	;;

	.body

	ld8.s v[1]=[src],8	// load the initial 8bytes (must speculate)
	shl tmp=tmp,3		// multiply by 8bits/byte
	mov mask=-1		// our mask
	;;
	ld8.s w[1]=[src],8	// load next 8 bytes in 2nd pipeline
	cmp.eq p6,p0=r0,r0	// sets p6 (required because of // cmp.and)
	sub tmp=64,tmp		// how many bits to shift our mask on the right
	;;
	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
	;;
	add base=-16,src	// keep track of aligned base
	chk.s v[1], recover	// if already NaT, then directly skip to recover
	or v[1]=v[1],mask	// now we have a safe initial byte pattern
	;;
1:
	ld8.s v[0]=[src],8	// speculatively load next 
	czx1.r val1=v[1]	// search 0 byte from right 
	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
	;;
	ld8.s w[0]=[src],8	// speculatively load next to next
	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
(p6)	br.wtop.dptk.few 1b	// loop until p6 == 0
	;;
	//
	// We must return try the recovery code iff
	// val1_is_nat || (val1==8 && val2_is_nat)
	//
	// XXX Fixme
	//	- there must be a better way of doing the test
	//
	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
#ifdef notyet
	tnat.nz p6,p7=val1	// test NaT on val1
#else
	tnat.z p7,p6=val1	// test NaT on val1
#endif
(p6)	br.cond.spnt.few recover// jump to recovery if val1 is NaT
	;;
	//
	// if we come here p7 is true, i.e., initialized for // cmp
	//
	cmp.eq.and  p7,p0=8,val1// val1==8?
	tnat.nz.and p7,p0=val2	// test NaT if val2
(p7)	br.cond.spnt.few recover// jump to recovery if val2 is NaT
	;;
(p8)	mov val1=val2		// val2 contains the value
(p8)	adds src=-16,src	// correct position when 3 ahead
(p9)	adds src=-24,src	// correct position when 4 ahead
	;;
	sub ret0=src,orig	// distance from origin
	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
	mov pr=saved_pr,0xffffffffffff0000
	;;
	sub ret0=ret0,tmp	// length=now - back -1
	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
	br.ret.sptk.few rp	// end of normal execution

	//
	// Outlined recovery code when speculation failed
	//
	// This time we don't use speculation and rely on the normal exception
	// mechanism. that's why the loop is not as good as the previous one
	// because read ahead is not possible
	//
	// XXX Fixme
	//	- today we restart from the beginning of the string instead
	//	  of trying to continue where we left off.
	//
recover:
	EX(.Lexit1, ld8 val=[base],8)	// load the initial bytes
	;;
	or val=val,mask			// remask first bytes
	cmp.eq p0,p6=r0,r0		// nullify first ld8 in loop
	;;
	//
	// ar.ec is still zero here
	//
2:
	EX(.Lexit1, (p6) ld8 val=[base],8)
	;;
	czx1.r val1=val		// search 0 byte from right 
	;;
	cmp.eq p6,p0=8,val1	// val1==8 ?
(p6)	br.wtop.dptk.few 2b	// loop until p6 == 0
	;;
	sub ret0=base,orig	// distance from base
	sub tmp=7,val1		// 7=8-1 because this strlen returns strlen+1
	mov pr=saved_pr,0xffffffffffff0000
	;;
	sub ret0=ret0,tmp	// length=now - back -1
	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
	br.ret.sptk.few rp	// end of successful recovery code

	//
	// We failed even on the normal load (called from exception handler)
	//
.Lexit1:
	mov ret0=0
	mov pr=saved_pr,0xffffffffffff0000
	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
	br.ret.sptk.few rp
END(__strlen_user)