strcpy.S source code [libc/AOR_v20.02/string/aarch64/strcpy.S]

1	/*
2	* strcpy/stpcpy - copy a string returning pointer to start/end.
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
12	*/
13
14	#include "../asmdefs.h"
15
16	/ To build as stpcpy, define BUILD_STPCPY before compiling this file.*
17
18	To test the page crossing code path more thoroughly, compile with
19	-DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
20	entry path. This option is not intended for production use. /*
21
22	/ Arguments and results. /
23	#define dstin x0
24	#define srcin x1
25
26	/ Locals and temporaries. /
27	#define src x2
28	#define dst x3
29	#define data1 x4
30	#define data1w w4
31	#define data2 x5
32	#define data2w w5
33	#define has_nul1 x6
34	#define has_nul2 x7
35	#define tmp1 x8
36	#define tmp2 x9
37	#define tmp3 x10
38	#define tmp4 x11
39	#define zeroones x12
40	#define data1a x13
41	#define data2a x14
42	#define pos x15
43	#define len x16
44	#define to_align x17
45
46	#ifdef BUILD_STPCPY
47	#define STRCPY __stpcpy_aarch64
48	#else
49	#define STRCPY __strcpy_aarch64
50	#endif
51
52	/ NUL detection works on the principle that (X - 1) & (~X) & 0x80*
53	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
54	can be done in parallel across the entire word. /*
55
56	#define REP8_01 0x0101010101010101
57	#define REP8_7f 0x7f7f7f7f7f7f7f7f
58	#define REP8_80 0x8080808080808080
59
60	/ AArch64 systems have a minimum page size of 4k. We can do a quick*
61	page size check for crossing this boundary on entry and if we
62	do not, then we can short-circuit much of the entry code. We
63	expect early page-crossing strings to be rare (probability of
64	16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
65	predictable, even with random strings.
66
67	We don't bother checking for larger page sizes, the cost of setting
68	up the correct page size is just not worth the extra gain from
69	a small reduction in the cases taking the slow path. Note that
70	we only care about whether the first fetch, which may be
71	misaligned, crosses a page boundary - after that we move to aligned
72	fetches for the remainder of the string. /*
73
74	#ifdef STRCPY_TEST_PAGE_CROSS
75	/ Make everything that isn't Qword aligned look like a page cross. /
76	#define MIN_PAGE_P2 4
77	#else
78	#define MIN_PAGE_P2 12
79	#endif
80
81	#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
82
83	ENTRY (STRCPY)
84	/ For moderately short strings, the fastest way to do the copy is to*
85	calculate the length of the string in the same way as strlen, then
86	essentially do a memcpy of the result. This avoids the need for
87	multiple byte copies and further means that by the time we
88	reach the bulk copy loop we know we can always use DWord
89	accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
90	with the same source string, so branch prediction is likely to
91	always be difficult - we mitigate against this by preferring
92	conditional select operations over branches whenever this is
93	feasible. /*
94	and tmp2, srcin, #(MIN_PAGE_SIZE - `1`)
95	mov zeroones, #REP8_01
96	and to_align, srcin, #`15`
97	cmp tmp2, #(MIN_PAGE_SIZE - `16`)
98	neg tmp1, to_align
99	/ The first fetch will straddle a (possible) page boundary iff*
100	srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
101	aligned string will never fail the page align check, so will
102	always take the fast path. /*
103	b.gt L(page_cross)
104
105	L(page_cross_ok):
106	ldp data1, data2, [srcin]
107	#ifdef __AARCH64EB__
108	/ Because we expect the end to be found within 16 characters*
109	(profiling shows this is the most common case), it's worth
110	swapping the bytes now to save having to recalculate the
111	termination syndrome later. We preserve data1 and data2
112	so that we can re-use the values later on. /*
113	rev tmp2, data1
114	sub tmp1, tmp2, zeroones
115	orr tmp2, tmp2, #REP8_7f
116	bics has_nul1, tmp1, tmp2
117	b.ne L(fp_le8)
118	rev tmp4, data2
119	sub tmp3, tmp4, zeroones
120	orr tmp4, tmp4, #REP8_7f
121	#else
122	sub tmp1, data1, zeroones
123	orr tmp2, data1, #REP8_7f
124	bics has_nul1, tmp1, tmp2
125	b.ne L(fp_le8)
126	sub tmp3, data2, zeroones
127	orr tmp4, data2, #REP8_7f
128	#endif
129	bics has_nul2, tmp3, tmp4
130	b.eq L(bulk_entry)
131
132	/ The string is short (<=16 bytes). We don't know exactly how*
133	short though, yet. Work out the exact length so that we can
134	quickly select the optimal copy strategy. /*
135	L(fp_gt8):
136	rev has_nul2, has_nul2
137	clz pos, has_nul2
138	mov tmp2, #`56`
139	add dst, dstin, pos, lsr #`3` / Bits to bytes. /
140	sub pos, tmp2, pos
141	#ifdef __AARCH64EB__
142	lsr data2, data2, pos
143	#else
144	lsl data2, data2, pos
145	#endif
146	str data2, [dst, #`1`]
147	str data1, [dstin]
148	#ifdef BUILD_STPCPY
149	add dstin, dst, #`8`
150	#endif
151	ret
152
153	L(fp_le8):
154	rev has_nul1, has_nul1
155	clz pos, has_nul1
156	add dst, dstin, pos, lsr #`3` / Bits to bytes. /
157	subs tmp2, pos, #`24` / Pos in bits. /
158	b.lt L(fp_lt4)
159	#ifdef __AARCH64EB__
160	mov tmp2, #`56`
161	sub pos, tmp2, pos
162	lsr data2, data1, pos
163	lsr data1, data1, #`32`
164	#else
165	lsr data2, data1, tmp2
166	#endif
167	/ 4->7 bytes to copy. /
168	str data2w, [dst, #-`3`]
169	str data1w, [dstin]
170	#ifdef BUILD_STPCPY
171	mov dstin, dst
172	#endif
173	ret
174	L(fp_lt4):
175	cbz pos, L(fp_lt2)
176	/ 2->3 bytes to copy. /
177	#ifdef __AARCH64EB__
178	lsr data1, data1, #`48`
179	#endif
180	strh data1w, [dstin]
181	/ Fall-through, one byte (max) to go. /
182	L(fp_lt2):
183	/ Null-terminated string. Last character must be zero! /
184	strb wzr, [dst]
185	#ifdef BUILD_STPCPY
186	mov dstin, dst
187	#endif
188	ret
189
190	.p2align `6`
191	/ Aligning here ensures that the entry code and main loop all lies*
192	within one 64-byte cache line. /*
193	L(bulk_entry):
194	sub to_align, to_align, #`16`
195	stp data1, data2, [dstin]
196	sub src, srcin, to_align
197	sub dst, dstin, to_align
198	b L(entry_no_page_cross)
199
200	/ The inner loop deals with two Dwords at a time. This has a*
201	slightly higher start-up cost, but we should win quite quickly,
202	especially on cores with a high number of issue slots per
203	cycle, as we get much better parallelism out of the operations. /*
204	L(main_loop):
205	stp data1, data2, [dst], #`16`
206	L(entry_no_page_cross):
207	ldp data1, data2, [src], #`16`
208	sub tmp1, data1, zeroones
209	orr tmp2, data1, #REP8_7f
210	sub tmp3, data2, zeroones
211	orr tmp4, data2, #REP8_7f
212	bic has_nul1, tmp1, tmp2
213	bics has_nul2, tmp3, tmp4
214	ccmp has_nul1, #`0`, #`0`, eq / NZCV = 0000 /
215	b.eq L(main_loop)
216
217	/ Since we know we are copying at least 16 bytes, the fastest way*
218	to deal with the tail is to determine the location of the
219	trailing NUL, then (re)copy the 16 bytes leading up to that. /*
220	cmp has_nul1, #`0`
221	#ifdef __AARCH64EB__
222	/ For big-endian, carry propagation (if the final byte in the*
223	string is 0x01) means we cannot use has_nul directly. The
224	easiest way to get the correct byte is to byte-swap the data
225	and calculate the syndrome a second time. /*
226	csel data1, data1, data2, ne
227	rev data1, data1
228	sub tmp1, data1, zeroones
229	orr tmp2, data1, #REP8_7f
230	bic has_nul1, tmp1, tmp2
231	#else
232	csel has_nul1, has_nul1, has_nul2, ne
233	#endif
234	rev has_nul1, has_nul1
235	clz pos, has_nul1
236	add tmp1, pos, #`72`
237	add pos, pos, #`8`
238	csel pos, pos, tmp1, ne
239	add src, src, pos, lsr #`3`
240	add dst, dst, pos, lsr #`3`
241	ldp data1, data2, [src, #-`32`]
242	stp data1, data2, [dst, #-`16`]
243	#ifdef BUILD_STPCPY
244	sub dstin, dst, #`1`
245	#endif
246	ret
247
248	L(page_cross):
249	bic src, srcin, #`15`
250	/ Start by loading two words at [srcin & ~15], then forcing the*
251	bytes that precede srcin to 0xff. This means they never look
252	like termination bytes. /*
253	ldp data1, data2, [src]
254	lsl tmp1, tmp1, #`3` / Bytes beyond alignment -> bits. /
255	tst to_align, #`7`
256	csetm tmp2, ne
257	#ifdef __AARCH64EB__
258	lsl tmp2, tmp2, tmp1 / Shift (tmp1 & 63). /
259	#else
260	lsr tmp2, tmp2, tmp1 / Shift (tmp1 & 63). /
261	#endif
262	orr data1, data1, tmp2
263	orr data2a, data2, tmp2
264	cmp to_align, #`8`
265	csinv data1, data1, xzr, lt
266	csel data2, data2, data2a, lt
267	sub tmp1, data1, zeroones
268	orr tmp2, data1, #REP8_7f
269	sub tmp3, data2, zeroones
270	orr tmp4, data2, #REP8_7f
271	bic has_nul1, tmp1, tmp2
272	bics has_nul2, tmp3, tmp4
273	ccmp has_nul1, #`0`, #`0`, eq / NZCV = 0000 /
274	b.eq L(page_cross_ok)
275	/ We now need to make data1 and data2 look like they've been*
276	loaded directly from srcin. Do a rotate on the 128-bit value. /*
277	lsl tmp1, to_align, #`3` / Bytes->bits. /
278	neg tmp2, to_align, lsl #`3`
279	#ifdef __AARCH64EB__
280	lsl data1a, data1, tmp1
281	lsr tmp4, data2, tmp2
282	lsl data2, data2, tmp1
283	orr tmp4, tmp4, data1a
284	cmp to_align, #`8`
285	csel data1, tmp4, data2, lt
286	rev tmp2, data1
287	rev tmp4, data2
288	sub tmp1, tmp2, zeroones
289	orr tmp2, tmp2, #REP8_7f
290	sub tmp3, tmp4, zeroones
291	orr tmp4, tmp4, #REP8_7f
292	#else
293	lsr data1a, data1, tmp1
294	lsl tmp4, data2, tmp2
295	lsr data2, data2, tmp1
296	orr tmp4, tmp4, data1a
297	cmp to_align, #`8`
298	csel data1, tmp4, data2, lt
299	sub tmp1, data1, zeroones
300	orr tmp2, data1, #REP8_7f
301	sub tmp3, data2, zeroones
302	orr tmp4, data2, #REP8_7f
303	#endif
304	bic has_nul1, tmp1, tmp2
305	cbnz has_nul1, L(fp_le8)
306	bic has_nul2, tmp3, tmp4
307	b L(fp_gt8)
308
309	END (STRCPY)
310

source code of libc/AOR_v20.02/string/aarch64/strcpy.S