memcpy.S source code [libc/AOR_v20.02/string/arm/memcpy.S]

1	/*
2	* memcpy - copy memory area
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/*
10	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
11	of VFP or NEON when built with the appropriate flags.
12
13	Assumptions:
14
15	ARMv6 (ARMv7-a if using Neon)
16	ARM state
17	Unaligned accesses
18
19	*/
20
21	#include "../asmdefs.h"
22
23	.syntax unified
24	/ This implementation requires ARM state. /
25	.arm
26
27	#ifdef __ARM_NEON__
28
29	.fpu neon
30	.arch armv7-a
31	# define FRAME_SIZE 4
32	# define USE_VFP
33	# define USE_NEON
34
35	#elif !defined (__SOFTFP__)
36
37	.arch armv6
38	.fpu vfpv2
39	# define FRAME_SIZE 32
40	# define USE_VFP
41
42	#else
43	.arch armv6
44	# define FRAME_SIZE 32
45
46	#endif
47
48	/ Old versions of GAS incorrectly implement the NEON align semantics. /
49	#ifdef BROKEN_ASM_NEON_ALIGN
50	#define ALIGN(addr, align) addr,:align
51	#else
52	#define ALIGN(addr, align) addr:align
53	#endif
54
55	#define PC_OFFSET 8 /* PC pipeline compensation. */
56	#define INSN_SIZE 4
57
58	/ Call parameters. /
59	#define dstin r0
60	#define src r1
61	#define count r2
62
63	/ Locals. /
64	#define tmp1 r3
65	#define dst ip
66	#define tmp2 r10
67
68	#ifndef USE_NEON
69	/ For bulk copies using GP registers. /
70	#define A_l r2 /* Call-clobbered. */
71	#define A_h r3 /* Call-clobbered. */
72	#define B_l r4
73	#define B_h r5
74	#define C_l r6
75	#define C_h r7
76	#define D_l r8
77	#define D_h r9
78	#endif
79
80	/ Number of lines ahead to pre-fetch data. If you change this the code*
81	below will need adjustment to compensate. /*
82
83	#define prefetch_lines 5
84
85	#ifdef USE_VFP
86	.macro cpy_line_vfp vreg, base
87	vstr \vreg, [dst, #\base]
88	vldr \vreg, [src, #\base]
89	vstr d0, [dst, #\base + `8`]
90	vldr d0, [src, #\base + `8`]
91	vstr d1, [dst, #\base + `16`]
92	vldr d1, [src, #\base + `16`]
93	vstr d2, [dst, #\base + `24`]
94	vldr d2, [src, #\base + `24`]
95	vstr \vreg, [dst, #\base + `32`]
96	vldr \vreg, [src, #\base + prefetch_lines * `64` - `32`]
97	vstr d0, [dst, #\base + `40`]
98	vldr d0, [src, #\base + `40`]
99	vstr d1, [dst, #\base + `48`]
100	vldr d1, [src, #\base + `48`]
101	vstr d2, [dst, #\base + `56`]
102	vldr d2, [src, #\base + `56`]
103	.endm
104
105	.macro cpy_tail_vfp vreg, base
106	vstr \vreg, [dst, #\base]
107	vldr \vreg, [src, #\base]
108	vstr d0, [dst, #\base + `8`]
109	vldr d0, [src, #\base + `8`]
110	vstr d1, [dst, #\base + `16`]
111	vldr d1, [src, #\base + `16`]
112	vstr d2, [dst, #\base + `24`]
113	vldr d2, [src, #\base + `24`]
114	vstr \vreg, [dst, #\base + `32`]
115	vstr d0, [dst, #\base + `40`]
116	vldr d0, [src, #\base + `40`]
117	vstr d1, [dst, #\base + `48`]
118	vldr d1, [src, #\base + `48`]
119	vstr d2, [dst, #\base + `56`]
120	vldr d2, [src, #\base + `56`]
121	.endm
122	#endif
123
124	ENTRY (__memcpy_arm)
125
126	mov dst, dstin / Preserve dstin, we need to return it. /
127	cmp count, #`64`
128	bge L(cpy_not_short)
129	/ Deal with small copies quickly by dropping straight into the*
130	exit block. /*
131
132	L(tail63unaligned):
133	#ifdef USE_NEON
134	and tmp1, count, #`0x38`
135	rsb tmp1, tmp1, #(`56` - PC_OFFSET + INSN_SIZE)
136	add pc, pc, tmp1
137	vld1`.8` {d0}, [src]! / 14 words to go. /
138	vst1`.8` {d0}, [dst]!
139	vld1`.8` {d0}, [src]! / 12 words to go. /
140	vst1`.8` {d0}, [dst]!
141	vld1`.8` {d0}, [src]! / 10 words to go. /
142	vst1`.8` {d0}, [dst]!
143	vld1`.8` {d0}, [src]! / 8 words to go. /
144	vst1`.8` {d0}, [dst]!
145	vld1`.8` {d0}, [src]! / 6 words to go. /
146	vst1`.8` {d0}, [dst]!
147	vld1`.8` {d0}, [src]! / 4 words to go. /
148	vst1`.8` {d0}, [dst]!
149	vld1`.8` {d0}, [src]! / 2 words to go. /
150	vst1`.8` {d0}, [dst]!
151
152	tst count, #`4`
153	ldrne tmp1, [src], #`4`
154	strne tmp1, [dst], #`4`
155	#else
156	/ Copy up to 15 full words of data. May not be aligned. /
157	/ Cannot use VFP for unaligned data. /
158	and tmp1, count, #`0x3c`
159	add dst, dst, tmp1
160	add src, src, tmp1
161	rsb tmp1, tmp1, #(`60` - PC_OFFSET/`2` + INSN_SIZE/`2`)
162	/ Jump directly into the sequence below at the correct offset. /
163	add pc, pc, tmp1, lsl #`1`
164
165	ldr tmp1, [src, #-`60`] / 15 words to go. /
166	str tmp1, [dst, #-`60`]
167
168	ldr tmp1, [src, #-`56`] / 14 words to go. /
169	str tmp1, [dst, #-`56`]
170	ldr tmp1, [src, #-`52`]
171	str tmp1, [dst, #-`52`]
172
173	ldr tmp1, [src, #-`48`] / 12 words to go. /
174	str tmp1, [dst, #-`48`]
175	ldr tmp1, [src, #-`44`]
176	str tmp1, [dst, #-`44`]
177
178	ldr tmp1, [src, #-`40`] / 10 words to go. /
179	str tmp1, [dst, #-`40`]
180	ldr tmp1, [src, #-`36`]
181	str tmp1, [dst, #-`36`]
182
183	ldr tmp1, [src, #-`32`] / 8 words to go. /
184	str tmp1, [dst, #-`32`]
185	ldr tmp1, [src, #-`28`]
186	str tmp1, [dst, #-`28`]
187
188	ldr tmp1, [src, #-`24`] / 6 words to go. /
189	str tmp1, [dst, #-`24`]
190	ldr tmp1, [src, #-`20`]
191	str tmp1, [dst, #-`20`]
192
193	ldr tmp1, [src, #-`16`] / 4 words to go. /
194	str tmp1, [dst, #-`16`]
195	ldr tmp1, [src, #-`12`]
196	str tmp1, [dst, #-`12`]
197
198	ldr tmp1, [src, #-`8`] / 2 words to go. /
199	str tmp1, [dst, #-`8`]
200	ldr tmp1, [src, #-`4`]
201	str tmp1, [dst, #-`4`]
202	#endif
203
204	lsls count, count, #`31`
205	ldrhcs tmp1, [src], #`2`
206	ldrbne src, [src] / Src is dead, use as a scratch. /
207	strhcs tmp1, [dst], #`2`
208	strbne src, [dst]
209	bx lr
210
211	L(cpy_not_short):
212	/ At least 64 bytes to copy, but don't know the alignment yet. /
213	str tmp2, [sp, #-FRAME_SIZE]!
214	and tmp2, src, #`7`
215	and tmp1, dst, #`7`
216	cmp tmp1, tmp2
217	bne L(cpy_notaligned)
218
219	#ifdef USE_VFP
220	/ Magic dust alert! Force VFP on Cortex-A9. Experiments show*
221	that the FP pipeline is much better at streaming loads and
222	stores. This is outside the critical loop. /*
223	vmov.f32 s0, s0
224	#endif
225
226	/ SRC and DST have the same mutual 64-bit alignment, but we may*
227	still need to pre-copy some bytes to get to natural alignment.
228	We bring SRC and DST into full 64-bit alignment. /*
229	lsls tmp2, dst, #`29`
230	beq `1f`
231	rsbs tmp2, tmp2, #`0`
232	sub count, count, tmp2, lsr #`29`
233	ldrmi tmp1, [src], #`4`
234	strmi tmp1, [dst], #`4`
235	lsls tmp2, tmp2, #`2`
236	ldrhcs tmp1, [src], #`2`
237	ldrbne tmp2, [src], #`1`
238	strhcs tmp1, [dst], #`2`
239	strbne tmp2, [dst], #`1`
240
241	`1`:
242	subs tmp2, count, #`64` / Use tmp2 for count. /
243	blt L(tail63aligned)
244
245	cmp tmp2, #`512`
246	bge L(cpy_body_long)
247
248	L(cpy_body_medium): / Count in tmp2. /
249	#ifdef USE_VFP
250	`1`:
251	vldr d0, [src, #`0`]
252	subs tmp2, tmp2, #`64`
253	vldr d1, [src, #`8`]
254	vstr d0, [dst, #`0`]
255	vldr d0, [src, #`16`]
256	vstr d1, [dst, #`8`]
257	vldr d1, [src, #`24`]
258	vstr d0, [dst, #`16`]
259	vldr d0, [src, #`32`]
260	vstr d1, [dst, #`24`]
261	vldr d1, [src, #`40`]
262	vstr d0, [dst, #`32`]
263	vldr d0, [src, #`48`]
264	vstr d1, [dst, #`40`]
265	vldr d1, [src, #`56`]
266	vstr d0, [dst, #`48`]
267	add src, src, #`64`
268	vstr d1, [dst, #`56`]
269	add dst, dst, #`64`
270	bge `1b`
271	tst tmp2, #`0x3f`
272	beq L(done)
273
274	L(tail63aligned): / Count in tmp2. /
275	and tmp1, tmp2, #`0x38`
276	add dst, dst, tmp1
277	add src, src, tmp1
278	rsb tmp1, tmp1, #(`56` - PC_OFFSET + INSN_SIZE)
279	add pc, pc, tmp1
280
281	vldr d0, [src, #-`56`] / 14 words to go. /
282	vstr d0, [dst, #-`56`]
283	vldr d0, [src, #-`48`] / 12 words to go. /
284	vstr d0, [dst, #-`48`]
285	vldr d0, [src, #-`40`] / 10 words to go. /
286	vstr d0, [dst, #-`40`]
287	vldr d0, [src, #-`32`] / 8 words to go. /
288	vstr d0, [dst, #-`32`]
289	vldr d0, [src, #-`24`] / 6 words to go. /
290	vstr d0, [dst, #-`24`]
291	vldr d0, [src, #-`16`] / 4 words to go. /
292	vstr d0, [dst, #-`16`]
293	vldr d0, [src, #-`8`] / 2 words to go. /
294	vstr d0, [dst, #-`8`]
295	#else
296	sub src, src, #`8`
297	sub dst, dst, #`8`
298	`1`:
299	ldrd A_l, A_h, [src, #`8`]
300	strd A_l, A_h, [dst, #`8`]
301	ldrd A_l, A_h, [src, #`16`]
302	strd A_l, A_h, [dst, #`16`]
303	ldrd A_l, A_h, [src, #`24`]
304	strd A_l, A_h, [dst, #`24`]
305	ldrd A_l, A_h, [src, #`32`]
306	strd A_l, A_h, [dst, #`32`]
307	ldrd A_l, A_h, [src, #`40`]
308	strd A_l, A_h, [dst, #`40`]
309	ldrd A_l, A_h, [src, #`48`]
310	strd A_l, A_h, [dst, #`48`]
311	ldrd A_l, A_h, [src, #`56`]
312	strd A_l, A_h, [dst, #`56`]
313	ldrd A_l, A_h, [src, #`64`]!
314	strd A_l, A_h, [dst, #`64`]!
315	subs tmp2, tmp2, #`64`
316	bge `1b`
317	tst tmp2, #`0x3f`
318	bne `1f`
319	ldr tmp2,[sp], #FRAME_SIZE
320	bx lr
321	`1`:
322	add src, src, #`8`
323	add dst, dst, #`8`
324
325	L(tail63aligned): / Count in tmp2. /
326	/ Copy up to 7 d-words of data. Similar to Ltail63unaligned, but*
327	we know that the src and dest are 64-bit aligned so we can use
328	LDRD/STRD to improve efficiency. /*
329	/ TMP2 is now negative, but we don't care about that. The bottom*
330	six bits still tell us how many bytes are left to copy. /*
331
332	and tmp1, tmp2, #`0x38`
333	add dst, dst, tmp1
334	add src, src, tmp1
335	rsb tmp1, tmp1, #(`56` - PC_OFFSET + INSN_SIZE)
336	add pc, pc, tmp1
337	ldrd A_l, A_h, [src, #-`56`] / 14 words to go. /
338	strd A_l, A_h, [dst, #-`56`]
339	ldrd A_l, A_h, [src, #-`48`] / 12 words to go. /
340	strd A_l, A_h, [dst, #-`48`]
341	ldrd A_l, A_h, [src, #-`40`] / 10 words to go. /
342	strd A_l, A_h, [dst, #-`40`]
343	ldrd A_l, A_h, [src, #-`32`] / 8 words to go. /
344	strd A_l, A_h, [dst, #-`32`]
345	ldrd A_l, A_h, [src, #-`24`] / 6 words to go. /
346	strd A_l, A_h, [dst, #-`24`]
347	ldrd A_l, A_h, [src, #-`16`] / 4 words to go. /
348	strd A_l, A_h, [dst, #-`16`]
349	ldrd A_l, A_h, [src, #-`8`] / 2 words to go. /
350	strd A_l, A_h, [dst, #-`8`]
351
352	#endif
353	tst tmp2, #`4`
354	ldrne tmp1, [src], #`4`
355	strne tmp1, [dst], #`4`
356	lsls tmp2, tmp2, #`31` / Count (tmp2) now dead. /
357	ldrhcs tmp1, [src], #`2`
358	ldrbne tmp2, [src]
359	strhcs tmp1, [dst], #`2`
360	strbne tmp2, [dst]
361
362	L(done):
363	ldr tmp2, [sp], #FRAME_SIZE
364	bx lr
365
366	L(cpy_body_long): / Count in tmp2. /
367
368	/ Long copy. We know that there's at least (prefetch_lines * 64)*
369	bytes to go. /*
370	#ifdef USE_VFP
371	/ Don't use PLD. Instead, read some data in advance of the current*
372	copy position into a register. This should act like a PLD
373	operation but we won't have to repeat the transfer. /*
374
375	vldr d3, [src, #`0`]
376	vldr d4, [src, #`64`]
377	vldr d5, [src, #`128`]
378	vldr d6, [src, #`192`]
379	vldr d7, [src, #`256`]
380
381	vldr d0, [src, #`8`]
382	vldr d1, [src, #`16`]
383	vldr d2, [src, #`24`]
384	add src, src, #`32`
385
386	subs tmp2, tmp2, #prefetch_lines * `64` * `2`
387	blt `2f`
388	`1`:
389	cpy_line_vfp d3, `0`
390	cpy_line_vfp d4, `64`
391	cpy_line_vfp d5, `128`
392	add dst, dst, #`3` * `64`
393	add src, src, #`3` * `64`
394	cpy_line_vfp d6, `0`
395	cpy_line_vfp d7, `64`
396	add dst, dst, #`2` * `64`
397	add src, src, #`2` * `64`
398	subs tmp2, tmp2, #prefetch_lines * `64`
399	bge `1b`
400
401	`2`:
402	cpy_tail_vfp d3, `0`
403	cpy_tail_vfp d4, `64`
404	cpy_tail_vfp d5, `128`
405	add src, src, #`3` * `64`
406	add dst, dst, #`3` * `64`
407	cpy_tail_vfp d6, `0`
408	vstr d7, [dst, #`64`]
409	vldr d7, [src, #`64`]
410	vstr d0, [dst, #`64` + `8`]
411	vldr d0, [src, #`64` + `8`]
412	vstr d1, [dst, #`64` + `16`]
413	vldr d1, [src, #`64` + `16`]
414	vstr d2, [dst, #`64` + `24`]
415	vldr d2, [src, #`64` + `24`]
416	vstr d7, [dst, #`64` + `32`]
417	add src, src, #`96`
418	vstr d0, [dst, #`64` + `40`]
419	vstr d1, [dst, #`64` + `48`]
420	vstr d2, [dst, #`64` + `56`]
421	add dst, dst, #`128`
422	add tmp2, tmp2, #prefetch_lines * `64`
423	b L(cpy_body_medium)
424	#else
425	/ Long copy. Use an SMS style loop to maximize the I/O*
426	bandwidth of the core. We don't have enough spare registers
427	to synthesise prefetching, so use PLD operations. /*
428	/ Pre-bias src and dst. /
429	sub src, src, #`8`
430	sub dst, dst, #`8`
431	pld [src, #`8`]
432	pld [src, #`72`]
433	subs tmp2, tmp2, #`64`
434	pld [src, #`136`]
435	ldrd A_l, A_h, [src, #`8`]
436	strd B_l, B_h, [sp, #`8`]
437	ldrd B_l, B_h, [src, #`16`]
438	strd C_l, C_h, [sp, #`16`]
439	ldrd C_l, C_h, [src, #`24`]
440	strd D_l, D_h, [sp, #`24`]
441	pld [src, #`200`]
442	ldrd D_l, D_h, [src, #`32`]!
443	b `1f`
444	.p2align `6`
445	`2`:
446	pld [src, #`232`]
447	strd A_l, A_h, [dst, #`40`]
448	ldrd A_l, A_h, [src, #`40`]
449	strd B_l, B_h, [dst, #`48`]
450	ldrd B_l, B_h, [src, #`48`]
451	strd C_l, C_h, [dst, #`56`]
452	ldrd C_l, C_h, [src, #`56`]
453	strd D_l, D_h, [dst, #`64`]!
454	ldrd D_l, D_h, [src, #`64`]!
455	subs tmp2, tmp2, #`64`
456	`1`:
457	strd A_l, A_h, [dst, #`8`]
458	ldrd A_l, A_h, [src, #`8`]
459	strd B_l, B_h, [dst, #`16`]
460	ldrd B_l, B_h, [src, #`16`]
461	strd C_l, C_h, [dst, #`24`]
462	ldrd C_l, C_h, [src, #`24`]
463	strd D_l, D_h, [dst, #`32`]
464	ldrd D_l, D_h, [src, #`32`]
465	bcs `2b`
466	/ Save the remaining bytes and restore the callee-saved regs. /
467	strd A_l, A_h, [dst, #`40`]
468	add src, src, #`40`
469	strd B_l, B_h, [dst, #`48`]
470	ldrd B_l, B_h, [sp, #`8`]
471	strd C_l, C_h, [dst, #`56`]
472	ldrd C_l, C_h, [sp, #`16`]
473	strd D_l, D_h, [dst, #`64`]
474	ldrd D_l, D_h, [sp, #`24`]
475	add dst, dst, #`72`
476	tst tmp2, #`0x3f`
477	bne L(tail63aligned)
478	ldr tmp2, [sp], #FRAME_SIZE
479	bx lr
480	#endif
481
482	L(cpy_notaligned):
483	pld [src]
484	pld [src, #`64`]
485	/ There's at least 64 bytes to copy, but there is no mutual*
486	alignment. /*
487	/ Bring DST to 64-bit alignment. /
488	lsls tmp2, dst, #`29`
489	pld [src, #(`2` * `64`)]
490	beq `1f`
491	rsbs tmp2, tmp2, #`0`
492	sub count, count, tmp2, lsr #`29`
493	ldrmi tmp1, [src], #`4`
494	strmi tmp1, [dst], #`4`
495	lsls tmp2, tmp2, #`2`
496	ldrbne tmp1, [src], #`1`
497	ldrhcs tmp2, [src], #`2`
498	strbne tmp1, [dst], #`1`
499	strhcs tmp2, [dst], #`2`
500	`1`:
501	pld [src, #(`3` * `64`)]
502	subs count, count, #`64`
503	ldrmi tmp2, [sp], #FRAME_SIZE
504	bmi L(tail63unaligned)
505	pld [src, #(`4` * `64`)]
506
507	#ifdef USE_NEON
508	vld1`.8` {d0-d3}, [src]!
509	vld1`.8` {d4-d7}, [src]!
510	subs count, count, #`64`
511	bmi `2f`
512	`1`:
513	pld [src, #(`4` * `64`)]
514	vst1`.8` {d0-d3}, [ALIGN (dst, `64`)]!
515	vld1`.8` {d0-d3}, [src]!
516	vst1`.8` {d4-d7}, [ALIGN (dst, `64`)]!
517	vld1`.8` {d4-d7}, [src]!
518	subs count, count, #`64`
519	bpl `1b`
520	`2`:
521	vst1`.8` {d0-d3}, [ALIGN (dst, `64`)]!
522	vst1`.8` {d4-d7}, [ALIGN (dst, `64`)]!
523	ands count, count, #`0x3f`
524	#else
525	/ Use an SMS style loop to maximize the I/O bandwidth. /
526	sub src, src, #`4`
527	sub dst, dst, #`8`
528	subs tmp2, count, #`64` / Use tmp2 for count. /
529	ldr A_l, [src, #`4`]
530	ldr A_h, [src, #`8`]
531	strd B_l, B_h, [sp, #`8`]
532	ldr B_l, [src, #`12`]
533	ldr B_h, [src, #`16`]
534	strd C_l, C_h, [sp, #`16`]
535	ldr C_l, [src, #`20`]
536	ldr C_h, [src, #`24`]
537	strd D_l, D_h, [sp, #`24`]
538	ldr D_l, [src, #`28`]
539	ldr D_h, [src, #`32`]!
540	b `1f`
541	.p2align `6`
542	`2`:
543	pld [src, #(`5` * `64`) - (`32` - `4`)]
544	strd A_l, A_h, [dst, #`40`]
545	ldr A_l, [src, #`36`]
546	ldr A_h, [src, #`40`]
547	strd B_l, B_h, [dst, #`48`]
548	ldr B_l, [src, #`44`]
549	ldr B_h, [src, #`48`]
550	strd C_l, C_h, [dst, #`56`]
551	ldr C_l, [src, #`52`]
552	ldr C_h, [src, #`56`]
553	strd D_l, D_h, [dst, #`64`]!
554	ldr D_l, [src, #`60`]
555	ldr D_h, [src, #`64`]!
556	subs tmp2, tmp2, #`64`
557	`1`:
558	strd A_l, A_h, [dst, #`8`]
559	ldr A_l, [src, #`4`]
560	ldr A_h, [src, #`8`]
561	strd B_l, B_h, [dst, #`16`]
562	ldr B_l, [src, #`12`]
563	ldr B_h, [src, #`16`]
564	strd C_l, C_h, [dst, #`24`]
565	ldr C_l, [src, #`20`]
566	ldr C_h, [src, #`24`]
567	strd D_l, D_h, [dst, #`32`]
568	ldr D_l, [src, #`28`]
569	ldr D_h, [src, #`32`]
570	bcs `2b`
571
572	/ Save the remaining bytes and restore the callee-saved regs. /
573	strd A_l, A_h, [dst, #`40`]
574	add src, src, #`36`
575	strd B_l, B_h, [dst, #`48`]
576	ldrd B_l, B_h, [sp, #`8`]
577	strd C_l, C_h, [dst, #`56`]
578	ldrd C_l, C_h, [sp, #`16`]
579	strd D_l, D_h, [dst, #`64`]
580	ldrd D_l, D_h, [sp, #`24`]
581	add dst, dst, #`72`
582	ands count, tmp2, #`0x3f`
583	#endif
584	ldr tmp2, [sp], #FRAME_SIZE
585	bne L(tail63unaligned)
586	bx lr
587
588	END (__memcpy_arm)
589

source code of libc/AOR_v20.02/string/arm/memcpy.S