memcpy.S source code [libc/AOR_v20.02/string/aarch64/memcpy.S]

1	/*
2	* memcpy - copy memory area
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64, unaligned accesses.
12	*
13	*/
14
15	#include "../asmdefs.h"
16
17	#define dstin x0
18	#define src x1
19	#define count x2
20	#define dst x3
21	#define srcend x4
22	#define dstend x5
23	#define A_l x6
24	#define A_lw w6
25	#define A_h x7
26	#define B_l x8
27	#define B_lw w8
28	#define B_h x9
29	#define C_l x10
30	#define C_lw w10
31	#define C_h x11
32	#define D_l x12
33	#define D_h x13
34	#define E_l x14
35	#define E_h x15
36	#define F_l x16
37	#define F_h x17
38	#define G_l count
39	#define G_h dst
40	#define H_l src
41	#define H_h srcend
42	#define tmp1 x14
43
44	/ This implementation handles overlaps and supports both memcpy and memmove*
45	from a single entry point. It uses unaligned accesses and branchless
46	sequences to keep the code small, simple and improve performance.
47
48	Copies are split into 3 main cases: small copies of up to 32 bytes, medium
49	copies of up to 128 bytes, and large copies. The overhead of the overlap
50	check is negligible since it is only required for large copies.
51
52	Large copies use a software pipelined loop processing 64 bytes per iteration.
53	The destination pointer is 16-byte aligned to minimize unaligned accesses.
54	The loop tail is handled by always copying 64 bytes from the end.
55	*/
56
57	ENTRY (__memcpy_aarch64)
58	ENTRY_ALIAS (__memmove_aarch64)
59	add srcend, src, count
60	add dstend, dstin, count
61	cmp count, `128`
62	b.hi L(copy_long)
63	cmp count, `32`
64	b.hi L(copy32_128)
65
66	/ Small copies: 0..32 bytes. /
67	cmp count, `16`
68	b.lo L(copy16)
69	ldp A_l, A_h, [src]
70	ldp D_l, D_h, [srcend, -`16`]
71	stp A_l, A_h, [dstin]
72	stp D_l, D_h, [dstend, -`16`]
73	ret
74
75	/ Copy 8-15 bytes. /
76	L(copy16):
77	tbz count, `3`, L(copy8)
78	ldr A_l, [src]
79	ldr A_h, [srcend, -`8`]
80	str A_l, [dstin]
81	str A_h, [dstend, -`8`]
82	ret
83
84	.p2align `3`
85	/ Copy 4-7 bytes. /
86	L(copy8):
87	tbz count, `2`, L(copy4)
88	ldr A_lw, [src]
89	ldr B_lw, [srcend, -`4`]
90	str A_lw, [dstin]
91	str B_lw, [dstend, -`4`]
92	ret
93
94	/ Copy 0..3 bytes using a branchless sequence. /
95	L(copy4):
96	cbz count, L(copy0)
97	lsr tmp1, count, `1`
98	ldrb A_lw, [src]
99	ldrb C_lw, [srcend, -`1`]
100	ldrb B_lw, [src, tmp1]
101	strb A_lw, [dstin]
102	strb B_lw, [dstin, tmp1]
103	strb C_lw, [dstend, -`1`]
104	L(copy0):
105	ret
106
107	.p2align `4`
108	/ Medium copies: 33..128 bytes. /
109	L(copy32_128):
110	ldp A_l, A_h, [src]
111	ldp B_l, B_h, [src, `16`]
112	ldp C_l, C_h, [srcend, -`32`]
113	ldp D_l, D_h, [srcend, -`16`]
114	cmp count, `64`
115	b.hi L(copy128)
116	stp A_l, A_h, [dstin]
117	stp B_l, B_h, [dstin, `16`]
118	stp C_l, C_h, [dstend, -`32`]
119	stp D_l, D_h, [dstend, -`16`]
120	ret
121
122	.p2align `4`
123	/ Copy 65..128 bytes. /
124	L(copy128):
125	ldp E_l, E_h, [src, `32`]
126	ldp F_l, F_h, [src, `48`]
127	cmp count, `96`
128	b.ls L(copy96)
129	ldp G_l, G_h, [srcend, -`64`]
130	ldp H_l, H_h, [srcend, -`48`]
131	stp G_l, G_h, [dstend, -`64`]
132	stp H_l, H_h, [dstend, -`48`]
133	L(copy96):
134	stp A_l, A_h, [dstin]
135	stp B_l, B_h, [dstin, `16`]
136	stp E_l, E_h, [dstin, `32`]
137	stp F_l, F_h, [dstin, `48`]
138	stp C_l, C_h, [dstend, -`32`]
139	stp D_l, D_h, [dstend, -`16`]
140	ret
141
142	.p2align `4`
143	/ Copy more than 128 bytes. /
144	L(copy_long):
145	/ Use backwards copy if there is an overlap. /
146	sub tmp1, dstin, src
147	cbz tmp1, L(copy0)
148	cmp tmp1, count
149	b.lo L(copy_long_backwards)
150
151	/ Copy 16 bytes and then align dst to 16-byte alignment. /
152
153	ldp D_l, D_h, [src]
154	and tmp1, dstin, `15`
155	bic dst, dstin, `15`
156	sub src, src, tmp1
157	add count, count, tmp1 / Count is now 16 too large. /
158	ldp A_l, A_h, [src, `16`]
159	stp D_l, D_h, [dstin]
160	ldp B_l, B_h, [src, `32`]
161	ldp C_l, C_h, [src, `48`]
162	ldp D_l, D_h, [src, `64`]!
163	subs count, count, `128` + `16` / Test and readjust count. /
164	b.ls L(copy64_from_end)
165
166	L(loop64):
167	stp A_l, A_h, [dst, `16`]
168	ldp A_l, A_h, [src, `16`]
169	stp B_l, B_h, [dst, `32`]
170	ldp B_l, B_h, [src, `32`]
171	stp C_l, C_h, [dst, `48`]
172	ldp C_l, C_h, [src, `48`]
173	stp D_l, D_h, [dst, `64`]!
174	ldp D_l, D_h, [src, `64`]!
175	subs count, count, `64`
176	b.hi L(loop64)
177
178	/ Write the last iteration and copy 64 bytes from the end. /
179	L(copy64_from_end):
180	ldp E_l, E_h, [srcend, -`64`]
181	stp A_l, A_h, [dst, `16`]
182	ldp A_l, A_h, [srcend, -`48`]
183	stp B_l, B_h, [dst, `32`]
184	ldp B_l, B_h, [srcend, -`32`]
185	stp C_l, C_h, [dst, `48`]
186	ldp C_l, C_h, [srcend, -`16`]
187	stp D_l, D_h, [dst, `64`]
188	stp E_l, E_h, [dstend, -`64`]
189	stp A_l, A_h, [dstend, -`48`]
190	stp B_l, B_h, [dstend, -`32`]
191	stp C_l, C_h, [dstend, -`16`]
192	ret
193
194	.p2align `4`
195
196	/ Large backwards copy for overlapping copies.*
197	Copy 16 bytes and then align dst to 16-byte alignment. /*
198	L(copy_long_backwards):
199	ldp D_l, D_h, [srcend, -`16`]
200	and tmp1, dstend, `15`
201	sub srcend, srcend, tmp1
202	sub count, count, tmp1
203	ldp A_l, A_h, [srcend, -`16`]
204	stp D_l, D_h, [dstend, -`16`]
205	ldp B_l, B_h, [srcend, -`32`]
206	ldp C_l, C_h, [srcend, -`48`]
207	ldp D_l, D_h, [srcend, -`64`]!
208	sub dstend, dstend, tmp1
209	subs count, count, `128`
210	b.ls L(copy64_from_start)
211
212	L(loop64_backwards):
213	stp A_l, A_h, [dstend, -`16`]
214	ldp A_l, A_h, [srcend, -`16`]
215	stp B_l, B_h, [dstend, -`32`]
216	ldp B_l, B_h, [srcend, -`32`]
217	stp C_l, C_h, [dstend, -`48`]
218	ldp C_l, C_h, [srcend, -`48`]
219	stp D_l, D_h, [dstend, -`64`]!
220	ldp D_l, D_h, [srcend, -`64`]!
221	subs count, count, `64`
222	b.hi L(loop64_backwards)
223
224	/ Write the last iteration and copy 64 bytes from the start. /
225	L(copy64_from_start):
226	ldp G_l, G_h, [src, `48`]
227	stp A_l, A_h, [dstend, -`16`]
228	ldp A_l, A_h, [src, `32`]
229	stp B_l, B_h, [dstend, -`32`]
230	ldp B_l, B_h, [src, `16`]
231	stp C_l, C_h, [dstend, -`48`]
232	ldp C_l, C_h, [src]
233	stp D_l, D_h, [dstend, -`64`]
234	stp G_l, G_h, [dstin, `48`]
235	stp A_l, A_h, [dstin, `32`]
236	stp B_l, B_h, [dstin, `16`]
237	stp C_l, C_h, [dstin]
238	ret
239
240	END (__memcpy_aarch64)
241

source code of libc/AOR_v20.02/string/aarch64/memcpy.S