strlen-mte.S source code [libc/AOR_v20.02/string/aarch64/strlen-mte.S]

1	/*
2	* strlen - calculate the length of a string
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64.
12	*/
13
14	#include "../asmdefs.h"
15
16	/ Arguments and results. /
17	#define srcin x0
18	#define len x0
19
20	/ Locals and temporaries. /
21	#define src x1
22	#define data1 x2
23	#define data2 x3
24	#define has_nul1 x4
25	#define has_nul2 x5
26	#define tmp1 x4
27	#define tmp2 x5
28	#define tmp3 x6
29	#define tmp4 x7
30	#define zeroones x8
31	#define offset x9
32
33	/ NUL detection works on the principle that (X - 1) & (~X) & 0x80*
34	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
35	can be done in parallel across the entire word. A faster check
36	(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
37	false hits for characters 129..255. /*
38
39	#define REP8_01 0x0101010101010101
40	#define REP8_7f 0x7f7f7f7f7f7f7f7f
41
42	/ This implementation is compatible with Memory Tagging. All loads*
43	are 16 bytes in size and 16 bytes aligned. This also avoids the
44	need for page boundary checks. This implementation is correct
45	even without Memory Tagging, but other implementations could be
46	more beneficial if Memory Tagging is not enabled.
47
48	First load is aligned down and can contain bytes that are located
49	before the string. This is handled by modifying the "zeroones"
50	mask. The bytes that need to be ignored are set to zero.
51	If the string is aligned in such a way that 8 or more bytes from
52	the first load should be ignored, there is a special case
53	(skip_first_8_bytes) which only compares the second 8 bytes.
54
55	If there is a NUL byte in the first load, we calculate the length
56	from the 2 8-byte words using conditional select to reduce branch
57	mispredictions.
58
59	If the string is longer than 16 bytes, we check 32 bytes per
60	iteration using the fast NUL check (main_loop). If we encounter
61	non-ASCII characters, we fallback to a second loop
62	(nonascii_loop) using the full NUL check. /*
63
64	ENTRY(__strlen_aarch64_mte)
65	bic src, srcin, `15` / Align down to 16 bytes. /
66	mov zeroones, REP8_01
67	/ (offset & 63) holds number of bits to ignore in a register./
68	lsl offset, srcin, `3`
69	ldp data1, data2, [src], -`16`
70	lsl tmp1, zeroones, offset / Shift (offset & 63). /
71	#ifdef __AARCH64EB__
72	/ For big-endian, carry propagation (if the final byte in the*
73	string is 0x01) means we cannot use has_nul1/2 directly.
74	e.g. 0x0100 - 0x0101 = 0xffff, so 0x01 will be mistaken for NUL.
75	Since we expect strings to be small and early-exit,
76	byte-swap the data now so has_null1/2 will be correct. /*
77	rev data1, data1
78	rev data2, data2
79	#endif
80	tbnz srcin, `3`, L(skip_first_8_bytes)
81	sub tmp1, data1, tmp1
82	orr tmp2, data1, REP8_7f
83	sub tmp3, data2, zeroones
84	orr tmp4, data2, REP8_7f
85	bics has_nul1, tmp1, tmp2
86	bic has_nul2, tmp3, tmp4
87	/ If comparison happens, C flag is always set. /
88	ccmp has_nul2, `0`, `0`, eq
89	beq L(main_loop)
90
91	/ Enter with C = has_nul1 == 0. /
92	csel has_nul1, has_nul1, has_nul2, cc
93	and tmp2, srcin, `7` / Bytes to ignore. /
94	rev has_nul1, has_nul1
95	neg tmp2, tmp2
96	clz tmp1, has_nul1 / Count bits before NUL. /
97	/ Add 8 if NUL byte is not in first register. /
98	add tmp3, tmp2, `8`
99	csel len, tmp2, tmp3, cc
100	add len, len, tmp1, lsr `3`
101	ret
102
103	L(skip_first_8_bytes):
104	sub tmp1, data2, tmp1
105	orr tmp2, data2, REP8_7f
106	bics has_nul1, tmp1, tmp2
107	beq L(main_loop)
108
109	rev has_nul1, has_nul1
110	lsl tmp1, has_nul1, offset / Ignore bytes before string. /
111	clz tmp1, tmp1 / Count bits before NUL. /
112	lsr len, tmp1, `3`
113	ret
114
115	/ The inner loop processes 32 bytes per iteration and uses the fast*
116	NUL check. If we encounter non-ASCII characters, use a second
117	loop with the accurate NUL check. /*
118	.p2align `4`
119	L(main_loop):
120	ldp data1, data2, [src, `32`]!
121	sub tmp1, data1, zeroones
122	sub tmp3, data2, zeroones
123	orr tmp2, tmp1, tmp3
124	tst tmp2, zeroones, lsl `7`
125	bne `1f`
126	ldp data1, data2, [src, `16`]
127	sub tmp1, data1, zeroones
128	sub tmp3, data2, zeroones
129	orr tmp2, tmp1, tmp3
130	tst tmp2, zeroones, lsl `7`
131	beq L(main_loop)
132	add src, src, `16`
133	`1`:
134	/ The fast check failed, so do the slower, accurate NUL check. /
135	orr tmp2, data1, REP8_7f
136	orr tmp4, data2, REP8_7f
137	bics has_nul1, tmp1, tmp2
138	bic has_nul2, tmp3, tmp4
139	ccmp has_nul2, `0`, `0`, eq
140	beq L(nonascii_loop)
141
142	/ Enter with C = has_nul1 == 0. /
143	L(tail):
144	#ifdef __AARCH64EB__
145	/ For big-endian, carry propagation (if the final byte in the*
146	string is 0x01) means we cannot use has_nul1/2 directly. The
147	easiest way to get the correct byte is to byte-swap the data
148	and calculate the syndrome a second time. /*
149	csel data1, data1, data2, cc
150	rev data1, data1
151	sub tmp1, data1, zeroones
152	orr tmp2, data1, REP8_7f
153	bic has_nul1, tmp1, tmp2
154	#else
155	csel has_nul1, has_nul1, has_nul2, cc
156	#endif
157	sub len, src, srcin
158	rev has_nul1, has_nul1
159	add tmp2, len, `8`
160	clz tmp1, has_nul1
161	csel len, len, tmp2, cc
162	add len, len, tmp1, lsr `3`
163	ret
164
165	L(nonascii_loop):
166	ldp data1, data2, [src, `16`]!
167	sub tmp1, data1, zeroones
168	orr tmp2, data1, REP8_7f
169	sub tmp3, data2, zeroones
170	orr tmp4, data2, REP8_7f
171	bics has_nul1, tmp1, tmp2
172	bic has_nul2, tmp3, tmp4
173	ccmp has_nul2, `0`, `0`, eq
174	bne L(tail)
175	ldp data1, data2, [src, `16`]!
176	sub tmp1, data1, zeroones
177	orr tmp2, data1, REP8_7f
178	sub tmp3, data2, zeroones
179	orr tmp4, data2, REP8_7f
180	bics has_nul1, tmp1, tmp2
181	bic has_nul2, tmp3, tmp4
182	ccmp has_nul2, `0`, `0`, eq
183	beq L(nonascii_loop)
184	b L(tail)
185
186	END(__strlen_aarch64_mte)
187

source code of libc/AOR_v20.02/string/aarch64/strlen-mte.S