strrchr.S source code [libc/AOR_v20.02/string/aarch64/strrchr.S]

1	/*
2	* strrchr - find last position of a character in a string.
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64
12	* Neon Available.
13	*/
14
15	#include "../asmdefs.h"
16
17	/ Arguments and results. /
18	#define srcin x0
19	#define chrin w1
20
21	#define result x0
22
23	#define src x2
24	#define tmp1 x3
25	#define wtmp2 w4
26	#define tmp3 x5
27	#define src_match x6
28	#define src_offset x7
29	#define const_m1 x8
30	#define tmp4 x9
31	#define nul_match x10
32	#define chr_match x11
33
34	#define vrepchr v0
35	#define vdata1 v1
36	#define vdata2 v2
37	#define vhas_nul1 v3
38	#define vhas_nul2 v4
39	#define vhas_chr1 v5
40	#define vhas_chr2 v6
41	#define vrepmask_0 v7
42	#define vrepmask_c v16
43	#define vend1 v17
44	#define vend2 v18
45
46	/ Core algorithm.*
47
48	For each 32-byte hunk we calculate a 64-bit syndrome value, with
49	two bits per byte (LSB is always in bits 0 and 1, for both big
50	and little-endian systems). For each tuple, bit 0 is set iff
51	the relevant byte matched the requested character; bit 1 is set
52	iff the relevant byte matched the NUL end of string (we trigger
53	off bit0 for the special case of looking for NUL). Since the bits
54	in the syndrome reflect exactly the order in which things occur
55	in the original string a count_trailing_zeros() operation will
56	identify exactly which byte is causing the termination, and why. /*
57
58	ENTRY (__strrchr_aarch64)
59	/ Magic constant 0x40100401 to allow us to identify which lane*
60	matches the requested byte. Magic constant 0x80200802 used
61	similarly for NUL termination. /*
62	mov wtmp2, #`0x0401`
63	movk wtmp2, #`0x4010`, lsl #`16`
64	dup vrepchr`.16b`, chrin
65	bic src, srcin, #`31` / Work with aligned 32-byte hunks. /
66	dup vrepmask_c`.4s`, wtmp2
67	mov src_offset, #`0`
68	ands tmp1, srcin, #`31`
69	add vrepmask_0`.4s`, vrepmask_c`.4s`, vrepmask_c`.4s` / equiv: lsl #1 /
70	b.eq L(aligned)
71
72	/ Input string is not 32-byte aligned. Rather than forcing*
73	the padding bytes to a safe value, we calculate the syndrome
74	for all the bytes, but then mask off those bits of the
75	syndrome that are related to the padding. /*
76	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
77	neg tmp1, tmp1
78	cmeq vhas_nul1`.16b`, vdata1`.16b`, #`0`
79	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
80	cmeq vhas_nul2`.16b`, vdata2`.16b`, #`0`
81	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
82	and vhas_nul1`.16b`, vhas_nul1`.16b`, vrepmask_0`.16b`
83	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask_c`.16b`
84	and vhas_nul2`.16b`, vhas_nul2`.16b`, vrepmask_0`.16b`
85	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask_c`.16b`
86	addp vhas_nul1`.16b`, vhas_nul1`.16b`, vhas_nul2`.16b` // 256->128
87	addp vhas_chr1`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` // 256->128
88	addp vhas_nul1`.16b`, vhas_nul1`.16b`, vhas_nul1`.16b` // 128->64
89	addp vhas_chr1`.16b`, vhas_chr1`.16b`, vhas_chr1`.16b` // 128->64
90	mov nul_match, vhas_nul1.d[`0`]
91	lsl tmp1, tmp1, #`1`
92	mov const_m1, #~`0`
93	mov chr_match, vhas_chr1.d[`0`]
94	lsr tmp3, const_m1, tmp1
95
96	bic nul_match, nul_match, tmp3 // Mask padding bits.
97	bic chr_match, chr_match, tmp3 // Mask padding bits.
98	cbnz nul_match, L(tail)
99
100	L(loop):
101	cmp chr_match, #`0`
102	csel src_match, src, src_match, ne
103	csel src_offset, chr_match, src_offset, ne
104	L(aligned):
105	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
106	cmeq vhas_nul1`.16b`, vdata1`.16b`, #`0`
107	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
108	cmeq vhas_nul2`.16b`, vdata2`.16b`, #`0`
109	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
110	addp vend1`.16b`, vhas_nul1`.16b`, vhas_nul2`.16b` // 256->128
111	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask_c`.16b`
112	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask_c`.16b`
113	addp vhas_chr1`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` // 256->128
114	addp vend1`.16b`, vend1`.16b`, vend1`.16b` // 128->64
115	addp vhas_chr1`.16b`, vhas_chr1`.16b`, vhas_chr1`.16b` // 128->64
116	mov nul_match, vend1.d[`0`]
117	mov chr_match, vhas_chr1.d[`0`]
118	cbz nul_match, L(loop)
119
120	and vhas_nul1`.16b`, vhas_nul1`.16b`, vrepmask_0`.16b`
121	and vhas_nul2`.16b`, vhas_nul2`.16b`, vrepmask_0`.16b`
122	addp vhas_nul1`.16b`, vhas_nul1`.16b`, vhas_nul2`.16b`
123	addp vhas_nul1`.16b`, vhas_nul1`.16b`, vhas_nul1`.16b`
124	mov nul_match, vhas_nul1.d[`0`]
125
126	L(tail):
127	/ Work out exactly where the string ends. /
128	sub tmp4, nul_match, #`1`
129	eor tmp4, tmp4, nul_match
130	ands chr_match, chr_match, tmp4
131	/ And pick the values corresponding to the last match. /
132	csel src_match, src, src_match, ne
133	csel src_offset, chr_match, src_offset, ne
134
135	/ Count down from the top of the syndrome to find the last match. /
136	clz tmp3, src_offset
137	/ Src_match points beyond the word containing the match, so we can*
138	simply subtract half the bit-offset into the syndrome. Because
139	we are counting down, we need to go back one more character. /*
140	add tmp3, tmp3, #`2`
141	sub result, src_match, tmp3, lsr #`1`
142	/ But if the syndrome shows no match was found, then return NULL. /
143	cmp src_offset, #`0`
144	csel result, result, xzr, ne
145
146	ret
147
148	END (__strrchr_aarch64)
149

source code of libc/AOR_v20.02/string/aarch64/strrchr.S