strcmp.S source code [libc/AOR_v20.02/string/arm/strcmp.S]

1	/*
2	* strcmp for ARMv7
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
10
11	/ Implementation of strcmp for ARMv7 when DSP instructions are*
12	available. Use ldrd to support wider loads, provided the data
13	is sufficiently aligned. Use saturating arithmetic to optimize
14	the compares. /*
15
16	#include "../asmdefs.h"
17
18	/ Build Options:*
19	STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
20	byte in the string. If comparing completely random strings
21	the pre-check will save time, since there is a very high
22	probability of a mismatch in the first character: we save
23	significant overhead if this is the common case. However,
24	if strings are likely to be identical (eg because we're
25	verifying a hit in a hash table), then this check is largely
26	redundant. /*
27
28	#define STRCMP_NO_PRECHECK 0
29
30	/ This version uses Thumb-2 code. /
31	.thumb
32	.syntax unified
33
34	#ifdef __ARM_BIG_ENDIAN
35	#define S2LO lsl
36	#define S2LOEQ lsleq
37	#define S2HI lsr
38	#define MSB 0x000000ff
39	#define LSB 0xff000000
40	#define BYTE0_OFFSET 24
41	#define BYTE1_OFFSET 16
42	#define BYTE2_OFFSET 8
43	#define BYTE3_OFFSET 0
44	#else /* not __ARM_BIG_ENDIAN */
45	#define S2LO lsr
46	#define S2LOEQ lsreq
47	#define S2HI lsl
48	#define BYTE0_OFFSET 0
49	#define BYTE1_OFFSET 8
50	#define BYTE2_OFFSET 16
51	#define BYTE3_OFFSET 24
52	#define MSB 0xff000000
53	#define LSB 0x000000ff
54	#endif /* not __ARM_BIG_ENDIAN */
55
56	/ Parameters and result. /
57	#define src1 r0
58	#define src2 r1
59	#define result r0 /* Overlaps src1. */
60
61	/ Internal variables. /
62	#define tmp1 r4
63	#define tmp2 r5
64	#define const_m1 r12
65
66	/ Additional internal variables for 64-bit aligned data. /
67	#define data1a r2
68	#define data1b r3
69	#define data2a r6
70	#define data2b r7
71	#define syndrome_a tmp1
72	#define syndrome_b tmp2
73
74	/ Additional internal variables for 32-bit aligned data. /
75	#define data1 r2
76	#define data2 r3
77	#define syndrome tmp2
78
79
80	/ Macro to compute and return the result value for word-aligned*
81	cases. /*
82	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
83	#ifdef __ARM_BIG_ENDIAN
84	/ If data1 contains a zero byte, then syndrome will contain a 1 in*
85	bit 7 of that byte. Otherwise, the highest set bit in the
86	syndrome will highlight the first different bit. It is therefore
87	sufficient to extract the eight bits starting with the syndrome
88	bit. /*
89	clz tmp1, \synd
90	lsl r1, \d2, tmp1
91	.if \restore_r6
92	ldrd r6, r7, [sp, #`8`]
93	.endif
94	.cfi_restore `6`
95	.cfi_restore `7`
96	lsl \d1, \d1, tmp1
97	.cfi_remember_state
98	lsr result, \d1, #`24`
99	ldrd r4, r5, [sp], #`16`
100	.cfi_restore `4`
101	.cfi_restore `5`
102	sub result, result, r1, lsr #`24`
103	bx lr
104	#else
105	/ To use the big-endian trick we'd have to reverse all three words.*
106	that's slower than this approach. /*
107	rev \synd, \synd
108	clz tmp1, \synd
109	bic tmp1, tmp1, #`7`
110	lsr r1, \d2, tmp1
111	.cfi_remember_state
112	.if \restore_r6
113	ldrd r6, r7, [sp, #`8`]
114	.endif
115	.cfi_restore `6`
116	.cfi_restore `7`
117	lsr \d1, \d1, tmp1
118	and result, \d1, #`255`
119	and r1, r1, #`255`
120	ldrd r4, r5, [sp], #`16`
121	.cfi_restore `4`
122	.cfi_restore `5`
123	sub result, result, r1
124
125	bx lr
126	#endif
127	.endm
128
129	.text
130	.p2align `5`
131	L(strcmp_start_addr):
132	#if STRCMP_NO_PRECHECK == 0
133	L(fastpath_exit):
134	sub r0, r2, r3
135	bx lr
136	nop
137	#endif
138	ENTRY_ALIGN (__strcmp_arm, `0`)
139	#if STRCMP_NO_PRECHECK == 0
140	ldrb r2, [src1]
141	ldrb r3, [src2]
142	cmp r2, #`1`
143	it cs
144	cmpcs r2, r3
145	bne L(fastpath_exit)
146	#endif
147	strd r4, r5, [sp, #-`16`]!
148	.cfi_def_cfa_offset `16`
149	.cfi_offset `4`, -`16`
150	.cfi_offset `5`, -`12`
151	orr tmp1, src1, src2
152	strd r6, r7, [sp, #`8`]
153	.cfi_offset `6`, -`8`
154	.cfi_offset `7`, -`4`
155	mvn const_m1, #`0`
156	lsl r2, tmp1, #`29`
157	cbz r2, L(loop_aligned8)
158
159	L(not_aligned):
160	eor tmp1, src1, src2
161	tst tmp1, #`7`
162	bne L(misaligned8)
163
164	/ Deal with mutual misalignment by aligning downwards and then*
165	masking off the unwanted loaded data to prevent a difference. /*
166	and tmp1, src1, #`7`
167	bic src1, src1, #`7`
168	and tmp2, tmp1, #`3`
169	bic src2, src2, #`7`
170	lsl tmp2, tmp2, #`3` / Bytes -> bits. /
171	ldrd data1a, data1b, [src1], #`16`
172	tst tmp1, #`4`
173	ldrd data2a, data2b, [src2], #`16`
174	/ In thumb code we can't use MVN with a register shift, but*
175	we do have ORN. /*
176	S2HI tmp1, const_m1, tmp2
177	orn data1a, data1a, tmp1
178	orn data2a, data2a, tmp1
179	beq L(start_realigned8)
180	orn data1b, data1b, tmp1
181	mov data1a, const_m1
182	orn data2b, data2b, tmp1
183	mov data2a, const_m1
184	b L(start_realigned8)
185
186	/ Unwind the inner loop by a factor of 2, giving 16 bytes per*
187	pass. /*
188	.p2align `5`,,`12` / Don't start in the tail bytes of a cache line. /
189	.p2align `2` / Always word aligned. /
190	L(loop_aligned8):
191	ldrd data1a, data1b, [src1], #`16`
192	ldrd data2a, data2b, [src2], #`16`
193	L(start_realigned8):
194	uadd8 syndrome_b, data1a, const_m1 / Only want GE bits, /
195	eor syndrome_a, data1a, data2a
196	sel syndrome_a, syndrome_a, const_m1
197	cbnz syndrome_a, L(diff_in_a)
198	uadd8 syndrome_b, data1b, const_m1 / Only want GE bits. /
199	eor syndrome_b, data1b, data2b
200	sel syndrome_b, syndrome_b, const_m1
201	cbnz syndrome_b, L(diff_in_b)
202
203	ldrd data1a, data1b, [src1, #-`8`]
204	ldrd data2a, data2b, [src2, #-`8`]
205	uadd8 syndrome_b, data1a, const_m1 / Only want GE bits, /
206	eor syndrome_a, data1a, data2a
207	sel syndrome_a, syndrome_a, const_m1
208	uadd8 syndrome_b, data1b, const_m1 / Only want GE bits. /
209	eor syndrome_b, data1b, data2b
210	sel syndrome_b, syndrome_b, const_m1
211	/ Can't use CBZ for backwards branch. /
212	orrs syndrome_b, syndrome_b, syndrome_a / Only need if s_a == 0 /
213	beq L(loop_aligned8)
214
215	L(diff_found):
216	cbnz syndrome_a, L(diff_in_a)
217
218	L(diff_in_b):
219	strcmp_epilogue_aligned syndrome_b, data1b, data2b `1`
220
221	L(diff_in_a):
222	.cfi_restore_state
223	strcmp_epilogue_aligned syndrome_a, data1a, data2a `1`
224
225	.cfi_restore_state
226	L(misaligned8):
227	tst tmp1, #`3`
228	bne L(misaligned4)
229	ands tmp1, src1, #`3`
230	bne L(mutual_align4)
231
232	/ Unrolled by a factor of 2, to reduce the number of post-increment*
233	operations. /*
234	L(loop_aligned4):
235	ldr data1, [src1], #`8`
236	ldr data2, [src2], #`8`
237	L(start_realigned4):
238	uadd8 syndrome, data1, const_m1 / Only need GE bits. /
239	eor syndrome, data1, data2
240	sel syndrome, syndrome, const_m1
241	cbnz syndrome, L(aligned4_done)
242	ldr data1, [src1, #-`4`]
243	ldr data2, [src2, #-`4`]
244	uadd8 syndrome, data1, const_m1
245	eor syndrome, data1, data2
246	sel syndrome, syndrome, const_m1
247	cmp syndrome, #`0`
248	beq L(loop_aligned4)
249
250	L(aligned4_done):
251	strcmp_epilogue_aligned syndrome, data1, data2, `0`
252
253	L(mutual_align4):
254	.cfi_restore_state
255	/ Deal with mutual misalignment by aligning downwards and then*
256	masking off the unwanted loaded data to prevent a difference. /*
257	lsl tmp1, tmp1, #`3` / Bytes -> bits. /
258	bic src1, src1, #`3`
259	ldr data1, [src1], #`8`
260	bic src2, src2, #`3`
261	ldr data2, [src2], #`8`
262
263	/ In thumb code we can't use MVN with a register shift, but*
264	we do have ORN. /*
265	S2HI tmp1, const_m1, tmp1
266	orn data1, data1, tmp1
267	orn data2, data2, tmp1
268	b L(start_realigned4)
269
270	L(misaligned4):
271	ands tmp1, src1, #`3`
272	beq L(src1_aligned)
273	sub src2, src2, tmp1
274	bic src1, src1, #`3`
275	lsls tmp1, tmp1, #`31`
276	ldr data1, [src1], #`4`
277	beq L(aligned_m2)
278	bcs L(aligned_m1)
279
280	#if STRCMP_NO_PRECHECK == 1
281	ldrb data2, [src2, #`1`]
282	uxtb tmp1, data1, ror #BYTE1_OFFSET
283	subs tmp1, tmp1, data2
284	bne L(misaligned_exit)
285	cbz data2, L(misaligned_exit)
286
287	L(aligned_m2):
288	ldrb data2, [src2, #`2`]
289	uxtb tmp1, data1, ror #BYTE2_OFFSET
290	subs tmp1, tmp1, data2
291	bne L(misaligned_exit)
292	cbz data2, L(misaligned_exit)
293
294	L(aligned_m1):
295	ldrb data2, [src2, #`3`]
296	uxtb tmp1, data1, ror #BYTE3_OFFSET
297	subs tmp1, tmp1, data2
298	bne L(misaligned_exit)
299	add src2, src2, #`4`
300	cbnz data2, L(src1_aligned)
301	#else /* STRCMP_NO_PRECHECK */
302	/ If we've done the pre-check, then we don't need to check the*
303	first byte again here. /*
304	ldrb data2, [src2, #`2`]
305	uxtb tmp1, data1, ror #BYTE2_OFFSET
306	subs tmp1, tmp1, data2
307	bne L(misaligned_exit)
308	cbz data2, L(misaligned_exit)
309
310	L(aligned_m2):
311	ldrb data2, [src2, #`3`]
312	uxtb tmp1, data1, ror #BYTE3_OFFSET
313	subs tmp1, tmp1, data2
314	bne L(misaligned_exit)
315	cbnz data2, L(aligned_m1)
316	#endif
317
318	L(misaligned_exit):
319	.cfi_remember_state
320	mov result, tmp1
321	ldr r4, [sp], #`16`
322	.cfi_restore `4`
323	bx lr
324
325	#if STRCMP_NO_PRECHECK == 0
326	L(aligned_m1):
327	add src2, src2, #`4`
328	#endif
329	L(src1_aligned):
330	.cfi_restore_state
331	/ src1 is word aligned, but src2 has no common alignment*
332	with it. /*
333	ldr data1, [src1], #`4`
334	lsls tmp1, src2, #`31` / C=src2[1], Z=src2[0]. /
335
336	bic src2, src2, #`3`
337	ldr data2, [src2], #`4`
338	bhi L(overlap1) / C=1, Z=0 => src2[1:0] = 0b11. /
339	bcs L(overlap2) / C=1, Z=1 => src2[1:0] = 0b10. /
340
341	/ (overlap3) C=0, Z=0 => src2[1:0] = 0b01. /
342	L(overlap3):
343	bic tmp1, data1, #MSB
344	uadd8 syndrome, data1, const_m1
345	eors syndrome, tmp1, data2, S2LO #`8`
346	sel syndrome, syndrome, const_m1
347	bne `4f`
348	cbnz syndrome, `5f`
349	ldr data2, [src2], #`4`
350	eor tmp1, tmp1, data1
351	cmp tmp1, data2, S2HI #`24`
352	bne `6f`
353	ldr data1, [src1], #`4`
354	b L(overlap3)
355	`4`:
356	S2LO data2, data2, #`8`
357	b L(strcmp_tail)
358
359	`5`:
360	bics syndrome, syndrome, #MSB
361	bne L(strcmp_done_equal)
362
363	/ We can only get here if the MSB of data1 contains 0, so*
364	fast-path the exit. /*
365	ldrb result, [src2]
366	.cfi_remember_state
367	ldrd r4, r5, [sp], #`16`
368	.cfi_restore `4`
369	.cfi_restore `5`
370	/ R6/7 Not used in this sequence. /
371	.cfi_restore `6`
372	.cfi_restore `7`
373	neg result, result
374	bx lr
375
376	`6`:
377	.cfi_restore_state
378	S2LO data1, data1, #`24`
379	and data2, data2, #LSB
380	b L(strcmp_tail)
381
382	.p2align `5`,,`12` / Ensure at least 3 instructions in cache line. /
383	L(overlap2):
384	and tmp1, data1, const_m1, S2LO #`16`
385	uadd8 syndrome, data1, const_m1
386	eors syndrome, tmp1, data2, S2LO #`16`
387	sel syndrome, syndrome, const_m1
388	bne `4f`
389	cbnz syndrome, `5f`
390	ldr data2, [src2], #`4`
391	eor tmp1, tmp1, data1
392	cmp tmp1, data2, S2HI #`16`
393	bne `6f`
394	ldr data1, [src1], #`4`
395	b L(overlap2)
396	`4`:
397	S2LO data2, data2, #`16`
398	b L(strcmp_tail)
399	`5`:
400	ands syndrome, syndrome, const_m1, S2LO #`16`
401	bne L(strcmp_done_equal)
402
403	ldrh data2, [src2]
404	S2LO data1, data1, #`16`
405	#ifdef __ARM_BIG_ENDIAN
406	lsl data2, data2, #`16`
407	#endif
408	b L(strcmp_tail)
409
410	`6`:
411	S2LO data1, data1, #`16`
412	and data2, data2, const_m1, S2LO #`16`
413	b L(strcmp_tail)
414
415	.p2align `5`,,`12` / Ensure at least 3 instructions in cache line. /
416	L(overlap1):
417	and tmp1, data1, #LSB
418	uadd8 syndrome, data1, const_m1
419	eors syndrome, tmp1, data2, S2LO #`24`
420	sel syndrome, syndrome, const_m1
421	bne `4f`
422	cbnz syndrome, `5f`
423	ldr data2, [src2], #`4`
424	eor tmp1, tmp1, data1
425	cmp tmp1, data2, S2HI #`8`
426	bne `6f`
427	ldr data1, [src1], #`4`
428	b L(overlap1)
429	`4`:
430	S2LO data2, data2, #`24`
431	b L(strcmp_tail)
432	`5`:
433	tst syndrome, #LSB
434	bne L(strcmp_done_equal)
435	ldr data2, [src2]
436	`6`:
437	S2LO data1, data1, #`8`
438	bic data2, data2, #MSB
439	b L(strcmp_tail)
440
441	L(strcmp_done_equal):
442	mov result, #`0`
443	.cfi_remember_state
444	ldrd r4, r5, [sp], #`16`
445	.cfi_restore `4`
446	.cfi_restore `5`
447	/ R6/7 not used in this sequence. /
448	.cfi_restore `6`
449	.cfi_restore `7`
450	bx lr
451
452	L(strcmp_tail):
453	.cfi_restore_state
454	#ifndef __ARM_BIG_ENDIAN
455	rev data1, data1
456	rev data2, data2
457	/ Now everything looks big-endian... /
458	#endif
459	uadd8 tmp1, data1, const_m1
460	eor tmp1, data1, data2
461	sel syndrome, tmp1, const_m1
462	clz tmp1, syndrome
463	lsl data1, data1, tmp1
464	lsl data2, data2, tmp1
465	lsr result, data1, #`24`
466	ldrd r4, r5, [sp], #`16`
467	.cfi_restore `4`
468	.cfi_restore `5`
469	/ R6/7 not used in this sequence. /
470	.cfi_restore `6`
471	.cfi_restore `7`
472	sub result, result, data2, lsr #`24`
473	bx lr
474
475	END (__strcmp_arm)
476
477	#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */
478

source code of libc/AOR_v20.02/string/arm/strcmp.S