| 1 | /* memcmp - compare memory |
| 2 | * |
| 3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | * See https://llvm.org/LICENSE.txt for license information. |
| 5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | */ |
| 7 | |
| 8 | /* Assumptions: |
| 9 | * |
| 10 | * ARMv8-a, AArch64, unaligned accesses. |
| 11 | */ |
| 12 | |
| 13 | #include "../asmdefs.h" |
| 14 | |
| 15 | /* Parameters and result. */ |
| 16 | #define src1 x0 |
| 17 | #define src2 x1 |
| 18 | #define limit x2 |
| 19 | #define result w0 |
| 20 | |
| 21 | /* Internal variables. */ |
| 22 | #define data1 x3 |
| 23 | #define data1w w3 |
| 24 | #define data1h x4 |
| 25 | #define data2 x5 |
| 26 | #define data2w w5 |
| 27 | #define data2h x6 |
| 28 | #define tmp1 x7 |
| 29 | #define tmp2 x8 |
| 30 | |
| 31 | ENTRY (__memcmp_aarch64) |
| 32 | subs limit, limit, 8 |
| 33 | b.lo L(less8) |
| 34 | |
| 35 | ldr data1, [src1], 8 |
| 36 | ldr data2, [src2], 8 |
| 37 | cmp data1, data2 |
| 38 | b.ne L(return) |
| 39 | |
| 40 | subs limit, limit, 8 |
| 41 | b.gt L(more16) |
| 42 | |
| 43 | ldr data1, [src1, limit] |
| 44 | ldr data2, [src2, limit] |
| 45 | b L(return) |
| 46 | |
| 47 | L(more16): |
| 48 | ldr data1, [src1], 8 |
| 49 | ldr data2, [src2], 8 |
| 50 | cmp data1, data2 |
| 51 | bne L(return) |
| 52 | |
| 53 | /* Jump directly to comparing the last 16 bytes for 32 byte (or less) |
| 54 | strings. */ |
| 55 | subs limit, limit, 16 |
| 56 | b.ls L(last_bytes) |
| 57 | |
| 58 | /* We overlap loads between 0-32 bytes at either side of SRC1 when we |
| 59 | try to align, so limit it only to strings larger than 128 bytes. */ |
| 60 | cmp limit, 96 |
| 61 | b.ls L(loop16) |
| 62 | |
| 63 | /* Align src1 and adjust src2 with bytes not yet done. */ |
| 64 | and tmp1, src1, 15 |
| 65 | add limit, limit, tmp1 |
| 66 | sub src1, src1, tmp1 |
| 67 | sub src2, src2, tmp1 |
| 68 | |
| 69 | /* Loop performing 16 bytes per iteration using aligned src1. |
| 70 | Limit is pre-decremented by 16 and must be larger than zero. |
| 71 | Exit if <= 16 bytes left to do or if the data is not equal. */ |
| 72 | .p2align 4 |
| 73 | L(loop16): |
| 74 | ldp data1, data1h, [src1], 16 |
| 75 | ldp data2, data2h, [src2], 16 |
| 76 | subs limit, limit, 16 |
| 77 | ccmp data1, data2, 0, hi |
| 78 | ccmp data1h, data2h, 0, eq |
| 79 | b.eq L(loop16) |
| 80 | |
| 81 | cmp data1, data2 |
| 82 | bne L(return) |
| 83 | mov data1, data1h |
| 84 | mov data2, data2h |
| 85 | cmp data1, data2 |
| 86 | bne L(return) |
| 87 | |
| 88 | /* Compare last 1-16 bytes using unaligned access. */ |
| 89 | L(last_bytes): |
| 90 | add src1, src1, limit |
| 91 | add src2, src2, limit |
| 92 | ldp data1, data1h, [src1] |
| 93 | ldp data2, data2h, [src2] |
| 94 | cmp data1, data2 |
| 95 | bne L(return) |
| 96 | mov data1, data1h |
| 97 | mov data2, data2h |
| 98 | cmp data1, data2 |
| 99 | |
| 100 | /* Compare data bytes and set return value to 0, -1 or 1. */ |
| 101 | L(return): |
| 102 | #ifndef __AARCH64EB__ |
| 103 | rev data1, data1 |
| 104 | rev data2, data2 |
| 105 | #endif |
| 106 | cmp data1, data2 |
| 107 | L(ret_eq): |
| 108 | cset result, ne |
| 109 | cneg result, result, lo |
| 110 | ret |
| 111 | |
| 112 | .p2align 4 |
| 113 | /* Compare up to 8 bytes. Limit is [-8..-1]. */ |
| 114 | L(less8): |
| 115 | adds limit, limit, 4 |
| 116 | b.lo L(less4) |
| 117 | ldr data1w, [src1], 4 |
| 118 | ldr data2w, [src2], 4 |
| 119 | cmp data1w, data2w |
| 120 | b.ne L(return) |
| 121 | sub limit, limit, 4 |
| 122 | L(less4): |
| 123 | adds limit, limit, 4 |
| 124 | beq L(ret_eq) |
| 125 | L(byte_loop): |
| 126 | ldrb data1w, [src1], 1 |
| 127 | ldrb data2w, [src2], 1 |
| 128 | subs limit, limit, 1 |
| 129 | ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ |
| 130 | b.eq L(byte_loop) |
| 131 | sub result, data1w, data2w |
| 132 | ret |
| 133 | |
| 134 | END (__memcmp_aarch64) |
| 135 | |