diff options
author | Connal de Souza <connaldesouza@google.com> | 2023-09-21 12:51:54 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2023-09-21 12:52:45 -0700 |
commit | aa3c949a7f0b4dc6fb4569117b2e3aa50e2cd27a (patch) | |
tree | 4e2459680e6faf54b5f1b4da39e8526499910959 /absl/synchronization | |
parent | 821756c32ee197556905a94910e631721113dbb3 (diff) | |
download | abseil-aa3c949a7f0b4dc6fb4569117b2e3aa50e2cd27a.tar.gz abseil-aa3c949a7f0b4dc6fb4569117b2e3aa50e2cd27a.tar.bz2 abseil-aa3c949a7f0b4dc6fb4569117b2e3aa50e2cd27a.zip |
Optimize CRC32 Extend for large inputs on Arm
This is a temporary workaround for an apparent compiler bug with pmull(2) instructions. The current hot loop looks like this:
mov w14, #0xef02,
lsl x15, x15, #6,
mov x13, xzr,
movk w14, #0x740e, lsl #16,
sub x15, x15, #0x40,
ldr q4, [x16, #0x4e0],
_LOOP_START:
add x16, x9, x13,
add x17, x12, x13,
fmov d19, x14, <--------- This is Loop invariant and expensive
add x13, x13, #0x40,
cmp x15, x13,
prfm pldl1keep, [x16, #0x140],
prfm pldl1keep, [x17, #0x140],
ldp x18, x0, [x16, #0x40],
crc32cx w10, w10, x18,
ldp x2, x18, [x16, #0x50],
crc32cx w10, w10, x0,
crc32cx w10, w10, x2,
ldp x0, x2, [x16, #0x60],
crc32cx w10, w10, x18,
ldp x18, x16, [x16, #0x70],
pmull2 v5.1q, v1.2d, v4.2d,
pmull2 v6.1q, v0.2d, v4.2d,
pmull2 v7.1q, v2.2d, v4.2d,
pmull2 v16.1q, v3.2d, v4.2d,
ldp q17, q18, [x17, #0x40],
crc32cx w10, w10, x0,
pmull v1.1q, v1.1d, v19.1d,
crc32cx w10, w10, x2,
pmull v0.1q, v0.1d, v19.1d,
crc32cx w10, w10, x18,
pmull v2.1q, v2.1d, v19.1d,
crc32cx w10, w10, x16,
pmull v3.1q, v3.1d, v19.1d,
ldp q20, q21, [x17, #0x60],
eor v1.16b, v17.16b, v1.16b,
eor v0.16b, v18.16b, v0.16b,
eor v1.16b, v1.16b, v5.16b,
eor v2.16b, v20.16b, v2.16b,
eor v0.16b, v0.16b, v6.16b,
eor v3.16b, v21.16b, v3.16b,
eor v2.16b, v2.16b, v7.16b,
eor v3.16b, v3.16b, v16.16b,
b.ne _LOOP_START
There is a redundant fmov that moves the same constant into a Neon register every loop iteration to be used in the PMULL instructions. The PMULL2 instructions already have this constant loaded into Neon registers. After this change, both the PMULL and PMULL2 instructions use the values in q4, and they are not reloaded every iteration. This fmov was expensive because it contends for execution units with crc32cx instructions. This is up to 20% faster for large inputs.
PiperOrigin-RevId: 567391972
Change-Id: I4c8e49750cfa5cc5730c3bb713bd9fd67657804a
Diffstat (limited to 'absl/synchronization')
0 files changed, 0 insertions, 0 deletions