Skip to content

perf: nudge LLVM codegen good to_bytes_be#569

Merged
prestwich merged 1 commit intorecmo:mainfrom
pepyakin:pep/to-be-bytes-fast-path
Apr 21, 2026
Merged

perf: nudge LLVM codegen good to_bytes_be#569
prestwich merged 1 commit intorecmo:mainfrom
pepyakin:pep/to-be-bytes-fast-path

Conversation

@pepyakin
Copy link
Copy Markdown
Contributor

Special case the BYTES = LIMBS * 8 so to compel LLVM generate the proper machine code.

In the wild, it was observed that LLVM1 generated very silly code on x86-64,

Disassembly on x86-64
;; ── limb3 (rdx) — written BYTE BY BYTE ──
mov    %rdx,%rax                ;  copy limb3
shr    $0x38,%rax               ;  extract byte 0 (bits 63..56)
mov    %al, (%rcx,%rbx)         ;  store byte 0
mov    %rdx,%rax
shr    $0x30,%rax               ;  extract byte 1 (bits 55..48)
mov    %al, 0x1(%rcx,%rbx)      ;  store byte 1
mov    %rdx,%rax
shr    $0x28,%rax               ;  extract byte 2 (bits 47..40)
mov    %al, 0x2(%rcx,%rbx)      ;  store byte 2
mov    %rdx,%rax
shr    $0x20,%rax               ;  extract byte 3 (bits 39..32)
mov    %al, 0x3(%rcx,%rbx)      ;  store byte 3
mov    %edx,%eax
shr    $0x18,%eax               ;  extract byte 4 (bits 31..24)
mov    %al, 0x4(%rcx,%rbx)      ;  store byte 4
mov    %edx,%eax
shr    $0x10,%eax               ;  extract byte 5 (bits 23..16)
mov    %al, 0x5(%rcx,%rbx)      ;  store byte 5
mov    %dh, 0x6(%rcx,%rbx)      ;  store byte 6 (bits 15..8)
mov    %dl, 0x7(%rcx,%rbx)      ;  store byte 7 (bits 7..0)

;; ── limbs 2,1 — bswap+qword ──
bswap  %r15                     ;  byte-swap limb2
mov    %r15, 0x8(%rcx,%rbx)     ;  store 8 bytes at once
bswap  %r14                     ;  byte-swap limb1
mov    %r14, 0x10(%rcx,%rbx)    ;  store 8 bytes at once

;; ── limb0 (r13) — BYTE BY BYTE again ──
mov    %r13,%rax
shr    $0x38,%rax               ;  extract byte 24
mov    %al, 0x18(%rcx,%rbx)     ;  store
mov    %r13,%rax
shr    $0x30,%rax               ;  extract byte 25
mov    %al, 0x19(%rcx,%rbx)     ;  store
mov    %r13,%rax
shr    $0x28,%rax               ;  extract byte 26
mov    %al, 0x1a(%rcx,%rbx)     ;  store
mov    %r13,%rax
shr    $0x20,%rax               ;  extract byte 27
mov    %al, 0x1b(%rcx,%rbx)     ;  store
mov    %eax,%edx
shr    $0x18,%edx               ;  extract byte 28
mov    %dl, 0x1c(%rcx,%rbx)     ;  store
mov    %eax,%edx
shr    $0x10,%edx               ;  extract byte 29
mov    %dl, 0x1d(%rcx,%rbx)     ;  store
mov    %ah, 0x1e(%rcx,%rbx)     ;  store byte 30
mov    %al, 0x1f(%rcx,%rbx)     ;  store byte 31

On M1 every limb was generated byte-by-byte.

This changeset is verified to generate moves + bswap.

Footnotes

  1. rustc 1.94.0-nightly (2026-01-12) with LLVM 21.1.8.

Special case the BYTES = LIMBS * 8 so to compel LLVM
generate the proper machine code.

In the wild, it was observed that LLVM[^1] generated this
on x86-64:

```
;; ── limb3 (rdx) — written BYTE BY BYTE ──
mov    %rdx,%rax                ;  copy limb3
shr    $0x38,%rax               ;  extract byte 0 (bits 63..56)
mov    %al, (%rcx,%rbx)         ;  store byte 0
mov    %rdx,%rax
shr    $0x30,%rax               ;  extract byte 1 (bits 55..48)
mov    %al, 0x1(%rcx,%rbx)      ;  store byte 1
mov    %rdx,%rax
shr    $0x28,%rax               ;  extract byte 2 (bits 47..40)
mov    %al, 0x2(%rcx,%rbx)      ;  store byte 2
mov    %rdx,%rax
shr    $0x20,%rax               ;  extract byte 3 (bits 39..32)
mov    %al, 0x3(%rcx,%rbx)      ;  store byte 3
mov    %edx,%eax
shr    $0x18,%eax               ;  extract byte 4 (bits 31..24)
mov    %al, 0x4(%rcx,%rbx)      ;  store byte 4
mov    %edx,%eax
shr    $0x10,%eax               ;  extract byte 5 (bits 23..16)
mov    %al, 0x5(%rcx,%rbx)      ;  store byte 5
mov    %dh, 0x6(%rcx,%rbx)      ;  store byte 6 (bits 15..8)
mov    %dl, 0x7(%rcx,%rbx)      ;  store byte 7 (bits 7..0)

;; ── limbs 2,1 — bswap+qword ──
bswap  %r15                     ;  byte-swap limb2
mov    %r15, 0x8(%rcx,%rbx)     ;  store 8 bytes at once
bswap  %r14                     ;  byte-swap limb1
mov    %r14, 0x10(%rcx,%rbx)    ;  store 8 bytes at once

;; ── limb0 (r13) — BYTE BY BYTE again ──
mov    %r13,%rax
shr    $0x38,%rax               ;  extract byte 24
mov    %al, 0x18(%rcx,%rbx)     ;  store
mov    %r13,%rax
shr    $0x30,%rax               ;  extract byte 25
mov    %al, 0x19(%rcx,%rbx)     ;  store
mov    %r13,%rax
shr    $0x28,%rax               ;  extract byte 26
mov    %al, 0x1a(%rcx,%rbx)     ;  store
mov    %r13,%rax
shr    $0x20,%rax               ;  extract byte 27
mov    %al, 0x1b(%rcx,%rbx)     ;  store
mov    %eax,%edx
shr    $0x18,%edx               ;  extract byte 28
mov    %dl, 0x1c(%rcx,%rbx)     ;  store
mov    %eax,%edx
shr    $0x10,%edx               ;  extract byte 29
mov    %dl, 0x1d(%rcx,%rbx)     ;  store
mov    %ah, 0x1e(%rcx,%rbx)     ;  store byte 30
mov    %al, 0x1f(%rcx,%rbx)     ;  store byte 31
```

On M1 every limb was generated byte-by-byte.

This changeset is verified to generate moves + bswap.

[^1]: rustc 1.94.0-nightly (2026-01-12) with LLVM 21.1.8.
@pepyakin pepyakin requested a review from prestwich as a code owner April 21, 2026 15:18
@codspeed-hq
Copy link
Copy Markdown

codspeed-hq Bot commented Apr 21, 2026

Merging this PR will degrade performance by 40.08%

⚡ 26 improved benchmarks
❌ 2 regressed benchmarks
✅ 358 untouched benchmarks

⚠️ Please fix the performance issues or acknowledge them on CodSpeed.

Performance Changes

Benchmark BASE HEAD Efficiency
fmt/binary/128 6.6 ms 5.2 ms +26.99%
fmt/decimal/256 10.3 ms 8.4 ms +21.59%
fmt/decimal/384 15.5 ms 12.7 ms +21.56%
fmt/decimal/192 7.8 ms 6.4 ms +21.3%
fmt/hex/128 2.4 ms 2 ms +17.54%
fmt/decimal/64 1.7 ms 1.2 ms +35.54%
fmt/decimal/512 21.4 ms 17.7 ms +20.77%
fmt/octal/128 2.8 ms 2.4 ms +19.93%
from/f32/128 366 µs 300.1 µs +21.96%
from/f32/192 431.2 µs 362.5 µs +18.96%
from/f32/512 714.4 µs 637.5 µs +12.05%
from/f32/256 482.6 µs 405.6 µs +18.98%
from/f64/384 582.2 µs 528 µs +10.27%
from/f64/128 354 µs 299.3 µs +18.28%
from/f64/256 464.5 µs 402.9 µs +15.28%
from/f32/384 601.9 µs 525.3 µs +14.58%
from/f64/192 418.5 µs 367.6 µs +13.86%
overflowing_pow/64 79.5 µs 29.4 µs ×2.7
overflowing_pow/128 460.4 µs 418.3 µs +10.06%
parse/decimal/256/zero 740.8 µs 340.7 µs ×2.2
... ... ... ... ...

ℹ️ Only the first 20 benchmarks are displayed. Go to the app to view all benchmarks.


Comparing pepyakin:pep/to-be-bytes-fast-path (44419d8) with main (fd26238)

Open in CodSpeed

@prestwich
Copy link
Copy Markdown
Collaborator

decimal parsing regressions seem uimportant to me? cc @DaniPopes

@DaniPopes
Copy link
Copy Markdown
Contributor

DaniPopes commented Apr 21, 2026

looks like due to another compiler version bump? most of these benchmarks don't use this function

1.95 was released on 16 April and last main run was 7 April

@prestwich prestwich merged commit 403227f into recmo:main Apr 21, 2026
18 of 19 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants