Skip to content

Conversation

@tarcieri
Copy link
Member

In #1332 we ran into LLVM inserting branches in this routine for thumbv6m-none-eabi targets. It was "fixed" by fiddling around with black_box but that seems brittle.

In #1334 we attempted a simple portable asm! optimization barrier approach but it did not work as expected.

This instead opts to implement one of the fiddliest bits, mask generation, using ARM assembly instead. The resulting assembly is actually more efficient than what rustc/LLVM outputs and avoids touching the stack pointer.

It's a simple enough function to implement in assembly on other platforms with stable asm! too, but this is a start.

@tarcieri
Copy link
Member Author

@NicsTr this version should hopefully both avoid the sidechannel and be slightly more efficient if you wouldn't mind confirming it still addresses the issue from #1332

@tarcieri tarcieri requested a review from newpavlov January 15, 2026 03:48
@tarcieri tarcieri force-pushed the cmov/optimized-maskgen32-for-arm32 branch 2 times, most recently from 23c9db5 to 077619c Compare January 15, 2026 03:54
@tarcieri
Copy link
Member Author

tarcieri commented Jan 15, 2026

In case it's helpful to anyone looking at this, here's the Godbolt playground I used when developing it: https://godbolt.org/z/vc9Tn94bG

Here are the results after LLVM inlines maskgen32/maskgen64:

cmovnz32

Pure Rust

        push    {r4, r6, r7, lr}
        add     r7, sp, #8
        sub     sp, #4
        uxtb    r2, r2
        subs    r3, r2, #1
        sbcs    r2, r3
        str     r2, [sp]
        mov     r2, sp
        ldr     r2, [sp]
        subs    r3, r2, #1
        ldr     r4, [r0]
        ands    r4, r3
        rsbs    r2, r2, #0
        ldr     r1, [r1]
        ands    r1, r2
        orrs    r1, r4
        str     r1, [r0]
        add     sp, #4
        pop     {r4, r6, r7, pc}

asm! optimized

        push    {r7, lr}
        add     r7, sp, #0
        uxtb    r2, r2
        rsbs    r2, r2, #0
        sbcs    r2, r2
        ldr     r3, [r0]
        ands    r3, r2
        ldr     r1, [r1]
        bics    r1, r2
        adds    r1, r1, r3
        str     r1, [r0]
        pop     {r7, pc}

cmovnz64

Pure Rust

       push    {r4, r5, r7, lr}
        add     r7, sp, #8
        sub     sp, #8
        mov     r5, r1
        mov     r4, r0
        uxtb    r0, r2
        subs    r1, r0, #1
        sbcs    r0, r1
        str     r0, [sp, #4]
        add     r0, sp, #4
        ldr     r0, [sp, #4]
        subs    r0, r0, #1
        movs    r1, #0
        movs    r2, #1
        mov     r3, r2
        bl      __aeabi_lmul
        ldm     r5!, {r2, r3}
        bics    r2, r0
        ldr     r5, [r4]
        ands    r5, r0
        adds    r0, r5, r2
        str     r0, [r4]
        bics    r3, r1
        ldr     r0, [r4, #4]
        ands    r0, r1
        adds    r0, r0, r3
        str     r0, [r4, #4]
        add     sp, #8
        pop     {r4, r5, r7, pc}

asm! optimized

        push    {r4, r5, r7, lr}
        add     r7, sp, #8
        mov     r5, r1
        mov     r4, r0
        uxtb    r0, r2
        rsbs    r0, r0, #0
        sbcs    r0, r0
        movs    r1, #0
        movs    r2, #1
        mov     r3, r2
        bl      __aeabi_lmul
        ldm     r5!, {r2, r3}
        bics    r2, r0
        ldr     r5, [r4]
        ands    r5, r0
        adds    r0, r5, r2
        str     r0, [r4]
        bics    r3, r1
        ldr     r0, [r4, #4]
        ands    r0, r1
        adds    r0, r0, r3
        str     r0, [r4, #4]
        pop     {r4, r5, r7, pc}

@tarcieri tarcieri force-pushed the cmov/optimized-maskgen32-for-arm32 branch 3 times, most recently from 163db99 to ff64bb5 Compare January 15, 2026 05:30
@tarcieri tarcieri changed the title cmov: add asm! optimized maskgen32 for ARM32 cmov: add asm! optimized masknz32 for ARM32 Jan 15, 2026
In #1332 we ran into LLVM inserting branches in this routine for
`thumbv6m-none-eabi` targets. It was "fixed" by fiddling around with
`black_box` but that seems brittle.

In #1334 we attempted a simple portable `asm!` optimization barrier
approach but it did not work as expected.

This instead opts to implement one of the fiddliest bits, mask
generation, using ARM assembly instead. The resulting assembly is
actually more efficient than what rustc/LLVM outputs and avoids touching
the stack pointer.

It's a simple enough function to implement in assembly on other
platforms with stable `asm!` too, but this is a start.
@tarcieri tarcieri force-pushed the cmov/optimized-maskgen32-for-arm32 branch from ff64bb5 to 256ace6 Compare January 15, 2026 06:12
@tarcieri tarcieri merged commit 65c6520 into master Jan 15, 2026
22 checks passed
@tarcieri tarcieri deleted the cmov/optimized-maskgen32-for-arm32 branch January 15, 2026 14:22
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants