Skip to content

Commit 9aa226a

Browse files
Avoid going through vst1_lane_u32 during batc_bool store on arm
Prefer a full lane store followed by a memcpy. The generated code is very close and it does not fail validation.
1 parent fc1ca1d commit 9aa226a

1 file changed

Lines changed: 12 additions & 4 deletions

File tree

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -766,28 +766,36 @@ namespace xsimd
766766
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
767767
{
768768
uint8x16_t val = vshrq_n_u8(b.data, 7);
769-
vst1q_u8((uint8_t*)mem, val);
769+
alignas(A::alignment()) uint8_t buffer[batch_bool<T, A>::size];
770+
vst1q_u8(buffer, val);
771+
memcpy(mem, buffer, sizeof(buffer));
770772
}
771773

772774
template <class T, class A, detail::enable_sized_t<T, 2> = 0>
773775
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
774776
{
775777
uint8x8_t val = vshr_n_u8(vqmovn_u16(b.data), 7);
776-
vst1_u8((uint8_t*)mem, val);
778+
alignas(A::alignment()) uint8_t buffer[batch_bool<T, A>::size];
779+
vst1_u8(buffer, val);
780+
memcpy(mem, buffer, sizeof(buffer));
777781
}
778782

779783
template <class T, class A, detail::enable_sized_t<T, 4> = 0>
780784
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
781785
{
782786
uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(b.data), vdup_n_u16(0))), 7);
783-
vst1_lane_u32((uint32_t*)mem, vreinterpret_u32_u8(val), 0);
787+
alignas(A::alignment()) uint8_t buffer[8];
788+
vst1_u8(buffer, val);
789+
memcpy(mem, buffer, batch_bool<T, A>::size);
784790
}
785791

786792
template <class T, class A, detail::enable_sized_t<T, 8> = 0>
787793
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
788794
{
789795
uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(vcombine_u32(vqmovn_u64(b.data), vdup_n_u32(0))), vdup_n_u16(0))), 7);
790-
vst1_lane_u16((uint16_t*)mem, vreinterpret_u16_u8(val), 0);
796+
alignas(A::alignment()) uint8_t buffer[8];
797+
vst1_u8(buffer, val);
798+
memcpy(mem, buffer, batch_bool<T, A>::size);
791799
}
792800

793801
template <class A>

0 commit comments

Comments
 (0)