@@ -766,28 +766,36 @@ namespace xsimd
766766 XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
767767 {
768768 uint8x16_t val = vshrq_n_u8 (b.data , 7 );
769- vst1q_u8 ((uint8_t *)mem, val);
769+ alignas (A::alignment ()) uint8_t buffer[batch_bool<T, A>::size];
770+ vst1q_u8 (buffer, val);
771+ memcpy (mem, buffer, sizeof (buffer));
770772 }
771773
772774 template <class T , class A , detail::enable_sized_t <T, 2 > = 0 >
773775 XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
774776 {
775777 uint8x8_t val = vshr_n_u8 (vqmovn_u16 (b.data ), 7 );
776- vst1_u8 ((uint8_t *)mem, val);
778+ alignas (A::alignment ()) uint8_t buffer[batch_bool<T, A>::size];
779+ vst1_u8 (buffer, val);
780+ memcpy (mem, buffer, sizeof (buffer));
777781 }
778782
779783 template <class T , class A , detail::enable_sized_t <T, 4 > = 0 >
780784 XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
781785 {
782786 uint8x8_t val = vshr_n_u8 (vqmovn_u16 (vcombine_u16 (vqmovn_u32 (b.data ), vdup_n_u16 (0 ))), 7 );
783- vst1_lane_u32 ((uint32_t *)mem, vreinterpret_u32_u8 (val), 0 );
787+ alignas (A::alignment ()) uint8_t buffer[8 ];
788+ vst1_u8 (buffer, val);
789+ memcpy (mem, buffer, batch_bool<T, A>::size);
784790 }
785791
786792 template <class T , class A , detail::enable_sized_t <T, 8 > = 0 >
787793 XSIMD_INLINE void store (batch_bool<T, A> b, bool * mem, requires_arch<neon>) noexcept
788794 {
789795 uint8x8_t val = vshr_n_u8 (vqmovn_u16 (vcombine_u16 (vqmovn_u32 (vcombine_u32 (vqmovn_u64 (b.data ), vdup_n_u32 (0 ))), vdup_n_u16 (0 ))), 7 );
790- vst1_lane_u16 ((uint16_t *)mem, vreinterpret_u16_u8 (val), 0 );
796+ alignas (A::alignment ()) uint8_t buffer[8 ];
797+ vst1_u8 (buffer, val);
798+ memcpy (mem, buffer, batch_bool<T, A>::size);
791799 }
792800
793801 template <class A >
0 commit comments