diff --git a/vortex-array/src/arrays/decimal/compute/between.rs b/vortex-array/src/arrays/decimal/compute/between.rs index 75c2da923ea..a13c927e278 100644 --- a/vortex-array/src/arrays/decimal/compute/between.rs +++ b/vortex-array/src/arrays/decimal/compute/between.rs @@ -132,8 +132,7 @@ fn between_impl( ) -> ArrayRef { let buffer = arr.buffer::(); BoolArray::new( - BitBuffer::collect_bool(buffer.len(), |idx| { - let value = buffer[idx]; + BitBuffer::collect_bool_slice(buffer.as_slice(), |&value| { lower.is_none_or(|l| lower_op(l, value)) & upper.is_none_or(|u| upper_op(value, u)) }), arr.validity() diff --git a/vortex-array/src/arrays/primitive/compute/between.rs b/vortex-array/src/arrays/primitive/compute/between.rs index 9a6c7f60c1b..815e6b5bd33 100644 --- a/vortex-array/src/arrays/primitive/compute/between.rs +++ b/vortex-array/src/arrays/primitive/compute/between.rs @@ -105,11 +105,7 @@ where { let slice = arr.as_slice::(); BoolArray::new( - BitBuffer::collect_bool(slice.len(), |idx| { - // We only iterate upto arr len and |arr| == |slice|. - let i = unsafe { *slice.get_unchecked(idx) }; - lower_fn(lower, i) & upper_fn(i, upper) - }), + BitBuffer::collect_bool_slice(slice, |&i| lower_fn(lower, i) & upper_fn(i, upper)), arr.validity() .vortex_expect("validity should be derivable") .union_nullability(nullability), diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index ae9d7e6cc05..3a38a85f367 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -48,3 +48,7 @@ harness = false [[bench]] name = "vortex_bitbuffer" harness = false + +[[bench]] +name = "compare_lowering" +harness = false diff --git a/vortex-buffer/benches/compare_lowering.rs b/vortex-buffer/benches/compare_lowering.rs new file mode 100644 index 00000000000..199ec1ea542 --- /dev/null +++ b/vortex-buffer/benches/compare_lowering.rs @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Best-scalar vs best-SIMD for three compare->bitmask lowerings: +//! 1. `u8 != 0` (byte truthiness pack) +//! 2. `i32 > 5` (single comparison) +//! 3. `5 < i32 < 10` (two comparisons / between) +//! +//! Every variant writes the SAME `u64`-word bitmask (`n` a multiple of 64), so +//! the comparison is apples-to-apples. SIMD paths dispatch avx512 -> avx2 -> +//! scalar at runtime (matching the production `pack_nonzero_bytes`), so they +//! always do real vector work under CodSpeed (which builds with `+avx2`). +//! Sizes span L1/L2 (16Ki) to DRAM (1Mi). + +#![allow(clippy::cast_possible_truncation)] +// Bench-local: terse math names and `unwrap` on infallible slice->array conversions. +#![allow(clippy::many_single_char_names)] +#![allow(clippy::unwrap_used)] + +use divan::Bencher; +use vortex_buffer::pack_nonzero_bytes; + +const SIZES: &[usize] = &[16_384, 1_048_576]; + +// ---------------- scalar ---------------- + +/// Slice-iterating shift-OR pack — the idiomatic portable scalar form. +#[inline(always)] +fn pack_pred bool>(out: &mut [u64], v: &[T], f: F) { + for (w, chunk) in out.iter_mut().zip(v.chunks_exact(64)) { + let mut word = 0u64; + for (b, &x) in chunk.iter().enumerate() { + word |= (f(x) as u64) << b; + } + *w = word; + } +} + +/// Carry-free SWAR: pack `u8 != 0` 8 bytes at a time without SIMD intrinsics. +fn nonzero_u8_swar(out: &mut [u64], v: &[u8]) { + for (i, w) in out.iter_mut().enumerate() { + let base = i * 64; + let mut word = 0u64; + for g in 0..8 { + let chunk = u64::from_le_bytes(v[base + g * 8..base + g * 8 + 8].try_into().unwrap()); + let low7 = chunk & 0x7f7f_7f7f_7f7f_7f7f; + let nz = (low7.wrapping_add(0x7f7f_7f7f_7f7f_7f7f) | chunk) & 0x8080_8080_8080_8080; + let bits = nz.wrapping_mul(0x0002_0408_1020_4081) >> 56; + word |= bits << (g * 8); + } + *w = word; + } +} + +// ---------------- SIMD: i32 > k ---------------- + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f")] +unsafe fn gt_i32_avx512(out: &mut [u64], v: &[i32], k: i32) { + use std::arch::x86_64::*; + let vk = _mm512_set1_epi32(k); + let p = v.as_ptr() as *const __m512i; + // SAFETY: word i reads 64 i32 (4x16-lane loads), in bounds for i k); +} + +// ---------------- SIMD: lo < i32 < hi ---------------- + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f")] +unsafe fn between_i32_avx512(out: &mut [u64], v: &[i32], lo: i32, hi: i32) { + use std::arch::x86_64::*; + let vlo = _mm512_set1_epi32(lo); + let vhi = _mm512_set1_epi32(hi); + let p = v.as_ptr() as *const __m512i; + // SAFETY: as above. + let lane = |j: usize| unsafe { + let x = _mm512_loadu_si512(p.add(j)); + (_mm512_cmpgt_epi32_mask(x, vlo) & _mm512_cmplt_epi32_mask(x, vhi)) as u64 + }; + for (i, w) in out.iter_mut().enumerate() { + *w = lane(i * 4) + | (lane(i * 4 + 1) << 16) + | (lane(i * 4 + 2) << 32) + | (lane(i * 4 + 3) << 48); + } +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn between_i32_avx2(out: &mut [u64], v: &[i32], lo: i32, hi: i32) { + use std::arch::x86_64::*; + let vlo = _mm256_set1_epi32(lo); + let vhi = _mm256_set1_epi32(hi); + let p = v.as_ptr(); + for (i, w) in out.iter_mut().enumerate() { + let mut word = 0u64; + for g in 0..8 { + // SAFETY: reads 8 i32 at i*64+g*8, in bounds for i lo) & (x < hi)); +} + +// ---------------- data ---------------- + +fn bytes(n: usize) -> Vec { + (0..n).map(|i| (i % 7 == 0) as u8).collect() +} +fn ints(n: usize) -> Vec { + (0..n) + .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32) % 16) + .collect() +} + +// ================= u8 != 0 ================= + +#[divan::bench(args = SIZES)] +fn u8_scalar_pack(bencher: Bencher, n: usize) { + let d = bytes(n); + let mut out = vec![0u64; n / 64]; + bencher.bench_local(|| { + pack_pred(divan::black_box(&mut out), divan::black_box(&d), |x: u8| { + x != 0 + }); + divan::black_box(out.as_slice()); + }); +} + +#[divan::bench(args = SIZES)] +fn u8_scalar_swar(bencher: Bencher, n: usize) { + let d = bytes(n); + let mut out = vec![0u64; n / 64]; + bencher.bench_local(|| { + nonzero_u8_swar(divan::black_box(&mut out), divan::black_box(&d)); + divan::black_box(out.as_slice()); + }); +} + +#[divan::bench(args = SIZES)] +fn u8_simd(bencher: Bencher, n: usize) { + let d = bytes(n); + let mut out = vec![0u64; n / 64]; + bencher.bench_local(|| { + pack_nonzero_bytes(divan::black_box(&mut out), divan::black_box(&d)); + divan::black_box(out.as_slice()); + }); +} + +// ================= i32 > 5 ================= + +#[divan::bench(args = SIZES)] +fn gt_scalar_pack(bencher: Bencher, n: usize) { + let d = ints(n); + let mut out = vec![0u64; n / 64]; + bencher.bench_local(|| { + pack_pred( + divan::black_box(&mut out), + divan::black_box(&d), + |x: i32| x > 5, + ); + divan::black_box(out.as_slice()); + }); +} + +#[divan::bench(args = SIZES)] +fn gt_simd_bench(bencher: Bencher, n: usize) { + let d = ints(n); + let mut out = vec![0u64; n / 64]; + bencher.bench_local(|| { + gt_simd(divan::black_box(&mut out), divan::black_box(&d), 5); + divan::black_box(out.as_slice()); + }); +} + +// ================= 5 < i32 < 10 ================= + +#[divan::bench(args = SIZES)] +fn between_scalar_pack(bencher: Bencher, n: usize) { + let d = ints(n); + let mut out = vec![0u64; n / 64]; + bencher.bench_local(|| { + pack_pred( + divan::black_box(&mut out), + divan::black_box(&d), + |x: i32| (x > 5) & (x < 10), + ); + divan::black_box(out.as_slice()); + }); +} + +#[divan::bench(args = SIZES)] +fn between_simd_bench(bencher: Bencher, n: usize) { + let d = ints(n); + let mut out = vec![0u64; n / 64]; + bencher.bench_local(|| { + between_simd(divan::black_box(&mut out), divan::black_box(&d), 5, 10); + divan::black_box(out.as_slice()); + }); +} + +/// Cross-check every variant against the scalar reference before benchmarking, +/// so a miscompiled lowering fails loudly in CI instead of reporting fast-but-wrong. +fn verify() { + let n = 4096usize; + let by = bytes(n); + let it = ints(n); + let mut want = vec![0u64; n / 64]; + let mut got = vec![0u64; n / 64]; + + pack_pred(&mut want, &by, |x: u8| x != 0); + nonzero_u8_swar(&mut got, &by); + assert_eq!(want, got, "u8 swar"); + pack_nonzero_bytes(&mut got, &by); + assert_eq!(want, got, "u8 simd"); + + pack_pred(&mut want, &it, |x: i32| x > 5); + gt_simd(&mut got, &it, 5); + assert_eq!(want, got, "gt simd"); + + pack_pred(&mut want, &it, |x: i32| (x > 5) & (x < 10)); + between_simd(&mut got, &it, 5, 10); + assert_eq!(want, got, "between simd"); +} + +fn main() { + verify(); + divan::main(); +} diff --git a/vortex-buffer/benches/vortex_bitbuffer.rs b/vortex-buffer/benches/vortex_bitbuffer.rs index 67ce88da889..9b080107adf 100644 --- a/vortex-buffer/benches/vortex_bitbuffer.rs +++ b/vortex-buffer/benches/vortex_bitbuffer.rs @@ -1,6 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +// Benchmark data generators cast loop indices (`i as i32`); truncation is intentional +// and harmless for the synthetic inputs. +#![allow(clippy::cast_possible_truncation)] +// Bench-local: terse SIMD/math variable names (lo, hi, v, w, p, ...). +#![allow(clippy::many_single_char_names)] + use std::iter::Iterator; use arrow_buffer::BooleanBuffer; @@ -8,6 +14,239 @@ use arrow_buffer::BooleanBufferBuilder; use divan::Bencher; use vortex_buffer::BitBuffer; use vortex_buffer::BitBufferMut; +use vortex_buffer::collect_bool_words; +use vortex_buffer::pack_slice_predicate; + +// Sizes spanning L1 -> DRAM for the collect-bool / bitmask-pack benchmarks. +const PACK_SIZES: &[usize] = &[1024, 16_384, 262_144, 1_048_576]; + +/// Pure-compute baseline: pack `n` truthy bytes (`b != 0`) into a *reused* word +/// buffer via the real `collect_bool_words` (the scalar `packed |= (f(i)) << i` +/// idiom). No allocation in the measured region. +#[divan::bench(args = PACK_SIZES)] +fn pack_truthy_bytes(bencher: Bencher, n: usize) { + let data: Vec = (0..n).map(|i| i.is_multiple_of(7) as u8).collect(); + let mut words = vec![0u64; n.div_ceil(64)]; + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + collect_bool_words(divan::black_box(&mut words), n, |i| d[i] > 0); + divan::black_box(words.as_slice()); + }); +} + +/// SIMD fast path: same pack into a *reused* buffer via `pack_nonzero_bytes`. +#[divan::bench(args = PACK_SIZES)] +fn pack_truthy_bytes_simd(bencher: Bencher, n: usize) { + let data: Vec = (0..n).map(|i| i.is_multiple_of(7) as u8).collect(); + let mut words = vec![0u64; n.div_ceil(64)]; + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + vortex_buffer::pack_nonzero_bytes(divan::black_box(&mut words), d); + divan::black_box(words.as_slice()); + }); +} + +/// Bounds-check-only fix: same scalar shift-OR idiom, but fed the slice directly +/// via `pack_slice_predicate` (`chunks_exact`, no per-element bounds check). No SIMD +/// intrinsics — isolates how much of the gap is the bounds-checked index closure. +#[divan::bench(args = PACK_SIZES)] +fn pack_truthy_bytes_chunked(bencher: Bencher, n: usize) { + let data: Vec = (0..n).map(|i| i.is_multiple_of(7) as u8).collect(); + let mut words = vec![0u64; n.div_ceil(64)]; + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + pack_slice_predicate(divan::black_box(&mut words), d, |b| *b > 0); + divan::black_box(words.as_slice()); + }); +} + +/// End-to-end real caller: `BitBufferMut::from(&[u8])` (includes allocation). +#[divan::bench(args = PACK_SIZES)] +fn bitbuffer_from_u8(bencher: Bencher, n: usize) { + let data: Vec = (0..n).map(|i| i.is_multiple_of(7) as u8).collect(); + bencher + .with_inputs(|| data.as_slice()) + .bench_refs(|s| BitBufferMut::from(divan::black_box(*s))); +} + +// ---- Typed compare -> bitmask (the `primitive between` shape, i32) ---- + +/// Baseline: exactly what `primitive between` does today — `collect_bool_words` +/// over a contiguous `&[i32]` with the inclusive between predicate. +#[divan::bench(args = PACK_SIZES)] +fn between_i32_scalar(bencher: Bencher, n: usize) { + let data: Vec = (0..n) + .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32)) + .collect(); + let mut words = vec![0u64; n.div_ceil(64)]; + let (lo, hi) = (-100_000_000i32, 100_000_000i32); + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + collect_bool_words(divan::black_box(&mut words), n, |i| { + lo <= d[i] && d[i] <= hi + }); + divan::black_box(words.as_slice()); + }); +} + +/// Bounds-check-only fix for the between shape: scalar shift-OR over the slice via +/// `pack_slice_predicate` (no per-element bounds check), no SIMD intrinsics. +#[divan::bench(args = PACK_SIZES)] +fn between_i32_chunked(bencher: Bencher, n: usize) { + let data: Vec = (0..n) + .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32)) + .collect(); + let mut words = vec![0u64; n.div_ceil(64)]; + let (lo, hi) = (-100_000_000i32, 100_000_000i32); + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + pack_slice_predicate(divan::black_box(&mut words), d, |v| lo <= *v && *v <= hi); + divan::black_box(words.as_slice()); + }); +} + +/// Current production form of `primitive between`: `collect_bool_words` with an +/// index closure that does `unsafe { *slice.get_unchecked(idx) }`. Baseline to +/// confirm `pack_slice_predicate` is not a regression vs the existing `unsafe`. +#[divan::bench(args = PACK_SIZES)] +fn between_i32_unchecked(bencher: Bencher, n: usize) { + let data: Vec = (0..n) + .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32)) + .collect(); + let mut words = vec![0u64; n.div_ceil(64)]; + let (lo, hi) = (-100_000_000i32, 100_000_000i32); + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + collect_bool_words(divan::black_box(&mut words), n, |i| { + // SAFETY: i < n == d.len(). + let v = unsafe { *d.get_unchecked(i) }; + lo <= v && v <= hi + }); + divan::black_box(words.as_slice()); + }); +} + +/// AVX-512 between: vpcmpd (>= lo) & vpcmpd (<= hi) -> kmovw, 16 i32/iter. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f")] +fn between_i32_avx512(out: &mut [u16], value: &[i32], lo: i32, hi: i32) { + use std::arch::x86_64::*; + let vlo = _mm512_set1_epi32(lo); + let vhi = _mm512_set1_epi32(hi); + let p = value.as_ptr() as *const __m512i; + for (i, w) in out.iter_mut().take(value.len() / 16).enumerate() { + // SAFETY: i < len/16 keeps the load in bounds. + let v = unsafe { _mm512_loadu_si512(p.add(i)) }; + let ge = _mm512_cmpge_epi32_mask(v, vlo); + let le = _mm512_cmple_epi32_mask(v, vhi); + *w = ge & le; + } +} + +#[divan::bench(args = PACK_SIZES)] +fn between_i32_simd(bencher: Bencher, n: usize) { + let data: Vec = (0..n) + .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32)) + .collect(); + let mut masks = vec![0u16; n.div_ceil(16)]; + let (lo, hi) = (-100_000_000i32, 100_000_000i32); + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + #[cfg(target_arch = "x86_64")] + if is_x86_feature_detected!("avx512f") { + // SAFETY: avx512f confirmed present at runtime. + unsafe { between_i32_avx512(divan::black_box(&mut masks), d, lo, hi) }; + } + divan::black_box(masks.as_slice()); + }); +} + +// ---- CodSpeed A/B: production `primitive between` API, original vs new ---- +// +// These call the exact `BitBuffer` constructors the production code uses (incl. +// allocation), so CodSpeed's baseline (x86-64/SSE2) instruction counts measure +// the real change: index-closure `collect_bool` vs slice-iterating +// `collect_bool_slice`. + +fn between_data(n: usize) -> Vec { + (0..n) + .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32)) + .collect() +} + +/// ORIGINAL: `BitBuffer::collect_bool` with an index closure that does +/// `unsafe { *slice.get_unchecked(idx) }` — exactly what `primitive between` did. +#[divan::bench(args = PACK_SIZES)] +fn between_bitbuffer_original(bencher: Bencher, n: usize) { + let data = between_data(n); + let (lo, hi) = (-100_000_000i32, 100_000_000i32); + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + let bb = BitBuffer::collect_bool(d.len(), |idx| { + // SAFETY: idx < d.len(). + let v = unsafe { *d.get_unchecked(idx) }; + lo <= v && v <= hi + }); + divan::black_box(bb) + }); +} + +/// NEW: `BitBuffer::collect_bool_slice` (safe, slice-iterating) — the rewired path. +#[divan::bench(args = PACK_SIZES)] +fn between_bitbuffer_new(bencher: Bencher, n: usize) { + let data = between_data(n); + let (lo, hi) = (-100_000_000i32, 100_000_000i32); + bencher.bench_local(|| { + let d = divan::black_box(data.as_slice()); + let bb = BitBuffer::collect_bool_slice(d, |&v| lo <= v && v <= hi); + divan::black_box(bb) + }); +} + +// ---- Adjacent-pair equality (the varbin `compare_offsets_to_empty` shape) ---- + +/// Baseline: `collect_bool_words` with the `offsets[i] == offsets[i+1]` predicate. +#[divan::bench(args = PACK_SIZES)] +fn offsets_eq_scalar(bencher: Bencher, n: usize) { + // n+1 offsets -> n comparisons (empty-string detection). + let offsets: Vec = (0..=n as i32).map(|i| i - (i % 3 == 0) as i32).collect(); + let mut words = vec![0u64; n.div_ceil(64)]; + bencher.bench_local(|| { + let o = divan::black_box(offsets.as_slice()); + collect_bool_words(divan::black_box(&mut words), n, |i| o[i] == o[i + 1]); + divan::black_box(words.as_slice()); + }); +} + +/// AVX-512: vpcmpeqd of offsets[i..] vs offsets[i+1..] -> kmovw, 16 pairs/iter. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f")] +fn offsets_eq_avx512(out: &mut [u16], offsets: &[i32]) { + use std::arch::x86_64::*; + let pairs = offsets.len().saturating_sub(1); + let p = offsets.as_ptr() as *const __m512i; + for (i, w) in out.iter_mut().take(pairs / 16).enumerate() { + // SAFETY: reads offsets[i*16 ..= i*16+16], in bounds since 16*(i+1) < len. + let lhs = unsafe { _mm512_loadu_si512(p.byte_add(i * 64)) }; + let rhs = unsafe { _mm512_loadu_si512(p.byte_add(i * 64 + 4)) }; + *w = _mm512_cmpeq_epi32_mask(lhs, rhs); + } +} + +#[divan::bench(args = PACK_SIZES)] +fn offsets_eq_simd(bencher: Bencher, n: usize) { + let offsets: Vec = (0..=n as i32).map(|i| i - (i % 3 == 0) as i32).collect(); + let mut masks = vec![0u16; n.div_ceil(16)]; + bencher.bench_local(|| { + let o = divan::black_box(offsets.as_slice()); + #[cfg(target_arch = "x86_64")] + if is_x86_feature_detected!("avx512f") { + // SAFETY: avx512f confirmed present at runtime. + unsafe { offsets_eq_avx512(divan::black_box(&mut masks), o) }; + } + divan::black_box(masks.as_slice()); + }); +} fn main() { // Pre-warm CPUID feature detection so the one-time probe cost is never diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs index 83b28e24b4a..422822e9116 100644 --- a/vortex-buffer/src/bit/buf.rs +++ b/vortex-buffer/src/bit/buf.rs @@ -165,6 +165,17 @@ impl BitBuffer { BitBufferMut::collect_bool(len, f).freeze() } + /// Build a [`BitBuffer`] of length `values.len()`, setting bit `i` iff + /// `pred(&values[i])`. + /// + /// Prefer this over [`collect_bool`](Self::collect_bool) when packing a + /// contiguous slice: it iterates the slice directly so LLVM can auto-vectorize + /// the compare and bit-pack, and needs no `unsafe` in the caller's predicate. + #[inline] + pub fn collect_bool_slice bool>(values: &[T], pred: F) -> Self { + BitBufferMut::collect_bool_slice(values, pred).freeze() + } + /// Maps over each bit in this buffer, calling `f(index, bit_value)` and collecting results. /// /// This is more efficient than `collect_bool` when you need to read the current bit value, diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index 829b0fcf9cc..79ea435cae7 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -11,6 +11,8 @@ use crate::ByteBufferMut; use crate::bit::collect_bool_words; use crate::bit::get_bit_unchecked; use crate::bit::ops; +use crate::bit::pack_nonzero_bytes; +use crate::bit::pack_slice_predicate; use crate::bit::set_bit_unchecked; use crate::bit::unset_bit_unchecked; use crate::buffer_mut; @@ -204,6 +206,56 @@ impl BitBufferMut { } } + /// Build a `BitBufferMut` of length `values.len()`, setting bit `i` iff + /// `pred(&values[i])`. + /// + /// Prefer this over [`collect_bool`](Self::collect_bool) when the source is a + /// contiguous slice: it iterates the slice directly via [`pack_slice_predicate`] + /// rather than calling an index closure per element, which lets LLVM + /// auto-vectorize the compare and bit-pack. It is also safe — no `get_unchecked` + /// in the caller's predicate. + #[inline] + pub fn collect_bool_slice bool>(values: &[T], pred: F) -> Self { + let len = values.len(); + let num_words = len.div_ceil(64); + let mut buffer: BufferMut = BufferMut::with_capacity(num_words); + // SAFETY: `pack_slice_predicate` writes every word in `0..num_words` below + // before any read; `u64` has no invalid bit patterns. + unsafe { buffer.set_len(num_words) }; + pack_slice_predicate(buffer.as_mut_slice(), values, pred); + + let mut bytes = buffer.into_byte_buffer(); + bytes.truncate(len.div_ceil(8)); + + Self { + buffer: bytes, + offset: 0, + len, + } + } + + /// Build a `BitBufferMut` from a contiguous byte slice, setting bit `i` iff + /// `value[i] != 0`. Uses the SIMD [`pack_nonzero_bytes`] fast path. + #[inline] + pub fn from_nonzero_bytes(value: &[u8]) -> Self { + let len = value.len(); + let num_words = len.div_ceil(64); + let mut buffer: BufferMut = BufferMut::with_capacity(num_words); + // SAFETY: `pack_nonzero_bytes` writes every word in `0..num_words` below + // before any read; `u64` has no invalid bit patterns. + unsafe { buffer.set_len(num_words) }; + pack_nonzero_bytes(buffer.as_mut_slice(), value); + + let mut bytes = buffer.into_byte_buffer(); + bytes.truncate(len.div_ceil(8)); + + Self { + buffer: bytes, + offset: 0, + len, + } + } + /// Return the underlying byte buffer. pub fn inner(&self) -> &ByteBufferMut { &self.buffer @@ -563,14 +615,19 @@ impl Not for BitBufferMut { impl From<&[bool]> for BitBufferMut { fn from(value: &[bool]) -> Self { - BitBufferMut::collect_bool(value.len(), |i| value[i]) + // SAFETY: `bool` is a single byte guaranteed to be 0 or 1, so it is + // sound to view `&[bool]` as `&[u8]`; `pack_nonzero_bytes` sets bit `i` + // iff the byte is non-zero, matching `value[i] == true`. + let bytes = + unsafe { core::slice::from_raw_parts(value.as_ptr().cast::(), value.len()) }; + BitBufferMut::from_nonzero_bytes(bytes) } } // allow building a buffer from a set of truthy byte values. impl From<&[u8]> for BitBufferMut { fn from(value: &[u8]) -> Self { - BitBufferMut::collect_bool(value.len(), |i| value[i] > 0) + BitBufferMut::from_nonzero_bytes(value) } } diff --git a/vortex-buffer/src/bit/mod.rs b/vortex-buffer/src/bit/mod.rs index 41bb5797266..c262b728871 100644 --- a/vortex-buffer/src/bit/mod.rs +++ b/vortex-buffer/src/bit/mod.rs @@ -14,6 +14,7 @@ mod count_ones; mod macros; mod meta; mod ops; +mod pack_simd; mod select; mod view; @@ -27,6 +28,7 @@ pub use arrow_buffer::bit_iterator::BitSliceIterator; pub use buf::*; pub use buf_mut::*; pub use meta::*; +pub use pack_simd::pack_nonzero_bytes; pub use view::*; /// Packs up to 64 boolean values into a little-endian `u64` word. @@ -74,6 +76,61 @@ where } } +/// Pack one chunk of at most 64 elements into a little-endian `u64`, LSB-first, +/// by applying `pred` to each element. Iterates the slice directly so there is no +/// per-element bounds check. +#[inline] +fn pack_chunk(chunk: &[T], pred: &mut F) -> u64 +where + F: FnMut(&T) -> bool, +{ + let mut packed = 0u64; + for (bit_idx, value) in chunk.iter().enumerate() { + packed |= (pred(value) as u64) << bit_idx; + } + packed +} + +/// Pack a per-element predicate over `values` into the prefix of `words`, LSB-first, +/// 64 bits per `u64`. Writes via `=` (not `|=`), so the destination need not be +/// zero-initialised. `words` must have capacity for at least +/// `values.len().div_ceil(64)` entries. +/// +/// Unlike [`collect_bool_words`], whose index closure forces a bounds-checked +/// `values[i]` in the caller's predicate, this iterates the slice directly through +/// `chunks_exact`. With the per-element bounds check elided and a fixed 64-element +/// inner trip count, LLVM can auto-vectorize the scalar shift-OR loop — which the +/// index-closure form prevents. This stays scalar (no SIMD intrinsics), so it does +/// not reach [`pack_nonzero_bytes`] speed, but it recovers the bounds-check overhead. +#[inline] +pub fn pack_slice_predicate(words: &mut [u64], values: &[T], mut pred: F) +where + F: FnMut(&T) -> bool, +{ + let num_words = values.len().div_ceil(64); + assert!( + words.len() >= num_words, + "words slice has {} entries, need at least {num_words}", + words.len(), + ); + + let mut chunks = values.chunks_exact(64); + let mut words_iter = words.iter_mut(); + // `chunks` must lead the zip: if it leads and is empty, zip short-circuits + // without consuming a slot from `words_iter`, leaving it positioned for the + // remainder write below. (The reverse order would discard a `words` slot.) + for (chunk, word) in chunks.by_ref().zip(words_iter.by_ref()) { + *word = pack_chunk(chunk, &mut pred); + } + + let remainder = chunks.remainder(); + if !remainder.is_empty() + && let Some(word) = words_iter.next() + { + *word = pack_chunk(remainder, &mut pred); + } +} + /// Splice a packed word `w` (whose bits above the highest valid bit are zero) into /// `words` at the given bit position. /// @@ -176,8 +233,12 @@ pub unsafe fn unset_bit_unchecked(buf: *mut u8, index: usize) { #[cfg(test)] mod tests { + use rstest::rstest; + use super::collect_bool_word; + use super::collect_bool_words; use super::pack_bools_into_words; + use super::pack_slice_predicate; #[test] fn collect_bool_word_packs_lsb_first() { @@ -222,6 +283,32 @@ mod tests { } } + #[rstest] + #[case(0)] + #[case(1)] + #[case(63)] + #[case(64)] + #[case(65)] + #[case(130)] + #[case(200)] + fn pack_slice_predicate_matches_collect_bool_words(#[case] len: usize) { + #[allow( + clippy::cast_possible_truncation, + reason = "small test indices fit in i32" + )] + let values: Vec = (0..len as i32).map(|i| i.wrapping_mul(7) - 3).collect(); + let pred = |v: i32| v % 5 == 0; + + let num_words = len.div_ceil(64); + let mut expected = vec![0u64; num_words.max(1)]; + collect_bool_words(&mut expected, len, |i| pred(values[i])); + + let mut actual = vec![0u64; num_words.max(1)]; + pack_slice_predicate(&mut actual, &values, |v| pred(*v)); + + assert_eq!(actual[..num_words], expected[..num_words], "len {len}"); + } + #[test] fn pack_bools_preserves_low_bits_of_leading_word() { let mut words = vec![0u64; 2]; diff --git a/vortex-buffer/src/bit/pack_simd.rs b/vortex-buffer/src/bit/pack_simd.rs new file mode 100644 index 00000000000..df8704edb38 --- /dev/null +++ b/vortex-buffer/src/bit/pack_simd.rs @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SIMD packing of "truthy" bytes into an LSB-first bitmask. +//! +//! Packs `value.len()` bytes into bits where bit `i` is set iff `value[i] != 0`. +//! This is the contiguous-slice fast path behind `BitBufferMut::from(&[u8])` and +//! `BitBufferMut::from(&[bool])`. +//! +//! Expressed as a vector compare into an opmask, this lowers to one +//! `vptestmb` + `kmovq` per 64 bytes on AVX-512BW. The scalar +//! `packed |= (b != 0) << i` reduction loop does not: LLVM's SLP vectorizer +//! rewrites it into a `vpsllvq` shift-OR reduction instead, which is ~10-20x +//! slower for cache-resident inputs. + +/// Pack `value.len()` truthy bytes (`b != 0`) into `words`, LSB-first, 64 bits +/// per `u64`. `words` must have at least `value.len().div_ceil(64)` entries. +#[inline] +pub fn pack_nonzero_bytes(words: &mut [u64], value: &[u8]) { + let num_words = value.len().div_ceil(64); + assert!( + words.len() >= num_words, + "words slice has {} entries, need at least {num_words}", + words.len(), + ); + + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512bw") { + // SAFETY: guarded by the runtime feature checks above. + return unsafe { pack_nonzero_bytes_avx512(words, value) }; + } + if is_x86_feature_detected!("avx2") { + // SAFETY: guarded by the runtime feature check above. + return unsafe { pack_nonzero_bytes_avx2(words, value) }; + } + } + + pack_nonzero_bytes_scalar(words, value); +} + +/// Portable fallback used directly on non-x86 targets and as the tail handler. +#[inline] +fn pack_nonzero_bytes_scalar(words: &mut [u64], value: &[u8]) { + let full = value.len() / 64; + for (word, chunk) in words.iter_mut().zip(value.chunks_exact(64)) { + let mut bits = 0u64; + for (i, &b) in chunk.iter().enumerate() { + bits |= ((b != 0) as u64) << i; + } + *word = bits; + } + if !value.len().is_multiple_of(64) { + let base = full * 64; + let mut bits = 0u64; + for (i, &b) in value[base..].iter().enumerate() { + bits |= ((b != 0) as u64) << i; + } + words[full] = bits; + } +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f,avx512bw")] +unsafe fn pack_nonzero_bytes_avx512(words: &mut [u64], value: &[u8]) { + use std::arch::x86_64::__m512i; + use std::arch::x86_64::_mm512_loadu_si512; + use std::arch::x86_64::_mm512_test_epi8_mask; + + let full = value.len() / 64; + let ptr = value.as_ptr(); + for (i, word) in words.iter_mut().take(full).enumerate() { + // SAFETY: i < full so the 64-byte load stays in bounds. + let v = unsafe { _mm512_loadu_si512(ptr.add(i * 64) as *const __m512i) }; + // vptestmb: per-byte (v & v) != 0 -> 64-bit opmask; kmovq stores it. + *word = _mm512_test_epi8_mask(v, v); + } + if !value.len().is_multiple_of(64) { + pack_nonzero_bytes_scalar(&mut words[full..], &value[full * 64..]); + } +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn pack_nonzero_bytes_avx2(words: &mut [u64], value: &[u8]) { + use std::arch::x86_64::__m256i; + use std::arch::x86_64::_mm256_cmpeq_epi8; + use std::arch::x86_64::_mm256_loadu_si256; + use std::arch::x86_64::_mm256_movemask_epi8; + use std::arch::x86_64::_mm256_setzero_si256; + + let full = value.len() / 64; + let ptr = value.as_ptr(); + let zero = _mm256_setzero_si256(); + for (i, word) in words.iter_mut().take(full).enumerate() { + // Two 32-byte halves; movemask gives 1 bit per byte. cmpeq vs zero + // marks zero bytes, so invert to mark non-zero bytes. + // SAFETY: i < full so both 32-byte loads stay in bounds. + let lo = unsafe { _mm256_loadu_si256(ptr.add(i * 64) as *const __m256i) }; + // SAFETY: i < full so the second 32-byte load stays in bounds. + let hi = unsafe { _mm256_loadu_si256(ptr.add(i * 64 + 32) as *const __m256i) }; + let lo_zero = _mm256_movemask_epi8(_mm256_cmpeq_epi8(lo, zero)) as u32; + let hi_zero = _mm256_movemask_epi8(_mm256_cmpeq_epi8(hi, zero)) as u32; + let bits = (!lo_zero as u64) | ((!hi_zero as u64) << 32); + *word = bits; + } + if !value.len().is_multiple_of(64) { + pack_nonzero_bytes_scalar(&mut words[full..], &value[full * 64..]); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn reference(value: &[u8]) -> Vec { + let mut w = vec![0u64; value.len().div_ceil(64)]; + for (i, &b) in value.iter().enumerate() { + if b != 0 { + w[i / 64] |= 1u64 << (i % 64); + } + } + w + } + + #[test] + fn matches_reference() { + for &n in &[0usize, 1, 7, 63, 64, 65, 127, 128, 200, 1000, 4096] { + // Mix of zero and varied non-zero bytes to exercise the != 0 test. + let data: Vec = (0..n) + .map(|i| { + if i.is_multiple_of(5) { + 0 + } else { + u8::try_from(i % 200 + 1).unwrap() + } + }) + .collect(); + let mut got = vec![0u64; n.div_ceil(64)]; + pack_nonzero_bytes(&mut got, &data); + assert_eq!(got, reference(&data), "mismatch at n={n}"); + } + } +}