diff --git a/vortex-array/src/arrays/decimal/compute/between.rs b/vortex-array/src/arrays/decimal/compute/between.rs
index 75c2da923ea..a13c927e278 100644
--- a/vortex-array/src/arrays/decimal/compute/between.rs
+++ b/vortex-array/src/arrays/decimal/compute/between.rs
@@ -132,8 +132,7 @@ fn between_impl<T: NativeDecimalType>(
 ) -> ArrayRef {
     let buffer = arr.buffer::<T>();
     BoolArray::new(
-        BitBuffer::collect_bool(buffer.len(), |idx| {
-            let value = buffer[idx];
+        BitBuffer::collect_bool_slice(buffer.as_slice(), |&value| {
             lower.is_none_or(|l| lower_op(l, value)) & upper.is_none_or(|u| upper_op(value, u))
         }),
         arr.validity()
diff --git a/vortex-array/src/arrays/primitive/compute/between.rs b/vortex-array/src/arrays/primitive/compute/between.rs
index 9a6c7f60c1b..815e6b5bd33 100644
--- a/vortex-array/src/arrays/primitive/compute/between.rs
+++ b/vortex-array/src/arrays/primitive/compute/between.rs
@@ -105,11 +105,7 @@ where
 {
     let slice = arr.as_slice::<T>();
     BoolArray::new(
-        BitBuffer::collect_bool(slice.len(), |idx| {
-            // We only iterate upto arr len and |arr| == |slice|.
-            let i = unsafe { *slice.get_unchecked(idx) };
-            lower_fn(lower, i) & upper_fn(i, upper)
-        }),
+        BitBuffer::collect_bool_slice(slice, |&i| lower_fn(lower, i) & upper_fn(i, upper)),
         arr.validity()
             .vortex_expect("validity should be derivable")
             .union_nullability(nullability),
diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml
index ae9d7e6cc05..3a38a85f367 100644
--- a/vortex-buffer/Cargo.toml
+++ b/vortex-buffer/Cargo.toml
@@ -48,3 +48,7 @@ harness = false
 [[bench]]
 name = "vortex_bitbuffer"
 harness = false
+
+[[bench]]
+name = "compare_lowering"
+harness = false
diff --git a/vortex-buffer/benches/compare_lowering.rs b/vortex-buffer/benches/compare_lowering.rs
new file mode 100644
index 00000000000..199ec1ea542
--- /dev/null
+++ b/vortex-buffer/benches/compare_lowering.rs
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Best-scalar vs best-SIMD for three compare->bitmask lowerings:
+//!   1. `u8 != 0`            (byte truthiness pack)
+//!   2. `i32 > 5`            (single comparison)
+//!   3. `5 < i32 < 10`       (two comparisons / between)
+//!
+//! Every variant writes the SAME `u64`-word bitmask (`n` a multiple of 64), so
+//! the comparison is apples-to-apples. SIMD paths dispatch avx512 -> avx2 ->
+//! scalar at runtime (matching the production `pack_nonzero_bytes`), so they
+//! always do real vector work under CodSpeed (which builds with `+avx2`).
+//! Sizes span L1/L2 (16Ki) to DRAM (1Mi).
+
+#![allow(clippy::cast_possible_truncation)]
+// Bench-local: terse math names and `unwrap` on infallible slice->array conversions.
+#![allow(clippy::many_single_char_names)]
+#![allow(clippy::unwrap_used)]
+
+use divan::Bencher;
+use vortex_buffer::pack_nonzero_bytes;
+
+const SIZES: &[usize] = &[16_384, 1_048_576];
+
+// ---------------- scalar ----------------
+
+/// Slice-iterating shift-OR pack — the idiomatic portable scalar form.
+#[inline(always)]
+fn pack_pred<T: Copy, F: Fn(T) -> bool>(out: &mut [u64], v: &[T], f: F) {
+    for (w, chunk) in out.iter_mut().zip(v.chunks_exact(64)) {
+        let mut word = 0u64;
+        for (b, &x) in chunk.iter().enumerate() {
+            word |= (f(x) as u64) << b;
+        }
+        *w = word;
+    }
+}
+
+/// Carry-free SWAR: pack `u8 != 0` 8 bytes at a time without SIMD intrinsics.
+fn nonzero_u8_swar(out: &mut [u64], v: &[u8]) {
+    for (i, w) in out.iter_mut().enumerate() {
+        let base = i * 64;
+        let mut word = 0u64;
+        for g in 0..8 {
+            let chunk = u64::from_le_bytes(v[base + g * 8..base + g * 8 + 8].try_into().unwrap());
+            let low7 = chunk & 0x7f7f_7f7f_7f7f_7f7f;
+            let nz = (low7.wrapping_add(0x7f7f_7f7f_7f7f_7f7f) | chunk) & 0x8080_8080_8080_8080;
+            let bits = nz.wrapping_mul(0x0002_0408_1020_4081) >> 56;
+            word |= bits << (g * 8);
+        }
+        *w = word;
+    }
+}
+
+// ---------------- SIMD: i32 > k ----------------
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f")]
+unsafe fn gt_i32_avx512(out: &mut [u64], v: &[i32], k: i32) {
+    use std::arch::x86_64::*;
+    let vk = _mm512_set1_epi32(k);
+    let p = v.as_ptr() as *const __m512i;
+    // SAFETY: word i reads 64 i32 (4x16-lane loads), in bounds for i<v.len()/64.
+    let lane =
+        |j: usize| unsafe { _mm512_cmpgt_epi32_mask(_mm512_loadu_si512(p.add(j)), vk) as u64 };
+    for (i, w) in out.iter_mut().enumerate() {
+        *w = lane(i * 4)
+            | (lane(i * 4 + 1) << 16)
+            | (lane(i * 4 + 2) << 32)
+            | (lane(i * 4 + 3) << 48);
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn gt_i32_avx2(out: &mut [u64], v: &[i32], k: i32) {
+    use std::arch::x86_64::*;
+    let vk = _mm256_set1_epi32(k);
+    let p = v.as_ptr();
+    for (i, w) in out.iter_mut().enumerate() {
+        let mut word = 0u64;
+        for g in 0..8 {
+            // SAFETY: reads 8 i32 at i*64+g*8, in bounds for i<v.len()/64.
+            let x = unsafe { _mm256_loadu_si256(p.add(i * 64 + g * 8) as *const __m256i) };
+            let m = _mm256_cmpgt_epi32(x, vk);
+            let bits = _mm256_movemask_ps(_mm256_castsi256_ps(m)) as u32 as u64;
+            word |= bits << (g * 8);
+        }
+        *w = word;
+    }
+}
+
+fn gt_simd(out: &mut [u64], v: &[i32], k: i32) {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("avx512f") {
+            // SAFETY: feature confirmed.
+            return unsafe { gt_i32_avx512(out, v, k) };
+        }
+        if is_x86_feature_detected!("avx2") {
+            // SAFETY: feature confirmed.
+            return unsafe { gt_i32_avx2(out, v, k) };
+        }
+    }
+    pack_pred(out, v, |x: i32| x > k);
+}
+
+// ---------------- SIMD: lo < i32 < hi ----------------
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f")]
+unsafe fn between_i32_avx512(out: &mut [u64], v: &[i32], lo: i32, hi: i32) {
+    use std::arch::x86_64::*;
+    let vlo = _mm512_set1_epi32(lo);
+    let vhi = _mm512_set1_epi32(hi);
+    let p = v.as_ptr() as *const __m512i;
+    // SAFETY: as above.
+    let lane = |j: usize| unsafe {
+        let x = _mm512_loadu_si512(p.add(j));
+        (_mm512_cmpgt_epi32_mask(x, vlo) & _mm512_cmplt_epi32_mask(x, vhi)) as u64
+    };
+    for (i, w) in out.iter_mut().enumerate() {
+        *w = lane(i * 4)
+            | (lane(i * 4 + 1) << 16)
+            | (lane(i * 4 + 2) << 32)
+            | (lane(i * 4 + 3) << 48);
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn between_i32_avx2(out: &mut [u64], v: &[i32], lo: i32, hi: i32) {
+    use std::arch::x86_64::*;
+    let vlo = _mm256_set1_epi32(lo);
+    let vhi = _mm256_set1_epi32(hi);
+    let p = v.as_ptr();
+    for (i, w) in out.iter_mut().enumerate() {
+        let mut word = 0u64;
+        for g in 0..8 {
+            // SAFETY: reads 8 i32 at i*64+g*8, in bounds for i<v.len()/64.
+            let x = unsafe { _mm256_loadu_si256(p.add(i * 64 + g * 8) as *const __m256i) };
+            let m = _mm256_and_si256(_mm256_cmpgt_epi32(x, vlo), _mm256_cmpgt_epi32(vhi, x));
+            let bits = _mm256_movemask_ps(_mm256_castsi256_ps(m)) as u32 as u64;
+            word |= bits << (g * 8);
+        }
+        *w = word;
+    }
+}
+
+fn between_simd(out: &mut [u64], v: &[i32], lo: i32, hi: i32) {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("avx512f") {
+            // SAFETY: feature confirmed.
+            return unsafe { between_i32_avx512(out, v, lo, hi) };
+        }
+        if is_x86_feature_detected!("avx2") {
+            // SAFETY: feature confirmed.
+            return unsafe { between_i32_avx2(out, v, lo, hi) };
+        }
+    }
+    pack_pred(out, v, |x: i32| (x > lo) & (x < hi));
+}
+
+// ---------------- data ----------------
+
+fn bytes(n: usize) -> Vec<u8> {
+    (0..n).map(|i| (i % 7 == 0) as u8).collect()
+}
+fn ints(n: usize) -> Vec<i32> {
+    (0..n)
+        .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32) % 16)
+        .collect()
+}
+
+// ================= u8 != 0 =================
+
+#[divan::bench(args = SIZES)]
+fn u8_scalar_pack(bencher: Bencher, n: usize) {
+    let d = bytes(n);
+    let mut out = vec![0u64; n / 64];
+    bencher.bench_local(|| {
+        pack_pred(divan::black_box(&mut out), divan::black_box(&d), |x: u8| {
+            x != 0
+        });
+        divan::black_box(out.as_slice());
+    });
+}
+
+#[divan::bench(args = SIZES)]
+fn u8_scalar_swar(bencher: Bencher, n: usize) {
+    let d = bytes(n);
+    let mut out = vec![0u64; n / 64];
+    bencher.bench_local(|| {
+        nonzero_u8_swar(divan::black_box(&mut out), divan::black_box(&d));
+        divan::black_box(out.as_slice());
+    });
+}
+
+#[divan::bench(args = SIZES)]
+fn u8_simd(bencher: Bencher, n: usize) {
+    let d = bytes(n);
+    let mut out = vec![0u64; n / 64];
+    bencher.bench_local(|| {
+        pack_nonzero_bytes(divan::black_box(&mut out), divan::black_box(&d));
+        divan::black_box(out.as_slice());
+    });
+}
+
+// ================= i32 > 5 =================
+
+#[divan::bench(args = SIZES)]
+fn gt_scalar_pack(bencher: Bencher, n: usize) {
+    let d = ints(n);
+    let mut out = vec![0u64; n / 64];
+    bencher.bench_local(|| {
+        pack_pred(
+            divan::black_box(&mut out),
+            divan::black_box(&d),
+            |x: i32| x > 5,
+        );
+        divan::black_box(out.as_slice());
+    });
+}
+
+#[divan::bench(args = SIZES)]
+fn gt_simd_bench(bencher: Bencher, n: usize) {
+    let d = ints(n);
+    let mut out = vec![0u64; n / 64];
+    bencher.bench_local(|| {
+        gt_simd(divan::black_box(&mut out), divan::black_box(&d), 5);
+        divan::black_box(out.as_slice());
+    });
+}
+
+// ================= 5 < i32 < 10 =================
+
+#[divan::bench(args = SIZES)]
+fn between_scalar_pack(bencher: Bencher, n: usize) {
+    let d = ints(n);
+    let mut out = vec![0u64; n / 64];
+    bencher.bench_local(|| {
+        pack_pred(
+            divan::black_box(&mut out),
+            divan::black_box(&d),
+            |x: i32| (x > 5) & (x < 10),
+        );
+        divan::black_box(out.as_slice());
+    });
+}
+
+#[divan::bench(args = SIZES)]
+fn between_simd_bench(bencher: Bencher, n: usize) {
+    let d = ints(n);
+    let mut out = vec![0u64; n / 64];
+    bencher.bench_local(|| {
+        between_simd(divan::black_box(&mut out), divan::black_box(&d), 5, 10);
+        divan::black_box(out.as_slice());
+    });
+}
+
+/// Cross-check every variant against the scalar reference before benchmarking,
+/// so a miscompiled lowering fails loudly in CI instead of reporting fast-but-wrong.
+fn verify() {
+    let n = 4096usize;
+    let by = bytes(n);
+    let it = ints(n);
+    let mut want = vec![0u64; n / 64];
+    let mut got = vec![0u64; n / 64];
+
+    pack_pred(&mut want, &by, |x: u8| x != 0);
+    nonzero_u8_swar(&mut got, &by);
+    assert_eq!(want, got, "u8 swar");
+    pack_nonzero_bytes(&mut got, &by);
+    assert_eq!(want, got, "u8 simd");
+
+    pack_pred(&mut want, &it, |x: i32| x > 5);
+    gt_simd(&mut got, &it, 5);
+    assert_eq!(want, got, "gt simd");
+
+    pack_pred(&mut want, &it, |x: i32| (x > 5) & (x < 10));
+    between_simd(&mut got, &it, 5, 10);
+    assert_eq!(want, got, "between simd");
+}
+
+fn main() {
+    verify();
+    divan::main();
+}
diff --git a/vortex-buffer/benches/vortex_bitbuffer.rs b/vortex-buffer/benches/vortex_bitbuffer.rs
index 67ce88da889..9b080107adf 100644
--- a/vortex-buffer/benches/vortex_bitbuffer.rs
+++ b/vortex-buffer/benches/vortex_bitbuffer.rs
@@ -1,6 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+// Benchmark data generators cast loop indices (`i as i32`); truncation is intentional
+// and harmless for the synthetic inputs.
+#![allow(clippy::cast_possible_truncation)]
+// Bench-local: terse SIMD/math variable names (lo, hi, v, w, p, ...).
+#![allow(clippy::many_single_char_names)]
+
 use std::iter::Iterator;
 
 use arrow_buffer::BooleanBuffer;
@@ -8,6 +14,239 @@ use arrow_buffer::BooleanBufferBuilder;
 use divan::Bencher;
 use vortex_buffer::BitBuffer;
 use vortex_buffer::BitBufferMut;
+use vortex_buffer::collect_bool_words;
+use vortex_buffer::pack_slice_predicate;
+
+// Sizes spanning L1 -> DRAM for the collect-bool / bitmask-pack benchmarks.
+const PACK_SIZES: &[usize] = &[1024, 16_384, 262_144, 1_048_576];
+
+/// Pure-compute baseline: pack `n` truthy bytes (`b != 0`) into a *reused* word
+/// buffer via the real `collect_bool_words` (the scalar `packed |= (f(i)) << i`
+/// idiom). No allocation in the measured region.
+#[divan::bench(args = PACK_SIZES)]
+fn pack_truthy_bytes(bencher: Bencher, n: usize) {
+    let data: Vec<u8> = (0..n).map(|i| i.is_multiple_of(7) as u8).collect();
+    let mut words = vec![0u64; n.div_ceil(64)];
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        collect_bool_words(divan::black_box(&mut words), n, |i| d[i] > 0);
+        divan::black_box(words.as_slice());
+    });
+}
+
+/// SIMD fast path: same pack into a *reused* buffer via `pack_nonzero_bytes`.
+#[divan::bench(args = PACK_SIZES)]
+fn pack_truthy_bytes_simd(bencher: Bencher, n: usize) {
+    let data: Vec<u8> = (0..n).map(|i| i.is_multiple_of(7) as u8).collect();
+    let mut words = vec![0u64; n.div_ceil(64)];
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        vortex_buffer::pack_nonzero_bytes(divan::black_box(&mut words), d);
+        divan::black_box(words.as_slice());
+    });
+}
+
+/// Bounds-check-only fix: same scalar shift-OR idiom, but fed the slice directly
+/// via `pack_slice_predicate` (`chunks_exact`, no per-element bounds check). No SIMD
+/// intrinsics — isolates how much of the gap is the bounds-checked index closure.
+#[divan::bench(args = PACK_SIZES)]
+fn pack_truthy_bytes_chunked(bencher: Bencher, n: usize) {
+    let data: Vec<u8> = (0..n).map(|i| i.is_multiple_of(7) as u8).collect();
+    let mut words = vec![0u64; n.div_ceil(64)];
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        pack_slice_predicate(divan::black_box(&mut words), d, |b| *b > 0);
+        divan::black_box(words.as_slice());
+    });
+}
+
+/// End-to-end real caller: `BitBufferMut::from(&[u8])` (includes allocation).
+#[divan::bench(args = PACK_SIZES)]
+fn bitbuffer_from_u8(bencher: Bencher, n: usize) {
+    let data: Vec<u8> = (0..n).map(|i| i.is_multiple_of(7) as u8).collect();
+    bencher
+        .with_inputs(|| data.as_slice())
+        .bench_refs(|s| BitBufferMut::from(divan::black_box(*s)));
+}
+
+// ---- Typed compare -> bitmask (the `primitive between` shape, i32) ----
+
+/// Baseline: exactly what `primitive between` does today — `collect_bool_words`
+/// over a contiguous `&[i32]` with the inclusive between predicate.
+#[divan::bench(args = PACK_SIZES)]
+fn between_i32_scalar(bencher: Bencher, n: usize) {
+    let data: Vec<i32> = (0..n)
+        .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32))
+        .collect();
+    let mut words = vec![0u64; n.div_ceil(64)];
+    let (lo, hi) = (-100_000_000i32, 100_000_000i32);
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        collect_bool_words(divan::black_box(&mut words), n, |i| {
+            lo <= d[i] && d[i] <= hi
+        });
+        divan::black_box(words.as_slice());
+    });
+}
+
+/// Bounds-check-only fix for the between shape: scalar shift-OR over the slice via
+/// `pack_slice_predicate` (no per-element bounds check), no SIMD intrinsics.
+#[divan::bench(args = PACK_SIZES)]
+fn between_i32_chunked(bencher: Bencher, n: usize) {
+    let data: Vec<i32> = (0..n)
+        .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32))
+        .collect();
+    let mut words = vec![0u64; n.div_ceil(64)];
+    let (lo, hi) = (-100_000_000i32, 100_000_000i32);
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        pack_slice_predicate(divan::black_box(&mut words), d, |v| lo <= *v && *v <= hi);
+        divan::black_box(words.as_slice());
+    });
+}
+
+/// Current production form of `primitive between`: `collect_bool_words` with an
+/// index closure that does `unsafe { *slice.get_unchecked(idx) }`. Baseline to
+/// confirm `pack_slice_predicate` is not a regression vs the existing `unsafe`.
+#[divan::bench(args = PACK_SIZES)]
+fn between_i32_unchecked(bencher: Bencher, n: usize) {
+    let data: Vec<i32> = (0..n)
+        .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32))
+        .collect();
+    let mut words = vec![0u64; n.div_ceil(64)];
+    let (lo, hi) = (-100_000_000i32, 100_000_000i32);
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        collect_bool_words(divan::black_box(&mut words), n, |i| {
+            // SAFETY: i < n == d.len().
+            let v = unsafe { *d.get_unchecked(i) };
+            lo <= v && v <= hi
+        });
+        divan::black_box(words.as_slice());
+    });
+}
+
+/// AVX-512 between: vpcmpd (>= lo) & vpcmpd (<= hi) -> kmovw, 16 i32/iter.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f")]
+fn between_i32_avx512(out: &mut [u16], value: &[i32], lo: i32, hi: i32) {
+    use std::arch::x86_64::*;
+    let vlo = _mm512_set1_epi32(lo);
+    let vhi = _mm512_set1_epi32(hi);
+    let p = value.as_ptr() as *const __m512i;
+    for (i, w) in out.iter_mut().take(value.len() / 16).enumerate() {
+        // SAFETY: i < len/16 keeps the load in bounds.
+        let v = unsafe { _mm512_loadu_si512(p.add(i)) };
+        let ge = _mm512_cmpge_epi32_mask(v, vlo);
+        let le = _mm512_cmple_epi32_mask(v, vhi);
+        *w = ge & le;
+    }
+}
+
+#[divan::bench(args = PACK_SIZES)]
+fn between_i32_simd(bencher: Bencher, n: usize) {
+    let data: Vec<i32> = (0..n)
+        .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32))
+        .collect();
+    let mut masks = vec![0u16; n.div_ceil(16)];
+    let (lo, hi) = (-100_000_000i32, 100_000_000i32);
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        #[cfg(target_arch = "x86_64")]
+        if is_x86_feature_detected!("avx512f") {
+            // SAFETY: avx512f confirmed present at runtime.
+            unsafe { between_i32_avx512(divan::black_box(&mut masks), d, lo, hi) };
+        }
+        divan::black_box(masks.as_slice());
+    });
+}
+
+// ---- CodSpeed A/B: production `primitive between` API, original vs new ----
+//
+// These call the exact `BitBuffer` constructors the production code uses (incl.
+// allocation), so CodSpeed's baseline (x86-64/SSE2) instruction counts measure
+// the real change: index-closure `collect_bool` vs slice-iterating
+// `collect_bool_slice`.
+
+fn between_data(n: usize) -> Vec<i32> {
+    (0..n)
+        .map(|i| (i as i32).wrapping_mul(2_654_435_761u32 as i32))
+        .collect()
+}
+
+/// ORIGINAL: `BitBuffer::collect_bool` with an index closure that does
+/// `unsafe { *slice.get_unchecked(idx) }` — exactly what `primitive between` did.
+#[divan::bench(args = PACK_SIZES)]
+fn between_bitbuffer_original(bencher: Bencher, n: usize) {
+    let data = between_data(n);
+    let (lo, hi) = (-100_000_000i32, 100_000_000i32);
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        let bb = BitBuffer::collect_bool(d.len(), |idx| {
+            // SAFETY: idx < d.len().
+            let v = unsafe { *d.get_unchecked(idx) };
+            lo <= v && v <= hi
+        });
+        divan::black_box(bb)
+    });
+}
+
+/// NEW: `BitBuffer::collect_bool_slice` (safe, slice-iterating) — the rewired path.
+#[divan::bench(args = PACK_SIZES)]
+fn between_bitbuffer_new(bencher: Bencher, n: usize) {
+    let data = between_data(n);
+    let (lo, hi) = (-100_000_000i32, 100_000_000i32);
+    bencher.bench_local(|| {
+        let d = divan::black_box(data.as_slice());
+        let bb = BitBuffer::collect_bool_slice(d, |&v| lo <= v && v <= hi);
+        divan::black_box(bb)
+    });
+}
+
+// ---- Adjacent-pair equality (the varbin `compare_offsets_to_empty` shape) ----
+
+/// Baseline: `collect_bool_words` with the `offsets[i] == offsets[i+1]` predicate.
+#[divan::bench(args = PACK_SIZES)]
+fn offsets_eq_scalar(bencher: Bencher, n: usize) {
+    // n+1 offsets -> n comparisons (empty-string detection).
+    let offsets: Vec<i32> = (0..=n as i32).map(|i| i - (i % 3 == 0) as i32).collect();
+    let mut words = vec![0u64; n.div_ceil(64)];
+    bencher.bench_local(|| {
+        let o = divan::black_box(offsets.as_slice());
+        collect_bool_words(divan::black_box(&mut words), n, |i| o[i] == o[i + 1]);
+        divan::black_box(words.as_slice());
+    });
+}
+
+/// AVX-512: vpcmpeqd of offsets[i..] vs offsets[i+1..] -> kmovw, 16 pairs/iter.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f")]
+fn offsets_eq_avx512(out: &mut [u16], offsets: &[i32]) {
+    use std::arch::x86_64::*;
+    let pairs = offsets.len().saturating_sub(1);
+    let p = offsets.as_ptr() as *const __m512i;
+    for (i, w) in out.iter_mut().take(pairs / 16).enumerate() {
+        // SAFETY: reads offsets[i*16 ..= i*16+16], in bounds since 16*(i+1) < len.
+        let lhs = unsafe { _mm512_loadu_si512(p.byte_add(i * 64)) };
+        let rhs = unsafe { _mm512_loadu_si512(p.byte_add(i * 64 + 4)) };
+        *w = _mm512_cmpeq_epi32_mask(lhs, rhs);
+    }
+}
+
+#[divan::bench(args = PACK_SIZES)]
+fn offsets_eq_simd(bencher: Bencher, n: usize) {
+    let offsets: Vec<i32> = (0..=n as i32).map(|i| i - (i % 3 == 0) as i32).collect();
+    let mut masks = vec![0u16; n.div_ceil(16)];
+    bencher.bench_local(|| {
+        let o = divan::black_box(offsets.as_slice());
+        #[cfg(target_arch = "x86_64")]
+        if is_x86_feature_detected!("avx512f") {
+            // SAFETY: avx512f confirmed present at runtime.
+            unsafe { offsets_eq_avx512(divan::black_box(&mut masks), o) };
+        }
+        divan::black_box(masks.as_slice());
+    });
+}
 
 fn main() {
     // Pre-warm CPUID feature detection so the one-time probe cost is never
diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs
index 83b28e24b4a..422822e9116 100644
--- a/vortex-buffer/src/bit/buf.rs
+++ b/vortex-buffer/src/bit/buf.rs
@@ -165,6 +165,17 @@ impl BitBuffer {
         BitBufferMut::collect_bool(len, f).freeze()
     }
 
+    /// Build a [`BitBuffer`] of length `values.len()`, setting bit `i` iff
+    /// `pred(&values[i])`.
+    ///
+    /// Prefer this over [`collect_bool`](Self::collect_bool) when packing a
+    /// contiguous slice: it iterates the slice directly so LLVM can auto-vectorize
+    /// the compare and bit-pack, and needs no `unsafe` in the caller's predicate.
+    #[inline]
+    pub fn collect_bool_slice<T, F: FnMut(&T) -> bool>(values: &[T], pred: F) -> Self {
+        BitBufferMut::collect_bool_slice(values, pred).freeze()
+    }
+
     /// Maps over each bit in this buffer, calling `f(index, bit_value)` and collecting results.
     ///
     /// This is more efficient than `collect_bool` when you need to read the current bit value,
diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs
index 829b0fcf9cc..79ea435cae7 100644
--- a/vortex-buffer/src/bit/buf_mut.rs
+++ b/vortex-buffer/src/bit/buf_mut.rs
@@ -11,6 +11,8 @@ use crate::ByteBufferMut;
 use crate::bit::collect_bool_words;
 use crate::bit::get_bit_unchecked;
 use crate::bit::ops;
+use crate::bit::pack_nonzero_bytes;
+use crate::bit::pack_slice_predicate;
 use crate::bit::set_bit_unchecked;
 use crate::bit::unset_bit_unchecked;
 use crate::buffer_mut;
@@ -204,6 +206,56 @@ impl BitBufferMut {
         }
     }
 
+    /// Build a `BitBufferMut` of length `values.len()`, setting bit `i` iff
+    /// `pred(&values[i])`.
+    ///
+    /// Prefer this over [`collect_bool`](Self::collect_bool) when the source is a
+    /// contiguous slice: it iterates the slice directly via [`pack_slice_predicate`]
+    /// rather than calling an index closure per element, which lets LLVM
+    /// auto-vectorize the compare and bit-pack. It is also safe — no `get_unchecked`
+    /// in the caller's predicate.
+    #[inline]
+    pub fn collect_bool_slice<T, F: FnMut(&T) -> bool>(values: &[T], pred: F) -> Self {
+        let len = values.len();
+        let num_words = len.div_ceil(64);
+        let mut buffer: BufferMut<u64> = BufferMut::with_capacity(num_words);
+        // SAFETY: `pack_slice_predicate` writes every word in `0..num_words` below
+        // before any read; `u64` has no invalid bit patterns.
+        unsafe { buffer.set_len(num_words) };
+        pack_slice_predicate(buffer.as_mut_slice(), values, pred);
+
+        let mut bytes = buffer.into_byte_buffer();
+        bytes.truncate(len.div_ceil(8));
+
+        Self {
+            buffer: bytes,
+            offset: 0,
+            len,
+        }
+    }
+
+    /// Build a `BitBufferMut` from a contiguous byte slice, setting bit `i` iff
+    /// `value[i] != 0`. Uses the SIMD [`pack_nonzero_bytes`] fast path.
+    #[inline]
+    pub fn from_nonzero_bytes(value: &[u8]) -> Self {
+        let len = value.len();
+        let num_words = len.div_ceil(64);
+        let mut buffer: BufferMut<u64> = BufferMut::with_capacity(num_words);
+        // SAFETY: `pack_nonzero_bytes` writes every word in `0..num_words` below
+        // before any read; `u64` has no invalid bit patterns.
+        unsafe { buffer.set_len(num_words) };
+        pack_nonzero_bytes(buffer.as_mut_slice(), value);
+
+        let mut bytes = buffer.into_byte_buffer();
+        bytes.truncate(len.div_ceil(8));
+
+        Self {
+            buffer: bytes,
+            offset: 0,
+            len,
+        }
+    }
+
     /// Return the underlying byte buffer.
     pub fn inner(&self) -> &ByteBufferMut {
         &self.buffer
@@ -563,14 +615,19 @@ impl Not for BitBufferMut {
 
 impl From<&[bool]> for BitBufferMut {
     fn from(value: &[bool]) -> Self {
-        BitBufferMut::collect_bool(value.len(), |i| value[i])
+        // SAFETY: `bool` is a single byte guaranteed to be 0 or 1, so it is
+        // sound to view `&[bool]` as `&[u8]`; `pack_nonzero_bytes` sets bit `i`
+        // iff the byte is non-zero, matching `value[i] == true`.
+        let bytes =
+            unsafe { core::slice::from_raw_parts(value.as_ptr().cast::<u8>(), value.len()) };
+        BitBufferMut::from_nonzero_bytes(bytes)
     }
 }
 
 // allow building a buffer from a set of truthy byte values.
 impl From<&[u8]> for BitBufferMut {
     fn from(value: &[u8]) -> Self {
-        BitBufferMut::collect_bool(value.len(), |i| value[i] > 0)
+        BitBufferMut::from_nonzero_bytes(value)
     }
 }
 
diff --git a/vortex-buffer/src/bit/mod.rs b/vortex-buffer/src/bit/mod.rs
index 41bb5797266..c262b728871 100644
--- a/vortex-buffer/src/bit/mod.rs
+++ b/vortex-buffer/src/bit/mod.rs
@@ -14,6 +14,7 @@ mod count_ones;
 mod macros;
 mod meta;
 mod ops;
+mod pack_simd;
 mod select;
 mod view;
 
@@ -27,6 +28,7 @@ pub use arrow_buffer::bit_iterator::BitSliceIterator;
 pub use buf::*;
 pub use buf_mut::*;
 pub use meta::*;
+pub use pack_simd::pack_nonzero_bytes;
 pub use view::*;
 
 /// Packs up to 64 boolean values into a little-endian `u64` word.
@@ -74,6 +76,61 @@ where
     }
 }
 
+/// Pack one chunk of at most 64 elements into a little-endian `u64`, LSB-first,
+/// by applying `pred` to each element. Iterates the slice directly so there is no
+/// per-element bounds check.
+#[inline]
+fn pack_chunk<T, F>(chunk: &[T], pred: &mut F) -> u64
+where
+    F: FnMut(&T) -> bool,
+{
+    let mut packed = 0u64;
+    for (bit_idx, value) in chunk.iter().enumerate() {
+        packed |= (pred(value) as u64) << bit_idx;
+    }
+    packed
+}
+
+/// Pack a per-element predicate over `values` into the prefix of `words`, LSB-first,
+/// 64 bits per `u64`. Writes via `=` (not `|=`), so the destination need not be
+/// zero-initialised. `words` must have capacity for at least
+/// `values.len().div_ceil(64)` entries.
+///
+/// Unlike [`collect_bool_words`], whose index closure forces a bounds-checked
+/// `values[i]` in the caller's predicate, this iterates the slice directly through
+/// `chunks_exact`. With the per-element bounds check elided and a fixed 64-element
+/// inner trip count, LLVM can auto-vectorize the scalar shift-OR loop — which the
+/// index-closure form prevents. This stays scalar (no SIMD intrinsics), so it does
+/// not reach [`pack_nonzero_bytes`] speed, but it recovers the bounds-check overhead.
+#[inline]
+pub fn pack_slice_predicate<T, F>(words: &mut [u64], values: &[T], mut pred: F)
+where
+    F: FnMut(&T) -> bool,
+{
+    let num_words = values.len().div_ceil(64);
+    assert!(
+        words.len() >= num_words,
+        "words slice has {} entries, need at least {num_words}",
+        words.len(),
+    );
+
+    let mut chunks = values.chunks_exact(64);
+    let mut words_iter = words.iter_mut();
+    // `chunks` must lead the zip: if it leads and is empty, zip short-circuits
+    // without consuming a slot from `words_iter`, leaving it positioned for the
+    // remainder write below. (The reverse order would discard a `words` slot.)
+    for (chunk, word) in chunks.by_ref().zip(words_iter.by_ref()) {
+        *word = pack_chunk(chunk, &mut pred);
+    }
+
+    let remainder = chunks.remainder();
+    if !remainder.is_empty()
+        && let Some(word) = words_iter.next()
+    {
+        *word = pack_chunk(remainder, &mut pred);
+    }
+}
+
 /// Splice a packed word `w` (whose bits above the highest valid bit are zero) into
 /// `words` at the given bit position.
 ///
@@ -176,8 +233,12 @@ pub unsafe fn unset_bit_unchecked(buf: *mut u8, index: usize) {
 
 #[cfg(test)]
 mod tests {
+    use rstest::rstest;
+
     use super::collect_bool_word;
+    use super::collect_bool_words;
     use super::pack_bools_into_words;
+    use super::pack_slice_predicate;
 
     #[test]
     fn collect_bool_word_packs_lsb_first() {
@@ -222,6 +283,32 @@ mod tests {
         }
     }
 
+    #[rstest]
+    #[case(0)]
+    #[case(1)]
+    #[case(63)]
+    #[case(64)]
+    #[case(65)]
+    #[case(130)]
+    #[case(200)]
+    fn pack_slice_predicate_matches_collect_bool_words(#[case] len: usize) {
+        #[allow(
+            clippy::cast_possible_truncation,
+            reason = "small test indices fit in i32"
+        )]
+        let values: Vec<i32> = (0..len as i32).map(|i| i.wrapping_mul(7) - 3).collect();
+        let pred = |v: i32| v % 5 == 0;
+
+        let num_words = len.div_ceil(64);
+        let mut expected = vec![0u64; num_words.max(1)];
+        collect_bool_words(&mut expected, len, |i| pred(values[i]));
+
+        let mut actual = vec![0u64; num_words.max(1)];
+        pack_slice_predicate(&mut actual, &values, |v| pred(*v));
+
+        assert_eq!(actual[..num_words], expected[..num_words], "len {len}");
+    }
+
     #[test]
     fn pack_bools_preserves_low_bits_of_leading_word() {
         let mut words = vec![0u64; 2];
diff --git a/vortex-buffer/src/bit/pack_simd.rs b/vortex-buffer/src/bit/pack_simd.rs
new file mode 100644
index 00000000000..df8704edb38
--- /dev/null
+++ b/vortex-buffer/src/bit/pack_simd.rs
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! SIMD packing of "truthy" bytes into an LSB-first bitmask.
+//!
+//! Packs `value.len()` bytes into bits where bit `i` is set iff `value[i] != 0`.
+//! This is the contiguous-slice fast path behind `BitBufferMut::from(&[u8])` and
+//! `BitBufferMut::from(&[bool])`.
+//!
+//! Expressed as a vector compare into an opmask, this lowers to one
+//! `vptestmb` + `kmovq` per 64 bytes on AVX-512BW. The scalar
+//! `packed |= (b != 0) << i` reduction loop does not: LLVM's SLP vectorizer
+//! rewrites it into a `vpsllvq` shift-OR reduction instead, which is ~10-20x
+//! slower for cache-resident inputs.
+
+/// Pack `value.len()` truthy bytes (`b != 0`) into `words`, LSB-first, 64 bits
+/// per `u64`. `words` must have at least `value.len().div_ceil(64)` entries.
+#[inline]
+pub fn pack_nonzero_bytes(words: &mut [u64], value: &[u8]) {
+    let num_words = value.len().div_ceil(64);
+    assert!(
+        words.len() >= num_words,
+        "words slice has {} entries, need at least {num_words}",
+        words.len(),
+    );
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512bw") {
+            // SAFETY: guarded by the runtime feature checks above.
+            return unsafe { pack_nonzero_bytes_avx512(words, value) };
+        }
+        if is_x86_feature_detected!("avx2") {
+            // SAFETY: guarded by the runtime feature check above.
+            return unsafe { pack_nonzero_bytes_avx2(words, value) };
+        }
+    }
+
+    pack_nonzero_bytes_scalar(words, value);
+}
+
+/// Portable fallback used directly on non-x86 targets and as the tail handler.
+#[inline]
+fn pack_nonzero_bytes_scalar(words: &mut [u64], value: &[u8]) {
+    let full = value.len() / 64;
+    for (word, chunk) in words.iter_mut().zip(value.chunks_exact(64)) {
+        let mut bits = 0u64;
+        for (i, &b) in chunk.iter().enumerate() {
+            bits |= ((b != 0) as u64) << i;
+        }
+        *word = bits;
+    }
+    if !value.len().is_multiple_of(64) {
+        let base = full * 64;
+        let mut bits = 0u64;
+        for (i, &b) in value[base..].iter().enumerate() {
+            bits |= ((b != 0) as u64) << i;
+        }
+        words[full] = bits;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f,avx512bw")]
+unsafe fn pack_nonzero_bytes_avx512(words: &mut [u64], value: &[u8]) {
+    use std::arch::x86_64::__m512i;
+    use std::arch::x86_64::_mm512_loadu_si512;
+    use std::arch::x86_64::_mm512_test_epi8_mask;
+
+    let full = value.len() / 64;
+    let ptr = value.as_ptr();
+    for (i, word) in words.iter_mut().take(full).enumerate() {
+        // SAFETY: i < full so the 64-byte load stays in bounds.
+        let v = unsafe { _mm512_loadu_si512(ptr.add(i * 64) as *const __m512i) };
+        // vptestmb: per-byte (v & v) != 0 -> 64-bit opmask; kmovq stores it.
+        *word = _mm512_test_epi8_mask(v, v);
+    }
+    if !value.len().is_multiple_of(64) {
+        pack_nonzero_bytes_scalar(&mut words[full..], &value[full * 64..]);
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn pack_nonzero_bytes_avx2(words: &mut [u64], value: &[u8]) {
+    use std::arch::x86_64::__m256i;
+    use std::arch::x86_64::_mm256_cmpeq_epi8;
+    use std::arch::x86_64::_mm256_loadu_si256;
+    use std::arch::x86_64::_mm256_movemask_epi8;
+    use std::arch::x86_64::_mm256_setzero_si256;
+
+    let full = value.len() / 64;
+    let ptr = value.as_ptr();
+    let zero = _mm256_setzero_si256();
+    for (i, word) in words.iter_mut().take(full).enumerate() {
+        // Two 32-byte halves; movemask gives 1 bit per byte. cmpeq vs zero
+        // marks zero bytes, so invert to mark non-zero bytes.
+        // SAFETY: i < full so both 32-byte loads stay in bounds.
+        let lo = unsafe { _mm256_loadu_si256(ptr.add(i * 64) as *const __m256i) };
+        // SAFETY: i < full so the second 32-byte load stays in bounds.
+        let hi = unsafe { _mm256_loadu_si256(ptr.add(i * 64 + 32) as *const __m256i) };
+        let lo_zero = _mm256_movemask_epi8(_mm256_cmpeq_epi8(lo, zero)) as u32;
+        let hi_zero = _mm256_movemask_epi8(_mm256_cmpeq_epi8(hi, zero)) as u32;
+        let bits = (!lo_zero as u64) | ((!hi_zero as u64) << 32);
+        *word = bits;
+    }
+    if !value.len().is_multiple_of(64) {
+        pack_nonzero_bytes_scalar(&mut words[full..], &value[full * 64..]);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn reference(value: &[u8]) -> Vec<u64> {
+        let mut w = vec![0u64; value.len().div_ceil(64)];
+        for (i, &b) in value.iter().enumerate() {
+            if b != 0 {
+                w[i / 64] |= 1u64 << (i % 64);
+            }
+        }
+        w
+    }
+
+    #[test]
+    fn matches_reference() {
+        for &n in &[0usize, 1, 7, 63, 64, 65, 127, 128, 200, 1000, 4096] {
+            // Mix of zero and varied non-zero bytes to exercise the != 0 test.
+            let data: Vec<u8> = (0..n)
+                .map(|i| {
+                    if i.is_multiple_of(5) {
+                        0
+                    } else {
+                        u8::try_from(i % 200 + 1).unwrap()
+                    }
+                })
+                .collect();
+            let mut got = vec![0u64; n.div_ceil(64)];
+            pack_nonzero_bytes(&mut got, &data);
+            assert_eq!(got, reference(&data), "mismatch at n={n}");
+        }
+    }
+}