Skip to content
12 changes: 12 additions & 0 deletions vortex-array/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2260,6 +2260,10 @@ impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::List

pub fn vortex_array::arrays::List::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::List>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>

impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView

pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>

impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::Primitive

pub fn vortex_array::arrays::Primitive::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::Primitive>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
Expand Down Expand Up @@ -3026,6 +3030,10 @@ impl vortex_array::ValidityVTable<vortex_array::arrays::ListView> for vortex_arr

pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult<vortex_array::validity::Validity>

impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView

pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>

impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView

pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
Expand Down Expand Up @@ -6034,6 +6042,10 @@ impl vortex_array::ValidityVTable<vortex_array::arrays::ListView> for vortex_arr

pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult<vortex_array::validity::Validity>

impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView

pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>

impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView

pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
Expand Down
14 changes: 9 additions & 5 deletions vortex-array/src/arrays/dict/execute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ pub fn take_canonical(
Canonical::Primitive(a) => Canonical::Primitive(take_primitive(&a, codes, ctx)),
Canonical::Decimal(a) => Canonical::Decimal(take_decimal(&a, codes, ctx)),
Canonical::VarBinView(a) => Canonical::VarBinView(take_varbinview(&a, codes, ctx)),
Canonical::List(a) => Canonical::List(take_listview(&a, codes)),
Canonical::List(a) => Canonical::List(take_listview(&a, codes, ctx)),
Canonical::FixedSizeList(a) => {
Canonical::FixedSizeList(take_fixed_size_list(&a, codes, ctx))
}
Expand Down Expand Up @@ -123,12 +123,16 @@ fn take_varbinview(
.into_owned()
}

fn take_listview(array: &ListViewArray, codes: &PrimitiveArray) -> ListViewArray {
fn take_listview(
array: &ListViewArray,
codes: &PrimitiveArray,
ctx: &mut ExecutionCtx,
) -> ListViewArray {
let codes_ref = codes.clone().into_array();
let array = array.as_view();
<ListView as TakeReduce>::take(array, &codes_ref)
.vortex_expect("take listview array")
.vortex_expect("take listview should not return None")
<ListView as TakeExecute>::take(array, &codes_ref, ctx)
.vortex_expect("take listview execute")
.vortex_expect("ListView TakeExecute should not return None")
.as_::<ListView>()
.into_owned()
}
Expand Down
28 changes: 9 additions & 19 deletions vortex-array/src/arrays/filter/execute/listview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,10 @@ use vortex_mask::MaskValues;
use crate::arrays::ListViewArray;
use crate::arrays::filter::execute::filter_validity;
use crate::arrays::filter::execute::values_to_mask;
use crate::arrays::listview;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewRebuildMode;

// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators.
/// The threshold for triggering a rebuild of the [`ListViewArray`].
///
/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it
/// can be potentially expensive to reorganize the array based on what views we have into it.
///
/// However, we also do not want to carry around a large amount of garbage data. Below this
/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing
/// any garbage data.
#[expect(unused)]
const REBUILD_DENSITY_THRESHOLD: f64 = 0.1;

/// [`ListViewArray`] filter implementation.
///
/// This implementation is deliberately simple and read-optimized. We just filter the `offsets` and
Expand Down Expand Up @@ -70,13 +59,14 @@ pub fn filter_listview(array: &ListViewArray, selection_mask: &Arc<MaskValues>)
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
};

// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
// right now, so we will just rebuild every time (similar to `ListArray`).

new_array
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)
.vortex_expect("ListViewArray rebuild to zero-copy List should always succeed")
let kept_row_fraction = selection_mask.true_count() as f32 / array.sizes().len() as f32;
if kept_row_fraction < listview::compute::REBUILD_DENSITY_THRESHOLD {
new_array
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)
.vortex_expect("ListViewArray rebuild to zero-copy List should always succeed")
} else {
new_array
}
}

#[cfg(test)]
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/arrays/listview/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ mod mask;
pub(crate) mod rules;
mod slice;
mod take;
pub(crate) use take::REBUILD_DENSITY_THRESHOLD;
137 changes: 79 additions & 58 deletions vortex-array/src/arrays/listview/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ use num_traits::Zero;
use vortex_error::VortexResult;

use crate::ArrayRef;
use crate::ExecutionCtx;
use crate::IntoArray;
use crate::array::ArrayView;
use crate::arrays::ListView;
use crate::arrays::ListViewArray;
use crate::arrays::dict::TakeExecute;
use crate::arrays::dict::TakeReduce;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewRebuildMode;
Expand All @@ -17,72 +19,91 @@ use crate::dtype::Nullability;
use crate::match_each_integer_ptype;
use crate::scalar::Scalar;

// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators.
/// The threshold for triggering a rebuild of the [`ListViewArray`].
/// The threshold below which we rebuild the elements of a listview.
///
/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it
/// can be potentially expensive to reorganize the array based on what views we have into it.
/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive.
/// However, we also don't want to drag around a large amount of garbage data when the selection
/// is sparse. Below this fraction of list rows retained, the rebuild is worth it.
/// Rebuilding is needed when exporting the ListView's elements.
///
/// However, we also do not want to carry around a large amount of garbage data. Below this
/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing
/// any garbage data.
#[expect(unused)]
const REBUILD_DENSITY_THRESHOLD: f64 = 0.1;
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
// right now, so we will just rebuild every time (similar to [`ListArray`]).
pub(crate) const REBUILD_DENSITY_THRESHOLD: f32 = 0.1;

/// [`ListViewArray`] take implementation.
///
/// This implementation is deliberately simple and read-optimized. We just take the `offsets` and
/// `sizes` at the requested indices and reuse the original `elements` array. This works because
/// `ListView` (unlike `List`) allows non-contiguous and out-of-order lists.
///
/// We don't slice the `elements` array because it would require computing min/max offsets and
/// adjusting all offsets accordingly, which is not really worth the small potential memory we would
/// be able to get back.
///
/// The trade-off is that we may keep unreferenced elements in memory, but this is acceptable since
/// we're optimizing for read performance and the data isn't being copied.
/// Metadata-only take for [`ListViewArray`].
impl TakeReduce for ListView {
fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<Option<ArrayRef>> {
let elements = array.elements();
let offsets = array.offsets();
let sizes = array.sizes();
// Approximate element density by the fraction of list rows retained. Assumes roughly
// uniform list sizes; good enough to decide whether dragging along the full `elements`
// buffer is worth avoiding a rebuild.
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
return Ok(None);
}

Ok(Some(apply_take(array, indices)?.into_array()))
}
}

// Compute the new validity by combining the array's validity with the indices' validity.
let new_validity = array.validity()?.take(indices)?;
/// Execution-path take for [`ListViewArray`].
///
/// This does the same metadata-only take as [`TakeReduce`], but also rebuilds the array if the
/// resulting array will be less dense than `REBUILD_DENSITY_THRESHOLD`.
impl TakeExecute for ListView {
fn take(
array: ArrayView<'_, ListView>,
indices: &ArrayRef,
_ctx: &mut ExecutionCtx,
) -> VortexResult<Option<ArrayRef>> {
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
let taken = apply_take(array, indices)?;

if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do
// this right now, so we will just rebuild every time (similar to `ListArray`).
Ok(Some(
taken
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
.into_array(),
))
} else {
Ok(Some(taken.into_array()))
}
}
}

// Take the offsets and sizes arrays at the requested indices.
// Take can reorder offsets, create gaps, and may introduce overlaps if the `indices`
// contain duplicates.
let nullable_new_offsets = offsets.take(indices.clone())?;
let nullable_new_sizes = sizes.take(indices.clone())?;
/// Shared metadata-only take: take `offsets`, `sizes` and `validity` at `indices` while reusing
/// the original `elements` buffer as-is.
fn apply_take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<ListViewArray> {
let elements = array.elements();
let offsets = array.offsets();
let sizes = array.sizes();

// Since `take` returns nullable arrays, we simply cast it back to non-nullable (filled with
// zeros to represent null lists).
let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| {
nullable_new_offsets
.fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))?
});
let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| {
nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))?
});
// SAFETY: Take operation maintains all `ListViewArray` invariants:
// - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
// - `new_offsets` and `new_sizes` are non-nullable.
// - `new_offsets` and `new_sizes` have the same length (both taken with the same
// `indices`).
// - Validity correctly reflects the combination of array and indices validity.
let new_array = unsafe {
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
};
// Combine the array's validity with the indices' validity.
let new_validity = array.validity()?.take(indices)?;

// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
// right now, so we will just rebuild every time (similar to `ListArray`).
// Take can reorder offsets, create gaps, and may introduce overlaps if `indices` contain
// duplicates.
let nullable_new_offsets = offsets.take(indices.clone())?;
let nullable_new_sizes = sizes.take(indices.clone())?;

Ok(Some(
new_array
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
.into_array(),
))
}
// `take` returns nullable arrays; cast back to non-nullable (filling with zeros to represent
// the null lists — the validity mask tracks nullness separately).
let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| {
nullable_new_offsets.fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))?
});
let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| {
nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))?
});

// SAFETY: Take operation maintains all `ListViewArray` invariants:
// - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
// - `new_offsets` and `new_sizes` are non-nullable.
// - `new_offsets` and `new_sizes` have the same length (both taken with the same `indices`).
// - Validity correctly reflects the combination of array and indices validity.
Ok(unsafe {
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
})
}
Loading