diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index d5ab9cc61f8..289f24b7a35 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -2260,6 +2260,10 @@ impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::List pub fn vortex_array::arrays::List::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::List>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> +impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView + +pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> + impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::Primitive pub fn vortex_array::arrays::Primitive::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::Primitive>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> @@ -3026,6 +3030,10 @@ impl vortex_array::ValidityVTable for vortex_arr pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult +impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView + +pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> + impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult> @@ -6034,6 +6042,10 @@ impl vortex_array::ValidityVTable for vortex_arr pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult +impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView + +pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult> + impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult> diff --git a/vortex-array/src/arrays/dict/execute.rs b/vortex-array/src/arrays/dict/execute.rs index d95cd239a44..19eb6d56b3d 100644 --- a/vortex-array/src/arrays/dict/execute.rs +++ b/vortex-array/src/arrays/dict/execute.rs @@ -46,7 +46,7 @@ pub fn take_canonical( Canonical::Primitive(a) => Canonical::Primitive(take_primitive(&a, codes, ctx)), Canonical::Decimal(a) => Canonical::Decimal(take_decimal(&a, codes, ctx)), Canonical::VarBinView(a) => Canonical::VarBinView(take_varbinview(&a, codes, ctx)), - Canonical::List(a) => Canonical::List(take_listview(&a, codes)), + Canonical::List(a) => Canonical::List(take_listview(&a, codes, ctx)), Canonical::FixedSizeList(a) => { Canonical::FixedSizeList(take_fixed_size_list(&a, codes, ctx)) } @@ -123,12 +123,16 @@ fn take_varbinview( .into_owned() } -fn take_listview(array: &ListViewArray, codes: &PrimitiveArray) -> ListViewArray { +fn take_listview( + array: &ListViewArray, + codes: &PrimitiveArray, + ctx: &mut ExecutionCtx, +) -> ListViewArray { let codes_ref = codes.clone().into_array(); let array = array.as_view(); - ::take(array, &codes_ref) - .vortex_expect("take listview array") - .vortex_expect("take listview should not return None") + ::take(array, &codes_ref, ctx) + .vortex_expect("take listview execute") + .vortex_expect("ListView TakeExecute should not return None") .as_::() .into_owned() } diff --git a/vortex-array/src/arrays/filter/execute/listview.rs b/vortex-array/src/arrays/filter/execute/listview.rs index 1c0cd000c10..6f782a50531 100644 --- a/vortex-array/src/arrays/filter/execute/listview.rs +++ b/vortex-array/src/arrays/filter/execute/listview.rs @@ -9,21 +9,10 @@ use vortex_mask::MaskValues; use crate::arrays::ListViewArray; use crate::arrays::filter::execute::filter_validity; use crate::arrays::filter::execute::values_to_mask; +use crate::arrays::listview; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::ListViewRebuildMode; -// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators. -/// The threshold for triggering a rebuild of the [`ListViewArray`]. -/// -/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it -/// can be potentially expensive to reorganize the array based on what views we have into it. -/// -/// However, we also do not want to carry around a large amount of garbage data. Below this -/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing -/// any garbage data. -#[expect(unused)] -const REBUILD_DENSITY_THRESHOLD: f64 = 0.1; - /// [`ListViewArray`] filter implementation. /// /// This implementation is deliberately simple and read-optimized. We just filter the `offsets` and @@ -70,13 +59,14 @@ pub fn filter_listview(array: &ListViewArray, selection_mask: &Arc) ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) }; - // TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` - // compute functions have run, at the "top" of the operator tree. However, we cannot do this - // right now, so we will just rebuild every time (similar to `ListArray`). - - new_array - .rebuild(ListViewRebuildMode::MakeZeroCopyToList) - .vortex_expect("ListViewArray rebuild to zero-copy List should always succeed") + let kept_row_fraction = selection_mask.true_count() as f32 / array.sizes().len() as f32; + if kept_row_fraction < listview::compute::REBUILD_DENSITY_THRESHOLD { + new_array + .rebuild(ListViewRebuildMode::MakeZeroCopyToList) + .vortex_expect("ListViewArray rebuild to zero-copy List should always succeed") + } else { + new_array + } } #[cfg(test)] diff --git a/vortex-array/src/arrays/listview/compute/mod.rs b/vortex-array/src/arrays/listview/compute/mod.rs index 9a43503c4b5..9321e82c769 100644 --- a/vortex-array/src/arrays/listview/compute/mod.rs +++ b/vortex-array/src/arrays/listview/compute/mod.rs @@ -6,3 +6,4 @@ mod mask; pub(crate) mod rules; mod slice; mod take; +pub(crate) use take::REBUILD_DENSITY_THRESHOLD; diff --git a/vortex-array/src/arrays/listview/compute/take.rs b/vortex-array/src/arrays/listview/compute/take.rs index 69f40a400a4..8068c8e674f 100644 --- a/vortex-array/src/arrays/listview/compute/take.rs +++ b/vortex-array/src/arrays/listview/compute/take.rs @@ -5,10 +5,12 @@ use num_traits::Zero; use vortex_error::VortexResult; use crate::ArrayRef; +use crate::ExecutionCtx; use crate::IntoArray; use crate::array::ArrayView; use crate::arrays::ListView; use crate::arrays::ListViewArray; +use crate::arrays::dict::TakeExecute; use crate::arrays::dict::TakeReduce; use crate::arrays::listview::ListViewArrayExt; use crate::arrays::listview::ListViewRebuildMode; @@ -17,72 +19,91 @@ use crate::dtype::Nullability; use crate::match_each_integer_ptype; use crate::scalar::Scalar; -// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators. -/// The threshold for triggering a rebuild of the [`ListViewArray`]. +/// The threshold below which we rebuild the elements of a listview. /// -/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it -/// can be potentially expensive to reorganize the array based on what views we have into it. +/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive. +/// However, we also don't want to drag around a large amount of garbage data when the selection +/// is sparse. Below this fraction of list rows retained, the rebuild is worth it. +/// Rebuilding is needed when exporting the ListView's elements. /// -/// However, we also do not want to carry around a large amount of garbage data. Below this -/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing -/// any garbage data. -#[expect(unused)] -const REBUILD_DENSITY_THRESHOLD: f64 = 0.1; +// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` +// compute functions have run, at the "top" of the operator tree. However, we cannot do this +// right now, so we will just rebuild every time (similar to [`ListArray`]). +pub(crate) const REBUILD_DENSITY_THRESHOLD: f32 = 0.1; -/// [`ListViewArray`] take implementation. -/// -/// This implementation is deliberately simple and read-optimized. We just take the `offsets` and -/// `sizes` at the requested indices and reuse the original `elements` array. This works because -/// `ListView` (unlike `List`) allows non-contiguous and out-of-order lists. -/// -/// We don't slice the `elements` array because it would require computing min/max offsets and -/// adjusting all offsets accordingly, which is not really worth the small potential memory we would -/// be able to get back. -/// -/// The trade-off is that we may keep unreferenced elements in memory, but this is acceptable since -/// we're optimizing for read performance and the data isn't being copied. +/// Metadata-only take for [`ListViewArray`]. impl TakeReduce for ListView { fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult> { - let elements = array.elements(); - let offsets = array.offsets(); - let sizes = array.sizes(); + // Approximate element density by the fraction of list rows retained. Assumes roughly + // uniform list sizes; good enough to decide whether dragging along the full `elements` + // buffer is worth avoiding a rebuild. + let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32; + if kept_row_fraction < REBUILD_DENSITY_THRESHOLD { + return Ok(None); + } + + Ok(Some(apply_take(array, indices)?.into_array())) + } +} - // Compute the new validity by combining the array's validity with the indices' validity. - let new_validity = array.validity()?.take(indices)?; +/// Execution-path take for [`ListViewArray`]. +/// +/// This does the same metadata-only take as [`TakeReduce`], but also rebuilds the array if the +/// resulting array will be less dense than `REBUILD_DENSITY_THRESHOLD`. +impl TakeExecute for ListView { + fn take( + array: ArrayView<'_, ListView>, + indices: &ArrayRef, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32; + let taken = apply_take(array, indices)?; + + if kept_row_fraction < REBUILD_DENSITY_THRESHOLD { + // TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` + // compute functions have run, at the "top" of the operator tree. However, we cannot do + // this right now, so we will just rebuild every time (similar to `ListArray`). + Ok(Some( + taken + .rebuild(ListViewRebuildMode::MakeZeroCopyToList)? + .into_array(), + )) + } else { + Ok(Some(taken.into_array())) + } + } +} - // Take the offsets and sizes arrays at the requested indices. - // Take can reorder offsets, create gaps, and may introduce overlaps if the `indices` - // contain duplicates. - let nullable_new_offsets = offsets.take(indices.clone())?; - let nullable_new_sizes = sizes.take(indices.clone())?; +/// Shared metadata-only take: take `offsets`, `sizes` and `validity` at `indices` while reusing +/// the original `elements` buffer as-is. +fn apply_take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult { + let elements = array.elements(); + let offsets = array.offsets(); + let sizes = array.sizes(); - // Since `take` returns nullable arrays, we simply cast it back to non-nullable (filled with - // zeros to represent null lists). - let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| { - nullable_new_offsets - .fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))? - }); - let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| { - nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))? - }); - // SAFETY: Take operation maintains all `ListViewArray` invariants: - // - `new_offsets` and `new_sizes` are derived from existing valid child arrays. - // - `new_offsets` and `new_sizes` are non-nullable. - // - `new_offsets` and `new_sizes` have the same length (both taken with the same - // `indices`). - // - Validity correctly reflects the combination of array and indices validity. - let new_array = unsafe { - ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) - }; + // Combine the array's validity with the indices' validity. + let new_validity = array.validity()?.take(indices)?; - // TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter` - // compute functions have run, at the "top" of the operator tree. However, we cannot do this - // right now, so we will just rebuild every time (similar to `ListArray`). + // Take can reorder offsets, create gaps, and may introduce overlaps if `indices` contain + // duplicates. + let nullable_new_offsets = offsets.take(indices.clone())?; + let nullable_new_sizes = sizes.take(indices.clone())?; - Ok(Some( - new_array - .rebuild(ListViewRebuildMode::MakeZeroCopyToList)? - .into_array(), - )) - } + // `take` returns nullable arrays; cast back to non-nullable (filling with zeros to represent + // the null lists — the validity mask tracks nullness separately). + let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| { + nullable_new_offsets.fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))? + }); + let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| { + nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))? + }); + + // SAFETY: Take operation maintains all `ListViewArray` invariants: + // - `new_offsets` and `new_sizes` are derived from existing valid child arrays. + // - `new_offsets` and `new_sizes` are non-nullable. + // - `new_offsets` and `new_sizes` have the same length (both taken with the same `indices`). + // - Validity correctly reflects the combination of array and indices validity. + Ok(unsafe { + ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity) + }) }