-
Notifications
You must be signed in to change notification settings - Fork 145
Expand file tree
/
Copy pathmod.rs
More file actions
367 lines (325 loc) · 13.9 KB
/
mod.rs
File metadata and controls
367 lines (325 loc) · 13.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors
//! This module contains the VTable definitions for a Vortex encoding.
mod dyn_;
mod operations;
mod validity;
use std::fmt::Debug;
use std::hash::Hasher;
use std::ops::Deref;
pub use dyn_::*;
pub use operations::*;
pub use validity::*;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_panic;
use vortex_session::VortexSession;
use crate::ArrayRef;
use crate::Canonical;
use crate::DynArray;
use crate::ExecutionStep;
use crate::IntoArray;
use crate::Precision;
use crate::arrays::ConstantArray;
use crate::buffer::BufferHandle;
use crate::builders::ArrayBuilder;
use crate::dtype::DType;
use crate::executor::ExecutionCtx;
use crate::patches::Patches;
use crate::serde::ArrayChildren;
use crate::stats::StatsSetRef;
use crate::validity::Validity;
/// The array [`VTable`] encapsulates logic for an Array type within Vortex.
///
/// The logic is split across several "VTable" traits to enable easier code organization than
/// simply lumping everything into a single trait.
///
/// From this [`VTable`] trait, we derive implementations for the sealed [`DynArray`] and [`DynVTable`]
/// traits.
///
/// The functions defined in these vtable traits will typically document their pre- and
/// post-conditions. The pre-conditions are validated inside the [`DynArray`] and [`DynVTable`]
/// implementations so do not need to be checked in the vtable implementations (for example, index
/// out of bounds). Post-conditions are validated after invocation of the vtable function and will
/// panic if violated.
pub trait VTable: 'static + Sized + Send + Sync + Debug {
type Array: 'static + Send + Sync + Clone + Debug + Deref<Target = dyn DynArray> + IntoArray;
type Metadata: Debug;
type OperationsVTable: OperationsVTable<Self>;
type ValidityVTable: ValidityVTable<Self>;
/// Returns the ID of the array.
fn id(array: &Self::Array) -> ArrayId;
/// Returns the length of the array.
fn len(array: &Self::Array) -> usize;
/// Returns the DType of the array.
fn dtype(array: &Self::Array) -> &DType;
/// Returns the stats set for the array.
fn stats(array: &Self::Array) -> StatsSetRef<'_>;
/// Hashes the array contents.
fn array_hash<H: Hasher>(array: &Self::Array, state: &mut H, precision: Precision);
/// Compares two arrays of the same type for equality.
fn array_eq(array: &Self::Array, other: &Self::Array, precision: Precision) -> bool;
/// Returns the number of buffers in the array.
fn nbuffers(array: &Self::Array) -> usize;
/// Returns the buffer at the given index.
///
/// # Panics
/// Panics if `idx >= nbuffers(array)`.
fn buffer(array: &Self::Array, idx: usize) -> BufferHandle;
/// Returns the name of the buffer at the given index, or `None` if unnamed.
fn buffer_name(array: &Self::Array, idx: usize) -> Option<String>;
/// Returns the number of children in the array.
fn nchildren(array: &Self::Array) -> usize;
/// Returns the child at the given index.
///
/// # Panics
/// Panics if `idx >= nchildren(array)`.
fn child(array: &Self::Array, idx: usize) -> ArrayRef;
/// Returns the name of the child at the given index.
///
/// # Panics
/// Panics if `idx >= nchildren(array)`.
fn child_name(array: &Self::Array, idx: usize) -> String;
/// Exports metadata for an array.
///
/// * If the array does not contain metadata, it should return
/// [`crate::metadata::EmptyMetadata`].
fn metadata(array: &Self::Array) -> VortexResult<Self::Metadata>;
/// Serialize metadata into a byte buffer for IPC or file storage.
/// Return `None` if the array cannot be serialized.
fn serialize(metadata: Self::Metadata) -> VortexResult<Option<Vec<u8>>>;
/// Deserialize array metadata from a byte buffer.
///
/// To reduce the serialized form, arrays do not store their own DType and length. Instead,
/// this is passed down from the parent array during deserialization. These properties are
/// exposed here for use during deserialization.
fn deserialize(
bytes: &[u8],
_dtype: &DType,
_len: usize,
_buffers: &[BufferHandle],
_session: &VortexSession,
) -> VortexResult<Self::Metadata>;
/// Writes the array into a canonical builder.
///
/// ## Post-conditions
/// - The length of the builder is incremented by the length of the input array.
fn append_to_builder(
array: &Self::Array,
builder: &mut dyn ArrayBuilder,
ctx: &mut ExecutionCtx,
) -> VortexResult<()> {
let canonical = array.to_array().execute::<Canonical>(ctx)?.into_array();
builder.extend_from_array(&canonical);
Ok(())
}
/// Build an array from components.
///
/// This is called on the file and IPC deserialization pathways, to reconstruct the array from
/// type-erased components.
///
/// Encoding implementers should take note that all validation necessary to ensure the encoding
/// is safe to read should happen inside of this method.
///
/// # Safety and correctness
///
/// This method should *never* panic, it must always return an error or else it returns a
/// valid `Array` that meets all the encoding's preconditions.
///
/// For example, the `build` implementation for a dictionary encoding should ensure that all
/// codes lie in the valid range. For a UTF-8 array, it should check the bytes to ensure they
/// are all valid string data bytes. Any corrupt files or malformed data buffers should be
/// caught here, before returning the deserialized array.
///
/// # Validation
///
/// Validation is mainly meant to ensure that all internal pointers in the encoding reference
/// valid ranges of data, and that all data conforms to its DType constraints. These ensure
/// that no array operations will panic at runtime, or yield undefined behavior when unsafe
/// operations like `get_unchecked` use indices in the array buffer.
///
/// Examples of the kinds of validation that should be part of the `build` step:
///
/// * Checking that any offsets buffers point to valid offsets in some other child array
/// * Checking that any buffers for data or validity have the appropriate size for the
/// encoding
/// * Running UTF-8 validation for any buffers that are expected to hold flat UTF-8 data
// TODO(ngates): take the parts by ownership, since most arrays need them anyway
fn build(
dtype: &DType,
len: usize,
metadata: &Self::Metadata,
buffers: &[BufferHandle],
children: &dyn ArrayChildren,
) -> VortexResult<Self::Array>;
/// Replaces the children in `array` with `children`. The count must be the same and types
/// of children must be expected.
fn with_children(array: &mut Self::Array, children: Vec<ArrayRef>) -> VortexResult<()>;
/// Replaces the buffers in `array` with `buffers`.
///
/// The default implementation rebuilds the array via [`build()`](VTable::build), which
/// re-runs all validation. This is correct for replacing lazy device buffers with
/// materialized host buffers.
fn with_buffers(array: &mut Self::Array, buffers: Vec<BufferHandle>) -> VortexResult<()> {
let metadata = Self::metadata(array)?;
let dtype = Self::dtype(array).clone();
let len = Self::len(array);
let children: Vec<ArrayRef> = (0..Self::nchildren(array))
.map(|i| Self::child(array, i))
.collect();
let children_slice: &[ArrayRef] = &children;
*array = Self::build(&dtype, len, &metadata, &buffers, &children_slice)?;
Ok(())
}
/// Execute this array by returning an [`ExecutionStep`] that tells the scheduler what to
/// do next.
///
/// Instead of recursively executing children, implementations should return
/// [`ExecutionStep::ExecuteChild`] to request that the scheduler execute a child first,
/// or [`ExecutionStep::Done`] when the
/// encoding can produce a result directly.
///
/// Array execution is designed such that repeated execution of an array will eventually
/// converge to a canonical representation. Implementations of this function should therefore
/// ensure they make progress towards that goal.
///
/// The returned array (in `Done`) must be logically equivalent to the input array. In other
/// words, the recursively canonicalized forms of both arrays must be equal.
///
/// Debug builds will panic if the returned array is of the wrong type, wrong length, or
/// incorrectly contains null values.
fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep>;
/// Attempt to execute the parent of this array.
///
/// This function allows arrays to plug in specialized execution logic for their parent. For
/// example, strings compressed as FSST arrays can implement a custom equality comparison when
/// the comparing against a scalar string.
///
/// Returns `Ok(None)` if no specialized execution is possible.
fn execute_parent(
array: &Self::Array,
parent: &ArrayRef,
child_idx: usize,
ctx: &mut ExecutionCtx,
) -> VortexResult<Option<ArrayRef>> {
_ = (array, parent, child_idx, ctx);
Ok(None)
}
/// Attempt to reduce the array to a more simple representation.
///
/// Returns `Ok(None)` if no reduction is possible.
fn reduce(array: &Self::Array) -> VortexResult<Option<ArrayRef>> {
_ = array;
Ok(None)
}
/// Attempt to perform a reduction of the parent of this array.
///
/// This function allows arrays to plug in reduction rules to their parents, for example
/// run-end arrays can pull-down scalar functions and apply them only over their values.
///
/// Returns `Ok(None)` if no reduction is possible.
fn reduce_parent(
array: &Self::Array,
parent: &ArrayRef,
child_idx: usize,
) -> VortexResult<Option<ArrayRef>> {
_ = (array, parent, child_idx);
Ok(None)
}
}
/// Placeholder type used to indicate when a particular vtable is not supported by the encoding.
pub struct NotSupported;
/// Returns the validity as a child array if it produces one.
///
/// - `NonNullable` and `AllValid` produce no child (returns `None`)
/// - `AllInvalid` produces a `ConstantArray` of `false` values
/// - `Array` returns the validity array
#[inline]
pub fn validity_to_child(validity: &Validity, len: usize) -> Option<ArrayRef> {
match validity {
Validity::NonNullable | Validity::AllValid => None,
Validity::AllInvalid => Some(ConstantArray::new(false, len).into_array()),
Validity::Array(array) => Some(array.clone()),
}
}
/// Returns 1 if validity produces a child, 0 otherwise.
#[inline]
pub fn validity_nchildren(validity: &Validity) -> usize {
match validity {
Validity::NonNullable | Validity::AllValid => 0,
Validity::AllInvalid | Validity::Array(_) => 1,
}
}
/// Returns the number of children produced by patches.
#[inline]
pub fn patches_nchildren(patches: &Patches) -> usize {
2 + patches.chunk_offsets().is_some() as usize
}
/// Returns the child at the given index within a patches component.
///
/// Index 0 = patch_indices, 1 = patch_values, 2 = patch_chunk_offsets (if present).
#[inline]
pub fn patches_child(patches: &Patches, idx: usize) -> ArrayRef {
match idx {
0 => patches.indices().clone(),
1 => patches.values().clone(),
2 => patches
.chunk_offsets()
.as_ref()
.vortex_expect("patch_chunk_offsets child out of bounds")
.clone(),
_ => vortex_panic!("patches child index {idx} out of bounds"),
}
}
/// Returns the name of the child at the given index within a patches component.
#[inline]
pub fn patches_child_name(idx: usize) -> &'static str {
match idx {
0 => "patch_indices",
1 => "patch_values",
2 => "patch_chunk_offsets",
_ => vortex_panic!("patches child name index {idx} out of bounds"),
}
}
#[macro_export]
macro_rules! vtable {
($V:ident) => {
$crate::vtable!($V, $V);
};
($Base:ident, $VT:ident) => {
$crate::aliases::paste::paste! {
impl AsRef<dyn $crate::DynArray> for [<$Base Array>] {
fn as_ref(&self) -> &dyn $crate::DynArray {
// We can unsafe cast ourselves to an ArrayAdapter.
unsafe { &*(self as *const [<$Base Array>] as *const $crate::ArrayAdapter<$VT>) }
}
}
impl std::ops::Deref for [<$Base Array>] {
type Target = dyn $crate::DynArray;
fn deref(&self) -> &Self::Target {
// We can unsafe cast ourselves to an ArrayAdapter.
unsafe { &*(self as *const [<$Base Array>] as *const $crate::ArrayAdapter<$VT>) }
}
}
impl $crate::IntoArray for [<$Base Array>] {
fn into_array(self) -> $crate::ArrayRef {
// We can unsafe transmute ourselves to an ArrayAdapter.
std::sync::Arc::new(unsafe { std::mem::transmute::<[<$Base Array>], $crate::ArrayAdapter::<$VT>>(self) })
}
}
impl From<[<$Base Array>]> for $crate::ArrayRef {
fn from(value: [<$Base Array>]) -> $crate::ArrayRef {
use $crate::IntoArray;
value.into_array()
}
}
impl [<$Base Array>] {
#[deprecated(note = "use `.into_array()` (owned) or `.clone().into_array()` (ref) to make clones explicit")]
pub fn to_array(&self) -> $crate::ArrayRef {
use $crate::IntoArray;
self.clone().into_array()
}
}
}
};
}