66/// The plan builder walks an encoding tree and emits a linear sequence of
77/// stages. The kernel executes stages in order within a single launch.
88///
9- /// Shared memory: The plan builder bump-allocates shared memory regions for
10- /// each input stage's output. The output stage (last) is placed after all
11- /// input stages. Since all regions must coexist for the output stage to
12- /// reference, the total shared memory is the end of whichever region extends
13- /// furthest, in elements, times `sizeof(T)`.
9+ /// ## Stage plan
1410///
15- /// Example: RunEnd(ends=FoR(BitPacked), values=FoR(BitPacked)) with 100 runs
11+ /// The plan is packed as a variable-length byte buffer.
1612///
17- /// Stage 0 (input): BITUNPACK(7) → FoR(0) → smem[0..100) // run ends
18- /// Stage 1 (input): BITUNPACK(10) → FoR(50) → smem[100..200) // run values
19- /// Stage 2 (output): RUNEND(ends=0, values=100) → smem[200..1224) // resolved
20- ///
21- /// shared_mem_bytes = (200 + 1024) * sizeof(T)
13+ /// Layout (contiguous bytes):
14+ /// [PlanHeader]
15+ /// [PackedStage 0][ScalarOp × N0]
16+ /// [PackedStage 1][ScalarOp × N1]
17+ /// ...
2218
2319#pragma once
2420
2723/// Elements processed per CUDA block.
2824#define ELEMENTS_PER_BLOCK 2048
2925
30- /// Shared memory tile size for the output stage. Each block decompresses
31- /// ELEMENTS_PER_BLOCK elements but only holds SMEM_TILE_SIZE in smem at a
32- /// time — each tile is written to global memory before the next is decoded
33- /// into the same region. Input stages cannot tile because their outputs must
34- /// remain accessible for random access (e.g., dictionary lookup, run-end
35- /// binary search). Smaller tiles reduce smem per block, improving occupancy.
26+ /// Each tile is flushed to global before the next is decoded.
3627#define SMEM_TILE_SIZE 1024
3728
3829#ifdef __cplusplus
@@ -41,14 +32,13 @@ extern "C" {
4132
4233/// Parameters for source ops, which decode data into a stage's shared memory region.
4334union SourceParams {
44- /// Unpack bit-packed data using FastLanes layout .
35+ /// Unpack FastLanes bit-packed data.
4536 struct BitunpackParams {
4637 uint8_t bit_width ;
4738 uint32_t element_offset ; // Sub-byte offset
4839 } bitunpack ;
4940
50- /// Copy elements verbatim from global memory to shared memory.
51- /// The input pointer is pre-adjusted on the host to account for slicing.
41+ /// Copy from global to shared memory.
5242 struct LoadParams {
5343 uint8_t _placeholder ;
5444 } load ;
@@ -58,7 +48,7 @@ union SourceParams {
5848 uint32_t ends_smem_offset ; // element offset to decoded ends in smem
5949 uint32_t values_smem_offset ; // element offset to decoded values in smem
6050 uint64_t num_runs ;
61- uint64_t offset ;
51+ uint64_t offset ; // slice offset into the run-end encoded array
6252 } runend ;
6353
6454 /// Generate a linear sequence: `value[i] = base + i * multiplier`.
@@ -96,38 +86,62 @@ struct ScalarOp {
9686 union ScalarParams params ;
9787};
9888
99- #define MAX_SCALAR_OPS 4
100-
101- /// A single stage in the dispatch plan.
102- ///
103- /// Each stage is a pipeline (source + scalar ops) that writes decoded data
104- /// into a shared memory region at `smem_offset`. Input stage outputs persist
105- /// in smem so the output stage can reference them (via DICT or RUNEND offsets).
106- struct Stage {
89+ /// Packed stage header, followed by `num_scalar_ops` inline ScalarOps.
90+ struct PackedStage {
10791 uint64_t input_ptr ; // global memory pointer to this stage's encoded input
10892 uint32_t smem_offset ; // element offset within dynamic shared memory for output
10993 uint32_t len ; // number of elements this stage produces
11094
11195 struct SourceOp source ;
11296 uint8_t num_scalar_ops ;
113- struct ScalarOp scalar_ops [MAX_SCALAR_OPS ];
11497};
11598
116- #define MAX_STAGES 4
117-
118- /// Dispatch plan: a sequence of stages.
119- ///
120- /// The plan builder walks the encoding tree recursively, emitting an input
121- /// stage each time it encounters a child array that needs to live in shared
122- /// memory (e.g., dictionary values, run-end endpoints). Shared memory
123- /// offsets are assigned with a simple bump allocator.
124- ///
125- /// The last stage is the output pipeline which directly writes to global memory.
126- struct DynamicDispatchPlan {
99+ /// Header for the packed plan byte buffer.
100+ struct __attribute__((aligned (8 ))) PlanHeader {
127101 uint8_t num_stages ;
128- struct Stage stages [ MAX_STAGES ];
102+ uint16_t plan_size_bytes ; // total size of the packed plan including this header
129103};
130104
131105#ifdef __cplusplus
132106}
133107#endif
108+
109+ #ifdef __cplusplus
110+
111+ /// Stage parsed from the packed plan byte buffer.
112+ ///
113+ /// Input stages decode data (e.g. dict values, run-end endpoints) into a
114+ /// shared memory region for the output stage to reference. The output stage
115+ /// decodes the root encoding and writes to global memory.
116+ struct Stage {
117+ uint64_t input_ptr ; // encoded input in global memory
118+ uint32_t smem_offset ; // output offset in shared memory (elements)
119+ uint32_t len ; // elements produced
120+ struct SourceOp source ; // source decode op
121+ uint8_t num_scalar_ops ; // number of scalar ops
122+ const struct ScalarOp * scalar_ops ; // scalar deoode ops
123+ };
124+
125+ /// Parse a single stage from the packed plan byte buffer and advance the cursor.
126+ ///
127+ /// @param cursor Pointer into the packed plan buffer, pointing at a PackedStage.
128+ /// On return, advanced past this stage's ScalarOps.
129+ /// @return A Stage referencing data within the packed plan buffer.
130+ __device__ inline Stage parse_stage (const uint8_t * & cursor ) {
131+ const auto * packed_stage = reinterpret_cast < const struct PackedStage * > (cursor );
132+ cursor += sizeof (struct PackedStage );
133+
134+ const auto * ops = reinterpret_cast < const struct ScalarOp * > (cursor );
135+ cursor += packed_stage -> num_scalar_ops * sizeof (struct ScalarOp );
136+
137+ return Stage {
138+ .input_ptr = packed_stage -> input_ptr ,
139+ .smem_offset = packed_stage -> smem_offset ,
140+ .len = packed_stage -> len ,
141+ .source = packed_stage -> source ,
142+ .num_scalar_ops = packed_stage -> num_scalar_ops ,
143+ .scalar_ops = ops ,
144+ };
145+ }
146+
147+ #endif
0 commit comments