@@ -24,13 +24,16 @@ use super::{
2424} ;
2525use crate :: extensions:: Extensions ;
2626use async_trait:: async_trait;
27- use datafusion:: arrow:: datatypes:: DataType ;
28- use datafusion:: catalog:: TableProvider ;
27+ use datafusion:: arrow:: datatypes:: { DataType , SchemaRef } ;
28+ use datafusion:: catalog:: { Session , TableProvider } ;
29+ use datafusion:: common:: stats:: Precision ;
2930use datafusion:: common:: {
30- DFSchema , ScalarValue , TableReference , not_impl_err, substrait_err,
31+ DFSchema , ScalarValue , Statistics , TableReference , not_impl_err, substrait_err,
3132} ;
3233use datafusion:: execution:: { FunctionRegistry , SessionState } ;
34+ use datafusion:: logical_expr:: TableType ;
3335use datafusion:: logical_expr:: { Expr , Extension , LogicalPlan } ;
36+ use datafusion:: physical_plan:: ExecutionPlan ;
3437use std:: sync:: { Arc , RwLock } ;
3538use substrait:: proto;
3639use substrait:: proto:: expression as substrait_expression;
@@ -44,6 +47,26 @@ use substrait::proto::{
4447 FilterRel , JoinRel , ProjectRel , ReadRel , Rel , SetRel , SortRel , r#type,
4548} ;
4649
50+ /// Advisory hints extracted from a Substrait `RelCommon.hint.stats` message,
51+ /// passed to [`SubstraitConsumer::resolve_table_ref`] so that implementors can
52+ /// incorporate them into the returned [`TableProvider`].
53+ ///
54+ /// The struct is `#[non_exhaustive]` so that new fields can be added in future
55+ /// versions without breaking existing implementations.
56+ #[ non_exhaustive]
57+ #[ derive( Debug , Clone , Default ) ]
58+ pub struct SubstraitHints {
59+ /// Estimated number of rows, from `hint.stats.row_count`.
60+ ///
61+ /// `None` means the hint was absent or could not be reliably interpreted
62+ /// (e.g. proto3 default-zero or a non-finite value).
63+ pub row_count : Option < f64 > ,
64+ /// Estimated average byte size per record, from `hint.stats.record_size`.
65+ ///
66+ /// `None` means the hint was absent or non-positive / non-finite.
67+ pub record_size : Option < f64 > ,
68+ }
69+
4770#[ async_trait]
4871/// This trait is used to consume Substrait plans, converting them into DataFusion Logical Plans.
4972/// It can be implemented by users to allow for custom handling of relations, expressions, etc.
@@ -67,7 +90,7 @@ use substrait::proto::{
6790/// # use datafusion::logical_expr::expr::ScalarFunction;
6891/// # use datafusion_substrait::extensions::Extensions;
6992/// # use datafusion_substrait::logical_plan::consumer::{
70- /// # from_project_rel, from_substrait_rel, from_substrait_rex, SubstraitConsumer
93+ /// # from_project_rel, from_substrait_rel, from_substrait_rex, SubstraitConsumer, SubstraitHints
7194/// # };
7295///
7396/// struct CustomSubstraitConsumer {
@@ -80,6 +103,7 @@ use substrait::proto::{
80103/// async fn resolve_table_ref(
81104/// &self,
82105/// table_ref: &TableReference,
106+ /// _hints: SubstraitHints,
83107/// ) -> Result<Option<Arc<dyn TableProvider>>> {
84108/// let table = table_ref.table().to_string();
85109/// let schema = self.state.schema_for_ref(table_ref.clone())?;
@@ -162,6 +186,7 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
162186 async fn resolve_table_ref (
163187 & self ,
164188 table_ref : & TableReference ,
189+ hints : SubstraitHints ,
165190 ) -> datafusion:: common:: Result < Option < Arc < dyn TableProvider > > > ;
166191
167192 // TODO: Remove these two methods
@@ -471,6 +496,57 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
471496 }
472497}
473498
499+ /// Wraps an inner [`TableProvider`] and overrides its `statistics()` return value.
500+ ///
501+ /// Used by [`DefaultSubstraitConsumer`] to inject a row-count hint carried in a
502+ /// Substrait `RelCommon.hint.stats` when the resolved provider has no statistics.
503+ #[ derive( Debug ) ]
504+ struct StatisticsOverrideTableProvider {
505+ inner : Arc < dyn TableProvider > ,
506+ statistics : Statistics ,
507+ }
508+
509+ #[ async_trait]
510+ impl TableProvider for StatisticsOverrideTableProvider {
511+ fn as_any ( & self ) -> & dyn std:: any:: Any {
512+ self
513+ }
514+
515+ fn schema ( & self ) -> SchemaRef {
516+ self . inner . schema ( )
517+ }
518+
519+ fn constraints ( & self ) -> Option < & datafusion:: common:: Constraints > {
520+ self . inner . constraints ( )
521+ }
522+
523+ fn table_type ( & self ) -> TableType {
524+ self . inner . table_type ( )
525+ }
526+
527+ fn supports_filters_pushdown (
528+ & self ,
529+ filters : & [ & Expr ] ,
530+ ) -> datafusion:: common:: Result < Vec < datafusion:: logical_expr:: TableProviderFilterPushDown > >
531+ {
532+ self . inner . supports_filters_pushdown ( filters)
533+ }
534+
535+ fn statistics ( & self ) -> Option < Statistics > {
536+ Some ( self . statistics . clone ( ) )
537+ }
538+
539+ async fn scan (
540+ & self ,
541+ state : & dyn Session ,
542+ projection : Option < & Vec < usize > > ,
543+ filters : & [ Expr ] ,
544+ limit : Option < usize > ,
545+ ) -> datafusion:: common:: Result < Arc < dyn ExecutionPlan > > {
546+ self . inner . scan ( state, projection, filters, limit) . await
547+ }
548+ }
549+
474550/// Default SubstraitConsumer for converting standard Substrait without user-defined extensions.
475551///
476552/// Used as the consumer in [crate::logical_plan::consumer::from_substrait_plan]
@@ -495,11 +571,79 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
495571 async fn resolve_table_ref (
496572 & self ,
497573 table_ref : & TableReference ,
574+ hints : SubstraitHints ,
498575 ) -> datafusion:: common:: Result < Option < Arc < dyn TableProvider > > > {
499576 let table = table_ref. table ( ) . to_string ( ) ;
500577 let schema = self . state . schema_for_ref ( table_ref. clone ( ) ) ?;
501- let table_provider = schema. table ( & table) . await ?;
502- Ok ( table_provider)
578+ let provider = schema. table ( & table) . await ?;
579+ // If the Substrait plan provides hints and the provider is missing the
580+ // corresponding statistics fields, wrap it to expose those hints to
581+ // DataFusion (e.g. for downstream optimizer rules).
582+ // We check each field individually so that a provider returning
583+ // Some(Statistics { num_rows: Absent, ... }) also gets hints injected,
584+ // and any already-known values from the provider are preserved.
585+ let has_hints = hints. row_count . is_some ( ) || hints. record_size . is_some ( ) ;
586+ let provider = match provider {
587+ Some ( provider) if has_hints => {
588+ let existing = provider. statistics ( ) ;
589+ let row_count_absent = existing
590+ . as_ref ( )
591+ . map_or ( true , |s| matches ! ( s. num_rows, Precision :: Absent ) ) ;
592+ let byte_size_absent = existing
593+ . as_ref ( )
594+ . map_or ( true , |s| matches ! ( s. total_byte_size, Precision :: Absent ) ) ;
595+ let inject_row_count = hints. row_count . is_some ( ) && row_count_absent;
596+ // total_byte_size = row_count * record_size, so both hints must be
597+ // present to reconstruct it. A record_size-only hint (row_count absent)
598+ // is therefore silently ignored here, since we cannot compute a
599+ // meaningful total_byte_size from record_size alone.
600+ let inject_byte_size = hints. row_count . is_some ( )
601+ && hints. record_size . is_some ( )
602+ && byte_size_absent;
603+ if inject_row_count || inject_byte_size {
604+ let num_rows = if inject_row_count {
605+ Precision :: Inexact ( hints. row_count . unwrap ( ) . round ( ) as usize )
606+ } else {
607+ existing
608+ . as_ref ( )
609+ . map_or ( Precision :: Absent , |s| s. num_rows . clone ( ) )
610+ } ;
611+ let total_byte_size = if inject_byte_size {
612+ // Use the effective row count that was resolved above:
613+ // the provider's own value when present (keeping
614+ // num_rows and total_byte_size internally consistent),
615+ // or the hint's row_count when the provider had none.
616+ let effective_rows = match & num_rows {
617+ Precision :: Exact ( n) | Precision :: Inexact ( n) => * n as f64 ,
618+ Precision :: Absent => hints. row_count . unwrap ( ) ,
619+ } ;
620+ Precision :: Inexact (
621+ ( effective_rows * hints. record_size . unwrap ( ) ) . round ( ) as usize ,
622+ )
623+ } else {
624+ existing
625+ . as_ref ( )
626+ . map_or ( Precision :: Absent , |s| s. total_byte_size . clone ( ) )
627+ } ;
628+ let column_statistics = existing
629+ . map ( |s| s. column_statistics )
630+ . unwrap_or_else ( || Statistics :: unknown_column ( & provider. schema ( ) ) ) ;
631+ let statistics = Statistics {
632+ num_rows,
633+ total_byte_size,
634+ column_statistics,
635+ } ;
636+ Some ( Arc :: new ( StatisticsOverrideTableProvider {
637+ inner : provider,
638+ statistics,
639+ } ) as Arc < dyn TableProvider > )
640+ } else {
641+ Some ( provider)
642+ }
643+ }
644+ provider => provider,
645+ } ;
646+ Ok ( provider)
503647 }
504648
505649 fn get_extensions ( & self ) -> & Extensions {
0 commit comments