@@ -24,13 +24,16 @@ use super::{
2424} ;
2525use crate :: extensions:: Extensions ;
2626use async_trait:: async_trait;
27- use datafusion:: arrow:: datatypes:: DataType ;
28- use datafusion:: catalog:: TableProvider ;
27+ use datafusion:: arrow:: datatypes:: { DataType , SchemaRef } ;
28+ use datafusion:: catalog:: { Session , TableProvider } ;
29+ use datafusion:: common:: stats:: Precision ;
2930use datafusion:: common:: {
30- DFSchema , ScalarValue , TableReference , not_impl_err, substrait_err,
31+ DFSchema , ScalarValue , Statistics , TableReference , not_impl_err, substrait_err,
3132} ;
3233use datafusion:: execution:: { FunctionRegistry , SessionState } ;
34+ use datafusion:: logical_expr:: TableType ;
3335use datafusion:: logical_expr:: { Expr , Extension , LogicalPlan } ;
36+ use datafusion:: physical_plan:: ExecutionPlan ;
3437use std:: sync:: { Arc , RwLock } ;
3538use substrait:: proto;
3639use substrait:: proto:: expression as substrait_expression;
@@ -44,6 +47,26 @@ use substrait::proto::{
4447 FilterRel , JoinRel , ProjectRel , ReadRel , Rel , SetRel , SortRel , r#type,
4548} ;
4649
50+ /// Advisory hints extracted from a Substrait `RelCommon.hint.stats` message,
51+ /// passed to [`SubstraitConsumer::resolve_table_ref`] so that implementors can
52+ /// incorporate them into the returned [`TableProvider`].
53+ ///
54+ /// The struct is `#[non_exhaustive]` so that new fields can be added in future
55+ /// versions without breaking existing implementations.
56+ #[ non_exhaustive]
57+ #[ derive( Debug , Clone , Default ) ]
58+ pub struct SubstraitHints {
59+ /// Estimated number of rows, from `hint.stats.row_count`.
60+ ///
61+ /// `None` means the hint was absent or could not be reliably interpreted
62+ /// (e.g. proto3 default-zero or a non-finite value).
63+ pub row_count : Option < f64 > ,
64+ /// Estimated average byte size per record, from `hint.stats.record_size`.
65+ ///
66+ /// `None` means the hint was absent or non-positive / non-finite.
67+ pub record_size : Option < f64 > ,
68+ }
69+
4770#[ async_trait]
4871/// This trait is used to consume Substrait plans, converting them into DataFusion Logical Plans.
4972/// It can be implemented by users to allow for custom handling of relations, expressions, etc.
@@ -67,7 +90,7 @@ use substrait::proto::{
6790/// # use datafusion::logical_expr::expr::ScalarFunction;
6891/// # use datafusion_substrait::extensions::Extensions;
6992/// # use datafusion_substrait::logical_plan::consumer::{
70- /// # from_project_rel, from_substrait_rel, from_substrait_rex, SubstraitConsumer
93+ /// # from_project_rel, from_substrait_rel, from_substrait_rex, SubstraitConsumer, SubstraitHints
7194/// # };
7295///
7396/// struct CustomSubstraitConsumer {
@@ -80,6 +103,7 @@ use substrait::proto::{
80103/// async fn resolve_table_ref(
81104/// &self,
82105/// table_ref: &TableReference,
106+ /// _hints: SubstraitHints,
83107/// ) -> Result<Option<Arc<dyn TableProvider>>> {
84108/// let table = table_ref.table().to_string();
85109/// let schema = self.state.schema_for_ref(table_ref.clone())?;
@@ -162,6 +186,7 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
162186 async fn resolve_table_ref (
163187 & self ,
164188 table_ref : & TableReference ,
189+ hints : SubstraitHints ,
165190 ) -> datafusion:: common:: Result < Option < Arc < dyn TableProvider > > > ;
166191
167192 // TODO: Remove these two methods
@@ -471,6 +496,94 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
471496 }
472497}
473498
499+ /// Wraps an inner [`TableProvider`] and overrides its `statistics()` return value.
500+ ///
501+ /// Used by [`DefaultSubstraitConsumer`] to inject a row-count hint carried in a
502+ /// Substrait `RelCommon.hint.stats` when the resolved provider has no statistics.
503+ ///
504+ /// # Note on `as_any()` behaviour
505+ ///
506+ /// `as_any()` intentionally delegates to the inner provider so that callers can
507+ /// still downcast to the concrete inner type (e.g. `MemTable`) through this
508+ /// wrapper. As a consequence, downcasting to `StatisticsOverrideTableProvider`
509+ /// itself via `as_any()` will not work — but since this struct is private,
510+ /// external code should never need to do that.
511+ #[ derive( Debug ) ]
512+ struct StatisticsOverrideTableProvider {
513+ inner : Arc < dyn TableProvider > ,
514+ statistics : Statistics ,
515+ }
516+
517+ #[ async_trait]
518+ impl TableProvider for StatisticsOverrideTableProvider {
519+ fn as_any ( & self ) -> & dyn std:: any:: Any {
520+ // Delegate to the inner provider so that downcasting to the concrete
521+ // inner type works transparently through this wrapper.
522+ self . inner . as_any ( )
523+ }
524+
525+ fn schema ( & self ) -> SchemaRef {
526+ self . inner . schema ( )
527+ }
528+
529+ fn constraints ( & self ) -> Option < & datafusion:: common:: Constraints > {
530+ self . inner . constraints ( )
531+ }
532+
533+ fn table_type ( & self ) -> TableType {
534+ self . inner . table_type ( )
535+ }
536+
537+ fn supports_filters_pushdown (
538+ & self ,
539+ filters : & [ & Expr ] ,
540+ ) -> datafusion:: common:: Result <
541+ Vec < datafusion:: logical_expr:: TableProviderFilterPushDown > ,
542+ > {
543+ self . inner . supports_filters_pushdown ( filters)
544+ }
545+
546+ fn statistics ( & self ) -> Option < Statistics > {
547+ Some ( self . statistics . clone ( ) )
548+ }
549+
550+ async fn scan (
551+ & self ,
552+ state : & dyn Session ,
553+ projection : Option < & Vec < usize > > ,
554+ filters : & [ Expr ] ,
555+ limit : Option < usize > ,
556+ ) -> datafusion:: common:: Result < Arc < dyn ExecutionPlan > > {
557+ self . inner . scan ( state, projection, filters, limit) . await
558+ }
559+
560+ async fn insert_into (
561+ & self ,
562+ state : & dyn Session ,
563+ input : Arc < dyn ExecutionPlan > ,
564+ insert_op : datafusion:: logical_expr:: dml:: InsertOp ,
565+ ) -> datafusion:: common:: Result < Arc < dyn ExecutionPlan > > {
566+ self . inner . insert_into ( state, input, insert_op) . await
567+ }
568+
569+ async fn delete_from (
570+ & self ,
571+ state : & dyn Session ,
572+ filters : Vec < Expr > ,
573+ ) -> datafusion:: common:: Result < Arc < dyn ExecutionPlan > > {
574+ self . inner . delete_from ( state, filters) . await
575+ }
576+
577+ async fn update (
578+ & self ,
579+ state : & dyn Session ,
580+ assignments : Vec < ( String , Expr ) > ,
581+ filters : Vec < Expr > ,
582+ ) -> datafusion:: common:: Result < Arc < dyn ExecutionPlan > > {
583+ self . inner . update ( state, assignments, filters) . await
584+ }
585+ }
586+
474587/// Default SubstraitConsumer for converting standard Substrait without user-defined extensions.
475588///
476589/// Used as the consumer in [crate::logical_plan::consumer::from_substrait_plan]
@@ -495,11 +608,72 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
495608 async fn resolve_table_ref (
496609 & self ,
497610 table_ref : & TableReference ,
611+ hints : SubstraitHints ,
498612 ) -> datafusion:: common:: Result < Option < Arc < dyn TableProvider > > > {
499613 let table = table_ref. table ( ) . to_string ( ) ;
500614 let schema = self . state . schema_for_ref ( table_ref. clone ( ) ) ?;
501- let table_provider = schema. table ( & table) . await ?;
502- Ok ( table_provider)
615+ let provider = schema. table ( & table) . await ?;
616+ // Wrap the provider to inject hint statistics only for fields the
617+ // provider doesn't already have (checked individually, not as a whole).
618+ let has_hints = hints. row_count . is_some ( ) || hints. record_size . is_some ( ) ;
619+ let provider = match provider {
620+ Some ( provider) if has_hints => {
621+ let existing = provider. statistics ( ) ;
622+ let row_count_absent = existing
623+ . as_ref ( )
624+ . is_none_or ( |s| matches ! ( s. num_rows, Precision :: Absent ) ) ;
625+ let byte_size_absent = existing
626+ . as_ref ( )
627+ . is_none_or ( |s| matches ! ( s. total_byte_size, Precision :: Absent ) ) ;
628+ let inject_row_count = hints. row_count . is_some ( ) && row_count_absent;
629+ // Both hints required: total_byte_size = row_count * record_size.
630+ let inject_byte_size = hints. row_count . is_some ( )
631+ && hints. record_size . is_some ( )
632+ && byte_size_absent;
633+ if inject_row_count || inject_byte_size {
634+ let num_rows = if inject_row_count {
635+ Precision :: Inexact ( hints. row_count . unwrap ( ) . round ( ) as usize )
636+ } else {
637+ existing. as_ref ( ) . map_or ( Precision :: Absent , |s| s. num_rows )
638+ } ;
639+ let total_byte_size = if inject_byte_size {
640+ // Prefer the provider's own row count for consistency.
641+ let effective_rows = match & num_rows {
642+ Precision :: Exact ( n) | Precision :: Inexact ( n) => * n as f64 ,
643+ Precision :: Absent => hints. row_count . unwrap ( ) ,
644+ } ;
645+ let byte_size = effective_rows * hints. record_size . unwrap ( ) ;
646+ // The product of two sub-usize::MAX values can still overflow.
647+ if byte_size. is_finite ( ) && byte_size < usize:: MAX as f64 {
648+ Precision :: Inexact ( byte_size. round ( ) as usize )
649+ } else {
650+ Precision :: Absent
651+ }
652+ } else {
653+ existing
654+ . as_ref ( )
655+ . map_or ( Precision :: Absent , |s| s. total_byte_size )
656+ } ;
657+ let column_statistics =
658+ existing. map ( |s| s. column_statistics ) . unwrap_or_else ( || {
659+ Statistics :: unknown_column ( & provider. schema ( ) )
660+ } ) ;
661+ let statistics = Statistics {
662+ num_rows,
663+ total_byte_size,
664+ column_statistics,
665+ } ;
666+ Some ( Arc :: new ( StatisticsOverrideTableProvider {
667+ inner : provider,
668+ statistics,
669+ } ) as Arc < dyn TableProvider > )
670+ } else {
671+ Some ( provider)
672+ }
673+ }
674+ provider => provider,
675+ } ;
676+ Ok ( provider)
503677 }
504678
505679 fn get_extensions ( & self ) -> & Extensions {
0 commit comments