@@ -107,6 +107,7 @@ impl From<MaterializedView> for DataFusionTable {
107107}
108108
109109#[ derive( Clone , Debug , Builder ) ]
110+ #[ builder( build_fn( error = "Error" ) ) ]
110111pub struct DataFusionTableConfig {
111112 /// With this option, an additional "__data_file_path" column is added to the output of the
112113 /// TableProvider that contains the path of the data-file the row originates from.
@@ -407,30 +408,22 @@ async fn table_scan(
407408
408409 let file_schema: SchemaRef = Arc :: new ( ( schema. fields ( ) ) . try_into ( ) . unwrap ( ) ) ;
409410
410- let projection = projection. cloned ( ) . or_else ( || {
411- Some (
412- arrow_schema
413- . fields ( )
414- . iter ( )
415- . enumerate ( )
416- . map ( |( i, _) | i)
417- . collect ( ) ,
418- )
419- } ) ;
411+ // If no projection was specified default to projecting all the fields
412+ let projection = projection
413+ . cloned ( )
414+ . unwrap_or ( ( 0 ..arrow_schema. fields ( ) . len ( ) ) . collect_vec ( ) ) ;
420415
421- let projection_expr: Option < Vec < _ > > = projection. as_ref ( ) . map ( |projection| {
422- projection
423- . iter ( )
424- . enumerate ( )
425- . map ( |( i, id) | {
426- let name = arrow_schema. fields [ * id] . name ( ) ;
427- (
428- Arc :: new ( Column :: new ( name, i) ) as Arc < dyn PhysicalExpr > ,
429- name. to_owned ( ) ,
430- )
431- } )
432- . collect ( )
433- } ) ;
416+ let projection_expr: Vec < _ > = projection
417+ . iter ( )
418+ . enumerate ( )
419+ . map ( |( i, id) | {
420+ let name = arrow_schema. fields [ * id] . name ( ) ;
421+ (
422+ Arc :: new ( Column :: new ( name, i) ) as Arc < dyn PhysicalExpr > ,
423+ name. to_owned ( ) ,
424+ )
425+ } )
426+ . collect ( ) ;
434427
435428 if enable_data_file_path_column {
436429 table_partition_cols. push ( Field :: new ( DATA_FILE_PATH_COLUMN , DataType :: Utf8 , false ) ) ;
@@ -615,6 +608,31 @@ async fn table_scan(
615608
616609 let mut data_file_iter = data_files. into_iter ( ) . peekable ( ) ;
617610
611+ // Gather the complete equality projection up-front, since in general the requested
612+ // projection may differ from the equality delete columns. Moreover, in principle
613+ // each equality delete file may have different deletion columns.
614+ // And since we need to reconcile them all with data files using joins and unions,
615+ // we need to make sure their schemas are fully compatible in all intermediate nodes.
616+ let mut equality_projection = projection. clone ( ) ;
617+ delete_files
618+ . iter ( )
619+ . flat_map ( |delete_manifest| delete_manifest. 1 . data_file ( ) . equality_ids ( ) )
620+ . flatten ( )
621+ . unique ( )
622+ . for_each ( |eq_id| {
623+ // Look up the zero-based index of the column based on its equality id
624+ if let Some ( ( id, _) ) = schema
625+ . fields ( )
626+ . iter ( )
627+ . enumerate ( )
628+ . find ( |( _, f) | f. id == * eq_id)
629+ {
630+ if !equality_projection. contains ( & id) {
631+ equality_projection. push ( id) ;
632+ }
633+ }
634+ } ) ;
635+
618636 let mut plan = stream:: iter ( delete_files. iter ( ) )
619637 . map ( Ok :: < _ , DataFusionError > )
620638 . try_fold ( None , |acc, delete_manifest| {
@@ -626,6 +644,8 @@ async fn table_scan(
626644 let file_schema: Arc < ArrowSchema > = file_schema. clone ( ) ;
627645 let file_source = file_source. clone ( ) ;
628646 let mut data_files = Vec :: new ( ) ;
647+ let equality_projection = equality_projection. clone ( ) ;
648+
629649 while let Some ( data_manifest) = data_file_iter. next_if ( |x| {
630650 x. 1 . sequence_number ( ) . unwrap ( )
631651 < delete_manifest. 1 . sequence_number ( ) . unwrap ( )
@@ -657,26 +677,6 @@ async fn table_scan(
657677 ) ;
658678 let delete_file_schema: SchemaRef =
659679 Arc :: new ( ( delete_schema. fields ( ) ) . try_into ( ) . unwrap ( ) ) ;
660- let equality_projection: Option < Vec < usize > > =
661- match ( & projection, delete_manifest. 1 . data_file ( ) . equality_ids ( ) ) {
662- ( Some ( projection) , Some ( equality_ids) ) => {
663- let collect: Vec < usize > = schema
664- . iter ( )
665- . enumerate ( )
666- . filter_map ( |( id, x) | {
667- if equality_ids. contains ( & x. id )
668- && !projection. contains ( & id)
669- {
670- Some ( id)
671- } else {
672- None
673- }
674- } )
675- . collect ( ) ;
676- Some ( [ projection. as_slice ( ) , & collect] . concat ( ) )
677- }
678- _ => None ,
679- } ;
680680
681681 let last_updated_ms = table. metadata ( ) . last_updated_ms ;
682682 let manifest_path = if enable_manifest_file_path_column {
@@ -724,7 +724,7 @@ async fn table_scan(
724724 )
725725 . with_file_group ( FileGroup :: new ( data_files) )
726726 . with_statistics ( statistics)
727- . with_projection ( equality_projection)
727+ . with_projection ( Some ( equality_projection) )
728728 . with_limit ( limit)
729729 . with_table_partition_cols ( table_partition_cols)
730730 . build ( ) ;
@@ -804,7 +804,7 @@ async fn table_scan(
804804 )
805805 . with_file_group ( FileGroup :: new ( additional_data_files) )
806806 . with_statistics ( statistics)
807- . with_projection ( projection . as_ref ( ) . cloned ( ) )
807+ . with_projection ( Some ( equality_projection ) )
808808 . with_limit ( limit)
809809 . with_table_partition_cols ( table_partition_cols)
810810 . build ( ) ;
@@ -816,14 +816,8 @@ async fn table_scan(
816816 plan = Arc :: new ( UnionExec :: new ( vec ! [ plan, data_files_scan] ) ) ;
817817 }
818818
819- if let Some ( projection_expr) = projection_expr {
820- Ok :: < _ , DataFusionError > ( Arc :: new ( ProjectionExec :: try_new (
821- projection_expr,
822- plan,
823- ) ?) as Arc < dyn ExecutionPlan > )
824- } else {
825- Ok ( plan)
826- }
819+ Ok :: < _ , DataFusionError > ( Arc :: new ( ProjectionExec :: try_new ( projection_expr, plan) ?)
820+ as Arc < dyn ExecutionPlan > )
827821 }
828822 } )
829823 . try_collect :: < Vec < _ > > ( )
@@ -859,7 +853,7 @@ async fn table_scan(
859853 FileScanConfigBuilder :: new ( object_store_url, file_schema, file_source)
860854 . with_file_groups ( file_groups)
861855 . with_statistics ( statistics)
862- . with_projection ( projection. clone ( ) )
856+ . with_projection ( Some ( projection. clone ( ) ) )
863857 . with_limit ( limit)
864858 . with_table_partition_cols ( table_partition_cols)
865859 . build ( ) ;
@@ -873,10 +867,7 @@ async fn table_scan(
873867
874868 match plans. len ( ) {
875869 0 => {
876- let projected_schema = projection
877- . map ( |p| arrow_schema. project ( & p) )
878- . transpose ( ) ?
879- . unwrap_or ( arrow_schema. as_ref ( ) . clone ( ) ) ;
870+ let projected_schema = arrow_schema. project ( & projection) ?;
880871 Ok ( Arc :: new ( EmptyExec :: new ( Arc :: new ( projected_schema) ) ) )
881872 }
882873 1 => Ok ( plans. remove ( 0 ) ) ,
0 commit comments