@@ -666,6 +666,81 @@ def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]:
666666 else :
667667 return []
668668
669+ class _RewriteFiles (_SnapshotProducer ["_RewriteFiles" ]):
670+ """A snapshot producer that rewrites data files."""
671+
672+ def __init__ (self , operation : Operation , transaction : Transaction , io : FileIO , snapshot_properties : dict [str , str ]):
673+ super ().__init__ (operation , transaction , io , snapshot_properties )
674+
675+ def _commit (self ) -> UpdatesAndRequirements :
676+ # Only produce a commit when there is something to rewrite
677+ if self ._deleted_data_files or self ._added_data_files :
678+ return super ()._commit ()
679+ else :
680+ return (), ()
681+
682+ def _deleted_entries (self ) -> list [ManifestEntry ]:
683+ """Check if we need to mark the files as deleted."""
684+ if self ._parent_snapshot_id is not None :
685+ previous_snapshot = self ._transaction .table_metadata .snapshot_by_id (self ._parent_snapshot_id )
686+ if previous_snapshot is None :
687+ raise ValueError (f"Could not find the previous snapshot: { self ._parent_snapshot_id } " )
688+
689+ executor = ExecutorFactory .get_or_create ()
690+
691+ def _get_entries (manifest : ManifestFile ) -> list [ManifestEntry ]:
692+ return [
693+ ManifestEntry .from_args (
694+ status = ManifestEntryStatus .DELETED ,
695+ snapshot_id = entry .snapshot_id ,
696+ sequence_number = entry .sequence_number ,
697+ file_sequence_number = entry .file_sequence_number ,
698+ data_file = entry .data_file ,
699+ )
700+ for entry in manifest .fetch_manifest_entry (self ._io , discard_deleted = True )
701+ if entry .data_file .content == DataFileContent .DATA and entry .data_file in self ._deleted_data_files
702+ ]
703+
704+ list_of_entries = executor .map (_get_entries , previous_snapshot .manifests (self ._io ))
705+ return list (itertools .chain (* list_of_entries ))
706+ else :
707+ return []
708+
709+ def _existing_manifests (self ) -> list [ManifestFile ]:
710+ """To determine if there are any existing manifests."""
711+ existing_files = []
712+ if snapshot := self ._transaction .table_metadata .snapshot_by_name (name = self ._target_branch ):
713+ for manifest_file in snapshot .manifests (io = self ._io ):
714+ entries_to_write : set [ManifestEntry ] = set ()
715+ found_deleted_entries : set [ManifestEntry ] = set ()
716+
717+ for entry in manifest_file .fetch_manifest_entry (io = self ._io , discard_deleted = True ):
718+ if entry .data_file in self ._deleted_data_files :
719+ found_deleted_entries .add (entry )
720+ else :
721+ entries_to_write .add (entry )
722+
723+ if len (found_deleted_entries ) == 0 :
724+ existing_files .append (manifest_file )
725+ continue
726+
727+ if len (entries_to_write ) == 0 :
728+ continue
729+
730+ with self .new_manifest_writer (self .spec (manifest_file .partition_spec_id )) as writer :
731+ for entry in entries_to_write :
732+ writer .add_entry (
733+ ManifestEntry .from_args (
734+ status = ManifestEntryStatus .EXISTING ,
735+ snapshot_id = entry .snapshot_id ,
736+ sequence_number = entry .sequence_number ,
737+ file_sequence_number = entry .file_sequence_number ,
738+ data_file = entry .data_file ,
739+ )
740+ )
741+ existing_files .append (writer .to_manifest_file ())
742+ return existing_files
743+
669744
670745class UpdateSnapshot :
671746 _transaction : Transaction
@@ -724,7 +799,13 @@ def delete(self) -> _DeleteFiles:
724799 snapshot_properties = self ._snapshot_properties ,
725800 )
726801
727-
802+ def replace (self ) -> _RewriteFiles :
803+ return _RewriteFiles (
804+ operation = Operation .REPLACE ,
805+ transaction = self ._transaction ,
806+ io = self ._io ,
807+ snapshot_properties = self ._snapshot_properties ,
808+ )
728809class _ManifestMergeManager (Generic [U ]):
729810 _target_size_bytes : int
730811 _min_count_to_merge : int
0 commit comments