Replace dual-sync-async persistence panic with Watch contract

joostjager · claude · joostjager · commit 0d1e9784f8cf · 2026-02-26T14:05:54.000+01:00
Commit 0760f99 ("Disallow dual-sync-async persistence without restarting") added a panic in non-test builds when a Persist implementation returns both Completed and InProgress from the same ChannelManager instance. However, this check runs against the status that ChainMonitor returns to ChannelManager, not the raw Persist result. When ChannelMonitor::update_monitor fails (e.g. a counterparty commitment_signed arrives after a funding spend confirms), ChainMonitor persists the full monitor successfully but overrides the return value to InProgress. If the user's Persist impl only ever returns Completed, this override triggers a false mode-mismatch panic. This replaces the panic with a per-channel contract at the Watch trait level: a Watch implementation must not return Completed for a channel update while prior InProgress updates are still pending. Switching from Completed to InProgress is always allowed, but switching back is impractical because the Watch implementation cannot observe when ChannelManager has finished processing a MonitorEvent::Completed. The documentation on ChannelMonitorUpdateStatus is updated to describe these rules. The mode tracking and panic checks from 0760f99 are removed and replaced with a panic that validates the new contract directly on the in-flight update state. Legacy tests that switch the persister between modes mid-flight can opt out via Node::disable_monitor_completeness_assertion(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs
@@ -6817,6 +6817,7 @@ mod tests {
 		let legacy_cfg = test_legacy_channel_config();
 		let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[Some(legacy_cfg.clone()), Some(legacy_cfg.clone()), Some(legacy_cfg)]);
 		let nodes = create_network(3, &node_cfgs, &node_chanmgrs);
+		nodes[1].disable_monitor_completeness_assertion();
 		let channel = create_announced_chan_between_nodes(&nodes, 0, 1);
 		create_announced_chan_between_nodes(&nodes, 1, 2);
 
diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs
@@ -233,11 +233,10 @@ pub enum ChannelMonitorUpdateStatus {
 	/// This includes performing any `fsync()` calls required to ensure the update is guaranteed to
 	/// be available on restart even if the application crashes.
 	///
-	/// If you return this variant, you cannot later return [`InProgress`] from the same instance of
-	/// [`Persist`]/[`Watch`] without first restarting.
+	/// You cannot switch from [`InProgress`] to this variant for the same channel without first
+	/// restarting. However, switching from this variant to [`InProgress`] is always allowed.
 	///
 	/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
-	/// [`Persist`]: chainmonitor::Persist
 	Completed,
 	/// Indicates that the update will happen asynchronously in the background or that a transient
 	/// failure occurred which is being retried in the background and will eventually complete.
@@ -263,12 +262,7 @@ pub enum ChannelMonitorUpdateStatus {
 	/// reliable, this feature is considered beta, and a handful of edge-cases remain. Until the
 	/// remaining cases are fixed, in rare cases, *using this feature may lead to funds loss*.
 	///
-	/// If you return this variant, you cannot later return [`Completed`] from the same instance of
-	/// [`Persist`]/[`Watch`] without first restarting.
-	///
 	/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
-	/// [`Completed`]: ChannelMonitorUpdateStatus::Completed
-	/// [`Persist`]: chainmonitor::Persist
 	InProgress,
 	/// Indicates that an update has failed and will not complete at any point in the future.
 	///
@@ -328,6 +322,8 @@ pub trait Watch<ChannelSigner: EcdsaChannelSigner> {
 	/// cannot be retried, the node should shut down immediately after returning
 	/// [`ChannelMonitorUpdateStatus::UnrecoverableError`], see its documentation for more info.
 	///
+	/// See [`ChannelMonitorUpdateStatus`] for requirements on when each variant may be returned.
+	///
 	/// [`ChannelManager`]: crate::ln::channelmanager::ChannelManager
 	fn update_channel(
 		&self, channel_id: ChannelId, update: &ChannelMonitorUpdate,
diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs
@@ -175,6 +175,7 @@ fn do_test_simple_monitor_temporary_update_fail(disconnect: bool) {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[0].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -316,6 +317,7 @@ fn do_test_monitor_temporary_update_fail(disconnect_count: usize) {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[0].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -969,6 +971,7 @@ fn do_test_monitor_update_fail_raa(test_ignore_second_cs: bool) {
 	let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[None, None, None]);
 	let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -1500,6 +1503,7 @@ fn claim_while_disconnected_monitor_update_fail() {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -1727,6 +1731,7 @@ fn first_message_on_recv_ordering() {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -3849,6 +3854,7 @@ fn do_test_durable_preimages_on_closed_channel(
 	// Now reload node B
 	let manager_b = nodes[1].node.encode();
 	reload_node!(nodes[1], &manager_b, &[&mon_ab, &mon_bc], persister, chain_mon, node_b_reload);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	nodes[0].node.peer_disconnected(node_b_id);
 	nodes[2].node.peer_disconnected(node_b_id);
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -2792,12 +2792,12 @@ pub struct ChannelManager<
 	#[cfg(any(test, feature = "_test_utils"))]
 	pub(super) per_peer_state: FairRwLock<HashMap<PublicKey, Mutex<PeerState<SP>>>>,
 
-	/// We only support using one of [`ChannelMonitorUpdateStatus::InProgress`] and
-	/// [`ChannelMonitorUpdateStatus::Completed`] without restarting. Because the API does not
-	/// otherwise directly enforce this, we enforce it in non-test builds here by storing which one
-	/// is in use.
-	#[cfg(not(any(test, feature = "_externalize_tests")))]
-	monitor_update_type: AtomicUsize,
+	/// When set, disables the panic when `Watch::update_channel` returns `Completed` while
+	/// prior updates are still `InProgress`. Some legacy tests switch the persister between
+	/// `InProgress` and `Completed` mid-flight, which violates this contract but is otherwise
+	/// harmless in a test context.
+	#[cfg(test)]
+	pub(crate) skip_monitor_update_assertion: AtomicBool,
 
 	/// The set of events which we need to give to the user to handle. In some cases an event may
 	/// require some further action after the user handles it (currently only blocking a monitor
@@ -3540,8 +3540,8 @@ impl<
 
 			per_peer_state: FairRwLock::new(new_hash_map()),
 
-			#[cfg(not(any(test, feature = "_externalize_tests")))]
-			monitor_update_type: AtomicUsize::new(0),
+			#[cfg(test)]
+			skip_monitor_update_assertion: AtomicBool::new(false),
 
 			pending_events: Mutex::new(VecDeque::new()),
 			pending_events_processor: AtomicBool::new(false),
@@ -9965,6 +9965,15 @@ This indicates a bug inside LDK. Please report this error at https://github.com/
 			if update_completed {
 				let _ = in_flight_updates.remove(update_idx);
 			}
+			// A Watch implementation must not return Completed while prior updates are
+			// still InProgress, as this would violate the async persistence contract.
+			#[cfg(test)]
+			let skip_check = self.skip_monitor_update_assertion.load(Ordering::Relaxed);
+			#[cfg(not(test))]
+			let skip_check = false;
+			if !skip_check && update_completed && !in_flight_updates.is_empty() {
+				panic!("Watch::update_channel returned Completed while prior updates are still InProgress");
+			}
 			(update_completed, update_completed && in_flight_updates.is_empty())
 		} else {
 			// We blindly assume that the ChannelMonitorUpdate will be regenerated on startup if we
@@ -10030,23 +10039,13 @@ This indicates a bug inside LDK. Please report this error at https://github.com/
 				panic!("{}", err_str);
 			},
 			ChannelMonitorUpdateStatus::InProgress => {
-				#[cfg(not(any(test, feature = "_externalize_tests")))]
-				if self.monitor_update_type.swap(1, Ordering::Relaxed) == 2 {
-					panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
-				}
 				log_debug!(
 					logger,
 					"ChannelMonitor update in flight, holding messages until the update completes.",
 				);
 				false
 			},
-			ChannelMonitorUpdateStatus::Completed => {
-				#[cfg(not(any(test, feature = "_externalize_tests")))]
-				if self.monitor_update_type.swap(2, Ordering::Relaxed) == 1 {
-					panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
-				}
-				true
-			},
+			ChannelMonitorUpdateStatus::Completed => true,
 		}
 	}
 
@@ -19553,8 +19552,8 @@ impl<
 
 			per_peer_state: FairRwLock::new(per_peer_state),
 
-			#[cfg(not(any(test, feature = "_externalize_tests")))]
-			monitor_update_type: AtomicUsize::new(0),
+			#[cfg(test)]
+			skip_monitor_update_assertion: AtomicBool::new(false),
 
 			pending_events: Mutex::new(pending_events_read),
 			pending_events_processor: AtomicBool::new(false),
diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs
@@ -598,6 +598,14 @@ impl<'a, 'b, 'c> Node<'a, 'b, 'c> {
 			self.node.init_features() | self.onion_messenger.provided_init_features(peer_node_id)
 		})
 	}
+
+	/// Disables the panic when `Watch::update_channel` returns `Completed` while prior updates
+	/// are still `InProgress`. Some legacy tests switch the persister between modes mid-flight,
+	/// which violates this contract but is otherwise harmless.
+	#[cfg(test)]
+	pub fn disable_monitor_completeness_assertion(&self) {
+		self.node.skip_monitor_update_assertion.store(true, core::sync::atomic::Ordering::Relaxed);
+	}
 }
 
 impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {}
diff --git a/lightning/src/ln/monitor_tests.rs b/lightning/src/ln/monitor_tests.rs
@@ -3384,6 +3384,7 @@ fn test_claim_event_never_handled() {
 	let chan_0_monitor_serialized = get_monitor!(nodes[1], chan.2).encode();
 	let mons = &[&chan_0_monitor_serialized[..]];
 	reload_node!(nodes[1], &init_node_ser, mons, persister, new_chain_mon, nodes_1_reload);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	expect_payment_claimed!(nodes[1], payment_hash_a, 1_000_000);
 	// The reload logic spuriously generates a redundant payment preimage-containing
diff --git a/lightning/src/ln/reload_tests.rs b/lightning/src/ln/reload_tests.rs
@@ -823,12 +823,14 @@ fn do_test_partial_claim_before_restart(persist_both_monitors: bool, double_rest
 
 	// Now restart nodes[3].
 	reload_node!(nodes[3], original_manager.clone(), &[&updated_monitor.0, &original_monitor.0], persist_d_1, chain_d_1, node_d_1);
+	nodes[3].disable_monitor_completeness_assertion();
 
 	if double_restart {
 		// Previously, we had a bug where we'd fail to reload if we re-persist the `ChannelManager`
 		// without updating any `ChannelMonitor`s as we'd fail to double-initiate the claim replay.
 		// We test that here ensuring that we can reload again.
 		reload_node!(nodes[3], node_d_1.encode(), &[&updated_monitor.0, &original_monitor.0], persist_d_2, chain_d_2, node_d_2);
+		nodes[3].disable_monitor_completeness_assertion();
 	}
 
 	// Until the startup background events are processed (in `get_and_clear_pending_events`,
@@ -2216,6 +2218,7 @@ fn test_reload_with_mpp_claims_on_same_channel() {
 		nodes_1_deserialized,
 		Some(true)
 	);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	// When the claims are reconstructed during reload, PaymentForwarded events are regenerated.
 	let events = nodes[1].node.get_and_clear_pending_events();

Original file line number	Diff line number	Diff line change
`@@ -598,6 +598,14 @@ impl<'a, 'b, 'c> Node<'a, 'b, 'c> {`
`598`	`598`	`self.node.init_features() \| self.onion_messenger.provided_init_features(peer_node_id)`
`599`	`599`	`})`
`600`	`600`	`}`
	`601`	`+`
	`602`	+ /// Disables the panic when `Watch::update_channel` returns `Completed` while prior updates
	`603`	+ /// are still `InProgress`. Some legacy tests switch the persister between modes mid-flight,
	`604`	`+ /// which violates this contract but is otherwise harmless.`
	`605`	`+ #[cfg(test)]`
	`606`	`+ pub fn disable_monitor_completeness_assertion(&self) {`
	`607`	`+ self.node.skip_monitor_update_assertion.store(true, core::sync::atomic::Ordering::Relaxed);`
	`608`	`+ }`
`601`	`609`	`}`
`602`	`610`
`603`	`611`	`impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {}`