Replace dual-sync-async persistence panic with Watch contract

joostjager · claude · joostjager · commit 93a9d79c47e5 · 2026-02-24T11:25:49.000+01:00
Instead of panicking when a Persist implementation returns both Completed and InProgress from the same ChannelManager instance, define a clearer contract at the Watch trait level: a Watch implementation must not return Completed for a channel update if there are still pending InProgress updates for that channel. This matches the pure-async interface where you can't complete an update until all previous ones have completed. This reverts the monitor_update_type tracking and panic checks from 0760f99, replacing them with a debug_assert that validates the Watch contract. Persist implementors are expected to be consistent (always sync or always async), so ChainMonitor naturally satisfies this contract without code changes. Legacy tests that switch the persister between modes mid-flight can opt out of the assertion via Node::disable_monitor_completeness_assertion(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lightning/src/chain/channelmonitor.rs b/lightning/src/chain/channelmonitor.rs
@@ -6817,6 +6817,7 @@ mod tests {
 		let legacy_cfg = test_legacy_channel_config();
 		let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[Some(legacy_cfg.clone()), Some(legacy_cfg.clone()), Some(legacy_cfg)]);
 		let nodes = create_network(3, &node_cfgs, &node_chanmgrs);
+		nodes[1].disable_monitor_completeness_assertion();
 		let channel = create_announced_chan_between_nodes(&nodes, 0, 1);
 		create_announced_chan_between_nodes(&nodes, 1, 2);
 
diff --git a/lightning/src/chain/mod.rs b/lightning/src/chain/mod.rs
@@ -233,11 +233,11 @@ pub enum ChannelMonitorUpdateStatus {
 	/// This includes performing any `fsync()` calls required to ensure the update is guaranteed to
 	/// be available on restart even if the application crashes.
 	///
-	/// If you return this variant, you cannot later return [`InProgress`] from the same instance of
-	/// [`Persist`]/[`Watch`] without first restarting.
+	/// A [`Watch`] implementation must not return this for a channel update if there are still
+	/// pending [`InProgress`] updates for that channel. That is, an update can only be considered
+	/// complete once all prior updates have also completed.
 	///
 	/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
-	/// [`Persist`]: chainmonitor::Persist
 	Completed,
 	/// Indicates that the update will happen asynchronously in the background or that a transient
 	/// failure occurred which is being retried in the background and will eventually complete.
@@ -263,12 +263,7 @@ pub enum ChannelMonitorUpdateStatus {
 	/// reliable, this feature is considered beta, and a handful of edge-cases remain. Until the
 	/// remaining cases are fixed, in rare cases, *using this feature may lead to funds loss*.
 	///
-	/// If you return this variant, you cannot later return [`Completed`] from the same instance of
-	/// [`Persist`]/[`Watch`] without first restarting.
-	///
 	/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
-	/// [`Completed`]: ChannelMonitorUpdateStatus::Completed
-	/// [`Persist`]: chainmonitor::Persist
 	InProgress,
 	/// Indicates that an update has failed and will not complete at any point in the future.
 	///
diff --git a/lightning/src/ln/chanmon_update_fail_tests.rs b/lightning/src/ln/chanmon_update_fail_tests.rs
@@ -175,6 +175,7 @@ fn do_test_simple_monitor_temporary_update_fail(disconnect: bool) {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[0].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -316,6 +317,7 @@ fn do_test_monitor_temporary_update_fail(disconnect_count: usize) {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[0].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -969,6 +971,7 @@ fn do_test_monitor_update_fail_raa(test_ignore_second_cs: bool) {
 	let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[None, None, None]);
 	let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -1500,6 +1503,7 @@ fn claim_while_disconnected_monitor_update_fail() {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -1727,6 +1731,7 @@ fn first_message_on_recv_ordering() {
 	let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
 	let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
 	let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	let node_a_id = nodes[0].node.get_our_node_id();
 	let node_b_id = nodes[1].node.get_our_node_id();
@@ -3848,6 +3853,7 @@ fn do_test_durable_preimages_on_closed_channel(
 	// Now reload node B
 	let manager_b = nodes[1].node.encode();
 	reload_node!(nodes[1], &manager_b, &[&mon_ab, &mon_bc], persister, chain_mon, node_b_reload);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	nodes[0].node.peer_disconnected(node_b_id);
 	nodes[2].node.peer_disconnected(node_b_id);
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -2779,12 +2779,12 @@ pub struct ChannelManager<
 	#[cfg(any(test, feature = "_test_utils"))]
 	pub(super) per_peer_state: FairRwLock<HashMap<PublicKey, Mutex<PeerState<SP>>>>,
 
-	/// We only support using one of [`ChannelMonitorUpdateStatus::InProgress`] and
-	/// [`ChannelMonitorUpdateStatus::Completed`] without restarting. Because the API does not
-	/// otherwise directly enforce this, we enforce it in non-test builds here by storing which one
-	/// is in use.
-	#[cfg(not(any(test, feature = "_externalize_tests")))]
-	monitor_update_type: AtomicUsize,
+	/// When set, disables the debug assertion that `Watch::update_channel` must not return
+	/// `Completed` while prior updates are still `InProgress`. Some legacy tests switch the
+	/// persister between `InProgress` and `Completed` mid-flight, which violates this contract
+	/// but is otherwise harmless in a test context.
+	#[cfg(test)]
+	pub(crate) skip_monitor_update_assertion: AtomicBool,
 
 	/// The set of events which we need to give to the user to handle. In some cases an event may
 	/// require some further action after the user handles it (currently only blocking a monitor
@@ -3527,8 +3527,8 @@ impl<
 
 			per_peer_state: FairRwLock::new(new_hash_map()),
 
-			#[cfg(not(any(test, feature = "_externalize_tests")))]
-			monitor_update_type: AtomicUsize::new(0),
+			#[cfg(test)]
+			skip_monitor_update_assertion: AtomicBool::new(false),
 
 			pending_events: Mutex::new(VecDeque::new()),
 			pending_events_processor: AtomicBool::new(false),
@@ -9941,6 +9941,16 @@ This indicates a bug inside LDK. Please report this error at https://github.com/
 			if update_completed {
 				let _ = in_flight_updates.remove(update_idx);
 			}
+			// A Watch implementation must not return Completed while prior updates are
+			// still InProgress, as this would violate the async persistence contract.
+			#[cfg(test)]
+			let skip_assert = self.skip_monitor_update_assertion.load(Ordering::Relaxed);
+			#[cfg(not(test))]
+			let skip_assert = false;
+			debug_assert!(
+				skip_assert || !update_completed || in_flight_updates.is_empty(),
+				"Watch::update_channel returned Completed while prior updates are still InProgress"
+			);
 			(update_completed, update_completed && in_flight_updates.is_empty())
 		} else {
 			// We blindly assume that the ChannelMonitorUpdate will be regenerated on startup if we
@@ -10006,23 +10016,13 @@ This indicates a bug inside LDK. Please report this error at https://github.com/
 				panic!("{}", err_str);
 			},
 			ChannelMonitorUpdateStatus::InProgress => {
-				#[cfg(not(any(test, feature = "_externalize_tests")))]
-				if self.monitor_update_type.swap(1, Ordering::Relaxed) == 2 {
-					panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
-				}
 				log_debug!(
 					logger,
 					"ChannelMonitor update in flight, holding messages until the update completes.",
 				);
 				false
 			},
-			ChannelMonitorUpdateStatus::Completed => {
-				#[cfg(not(any(test, feature = "_externalize_tests")))]
-				if self.monitor_update_type.swap(2, Ordering::Relaxed) == 1 {
-					panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
-				}
-				true
-			},
+			ChannelMonitorUpdateStatus::Completed => true,
 		}
 	}
 
@@ -19550,8 +19550,8 @@ impl<
 
 			per_peer_state: FairRwLock::new(per_peer_state),
 
-			#[cfg(not(any(test, feature = "_externalize_tests")))]
-			monitor_update_type: AtomicUsize::new(0),
+			#[cfg(test)]
+			skip_monitor_update_assertion: AtomicBool::new(false),
 
 			pending_events: Mutex::new(pending_events_read),
 			pending_events_processor: AtomicBool::new(false),
diff --git a/lightning/src/ln/functional_test_utils.rs b/lightning/src/ln/functional_test_utils.rs
@@ -598,6 +598,13 @@ impl<'a, 'b, 'c> Node<'a, 'b, 'c> {
 			self.node.init_features() | self.onion_messenger.provided_init_features(peer_node_id)
 		})
 	}
+
+	/// Disables the debug assertion that `Watch::update_channel` must not return `Completed`
+	/// while prior updates are still `InProgress`. Some legacy tests switch the persister between
+	/// modes mid-flight, which violates this contract but is otherwise harmless.
+	pub fn disable_monitor_completeness_assertion(&self) {
+		self.node.skip_monitor_update_assertion.store(true, core::sync::atomic::Ordering::Relaxed);
+	}
 }
 
 impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {}
diff --git a/lightning/src/ln/monitor_tests.rs b/lightning/src/ln/monitor_tests.rs
@@ -3384,6 +3384,7 @@ fn test_claim_event_never_handled() {
 	let chan_0_monitor_serialized = get_monitor!(nodes[1], chan.2).encode();
 	let mons = &[&chan_0_monitor_serialized[..]];
 	reload_node!(nodes[1], &init_node_ser, mons, persister, new_chain_mon, nodes_1_reload);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	expect_payment_claimed!(nodes[1], payment_hash_a, 1_000_000);
 	// The reload logic spuriously generates a redundant payment preimage-containing
diff --git a/lightning/src/ln/reload_tests.rs b/lightning/src/ln/reload_tests.rs
@@ -823,12 +823,14 @@ fn do_test_partial_claim_before_restart(persist_both_monitors: bool, double_rest
 
 	// Now restart nodes[3].
 	reload_node!(nodes[3], original_manager.clone(), &[&updated_monitor.0, &original_monitor.0], persist_d_1, chain_d_1, node_d_1);
+	nodes[3].disable_monitor_completeness_assertion();
 
 	if double_restart {
 		// Previously, we had a bug where we'd fail to reload if we re-persist the `ChannelManager`
 		// without updating any `ChannelMonitor`s as we'd fail to double-initiate the claim replay.
 		// We test that here ensuring that we can reload again.
 		reload_node!(nodes[3], node_d_1.encode(), &[&updated_monitor.0, &original_monitor.0], persist_d_2, chain_d_2, node_d_2);
+		nodes[3].disable_monitor_completeness_assertion();
 	}
 
 	// Until the startup background events are processed (in `get_and_clear_pending_events`,
@@ -2215,6 +2217,7 @@ fn test_reload_with_mpp_claims_on_same_channel() {
 		nodes_1_deserialized,
 		Some(true)
 	);
+	nodes[1].disable_monitor_completeness_assertion();
 
 	// When the claims are reconstructed during reload, PaymentForwarded events are regenerated.
 	let events = nodes[1].node.get_and_clear_pending_events();

Original file line number	Diff line number	Diff line change
`@@ -598,6 +598,13 @@ impl<'a, 'b, 'c> Node<'a, 'b, 'c> {`
`598`	`598`	`self.node.init_features() \| self.onion_messenger.provided_init_features(peer_node_id)`
`599`	`599`	`})`
`600`	`600`	`}`
	`601`	`+`
	`602`	+ /// Disables the debug assertion that `Watch::update_channel` must not return `Completed`
	`603`	+ /// while prior updates are still `InProgress`. Some legacy tests switch the persister between
	`604`	`+ /// modes mid-flight, which violates this contract but is otherwise harmless.`
	`605`	`+ pub fn disable_monitor_completeness_assertion(&self) {`
	`606`	`+ self.node.skip_monitor_update_assertion.store(true, core::sync::atomic::Ordering::Relaxed);`
	`607`	`+ }`
`601`	`608`	`}`
`602`	`609`
`603`	`610`	`impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {}`