Skip to content

Commit 93a9d79

Browse files
joostjagerclaude
andcommitted
Replace dual-sync-async persistence panic with Watch contract
Instead of panicking when a Persist implementation returns both Completed and InProgress from the same ChannelManager instance, define a clearer contract at the Watch trait level: a Watch implementation must not return Completed for a channel update if there are still pending InProgress updates for that channel. This matches the pure-async interface where you can't complete an update until all previous ones have completed. This reverts the monitor_update_type tracking and panic checks from 0760f99, replacing them with a debug_assert that validates the Watch contract. Persist implementors are expected to be consistent (always sync or always async), so ChainMonitor naturally satisfies this contract without code changes. Legacy tests that switch the persister between modes mid-flight can opt out of the assertion via Node::disable_monitor_completeness_assertion(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent eb3980d commit 93a9d79

7 files changed

Lines changed: 42 additions & 29 deletions

File tree

lightning/src/chain/channelmonitor.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6817,6 +6817,7 @@ mod tests {
68176817
let legacy_cfg = test_legacy_channel_config();
68186818
let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[Some(legacy_cfg.clone()), Some(legacy_cfg.clone()), Some(legacy_cfg)]);
68196819
let nodes = create_network(3, &node_cfgs, &node_chanmgrs);
6820+
nodes[1].disable_monitor_completeness_assertion();
68206821
let channel = create_announced_chan_between_nodes(&nodes, 0, 1);
68216822
create_announced_chan_between_nodes(&nodes, 1, 2);
68226823

lightning/src/chain/mod.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -233,11 +233,11 @@ pub enum ChannelMonitorUpdateStatus {
233233
/// This includes performing any `fsync()` calls required to ensure the update is guaranteed to
234234
/// be available on restart even if the application crashes.
235235
///
236-
/// If you return this variant, you cannot later return [`InProgress`] from the same instance of
237-
/// [`Persist`]/[`Watch`] without first restarting.
236+
/// A [`Watch`] implementation must not return this for a channel update if there are still
237+
/// pending [`InProgress`] updates for that channel. That is, an update can only be considered
238+
/// complete once all prior updates have also completed.
238239
///
239240
/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
240-
/// [`Persist`]: chainmonitor::Persist
241241
Completed,
242242
/// Indicates that the update will happen asynchronously in the background or that a transient
243243
/// failure occurred which is being retried in the background and will eventually complete.
@@ -263,12 +263,7 @@ pub enum ChannelMonitorUpdateStatus {
263263
/// reliable, this feature is considered beta, and a handful of edge-cases remain. Until the
264264
/// remaining cases are fixed, in rare cases, *using this feature may lead to funds loss*.
265265
///
266-
/// If you return this variant, you cannot later return [`Completed`] from the same instance of
267-
/// [`Persist`]/[`Watch`] without first restarting.
268-
///
269266
/// [`InProgress`]: ChannelMonitorUpdateStatus::InProgress
270-
/// [`Completed`]: ChannelMonitorUpdateStatus::Completed
271-
/// [`Persist`]: chainmonitor::Persist
272267
InProgress,
273268
/// Indicates that an update has failed and will not complete at any point in the future.
274269
///

lightning/src/ln/chanmon_update_fail_tests.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ fn do_test_simple_monitor_temporary_update_fail(disconnect: bool) {
175175
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
176176
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
177177
let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
178+
nodes[0].disable_monitor_completeness_assertion();
178179

179180
let node_a_id = nodes[0].node.get_our_node_id();
180181
let node_b_id = nodes[1].node.get_our_node_id();
@@ -316,6 +317,7 @@ fn do_test_monitor_temporary_update_fail(disconnect_count: usize) {
316317
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
317318
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
318319
let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
320+
nodes[0].disable_monitor_completeness_assertion();
319321

320322
let node_a_id = nodes[0].node.get_our_node_id();
321323
let node_b_id = nodes[1].node.get_our_node_id();
@@ -969,6 +971,7 @@ fn do_test_monitor_update_fail_raa(test_ignore_second_cs: bool) {
969971
let node_cfgs = create_node_cfgs(3, &chanmon_cfgs);
970972
let node_chanmgrs = create_node_chanmgrs(3, &node_cfgs, &[None, None, None]);
971973
let mut nodes = create_network(3, &node_cfgs, &node_chanmgrs);
974+
nodes[1].disable_monitor_completeness_assertion();
972975

973976
let node_a_id = nodes[0].node.get_our_node_id();
974977
let node_b_id = nodes[1].node.get_our_node_id();
@@ -1500,6 +1503,7 @@ fn claim_while_disconnected_monitor_update_fail() {
15001503
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
15011504
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
15021505
let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
1506+
nodes[1].disable_monitor_completeness_assertion();
15031507

15041508
let node_a_id = nodes[0].node.get_our_node_id();
15051509
let node_b_id = nodes[1].node.get_our_node_id();
@@ -1727,6 +1731,7 @@ fn first_message_on_recv_ordering() {
17271731
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
17281732
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
17291733
let mut nodes = create_network(2, &node_cfgs, &node_chanmgrs);
1734+
nodes[1].disable_monitor_completeness_assertion();
17301735

17311736
let node_a_id = nodes[0].node.get_our_node_id();
17321737
let node_b_id = nodes[1].node.get_our_node_id();
@@ -3848,6 +3853,7 @@ fn do_test_durable_preimages_on_closed_channel(
38483853
// Now reload node B
38493854
let manager_b = nodes[1].node.encode();
38503855
reload_node!(nodes[1], &manager_b, &[&mon_ab, &mon_bc], persister, chain_mon, node_b_reload);
3856+
nodes[1].disable_monitor_completeness_assertion();
38513857

38523858
nodes[0].node.peer_disconnected(node_b_id);
38533859
nodes[2].node.peer_disconnected(node_b_id);

lightning/src/ln/channelmanager.rs

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2779,12 +2779,12 @@ pub struct ChannelManager<
27792779
#[cfg(any(test, feature = "_test_utils"))]
27802780
pub(super) per_peer_state: FairRwLock<HashMap<PublicKey, Mutex<PeerState<SP>>>>,
27812781

2782-
/// We only support using one of [`ChannelMonitorUpdateStatus::InProgress`] and
2783-
/// [`ChannelMonitorUpdateStatus::Completed`] without restarting. Because the API does not
2784-
/// otherwise directly enforce this, we enforce it in non-test builds here by storing which one
2785-
/// is in use.
2786-
#[cfg(not(any(test, feature = "_externalize_tests")))]
2787-
monitor_update_type: AtomicUsize,
2782+
/// When set, disables the debug assertion that `Watch::update_channel` must not return
2783+
/// `Completed` while prior updates are still `InProgress`. Some legacy tests switch the
2784+
/// persister between `InProgress` and `Completed` mid-flight, which violates this contract
2785+
/// but is otherwise harmless in a test context.
2786+
#[cfg(test)]
2787+
pub(crate) skip_monitor_update_assertion: AtomicBool,
27882788

27892789
/// The set of events which we need to give to the user to handle. In some cases an event may
27902790
/// require some further action after the user handles it (currently only blocking a monitor
@@ -3527,8 +3527,8 @@ impl<
35273527

35283528
per_peer_state: FairRwLock::new(new_hash_map()),
35293529

3530-
#[cfg(not(any(test, feature = "_externalize_tests")))]
3531-
monitor_update_type: AtomicUsize::new(0),
3530+
#[cfg(test)]
3531+
skip_monitor_update_assertion: AtomicBool::new(false),
35323532

35333533
pending_events: Mutex::new(VecDeque::new()),
35343534
pending_events_processor: AtomicBool::new(false),
@@ -9941,6 +9941,16 @@ This indicates a bug inside LDK. Please report this error at https://github.com/
99419941
if update_completed {
99429942
let _ = in_flight_updates.remove(update_idx);
99439943
}
9944+
// A Watch implementation must not return Completed while prior updates are
9945+
// still InProgress, as this would violate the async persistence contract.
9946+
#[cfg(test)]
9947+
let skip_assert = self.skip_monitor_update_assertion.load(Ordering::Relaxed);
9948+
#[cfg(not(test))]
9949+
let skip_assert = false;
9950+
debug_assert!(
9951+
skip_assert || !update_completed || in_flight_updates.is_empty(),
9952+
"Watch::update_channel returned Completed while prior updates are still InProgress"
9953+
);
99449954
(update_completed, update_completed && in_flight_updates.is_empty())
99459955
} else {
99469956
// We blindly assume that the ChannelMonitorUpdate will be regenerated on startup if we
@@ -10006,23 +10016,13 @@ This indicates a bug inside LDK. Please report this error at https://github.com/
1000610016
panic!("{}", err_str);
1000710017
},
1000810018
ChannelMonitorUpdateStatus::InProgress => {
10009-
#[cfg(not(any(test, feature = "_externalize_tests")))]
10010-
if self.monitor_update_type.swap(1, Ordering::Relaxed) == 2 {
10011-
panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
10012-
}
1001310019
log_debug!(
1001410020
logger,
1001510021
"ChannelMonitor update in flight, holding messages until the update completes.",
1001610022
);
1001710023
false
1001810024
},
10019-
ChannelMonitorUpdateStatus::Completed => {
10020-
#[cfg(not(any(test, feature = "_externalize_tests")))]
10021-
if self.monitor_update_type.swap(2, Ordering::Relaxed) == 1 {
10022-
panic!("Cannot use both ChannelMonitorUpdateStatus modes InProgress and Completed without restart");
10023-
}
10024-
true
10025-
},
10025+
ChannelMonitorUpdateStatus::Completed => true,
1002610026
}
1002710027
}
1002810028

@@ -19550,8 +19550,8 @@ impl<
1955019550

1955119551
per_peer_state: FairRwLock::new(per_peer_state),
1955219552

19553-
#[cfg(not(any(test, feature = "_externalize_tests")))]
19554-
monitor_update_type: AtomicUsize::new(0),
19553+
#[cfg(test)]
19554+
skip_monitor_update_assertion: AtomicBool::new(false),
1955519555

1955619556
pending_events: Mutex::new(pending_events_read),
1955719557
pending_events_processor: AtomicBool::new(false),

lightning/src/ln/functional_test_utils.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,13 @@ impl<'a, 'b, 'c> Node<'a, 'b, 'c> {
598598
self.node.init_features() | self.onion_messenger.provided_init_features(peer_node_id)
599599
})
600600
}
601+
602+
/// Disables the debug assertion that `Watch::update_channel` must not return `Completed`
603+
/// while prior updates are still `InProgress`. Some legacy tests switch the persister between
604+
/// modes mid-flight, which violates this contract but is otherwise harmless.
605+
pub fn disable_monitor_completeness_assertion(&self) {
606+
self.node.skip_monitor_update_assertion.store(true, core::sync::atomic::Ordering::Relaxed);
607+
}
601608
}
602609

603610
impl<'a, 'b, 'c> std::panic::UnwindSafe for Node<'a, 'b, 'c> {}

lightning/src/ln/monitor_tests.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3384,6 +3384,7 @@ fn test_claim_event_never_handled() {
33843384
let chan_0_monitor_serialized = get_monitor!(nodes[1], chan.2).encode();
33853385
let mons = &[&chan_0_monitor_serialized[..]];
33863386
reload_node!(nodes[1], &init_node_ser, mons, persister, new_chain_mon, nodes_1_reload);
3387+
nodes[1].disable_monitor_completeness_assertion();
33873388

33883389
expect_payment_claimed!(nodes[1], payment_hash_a, 1_000_000);
33893390
// The reload logic spuriously generates a redundant payment preimage-containing

lightning/src/ln/reload_tests.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -823,12 +823,14 @@ fn do_test_partial_claim_before_restart(persist_both_monitors: bool, double_rest
823823

824824
// Now restart nodes[3].
825825
reload_node!(nodes[3], original_manager.clone(), &[&updated_monitor.0, &original_monitor.0], persist_d_1, chain_d_1, node_d_1);
826+
nodes[3].disable_monitor_completeness_assertion();
826827

827828
if double_restart {
828829
// Previously, we had a bug where we'd fail to reload if we re-persist the `ChannelManager`
829830
// without updating any `ChannelMonitor`s as we'd fail to double-initiate the claim replay.
830831
// We test that here ensuring that we can reload again.
831832
reload_node!(nodes[3], node_d_1.encode(), &[&updated_monitor.0, &original_monitor.0], persist_d_2, chain_d_2, node_d_2);
833+
nodes[3].disable_monitor_completeness_assertion();
832834
}
833835

834836
// Until the startup background events are processed (in `get_and_clear_pending_events`,
@@ -2215,6 +2217,7 @@ fn test_reload_with_mpp_claims_on_same_channel() {
22152217
nodes_1_deserialized,
22162218
Some(true)
22172219
);
2220+
nodes[1].disable_monitor_completeness_assertion();
22182221

22192222
// When the claims are reconstructed during reload, PaymentForwarded events are regenerated.
22202223
let events = nodes[1].node.get_and_clear_pending_events();

0 commit comments

Comments
 (0)