Skip to content

Commit 5e0b636

Browse files
ben-kaufmanclaude
andcommitted
fix: multi-channel healing + TOCTOU race on chain sync spawn
1. Remove peer dedup from healing payments — send one per unhealed channel (not per peer). The previous dedup prevented retries to the same peer even when some of their channels remained unhealed. 2. Fix TOCTOU race: subscribe to stop_sender while holding the is_running read lock before spawning chain sync. This prevents stop() from completing between the check and subscribe, which would orphan the task (missing the already-sent stop signal). Extracted spawn_chain_sync_task_with_receiver() so the normal path (spawn_chain_sync_task) still works unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5b65bf2 commit 5e0b636

1 file changed

Lines changed: 27 additions & 19 deletions

File tree

src/lib.rs

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ mod tx_broadcaster;
102102
mod types;
103103
mod wallet;
104104

105-
use std::collections::{HashMap, HashSet};
105+
use std::collections::HashMap;
106106
use std::default::Default;
107107
use std::net::ToSocketAddrs;
108108
use std::ops::Deref;
@@ -728,14 +728,11 @@ impl Node {
728728
)
729729
};
730730

731-
// Deduplicate by counterparty — one payment per peer is enough to trigger
732-
// a commitment round-trip. If a peer has multiple channels, each retry
733-
// attempt may route through a different channel as outbound capacity shifts.
734-
let mut seen_peers = HashSet::new();
731+
// Send one healing payment per unhealed channel. Note: for multiple
732+
// channels with the same peer, the router may pick the same channel for
733+
// both payments. The retry loop gives multiple chances for the router to
734+
// select different channels as scores and capacity shift between attempts.
735735
for (_, counterparty_node_id, _) in &initial_update_ids {
736-
if !seen_peers.insert(*counterparty_node_id) {
737-
continue;
738-
}
739736
match send_heal_payment(*counterparty_node_id) {
740737
Ok(_) => {
741738
log_info!(
@@ -807,18 +804,16 @@ impl Node {
807804
break;
808805
}
809806

810-
// Retry healing payments for peers that still have unhealed channels.
811-
// Dedup by peer — the router may pick a different channel on each retry.
807+
// Retry healing payments for each unhealed channel.
812808
if last_retry_time.elapsed() >= retry_interval {
813809
last_retry_time = tokio::time::Instant::now();
814-
let mut retried_peers = HashSet::new();
815810
for (ch_id, counterparty_node_id, initial_id) in &initial_update_ids {
816811
let healed = chain_monitor
817812
.get_monitor(*ch_id)
818813
.ok()
819814
.map(|m| m.get_latest_update_id() > *initial_id)
820815
.unwrap_or(true);
821-
if healed || !retried_peers.insert(*counterparty_node_id) {
816+
if healed {
822817
continue;
823818
}
824819

@@ -845,12 +840,19 @@ impl Node {
845840
// Clear the flag so subsequent start()/stop()/start() cycles don't re-trigger.
846841
self.accept_stale_channel_monitors.store(false, Ordering::Relaxed);
847842

848-
// Only start chain sync if the node wasn't stopped during healing.
849-
if *self.is_running.read().unwrap() {
850-
self.spawn_chain_sync_task();
851-
log_info!(self.logger, "Startup complete.");
852-
} else {
853-
log_info!(self.logger, "Node was stopped during stale monitor recovery.");
843+
// Subscribe while holding the is_running read lock to prevent a TOCTOU
844+
// race where stop() completes between our check and the subscribe — which
845+
// would orphan the chain sync task (it would miss the stop signal).
846+
{
847+
let is_running = self.is_running.read().unwrap();
848+
if *is_running {
849+
let stop_receiver = self.stop_sender.subscribe();
850+
drop(is_running);
851+
self.spawn_chain_sync_task_with_receiver(stop_receiver);
852+
log_info!(self.logger, "Startup complete.");
853+
} else {
854+
log_info!(self.logger, "Node was stopped during stale monitor recovery.");
855+
}
854856
}
855857
return Ok(());
856858
}
@@ -861,7 +863,13 @@ impl Node {
861863
}
862864

863865
fn spawn_chain_sync_task(&self) {
864-
let stop_sync_receiver = self.stop_sender.subscribe();
866+
let stop_receiver = self.stop_sender.subscribe();
867+
self.spawn_chain_sync_task_with_receiver(stop_receiver);
868+
}
869+
870+
fn spawn_chain_sync_task_with_receiver(
871+
&self, stop_sync_receiver: tokio::sync::watch::Receiver<()>,
872+
) {
865873
let chain_source = Arc::clone(&self.chain_source);
866874
let sync_wallet = Arc::clone(&self.wallet);
867875
let sync_cman = Arc::clone(&self.channel_manager);

0 commit comments

Comments
 (0)