Skip to content

Commit 12c37b2

Browse files
committed
Ensure partial MPP claims continue to blocks channels on restart
In 9cc6e08, we started using the `RAAMonitorUpdateBlockingAction` logic to block RAAs which may remove a preimage from one `ChannelMonitor` if it isn't durably stored in another that is a part of the same MPP claim. At the time, when we claimed a payment, if we saw that the HTLC was already claimed in the channel, we'd simply drop the new RAA blocker. This can happen on reload when replaying MPP claims. However, just because an HTLC is no longer present in `ChannelManager`'s `Channel`, doesn't mean that the `ChannelMonitorUpdate` which stored the preimage actually made it durably into the `ChannelMonitor` on disk. We could begin an MPP payment, have one channel get the preimage durably into its `ChannelMonitor`, then step forward another update with the peer. Then, we could reload, causing the MPP claims to be replayed across all chanels, leading to the RAA blocker(s) being dropped and all channels being unlocked. Finally, if the first channel managed to step forward a further update with its peer before the (now-replayed) `ChannelMonitorUpdate`s for all MPP parts make it to disk we could theoretically lose the preimage. This is, of course, a somewhat comically unlikely scenario, but I had an old note to expand the test and it turned up the issue, so we might as well fix it.
1 parent 61e5819 commit 12c37b2

File tree

2 files changed

+143
-15
lines changed

2 files changed

+143
-15
lines changed

lightning/src/ln/chanmon_update_fail_tests.rs

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4218,13 +4218,17 @@ fn test_glacial_peer_cant_hang() {
42184218
do_test_glacial_peer_cant_hang(true);
42194219
}
42204220

4221-
#[test]
4222-
fn test_partial_claim_mon_update_compl_actions() {
4221+
fn do_test_partial_claim_mon_update_compl_actions(reload_a: bool, reload_b: bool) {
42234222
// Test that if we have an MPP claim that we ensure the preimage for the claim is retained in
42244223
// all the `ChannelMonitor`s until the preimage reaches every `ChannelMonitor` for a channel
42254224
// which was a part of the MPP.
42264225
let chanmon_cfgs = create_chanmon_cfgs(4);
42274226
let node_cfgs = create_node_cfgs(4, &chanmon_cfgs);
4227+
4228+
let (persister, persister_2, persister_3);
4229+
let (new_chain_mon, new_chain_mon_2, new_chain_mon_3);
4230+
let (nodes_3_reload, nodes_3_reload_2, nodes_3_reload_3);
4231+
42284232
let node_chanmgrs = create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
42294233
let mut nodes = create_network(4, &node_cfgs, &node_chanmgrs);
42304234

@@ -4253,6 +4257,8 @@ fn test_partial_claim_mon_update_compl_actions() {
42534257
let paths = &[&[&nodes[1], &nodes[3]][..], &[&nodes[2], &nodes[3]][..]];
42544258
send_along_route_with_secret(&nodes[0], route, paths, 200_000, payment_hash, payment_secret);
42554259

4260+
// Store the monitor for channel 4 without the preimage to use on reload
4261+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
42564262
// Claim along both paths, but only complete one of the two monitor updates.
42574263
chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
42584264
chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
@@ -4264,7 +4270,13 @@ fn test_partial_claim_mon_update_compl_actions() {
42644270
// Complete the 1<->3 monitor update and play the commitment_signed dance forward until it
42654271
// blocks.
42664272
nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_3_id);
4267-
expect_payment_claimed!(&nodes[3], payment_hash, 200_000);
4273+
let payment_claimed = nodes[3].node.get_and_clear_pending_events();
4274+
assert_eq!(payment_claimed.len(), 1, "{payment_claimed:?}");
4275+
if let Event::PaymentClaimed { payment_hash: ev_hash, .. } = &payment_claimed[0] {
4276+
assert_eq!(*ev_hash, payment_hash);
4277+
} else {
4278+
panic!("{payment_claimed:?}");
4279+
}
42684280
let mut updates = get_htlc_update_msgs(&nodes[3], &node_b_id);
42694281

42704282
nodes[1].node.handle_update_fulfill_htlc(node_d_id, updates.update_fulfill_htlcs.remove(0));
@@ -4283,15 +4295,41 @@ fn test_partial_claim_mon_update_compl_actions() {
42834295
check_added_monitors(&nodes[3], 0);
42844296
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
42854297

4298+
if reload_a {
4299+
// After a reload (with the monitor not yet fully updated), the RAA should still be blocked
4300+
// waiting until the monitor update completes.
4301+
let node_ser = nodes[3].node.encode();
4302+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4303+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4304+
reload_node!(nodes[3], &node_ser, mons, persister, new_chain_mon, nodes_3_reload);
4305+
// The final update to channel 4 should be replayed.
4306+
persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
4307+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4308+
check_added_monitors(&nodes[3], 1);
4309+
4310+
// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
4311+
// restart.
4312+
let second_payment_claimed = nodes[3].node.get_and_clear_pending_events();
4313+
assert_eq!(payment_claimed, second_payment_claimed);
4314+
4315+
nodes[1].node.peer_disconnected(node_d_id);
4316+
nodes[2].node.peer_disconnected(node_d_id);
4317+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4318+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4319+
4320+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4321+
}
4322+
42864323
// Now double-check that the preimage is still in the 1<->3 channel and complete the pending
42874324
// monitor update, allowing node 3 to claim the payment on the 2<->3 channel. This also
42884325
// unblocks the 1<->3 channel, allowing node 3 to release the two blocked monitor updates and
42894326
// respond to the final commitment_signed.
42904327
assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
4328+
assert!(nodes[3].node.get_and_clear_pending_events().is_empty());
42914329

42924330
nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_4_id);
42934331
let mut ds_msgs = nodes[3].node.get_and_clear_pending_msg_events();
4294-
assert_eq!(ds_msgs.len(), 2);
4332+
assert_eq!(ds_msgs.len(), 2, "{ds_msgs:?}");
42954333
check_added_monitors(&nodes[3], 2);
42964334

42974335
match remove_first_msg_event_to_node(&node_b_id, &mut ds_msgs) {
@@ -4335,13 +4373,86 @@ fn test_partial_claim_mon_update_compl_actions() {
43354373
assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
43364374
assert!(get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
43374375

4376+
if reload_b {
4377+
// Ensure that the channel pause logic doesn't accidentally get restarted after a second
4378+
// reload once the HTLCs for the first payment have been removed and the monitors
4379+
// completed.
4380+
let node_ser = nodes[3].node.encode();
4381+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4382+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
4383+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4384+
reload_node!(nodes[3], &node_ser, mons, persister_2, new_chain_mon_2, nodes_3_reload_2);
4385+
check_added_monitors(&nodes[3], 0);
4386+
4387+
nodes[1].node.peer_disconnected(node_d_id);
4388+
nodes[2].node.peer_disconnected(node_d_id);
4389+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4390+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4391+
4392+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4393+
4394+
// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
4395+
// restart.
4396+
let third_payment_claimed = nodes[3].node.get_and_clear_pending_events();
4397+
assert_eq!(payment_claimed, third_payment_claimed);
4398+
}
4399+
43384400
send_payment(&nodes[1], &[&nodes[3]], 100_000);
43394401
assert!(!get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
43404402

4403+
if reload_b {
4404+
// Ensure that the channel pause logic doesn't accidentally get restarted after a second
4405+
// reload once the HTLCs for the first payment have been removed and the monitors
4406+
// completed, even if only one of the two monitors still knows about the first payment.
4407+
let node_ser = nodes[3].node.encode();
4408+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4409+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
4410+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4411+
reload_node!(nodes[3], &node_ser, mons, persister_3, new_chain_mon_3, nodes_3_reload_3);
4412+
check_added_monitors(&nodes[3], 0);
4413+
4414+
nodes[1].node.peer_disconnected(node_d_id);
4415+
nodes[2].node.peer_disconnected(node_d_id);
4416+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4417+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4418+
4419+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4420+
4421+
// Because the HTLCs aren't yet cleared, the PaymentClaimed events for both payments will
4422+
// be replayed on restart.
4423+
// Use this as an opportunity to check the payment_ids are unique.
4424+
let mut events = nodes[3].node.get_and_clear_pending_events();
4425+
assert_eq!(events.len(), 2);
4426+
events.retain(|ev| *ev != payment_claimed[0]);
4427+
assert_eq!(events.len(), 1);
4428+
if let Event::PaymentClaimed { payment_id: original_payment_id, .. } = &payment_claimed[0] {
4429+
assert!(original_payment_id.is_some());
4430+
if let Event::PaymentClaimed { amount_msat, payment_id, .. } = &events[0] {
4431+
assert!(payment_id.is_some());
4432+
assert_ne!(original_payment_id, payment_id);
4433+
assert_eq!(*amount_msat, 100_000);
4434+
} else {
4435+
panic!("{events:?}");
4436+
}
4437+
} else {
4438+
panic!("{events:?}");
4439+
}
4440+
4441+
send_payment(&nodes[1], &[&nodes[3]], 100_000);
4442+
}
4443+
43414444
send_payment(&nodes[2], &[&nodes[3]], 100_000);
43424445
assert!(!get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
43434446
}
43444447

4448+
#[test]
4449+
fn test_partial_claim_mon_update_compl_actions() {
4450+
do_test_partial_claim_mon_update_compl_actions(true, true);
4451+
do_test_partial_claim_mon_update_compl_actions(true, false);
4452+
do_test_partial_claim_mon_update_compl_actions(false, true);
4453+
do_test_partial_claim_mon_update_compl_actions(false, false);
4454+
}
4455+
43454456
#[test]
43464457
fn test_claim_to_closed_channel_blocks_forwarded_preimage_removal() {
43474458
// One of the last features for async persistence we implemented was the correct blocking of

lightning/src/ln/channelmanager.rs

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8274,25 +8274,42 @@ where
82748274
// payment claim from a `ChannelMonitor`. In some cases (MPP or
82758275
// if the HTLC was only recently removed) we make such claims
82768276
// after an HTLC has been removed from a channel entirely, and
8277-
// thus the RAA blocker has long since completed.
8277+
// thus the RAA blocker may have long since completed.
82788278
//
8279-
// In any other case, the RAA blocker must still be present and
8280-
// blocking RAAs.
8281-
debug_assert!(
8282-
during_init
8283-
|| peer_state
8284-
.actions_blocking_raa_monitor_updates
8285-
.get(&chan_id)
8286-
.unwrap()
8287-
.contains(&raa_blocker)
8288-
);
8279+
// However, its possible that the `ChannelMonitorUpdate` containing
8280+
// the preimage never completed and is still pending. In that case,
8281+
// we need to re-add the RAA blocker, which we do here. Handling
8282+
// the post-update action, below, will remove it again.
8283+
//
8284+
// In any other case (i.e. not during startup), the RAA blocker
8285+
// must still be present and blocking RAAs.
8286+
let actions = &mut peer_state.actions_blocking_raa_monitor_updates;
8287+
let actions_list = actions.entry(chan_id).or_insert_with(Vec::new);
8288+
if !actions_list.contains(&raa_blocker) {
8289+
debug_assert!(during_init);
8290+
actions_list.push(raa_blocker);
8291+
}
82898292
}
82908293
let action = if let Some(action) = action_opt {
82918294
action
82928295
} else {
82938296
return;
82948297
};
82958298

8299+
// If there are monitor updates in flight, we may be in the case
8300+
// described above, replaying a claim on startup which needs an RAA
8301+
// blocker to remain blocked. Thus, in such a case we simply push the
8302+
// post-update action to the blocked list and move on.
8303+
let in_flight_mons = peer_state.in_flight_monitor_updates.get(&chan_id);
8304+
if in_flight_mons.map(|(_, mons)| !mons.is_empty()).unwrap_or(false) {
8305+
peer_state
8306+
.monitor_update_blocked_actions
8307+
.entry(chan_id)
8308+
.or_insert_with(Vec::new)
8309+
.push(action);
8310+
return;
8311+
}
8312+
82968313
mem::drop(peer_state_opt);
82978314

82988315
log_trace!(logger, "Completing monitor update completion action for channel {} as claim was redundant: {:?}",

0 commit comments

Comments
 (0)