Skip to content

Commit 490d6e4

Browse files
committed
Ensure partial MPP claims continue to blocks channels on restart
In 9cc6e08, we started using the `RAAMonitorUpdateBlockingAction` logic to block RAAs which may remove a preimage from one `ChannelMonitor` if it isn't durably stored in another that is a part of the same MPP claim. At the time, when we claimed a payment, if we saw that the HTLC was already claimed in the channel, we'd simply drop the new RAA blocker. This can happen on reload when replaying MPP claims. However, just because an HTLC is no longer present in `ChannelManager`'s `Channel`, doesn't mean that the `ChannelMonitorUpdate` which stored the preimage actually made it durably into the `ChannelMonitor` on disk. We could begin an MPP payment, have one channel get the preimage durably into its `ChannelMonitor`, then step forward another update with the peer. Then, we could reload, causing the MPP claims to be replayed across all chanels, leading to the RAA blocker(s) being dropped and all channels being unlocked. Finally, if the first channel managed to step forward a further update with its peer before the (now-replayed) `ChannelMonitorUpdate`s for all MPP parts make it to disk we could theoretically lose the preimage. This is, of course, a somewhat comically unlikely scenario, but I had an old note to expand the test and it turned up the issue, so we might as well fix it.
1 parent f5dd77c commit 490d6e4

File tree

2 files changed

+143
-9
lines changed

2 files changed

+143
-9
lines changed

lightning/src/ln/chanmon_update_fail_tests.rs

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4196,13 +4196,17 @@ fn test_glacial_peer_cant_hang() {
41964196
do_test_glacial_peer_cant_hang(true);
41974197
}
41984198

4199-
#[test]
4200-
fn test_partial_claim_mon_update_compl_actions() {
4199+
fn do_test_partial_claim_mon_update_compl_actions(reload_a: bool, reload_b: bool) {
42014200
// Test that if we have an MPP claim that we ensure the preimage for the claim is retained in
42024201
// all the `ChannelMonitor`s until the preimage reaches every `ChannelMonitor` for a channel
42034202
// which was a part of the MPP.
42044203
let chanmon_cfgs = create_chanmon_cfgs(4);
42054204
let node_cfgs = create_node_cfgs(4, &chanmon_cfgs);
4205+
4206+
let (persister, persister_2, persister_3);
4207+
let (new_chain_mon, new_chain_mon_2, new_chain_mon_3);
4208+
let (nodes_3_reload, nodes_3_reload_2, nodes_3_reload_3);
4209+
42064210
let node_chanmgrs = create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
42074211
let mut nodes = create_network(4, &node_cfgs, &node_chanmgrs);
42084212

@@ -4231,6 +4235,8 @@ fn test_partial_claim_mon_update_compl_actions() {
42314235
let paths = &[&[&nodes[1], &nodes[3]][..], &[&nodes[2], &nodes[3]][..]];
42324236
send_along_route_with_secret(&nodes[0], route, paths, 200_000, payment_hash, payment_secret);
42334237

4238+
// Store the monitor for channel 4 without the preimage to use on reload
4239+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
42344240
// Claim along both paths, but only complete one of the two monitor updates.
42354241
chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
42364242
chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
@@ -4242,7 +4248,13 @@ fn test_partial_claim_mon_update_compl_actions() {
42424248
// Complete the 1<->3 monitor update and play the commitment_signed dance forward until it
42434249
// blocks.
42444250
nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_3_id);
4245-
expect_payment_claimed!(&nodes[3], payment_hash, 200_000);
4251+
let payment_claimed = nodes[3].node.get_and_clear_pending_events();
4252+
assert_eq!(payment_claimed.len(), 1, "{payment_claimed:?}");
4253+
if let Event::PaymentClaimed { payment_hash: ev_hash, .. } = &payment_claimed[0] {
4254+
assert_eq!(*ev_hash, payment_hash);
4255+
} else {
4256+
panic!("{payment_claimed:?}");
4257+
}
42464258
let updates = get_htlc_update_msgs(&nodes[3], &node_b_id);
42474259

42484260
nodes[1].node.handle_update_fulfill_htlc(node_d_id, &updates.update_fulfill_htlcs[0]);
@@ -4261,15 +4273,41 @@ fn test_partial_claim_mon_update_compl_actions() {
42614273
check_added_monitors(&nodes[3], 0);
42624274
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
42634275

4276+
if reload_a {
4277+
// After a reload (with the monitor not yet fully updated), the RAA should still be blocked
4278+
// waiting until the monitor update completes.
4279+
let node_ser = nodes[3].node.encode();
4280+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4281+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4282+
reload_node!(nodes[3], &node_ser, mons, persister, new_chain_mon, nodes_3_reload);
4283+
// The final update to channel 4 should be replayed.
4284+
persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
4285+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4286+
check_added_monitors(&nodes[3], 1);
4287+
4288+
// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
4289+
// restart.
4290+
let second_payment_claimed = nodes[3].node.get_and_clear_pending_events();
4291+
assert_eq!(payment_claimed, second_payment_claimed);
4292+
4293+
nodes[1].node.peer_disconnected(node_d_id);
4294+
nodes[2].node.peer_disconnected(node_d_id);
4295+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4296+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4297+
4298+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4299+
}
4300+
42644301
// Now double-check that the preimage is still in the 1<->3 channel and complete the pending
42654302
// monitor update, allowing node 3 to claim the payment on the 2<->3 channel. This also
42664303
// unblocks the 1<->3 channel, allowing node 3 to release the two blocked monitor updates and
42674304
// respond to the final commitment_signed.
42684305
assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
4306+
assert!(nodes[3].node.get_and_clear_pending_events().is_empty());
42694307

42704308
nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_4_id);
42714309
let mut ds_msgs = nodes[3].node.get_and_clear_pending_msg_events();
4272-
assert_eq!(ds_msgs.len(), 2);
4310+
assert_eq!(ds_msgs.len(), 2, "{ds_msgs:?}");
42734311
check_added_monitors(&nodes[3], 2);
42744312

42754313
match remove_first_msg_event_to_node(&node_b_id, &mut ds_msgs) {
@@ -4312,13 +4350,86 @@ fn test_partial_claim_mon_update_compl_actions() {
43124350
assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
43134351
assert!(get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
43144352

4353+
if reload_b {
4354+
// Ensure that the channel pause logic doesn't accidentally get restarted after a second
4355+
// reload once the HTLCs for the first payment have been removed and the monitors
4356+
// completed.
4357+
let node_ser = nodes[3].node.encode();
4358+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4359+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
4360+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4361+
reload_node!(nodes[3], &node_ser, mons, persister_2, new_chain_mon_2, nodes_3_reload_2);
4362+
check_added_monitors(&nodes[3], 0);
4363+
4364+
nodes[1].node.peer_disconnected(node_d_id);
4365+
nodes[2].node.peer_disconnected(node_d_id);
4366+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4367+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4368+
4369+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4370+
4371+
// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
4372+
// restart.
4373+
let third_payment_claimed = nodes[3].node.get_and_clear_pending_events();
4374+
assert_eq!(payment_claimed, third_payment_claimed);
4375+
}
4376+
43154377
send_payment(&nodes[1], &[&nodes[3]], 100_000);
43164378
assert!(!get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
43174379

4380+
if reload_b {
4381+
// Ensure that the channel pause logic doesn't accidentally get restarted after a second
4382+
// reload once the HTLCs for the first payment have been removed and the monitors
4383+
// completed, even if only one of the two monitors still knows about the first payment.
4384+
let node_ser = nodes[3].node.encode();
4385+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4386+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
4387+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4388+
reload_node!(nodes[3], &node_ser, mons, persister_3, new_chain_mon_3, nodes_3_reload_3);
4389+
check_added_monitors(&nodes[3], 0);
4390+
4391+
nodes[1].node.peer_disconnected(node_d_id);
4392+
nodes[2].node.peer_disconnected(node_d_id);
4393+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4394+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4395+
4396+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4397+
4398+
// Because the HTLCs aren't yet cleared, the PaymentClaimed events for both payments will
4399+
// be replayed on restart.
4400+
// Use this as an opportunity to check the payment_ids are unique.
4401+
let mut events = nodes[3].node.get_and_clear_pending_events();
4402+
assert_eq!(events.len(), 2);
4403+
events.retain(|ev| *ev != payment_claimed[0]);
4404+
assert_eq!(events.len(), 1);
4405+
if let Event::PaymentClaimed { payment_id: original_payment_id, .. } = &payment_claimed[0] {
4406+
assert!(original_payment_id.is_some());
4407+
if let Event::PaymentClaimed { amount_msat, payment_id, .. } = &events[0] {
4408+
assert!(payment_id.is_some());
4409+
assert_ne!(original_payment_id, payment_id);
4410+
assert_eq!(*amount_msat, 100_000);
4411+
} else {
4412+
panic!("{events:?}");
4413+
}
4414+
} else {
4415+
panic!("{events:?}");
4416+
}
4417+
4418+
send_payment(&nodes[1], &[&nodes[3]], 100_000);
4419+
}
4420+
43184421
send_payment(&nodes[2], &[&nodes[3]], 100_000);
43194422
assert!(!get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
43204423
}
43214424

4425+
#[test]
4426+
fn test_partial_claim_mon_update_compl_actions() {
4427+
do_test_partial_claim_mon_update_compl_actions(true, true);
4428+
do_test_partial_claim_mon_update_compl_actions(true, false);
4429+
do_test_partial_claim_mon_update_compl_actions(false, true);
4430+
do_test_partial_claim_mon_update_compl_actions(false, false);
4431+
}
4432+
43224433
#[test]
43234434
fn test_claim_to_closed_channel_blocks_forwarded_preimage_removal() {
43244435
// One of the last features for async persistence we implemented was the correct blocking of

lightning/src/ln/channelmanager.rs

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7786,19 +7786,42 @@ where
77867786
// payment claim from a `ChannelMonitor`. In some cases (MPP or
77877787
// if the HTLC was only recently removed) we make such claims
77887788
// after an HTLC has been removed from a channel entirely, and
7789-
// thus the RAA blocker has long since completed.
7789+
// thus the RAA blocker may have long since completed.
77907790
//
7791-
// In any other case, the RAA blocker must still be present and
7792-
// blocking RAAs.
7793-
debug_assert!(during_init ||
7794-
peer_state.actions_blocking_raa_monitor_updates.get(&chan_id).unwrap().contains(&raa_blocker));
7791+
// However, its possible that the `ChannelMonitorUpdate` containing
7792+
// the preimage never completed and is still pending. In that case,
7793+
// we need to re-add the RAA blocker, which we do here. Handling
7794+
// the post-update action, below, will remove it again.
7795+
//
7796+
// In any other case (i.e. not during startup), the RAA blocker
7797+
// must still be present and blocking RAAs.
7798+
let actions = &mut peer_state.actions_blocking_raa_monitor_updates;
7799+
let actions_list = actions.entry(chan_id).or_insert_with(Vec::new);
7800+
if !actions_list.contains(&raa_blocker) {
7801+
debug_assert!(during_init);
7802+
actions_list.push(raa_blocker);
7803+
}
77957804
}
77967805
let action = if let Some(action) = action_opt {
77977806
action
77987807
} else {
77997808
return;
78007809
};
78017810

7811+
// If there are monitor updates in flight, we may be in the case
7812+
// described above, replaying a claim on startup which needs an RAA
7813+
// blocker to remain blocked. Thus, in such a case we simply push the
7814+
// post-update action to the blocked list and move on.
7815+
let in_flight_mons = peer_state.in_flight_monitor_updates.get(&chan_id);
7816+
if in_flight_mons.map(|(_, mons)| !mons.is_empty()).unwrap_or(false) {
7817+
peer_state
7818+
.monitor_update_blocked_actions
7819+
.entry(chan_id)
7820+
.or_insert_with(Vec::new)
7821+
.push(action);
7822+
return;
7823+
}
7824+
78027825
mem::drop(peer_state_opt);
78037826

78047827
log_trace!(logger, "Completing monitor update completion action for channel {} as claim was redundant: {:?}",

0 commit comments

Comments
 (0)