From 18ab1b5285076d5d68fbe34687261b3cf0d4c293 Mon Sep 17 00:00:00 2001 From: Daniel Hsu Date: Thu, 30 Oct 2025 11:23:05 +0800 Subject: [PATCH] mctp: Add retry for one-time peer property queries on timeout The function `query_peer_properties()` is called once during peer initialization to query basic information after the EID becomes routable. To improve reliability, this change adds a retry mechanism when the query fails with `-ETIMEDOUT`. Since these queries are one-time initialization steps, a single successful attempt is sufficient, and retrying enhances stability under transient MCTP bus contention or multi-master timing issues. Testing: add stress test for peer initialization under multi-master ``` while true; do echo "Restarting mctpd.service..." systemctl restart mctpd.service # Wait a few seconds to allow service to initialize sleep 20 done ``` After the 30 loops, the script checks mctpd.service journal for expected retry messages to verify robustness under transient MCTP bus contention. ``` root@bmc:~# journalctl -xeu mctpd.service | grep Retrying Oct 29 00:35:21 bmc mctpd[31801]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:00 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:01 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 2 Oct 29 00:45:08 bmc mctpd[32360]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 ``` Signed-off-by: Daniel Hsu --- src/mctpd.c | 60 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/src/mctpd.c b/src/mctpd.c index 36a2372..01517db 100644 --- a/src/mctpd.c +++ b/src/mctpd.c @@ -2835,23 +2835,57 @@ static int method_learn_endpoint(sd_bus_message *call, void *data, // and routable. static int query_peer_properties(struct peer *peer) { + const unsigned int max_retries = 4; int rc; - rc = query_get_peer_msgtypes(peer); - if (rc < 0) { - // Warn here, it's a mandatory command code. - // It might be too noisy if some devices don't implement it. - warnx("Error getting endpoint types for %s. Ignoring error %d %s", - peer_tostr(peer), rc, strerror(-rc)); - rc = 0; + for (unsigned int i = 0; i < max_retries; i++) { + rc = query_get_peer_msgtypes(peer); + + // Success + if (rc == 0) + break; + + // On timeout, retry + if (rc == -ETIMEDOUT) { + if (peer->ctx->verbose) + warnx("Retrying to get endpoint types for %s. Attempt %u", + peer_tostr(peer), i + 1); + continue; + } + + // On other errors, warn and ignore + if (rc < 0) { + if (peer->ctx->verbose) + warnx("Error getting endpoint types for %s. Ignoring error %d %s", + peer_tostr(peer), -rc, strerror(-rc)); + rc = 0; + break; + } } - rc = query_get_peer_uuid(peer); - if (rc < 0) { - if (peer->ctx->verbose) - warnx("Error getting UUID for %s. Ignoring error %d %s", - peer_tostr(peer), rc, strerror(-rc)); - rc = 0; + for (unsigned int i = 0; i < max_retries; i++) { + rc = query_get_peer_uuid(peer); + + // Success + if (rc == 0) + break; + + // On timeout, retry + if (rc == -ETIMEDOUT) { + if (peer->ctx->verbose) + warnx("Retrying to get peer UUID for %s. Attempt %u", + peer_tostr(peer), i + 1); + continue; + } + + // On other errors, warn and ignore + if (rc < 0) { + if (peer->ctx->verbose) + warnx("Error getting UUID for %s. Ignoring error %d %s", + peer_tostr(peer), -rc, strerror(-rc)); + rc = 0; + break; + } } // TODO: emit property changed? Though currently they are all const.