Skip to content

Commit 7aa1add

Browse files
authored
net_mana: downgrading invalid OOB traces from ERROR to WARN (#2163)
When `net_mana::tx_poll` receives an CQE_TX_INVALID_OOB it is currently logged as an Error. These events occur when the metadata does not match how HW parsed the packet. They can be a result of a Guest Bug, HW Bug, or OpenHCL Bug. They are common, possibly due to Encap, and only effect the specific packet. Unfortunately, their frequency is causing confusion when triaging other issues as it is not clear that `ty: 35` aka `CQE_TX_INVALID_OOB ` is safe to ignore. Downgrading the tracing to Warning to help folks better understand invalid OOB issues are probably not the cause of whatever they're seeing. * Adding to tracelimit macros to create event_ratelimited where callers may supply the Level. * Modifying net_mana to trace CQE_TX_GDMA_ERR at ERROR and CQE_TX_INVALID_OOB at WARN * Adding two new tests, one for valid packets and one for an invalid LSO segment count (1) * Refactoring test helpers in net_mana to support the new tests * Modifying bnic to enforce LSO segment counts * Adding BackendQueueStats to net_backent so that new tests in net_mana can check Counters: tx_packets, tx_errors, rx_packets, rx_errors
1 parent 478bf67 commit 7aa1add

File tree

7 files changed

+430
-65
lines changed

7 files changed

+430
-65
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2398,6 +2398,7 @@ dependencies = [
23982398
"slab",
23992399
"task_control",
24002400
"thiserror 2.0.16",
2401+
"tracelimit",
24012402
"tracing",
24022403
"vm_resource",
24032404
"vmcore",
@@ -4415,6 +4416,7 @@ dependencies = [
44154416
"futures-concurrency",
44164417
"guestmem",
44174418
"inspect",
4419+
"inspect_counters",
44184420
"memory_range",
44194421
"mesh",
44204422
"net_backend_resources",

support/tracelimit/src/lib.rs

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,3 +295,156 @@ macro_rules! info_ratelimited {
295295
}
296296
};
297297
}
298+
299+
/// As [`tracing::event!`], but rate limited.
300+
///
301+
/// Can be called with optional parameters to customize rate limiting:
302+
/// - `period: <ms>` - rate limiting period in milliseconds
303+
/// - `limit: <count>` - maximum events per period
304+
///
305+
/// `level` is required and must be a compile-time literal identifier (ERROR, WARN, INFO, DEBUG, TRACE).
306+
#[macro_export]
307+
macro_rules! event_ratelimited_static {
308+
// With both period and limit and level
309+
(level: $level:ident, period: $period:expr, limit: $limit:expr, $($rest:tt)*) => {{
310+
static RATE_LIMITER: $crate::RateLimiter = $crate::RateLimiter::new_default();
311+
if let Ok(missed_events) = RATE_LIMITER.event_with_config(Some($period), Some($limit)) {
312+
$crate::tracing::event!(
313+
$crate::tracing::Level::$level,
314+
dropped_ratelimited = missed_events,
315+
$($rest)*
316+
);
317+
}
318+
}};
319+
// With only period and level
320+
(level: $level:ident, period: $period:expr, $($rest:tt)*) => {{
321+
static RATE_LIMITER: $crate::RateLimiter = $crate::RateLimiter::new_default();
322+
if let Ok(missed_events) = RATE_LIMITER.event_with_config(Some($period), None) {
323+
$crate::tracing::event!(
324+
$crate::tracing::Level::$level,
325+
dropped_ratelimited = missed_events,
326+
$($rest)*
327+
);
328+
}
329+
}};
330+
// With only limit and level
331+
(level: $level:ident, limit: $limit:expr, $($rest:tt)*) => {{
332+
static RATE_LIMITER: $crate::RateLimiter = $crate::RateLimiter::new_default();
333+
if let Ok(missed_events) = RATE_LIMITER.event_with_config(None, Some($limit)) {
334+
$crate::tracing::event!(
335+
$crate::tracing::Level::$level,
336+
dropped_ratelimited = missed_events,
337+
$($rest)*
338+
);
339+
}
340+
}};
341+
// Default case (only level provided)
342+
(level: $level:ident, $($rest:tt)*) => {{
343+
static RATE_LIMITER: $crate::RateLimiter = $crate::RateLimiter::new_default();
344+
if let Ok(missed_events) = RATE_LIMITER.event() {
345+
$crate::tracing::event!(
346+
$crate::tracing::Level::$level,
347+
dropped_ratelimited = missed_events,
348+
$($rest)*
349+
);
350+
}
351+
}};
352+
}
353+
354+
/// Helper macro for dynamically dispatching to [`event_ratelimited_static!`] based on a runtime level.
355+
///
356+
/// This macro accepts a runtime `tracing::Level` expression and dispatches to the appropriate
357+
/// compile-time level identifier. Allows the log level to be determined at runtime.
358+
///
359+
/// Examples:
360+
/// ```
361+
/// use tracing::Level;
362+
/// use tracelimit::event_ratelimited;
363+
/// event_ratelimited!(Level::ERROR, period: 1000, limit: 5, "custome period and limit");
364+
/// event_ratelimited!(Level::WARN, period: 10000, "custom period only");
365+
/// event_ratelimited!(Level::INFO, limit: 50, "custom limit only");
366+
/// event_ratelimited!(Level::TRACE, "simple message");
367+
/// ```
368+
#[macro_export]
369+
macro_rules! event_ratelimited {
370+
// With period and limit and level
371+
($level:expr, period: $period:expr, limit: $limit:expr, $($rest:tt)*) => {
372+
match $level {
373+
$crate::tracing::Level::ERROR => {
374+
$crate::event_ratelimited_static!(level: ERROR, period: $period, limit: $limit, $($rest)*);
375+
}
376+
$crate::tracing::Level::WARN => {
377+
$crate::event_ratelimited_static!(level: WARN, period: $period, limit: $limit, $($rest)*);
378+
}
379+
$crate::tracing::Level::INFO => {
380+
$crate::event_ratelimited_static!(level: INFO, period: $period, limit: $limit, $($rest)*);
381+
}
382+
$crate::tracing::Level::DEBUG => {
383+
$crate::event_ratelimited_static!(level: DEBUG, period: $period, limit: $limit, $($rest)*);
384+
}
385+
$crate::tracing::Level::TRACE => {
386+
$crate::event_ratelimited_static!(level: TRACE, period: $period, limit: $limit, $($rest)*);
387+
}
388+
}
389+
};
390+
// With period and level
391+
($level:expr, period: $period:expr, $($rest:tt)*) => {
392+
match $level {
393+
$crate::tracing::Level::ERROR => {
394+
$crate::event_ratelimited_static!(level: ERROR, period: $period, $($rest)*);
395+
}
396+
$crate::tracing::Level::WARN => {
397+
$crate::event_ratelimited_static!(level: WARN, period: $period, $($rest)*);
398+
}
399+
$crate::tracing::Level::INFO => {
400+
$crate::event_ratelimited_static!(level: INFO, period: $period, $($rest)*);
401+
}
402+
$crate::tracing::Level::DEBUG => {
403+
$crate::event_ratelimited_static!(level: DEBUG, period: $period, $($rest)*);
404+
}
405+
$crate::tracing::Level::TRACE => {
406+
$crate::event_ratelimited_static!(level: TRACE, period: $period, $($rest)*);
407+
}
408+
}
409+
};
410+
// With limit and level
411+
($level:expr, limit: $limit:expr, $($rest:tt)*) => {
412+
match $level {
413+
$crate::tracing::Level::ERROR => {
414+
$crate::event_ratelimited_static!(level: ERROR, limit: $limit, $($rest)*);
415+
}
416+
$crate::tracing::Level::WARN => {
417+
$crate::event_ratelimited_static!(level: WARN, limit: $limit, $($rest)*);
418+
}
419+
$crate::tracing::Level::INFO => {
420+
$crate::event_ratelimited_static!(level: INFO, limit: $limit, $($rest)*);
421+
}
422+
$crate::tracing::Level::DEBUG => {
423+
$crate::event_ratelimited_static!(level: DEBUG, limit: $limit, $($rest)*);
424+
}
425+
$crate::tracing::Level::TRACE => {
426+
$crate::event_ratelimited_static!(level: TRACE, limit: $limit, $($rest)*);
427+
}
428+
}
429+
};
430+
// Default case (only level provided)
431+
($level:expr, $($rest:tt)*) => {
432+
match $level {
433+
$crate::tracing::Level::ERROR => {
434+
$crate::event_ratelimited_static!(level: ERROR, $($rest)*);
435+
}
436+
$crate::tracing::Level::WARN => {
437+
$crate::event_ratelimited_static!(level: WARN, $($rest)*);
438+
}
439+
$crate::tracing::Level::INFO => {
440+
$crate::event_ratelimited_static!(level: INFO, $($rest)*);
441+
}
442+
$crate::tracing::Level::DEBUG => {
443+
$crate::event_ratelimited_static!(level: DEBUG, $($rest)*);
444+
}
445+
$crate::tracing::Level::TRACE => {
446+
$crate::event_ratelimited_static!(level: TRACE, $($rest)*);
447+
}
448+
}
449+
};
450+
}

vm/devices/net/gdma/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ vm_resource.workspace = true
2323

2424
inspect.workspace = true
2525
task_control.workspace = true
26+
tracelimit.workspace = true
2627

2728
anyhow.workspace = true
2829
async-trait.workspace = true

vm/devices/net/gdma/src/bnic.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT License.
33

4+
use self::bnic_defs::CQE_TX_GDMA_ERR;
45
use self::bnic_defs::CQE_TX_OKAY;
56
use self::bnic_defs::MANA_CQE_COMPLETION;
67
use self::bnic_defs::ManaCommandCode;
@@ -560,6 +561,30 @@ impl TxRxTask {
560561
meta.offload_tcp_segmentation = true;
561562
}
562563

564+
// With LSO, the first SGE is the header and the rest are the payload.
565+
// For LSO, the requirements by the GDMA hardware are:
566+
// - The first SGE must be the header and must be <= 256 bytes.
567+
// - There should be at least two SGEs.
568+
// Possible test improvement: Disable the Queue to mimick the hardware behavior.
569+
if meta.offload_tcp_segmentation {
570+
if sqe.sgl().len() < 2 {
571+
tracelimit::error_ratelimited!(
572+
sgl_count = sqe.sgl().len(),
573+
"LSO enabled, but only one SGE"
574+
);
575+
self.post_tx_completion_error();
576+
return Ok(());
577+
}
578+
if sge0.size > 256 {
579+
tracelimit::error_ratelimited!(
580+
sge0_size = sge0.size,
581+
"LSO enabled and SGE[0] size > 256 bytes"
582+
);
583+
self.post_tx_completion_error();
584+
return Ok(());
585+
}
586+
}
587+
563588
let tx_segments = &mut self.tx_segment_buffer;
564589
tx_segments.clear();
565590
tx_segments.push(TxSegment {
@@ -582,6 +607,20 @@ impl TxRxTask {
582607
Ok(())
583608
}
584609

610+
// Possible test improvement: provide proper OOB data for the GDMA error.
611+
fn post_tx_completion_error(&mut self) {
612+
let tx_oob = ManaTxCompOob {
613+
cqe_hdr: ManaCqeHeader::new()
614+
.with_client_type(MANA_CQE_COMPLETION)
615+
.with_cqe_type(CQE_TX_GDMA_ERR),
616+
tx_data_offset: 0,
617+
offsets: ManaTxCompOobOffsets::new(),
618+
reserved: [0; 12],
619+
};
620+
self.queues
621+
.post_cq(self.sq_cq_id, tx_oob.as_bytes(), self.sq_id, true);
622+
}
623+
585624
fn post_tx_completion(&mut self) {
586625
let tx_oob = ManaTxCompOob {
587626
cqe_hdr: ManaCqeHeader::new()

vm/devices/net/net_backend/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ futures-concurrency.workspace = true
2424
parking_lot.workspace = true
2525
tracing.workspace = true
2626
thiserror.workspace = true
27+
inspect_counters.workspace = true
2728

2829
[lints]
2930
workspace = true

vm/devices/net/net_backend/src/lib.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use futures_concurrency::future::Race;
2121
use guestmem::GuestMemory;
2222
use guestmem::GuestMemoryError;
2323
use inspect::InspectMut;
24+
use inspect_counters::Counter;
2425
use mesh::rpc::Rpc;
2526
use mesh::rpc::RpcSend;
2627
use null::NullEndpoint;
@@ -144,6 +145,12 @@ pub enum TxError {
144145
#[error("unrecoverable error. {0}")]
145146
Fatal(#[source] anyhow::Error),
146147
}
148+
pub trait BackendQueueStats {
149+
fn rx_errors(&self) -> Counter;
150+
fn tx_errors(&self) -> Counter;
151+
fn rx_packets(&self) -> Counter;
152+
fn tx_packets(&self) -> Counter;
153+
}
147154

148155
/// A trait for sending and receiving network packets.
149156
#[async_trait]
@@ -172,6 +179,11 @@ pub trait Queue: Send + InspectMut {
172179

173180
/// Get the buffer access.
174181
fn buffer_access(&mut self) -> Option<&mut dyn BufferAccess>;
182+
183+
/// Get queue statistics
184+
fn queue_stats(&self) -> Option<&dyn BackendQueueStats> {
185+
None // Default implementation - not all queues implement stats
186+
}
175187
}
176188

177189
/// A trait for providing access to guest memory buffers.

0 commit comments

Comments
 (0)