diff --git a/4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go b/4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go index 9eb3a3304..a0f77a588 100644 --- a/4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go +++ b/4.validation_and_observability/3.efa-node-exporter/amazon_efa_linux.go @@ -48,41 +48,41 @@ func NewAmazonEfaCollector(logger *slog.Logger) (Collector, error) { // Detailed description for all metrics. descriptions := map[string]string{ - "alloc_pd_err": "Number of allocations PD errors", - "alloc_ucontext_err": "Number of allocations UContext errors", - "cmds_err": "Number of commands errors", - "completed_cmds": "Number of completed commands", - "create_ah_err": "Number of create AH errors", - "create_cq_err": "Number of create CQ errors", - "create_qp_err": "Number of create qp errors", - "impaired_remote_conn_events": "Number of EFA SRD connections entered an impaired state, resulting in a reduced throughput rate limit.", - "keep_alive_rcvd": "Number of keep-alive packets received", - "lifespan": "Lifespan of the port", - "mmap_err": "Number of mmap errors", - "no_completion_cmds": "Number of commands with no completion", - "rdma_read_bytes": "Number of bytes read with RDMA", - "rdma_read_resp_bytes": "Number of read reponses bytes with RDMA", - "rdma_read_wr_err": "Number of read write errors with RDMA", - "rdma_read_wrs": "Number of read rs with RDMA", - "rdma_write_bytes": "Number of bytes wrote with RDMA", - "rdma_write_recv_bytes": "Number of bytes wrote and received with RDMA", - "rdma_write_wr_err": "Number of bytes wrote wr with error RDMA", - "rdma_write_wrs": "Number of bytes wrote wrs RDMA", - "recv_bytes": "Number of bytes recv bytes", - "recv_wrs": "Number of bytes recv wrs", - "reg_mr_err": "Number of reg_mr errors", - "retrans_bytes": "Number of efa_srd bytes retransmitted", - "retrans_pkts": "Number of efa_srd packets retransmitted", - "retrans_timeout_events": "Number of times SRD traffic reached timeout and required network path change", - "rx_bytes": "Number of bytes received", - "rx_drops": "Number of packets droped", - "rx_pkts": "Number of packets received", - "send_bytes": "Number of bytes send", - "send_wrs": "Number of wrs send", - "submitted_cmds": "Number of submitted commands", - "tx_bytes": "Number of bytes transmitted", - "tx_pkts": "Number of packets transmitted", - "unresponsive_remote_events": "Number of times SRD connection remote was unresponsive", + "alloc_pd_err": "Number of allocations PD errors", + "alloc_ucontext_err": "Number of allocations UContext errors", + "cmds_err": "Number of commands errors", + "completed_cmds": "Number of completed commands", + "create_ah_err": "Number of create AH errors", + "create_cq_err": "Number of create CQ errors", + "create_qp_err": "Number of create qp errors", + "impaired_remote_conn_events": "Number of EFA SRD connections entered an impaired state, resulting in a reduced throughput rate limit.", + "keep_alive_rcvd": "Number of keep-alive packets received", + "lifespan": "Lifespan of the port", + "mmap_err": "Number of mmap errors", + "no_completion_cmds": "Number of commands with no completion", + "rdma_read_bytes": "Number of bytes read with RDMA", + "rdma_read_resp_bytes": "Number of read responses bytes with RDMA", + "rdma_read_wr_err": "Number of read write errors with RDMA", + "rdma_read_wrs": "Number of read rs with RDMA", + "rdma_write_bytes": "Number of bytes wrote with RDMA", + "rdma_write_recv_bytes": "Number of bytes wrote and received with RDMA", + "rdma_write_wr_err": "Number of bytes wrote wr with error RDMA", + "rdma_write_wrs": "Number of bytes wrote wrs RDMA", + "recv_bytes": "Number of bytes recv bytes", + "recv_wrs": "Number of bytes recv wrs", + "reg_mr_err": "Number of reg_mr errors", + "retrans_bytes": "Number of efa_srd bytes retransmitted", + "retrans_pkts": "Number of efa_srd packets retransmitted", + "retrans_timeout_events": "Number of times SRD traffic reached timeout and required network path change", + "rx_bytes": "Number of bytes received", + "rx_drops": "Number of packets dropped", + "rx_pkts": "Number of packets received", + "send_bytes": "Number of bytes send", + "send_wrs": "Number of wrs send", + "submitted_cmds": "Number of submitted commands", + "tx_bytes": "Number of bytes transmitted", + "tx_pkts": "Number of packets transmitted", + "unresponsive_remote_events": "Number of times SRD connection remote was unresponsive", } i.metricDescs = make(map[string]*prometheus.Desc) diff --git a/4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go b/4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go index a9aa3a3b0..811bf164e 100644 --- a/4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go +++ b/4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go @@ -257,64 +257,88 @@ func parseAmazonEfaCounters(portPath string) (*AmazonEfaCounters, error) { //vp := util.NewValueParser(value) switch f.Name() { - case "impaired_remote_conn_events": - counters.ImpairedRemoteConnEvents, err = parseUInt64(value) - case "lifespan": - counters.Lifespan, err = parseUInt64(value) - case "rdma_read_bytes": - counters.RdmaReadBytes, err = parseUInt64(value) - case "rdma_read_resp_bytes": - counters.RdmaReadRespBytes, err = parseUInt64(value) - case "rdma_read_wr_err": - counters.RdmaReadWrErr, err = parseUInt64(value) - case "rdma_read_wrs": - counters.RdmaReadWrs, err = parseUInt64(value) - case "rdma_write_bytes": - counters.RdmaWriteBytes, err = parseUInt64(value) - case "rdma_write_recv_bytes": - counters.RdmaWriteRecvBytes, err = parseUInt64(value) - case "rdma_write_wr_err": - counters.RdmaWriteWrErr, err = parseUInt64(value) - case "rdma_write_wrs": - counters.RdmaWriteWrs, err = parseUInt64(value) - case "recv_bytes": - counters.RecvBytes, err = parseUInt64(value) - case "recv_wrs": - counters.RecvWrs, err = parseUInt64(value) - case "retrans_bytes": - counters.RetransBytes, err = parseUInt64(value) - case "retrans_pkts": - counters.RetransPkts, err = parseUInt64(value) - case "retrans_timeout_events": - counters.RetransTimeoutEvents, err = parseUInt64(value) - case "rx_bytes": - counters.RxBytes, err = parseUInt64(value) - case "rx_drops": - counters.RxDrops, err = parseUInt64(value) - case "rx_pkts": - counters.RxPkts, err = parseUInt64(value) - case "send_bytes": - counters.SendBytes, err = parseUInt64(value) - case "send_wrs": - counters.SendWrs, err = parseUInt64(value) - case "tx_bytes": - counters.TxBytes, err = parseUInt64(value) - case "tx_pkts": - counters.TxPkts, err = parseUInt64(value) - case "unresponsive_remote_events": - counters.UnresponsiveRemoteEvents, err = parseUInt64(value) - - if err != nil { - // Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966 - // when counters are `N/A (not available)`. - // This was already patched and submitted, see - // https://www.spinics.net/lists/linux-rdma/msg68596.html - // Remove this as soon as the fix lands in the enterprise distros. - if strings.Contains(value, "N/A (no PMA)") { - continue - } - return nil, err + case "alloc_pd_err": + counters.AllocPdErr, err = parseUInt64(value) + case "alloc_ucontext_err": + counters.AllocUcontextErr, err = parseUInt64(value) + case "cmds_err": + counters.CmdsErr, err = parseUInt64(value) + case "completed_cmds": + counters.CompletedCmds, err = parseUInt64(value) + case "create_ah_err": + counters.CreateAhErr, err = parseUInt64(value) + case "create_cq_err": + counters.CreateCqErr, err = parseUInt64(value) + case "create_qp_err": + counters.CreateQpErr, err = parseUInt64(value) + case "impaired_remote_conn_events": + counters.ImpairedRemoteConnEvents, err = parseUInt64(value) + case "keep_alive_rcvd": + counters.KeepAliveRcvd, err = parseUInt64(value) + case "lifespan": + counters.Lifespan, err = parseUInt64(value) + case "mmap_err": + counters.MmapErr, err = parseUInt64(value) + case "no_completion_cmds": + counters.NoCompletionCmds, err = parseUInt64(value) + case "rdma_read_bytes": + counters.RdmaReadBytes, err = parseUInt64(value) + case "rdma_read_resp_bytes": + counters.RdmaReadRespBytes, err = parseUInt64(value) + case "rdma_read_wr_err": + counters.RdmaReadWrErr, err = parseUInt64(value) + case "rdma_read_wrs": + counters.RdmaReadWrs, err = parseUInt64(value) + case "rdma_write_bytes": + counters.RdmaWriteBytes, err = parseUInt64(value) + case "rdma_write_recv_bytes": + counters.RdmaWriteRecvBytes, err = parseUInt64(value) + case "rdma_write_wr_err": + counters.RdmaWriteWrErr, err = parseUInt64(value) + case "rdma_write_wrs": + counters.RdmaWriteWrs, err = parseUInt64(value) + case "recv_bytes": + counters.RecvBytes, err = parseUInt64(value) + case "recv_wrs": + counters.RecvWrs, err = parseUInt64(value) + case "reg_mr_err": + counters.RegMrErr, err = parseUInt64(value) + case "retrans_bytes": + counters.RetransBytes, err = parseUInt64(value) + case "retrans_pkts": + counters.RetransPkts, err = parseUInt64(value) + case "retrans_timeout_events": + counters.RetransTimeoutEvents, err = parseUInt64(value) + case "rx_bytes": + counters.RxBytes, err = parseUInt64(value) + case "rx_drops": + counters.RxDrops, err = parseUInt64(value) + case "rx_pkts": + counters.RxPkts, err = parseUInt64(value) + case "send_bytes": + counters.SendBytes, err = parseUInt64(value) + case "send_wrs": + counters.SendWrs, err = parseUInt64(value) + case "submitted_cmds": + counters.SubmittedCmds, err = parseUInt64(value) + case "tx_bytes": + counters.TxBytes, err = parseUInt64(value) + case "tx_pkts": + counters.TxPkts, err = parseUInt64(value) + case "unresponsive_remote_events": + counters.UnresponsiveRemoteEvents, err = parseUInt64(value) + } + + if err != nil { + // Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966 + // when counters are `N/A (not available)`. + // This was already patched and submitted, see + // https://www.spinics.net/lists/linux-rdma/msg68596.html + // Remove this as soon as the fix lands in the enterprise distros. + if strings.Contains(value, "N/A (no PMA)") { + continue } + return nil, fmt.Errorf("failed to parse counter %s with value %q: %w", f.Name(), value, err) } }