diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c new file mode 100644 index 00000000..17c69d2c --- /dev/null +++ b/examples/netstacklat.bpf.c @@ -0,0 +1,490 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include +//#include + +#include +#include +#include + +#include "netstacklat.h" +#include "bits.bpf.h" + +char LICENSE[] SEC("license") = "GPL"; + +volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); +volatile const struct netstacklat_bpf_config user_config = { + .network_ns = 0, + .filter_pid = false, + .filter_ifindex = true, + .filter_cgroup = true, + .filter_nonempty_sockqueue = true, +#define CONFIG_FILTER_NONEMPTY_SOCKQUEUE 1 + .groupby_ifindex = true, + .groupby_cgroup = true, +}; + +/* This provide easy way compile-time to disable some hooks */ +//#define CONFIG_HOOKS_EARLY_RCV 1 +#undef CONFIG_HOOKS_EARLY_RCV +//#define CONFIG_HOOKS_ENQUEUE 1 +#undef CONFIG_HOOKS_ENQUEUE +#define CONFIG_HOOKS_DEQUEUE 1 + +/* Allows to compile-time disable ifindex map as YAML cannot conf this */ +//#define CONFIG_IFINDEX_FILTER_MAP 1 +#undef CONFIG_IFINDEX_FILTER_MAP + +/* Allows to compile-time disable PID filter map as it is very large */ +//#define CONFIG_PID_FILTER_MAP 1 +#undef CONFIG_PID_FILTER_MAP + +/* + * Alternative definition of sk_buff to handle renaming of the field + * mono_delivery_time to tstamp_type. See + * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes + */ +struct sk_buff___old { + union { + ktime_t tstamp; + u64 skb_mstamp_ns; + }; + __u8 mono_delivery_time: 1; +} __attribute__((preserve_access_index)); + +/* NOTICE: max_entries need to be adjusted based on maximum + * number of cgroups and ifindex (that are "groupby" collecting) + * and "enabled" hooks (as we want to disable some) + */ +#define N_CGROUPS 2 /* depend on cgroup_id_map matches in YAML config*/ +#define N_HOOKS NETSTACKLAT_N_HOOKS /* Keep it same until we disable some */ +#define N_IFACES 64 /* On prod only interested in ext0 and vlan100@ext0 */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES * 64); + __type(key, struct hist_key); + __type(value, u64); +} netstack_latency_seconds SEC(".maps"); + +#ifdef CONFIG_PID_FILTER_MAP +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, PID_MAX_LIMIT); + __type(key, u32); + __type(value, u64); +} netstack_pidfilter SEC(".maps"); +#endif + +#ifdef CONFIG_IFINDEX_FILTER_MAP +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, IFINDEX_MAX); + __type(key, u32); + __type(value, u64); +} netstack_ifindexfilter SEC(".maps"); +#endif + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_TRACKED_CGROUPS); + __type(key, u64); + __type(value, u64); +} netstack_cgroupfilter SEC(".maps"); + +static ktime_t time_since(ktime_t tstamp) +{ + ktime_t now; + + if (tstamp <= 0) + return -1; + + now = bpf_ktime_get_tai_ns() - TAI_OFFSET; + if (tstamp > now) + return -1; + + return now - tstamp; +} + +/* Determine if ebpf_exporter macro or local C implementation is used */ +#define CONFIG_MAP_MACROS 1 +#ifdef CONFIG_MAP_MACROS +#include "maps.bpf.h" +#define _record_latency_since(tstamp, key) \ + ktime_t latency = time_since(tstamp); \ + if (latency >= 0) \ + increment_exp2_histogram_nosync(&netstack_latency_seconds, \ + key, latency, \ + HIST_MAX_LATENCY_SLOT); +#else /* !CONFIG_MAP_MACROS */ +#define _record_latency_since(tstamp, key) \ + record_latency_since(tstamp, &key) + +static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) +{ + u64 zero = 0; + u64 *val; + + val = bpf_map_lookup_elem(map, key); + if (val) + return val; + + // Key not in map - try insert it and lookup again + bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); + return bpf_map_lookup_elem(map, key); +} + +static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket) +{ + u32 bucket = log2l(value); + + // Right-inclusive histogram, so "round up" the log value + if (bucket > 0 && 1ULL << bucket < value) + bucket++; + + if (bucket > max_bucket) + bucket = max_bucket; + + return bucket; +} + +/* + * Same call signature as the increment_exp2_histogram_nosync macro from + * https://github.com/cloudflare/ebpf_exporter/blob/master/examples/maps.bpf.h + * but provided as a function. + * + * Unlike the macro, only works with keys of type struct hist_key. The hist_key + * struct must be provided by value (rather than as a pointer) to keep the same + * call signature as the ebpf-exporter macro, although this will get inefficent + * if struct hist_key grows large. + */ +static void increment_exp2_histogram_nosync(void *map, struct hist_key key, + u64 value, u32 max_bucket) +{ + u64 *bucket_count; + + // Increment histogram + key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket); + bucket_count = lookup_or_zeroinit_histentry(map, &key); + if (bucket_count) + (*bucket_count)++; + + // Increment sum at end of histogram + if (value == 0) + return; + + key.bucket = max_bucket + 1; + bucket_count = lookup_or_zeroinit_histentry(map, &key); + if (bucket_count) + *bucket_count += value; +} + +static void record_latency(ktime_t latency, const struct hist_key *key) +{ + increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency, + HIST_MAX_LATENCY_SLOT); +} +static void record_latency_since(ktime_t tstamp, const struct hist_key *key) +{ + ktime_t latency = time_since(tstamp); + if (latency >= 0) + record_latency(latency, key); +} +#endif /* !CONFIG_MAP_MACROS */ + +static bool filter_ifindex(u32 ifindex) +{ + if (!user_config.filter_ifindex) + // No ifindex filter - all ok + return true; + +#ifdef CONFIG_IFINDEX_FILTER_MAP + u64 *ifindex_ok; + + ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex); + if (!ifindex_ok) + return false; + + return *ifindex_ok > 0; +#else + /* Hack for production: + * - We want to exclude 'lo' which have ifindex==1. + * - We want to filter on ext0 (ifindex 2) and vlan100@ext0 (ifindex 5) + */ + if (ifindex > 1 && ifindex < 6) + return true; + + return false; +#endif +} + +static bool filter_network_ns(u32 ns) +{ + if (user_config.network_ns == 0) + return true; + + return ns == user_config.network_ns; +} + +static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) +{ + /* + * Favor reading from sk due to less redirection (fewer probe reads) + * and skb->dev is not always set. + */ + if (sk) + return BPF_CORE_READ(sk->__sk_common.skc_net.net, ns.inum); + else if (skb) + return BPF_CORE_READ(skb->dev, nd_net.net, ns.inum); + return 0; +} + +#if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE) +static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) +{ + struct hist_key key = { .hook = hook }; + u32 ifindex; + + if (bpf_core_field_exists(skb->tstamp_type)) { + /* + * For kernels >= v6.11 the tstamp_type being non-zero + * (SKB_CLOCK_REALTIME) implies that skb->tstamp holds a + * preserved TX timestamp rather than a RX timestamp. See + * https://lore.kernel.org/all/20240509211834.3235191-2-quic_abchauha@quicinc.com/ + */ + if (BPF_CORE_READ_BITFIELD(skb, tstamp_type) > 0) + return; + + } else { + /* + * For kernels < v6.11, the field was called mono_delivery_time + * instead, see https://lore.kernel.org/all/20220302195525.3480280-1-kafai@fb.com/ + * Kernels < v5.18 do not have the mono_delivery_field either, + * but we do not support those anyways (as they lack the + * bpf_ktime_get_tai_ns helper) + */ + struct sk_buff___old *skb_old = (void *)skb; + if (BPF_CORE_READ_BITFIELD(skb_old, mono_delivery_time) > 0) + return; + } + + ifindex = skb->skb_iif; + if (!filter_ifindex(ifindex)) + return; + + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + + _record_latency_since(skb->tstamp, key); +} +#endif + +#ifdef CONFIG_PID_FILTER_MAP +static bool filter_pid(u32 pid) +{ + u64 *pid_ok; + + if (!user_config.filter_pid) + // No PID filter - all PIDs ok + return true; + + pid_ok = bpf_map_lookup_elem(&netstack_pidfilter, &pid); + if (!pid_ok) + return false; + + return *pid_ok > 0; + +} +#endif /* CONFIG_PID_FILTER_MAP */ + +static bool filter_cgroup(u64 cgroup_id) +{ + if (!user_config.filter_cgroup) + // No cgroup filter - all cgroups ok + return true; + + return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL; +} + +static bool filter_current_task(u64 cgroup) +{ + bool ok = true; + +#ifdef CONFIG_PID_FILTER_MAP + __u32 tgid; + + if (user_config.filter_pid) { + tgid = bpf_get_current_pid_tgid() >> 32; + ok = ok && filter_pid(tgid); + } +#endif + if (user_config.filter_cgroup) + ok = ok && filter_cgroup(cgroup); + + return ok; +} + +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +/** + * skb_queue_empty - check if a queue is empty + * @list: queue head + * + * Returns true if the queue is empty, false otherwise. + * + * Copied from /include/linux/skbuff.h + */ +static inline int skb_queue_empty(const struct sk_buff_head *list) +{ + return READ_ONCE(list->next) == (const struct sk_buff *)list; +} + +static bool filter_nonempty_sockqueue(struct sock *sk) +{ +#ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE + if (!user_config.filter_nonempty_sockqueue) + return true; +#endif + + return !skb_queue_empty(&sk->sk_receive_queue); +} + +/* To lower runtime overhead, skip recording timestamps for sockets with very + * few packets. Use sk_buff_head->qlen to see if e.g. queue have more than 2 + * elements + */ +static inline __u32 sk_queue_len(const struct sk_buff_head *list_) +{ + return READ_ONCE(list_->qlen); +} + +static bool filter_queue_len(struct sock *sk, const __u32 above_len) +{ + if (sk_queue_len(&sk->sk_receive_queue) > above_len) + return true; + return false; +} + +static void record_socket_latency(struct sock *sk, struct sk_buff *skb, + ktime_t tstamp, enum netstacklat_hook hook) +{ + struct hist_key key = { .hook = hook }; + u64 cgroup = 0; + u32 ifindex; + + if (user_config.filter_cgroup || user_config.groupby_cgroup) + cgroup = bpf_get_current_cgroup_id(); + + if (!filter_current_task(cgroup)) + return; + + ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; + if (!filter_ifindex(ifindex)) + return; + + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + if (user_config.groupby_cgroup) + key.cgroup = cgroup; + + _record_latency_since(tstamp, key); +} + +#ifdef CONFIG_HOOKS_EARLY_RCV +SEC("fentry/ip_rcv_core") +int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block, + void *tp, void *res, bool compat_mode) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); + return 0; +} + +SEC("fentry/ip6_rcv_core") +int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block, + void *tp, void *res, bool compat_mode) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); + return 0; +} + +SEC("fentry/tcp_v4_rcv") +int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); + return 0; +} + +SEC("fentry/tcp_v6_rcv") +int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); + return 0; +} + +SEC("fentry/udp_rcv") +int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); + return 0; +} + +SEC("fentry/udpv6_rcv") +int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); + return 0; +} +#endif /* CONFIG_HOOKS_EARLY_RCV */ + +#ifdef CONFIG_HOOKS_ENQUEUE +SEC("fexit/tcp_queue_rcv") +int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb) +{ + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); + return 0; +} + +SEC("fexit/__udp_enqueue_schedule_skb") +int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, + struct sk_buff *skb, int retval) +{ + if (retval == 0) + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); + return 0; +} +#endif /* CONFIG_HOOKS_ENQUEUE */ + +#ifdef CONFIG_HOOKS_DEQUEUE +SEC("fentry/tcp_recv_timestamp") +int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, + struct scm_timestamping_internal *tss) +{ + if (!filter_nonempty_sockqueue(sk)) + return 0; + + if (!filter_queue_len(sk, 3)) + return 0; + + struct timespec64 *ts = &tss->ts[0]; + record_socket_latency(sk, NULL, + (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, + NETSTACKLAT_HOOK_TCP_SOCK_READ); + return 0; +} + +SEC("fentry/skb_consume_udp") +int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, + int len) +{ + if (!filter_nonempty_sockqueue(sk)) + return 0; + + record_socket_latency(sk, skb, skb->tstamp, + NETSTACKLAT_HOOK_UDP_SOCK_READ); + return 0; +} +#endif /* CONFIG_HOOKS_DEQUEUE */ diff --git a/examples/netstacklat.h b/examples/netstacklat.h new file mode 100644 index 00000000..4811da4c --- /dev/null +++ b/examples/netstacklat.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef NETSTACKLAT_H +#define NETSTACKLAT_H + +#define HIST_MAX_LATENCY_SLOT 34 // 2^34 ns -> ~17s +/* + * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key" + * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys) + * that ebpf_exporter expects for exp2 hists (see how it's used in the + * increment_exp2_histogram_nosync() function) + */ +#define HIST_NBUCKETS (HIST_MAX_LATENCY_SLOT + 2) + +#define NS_PER_S 1000000000 + +// The highest possible PID on a Linux system (from /include/linux/threads.h) +#define PID_MAX_LIMIT (4 * 1024 * 1024) +// The highest ifindex we expect to encounter +#define IFINDEX_MAX 16384 +// The maximum number of different cgroups we can filter for +#define MAX_TRACKED_CGROUPS 4096 + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) +#endif + +#ifndef max +#define max(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) +#endif + +#ifndef min +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#endif + +enum netstacklat_hook { + NETSTACKLAT_HOOK_INVALID = 0, + NETSTACKLAT_HOOK_IP_RCV, + NETSTACKLAT_HOOK_TCP_START, + NETSTACKLAT_HOOK_UDP_START, + NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED, + NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED, + NETSTACKLAT_HOOK_TCP_SOCK_READ, + NETSTACKLAT_HOOK_UDP_SOCK_READ, + NETSTACKLAT_N_HOOKS, +}; + +/* + * Key used for the histogram map + * To be compatible with ebpf-exporter, all histograms need a key struct whose final + * member is named "bucket" and is the histogram bucket index. + */ +struct hist_key { + __u64 cgroup; + __u32 ifindex; + __u16 hook; // need well defined size for ebpf-exporter to decode + __u16 bucket; // needs to be last to be compatible with ebpf-exporter +}; + +struct netstacklat_bpf_config { + __u32 network_ns; + bool filter_pid; + bool filter_ifindex; + bool filter_cgroup; + bool filter_nonempty_sockqueue; + bool groupby_ifindex; + bool groupby_cgroup; +}; + +#endif diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml new file mode 100644 index 00000000..d59d3b6b --- /dev/null +++ b/examples/netstacklat.yaml @@ -0,0 +1,49 @@ +metrics: + histograms: + - name: netstack_latency_seconds + help: Latency for packets (skbs) to reach various points in the kernel network stack + bucket_type: exp2 + bucket_min: 0 + bucket_max: 34 + bucket_multiplier: 0.000000001 # nanoseconds to seconds + labels: + - name: cgroup + size: 8 + decoders: + - name: uint + - name: cgroup + - name: iface + size: 4 + decoders: + # If including output from a different network namespace than ebpf-exporter + # you probably just want to decode as a uint (ifindex) instead + # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others + - name: ifname + - name: hook + size: 2 + decoders: + - name: uint + - name: static_map + static_map: + 1: "ip-start" + 2: "tcp-start" + 3: "udp-start" + 4: "tcp-socket-enqueued" + 5: "udp-socket-enqueued" + 6: "tcp-socket-read" + 7: "udp-socket-read" + - name: bucket + size: 2 + decoders: + - name: uint + +cgroup_id_map: + name: netstack_cgroupfilter + type: hash + regexps: + - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$ + - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$ +# - ^(/sys/fs/cgroup/production.slice/.*/pingora-backend-router.service).*$ +# - ^(/sys/fs/cgroup/production.slice/.*/pingora-origin.service).*$ +# - ^.*(system.slice/.*)$ +# - ^.*(user.slice/.*)$