From 137d570ea5098dfd7f158f2cebf469c73ce923e0 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Wed, 28 May 2025 22:22:00 +0200 Subject: [PATCH 01/14] netstacklat: Update socket enqueue hooks Previously netstacklat used fexit:tcp_data_queue as the socket enqueue point for TCP and fexit:udp[v6]_queue_rcv_one_skb for UDP. For both of these functions, the skb may actually have been freed by the time they return, leading us to read invalid data. Furthermore, not all calls to these functions necessarily end up enqueuing the skb to the socket, as they may be dropped for various reasons. For TCP, there are also some fast paths that may enqueue data to the socket without going through tcp_data_queue. Therefore, update these probes to hook more suitable functions. For TCP, use the tcp_queue_rcv function (which is called by tcp_data_queue when the data is actually queued to the socket). This function is much closer to the actual socket enqueue point, will never free the skb itself (although its return value may indicate to the calling function that it should be freed as it's be coalesced into tail skb in the receive queue), is only called when the skb is actually queued to the socket, and is also called in a fast path of tcp_rcv_established that bypasses tcp_data_queue. For UDP, use __udp_enqueue_schedule_skb, which udp[v6]_queue_rcv_one_skb functions call when they actually attempt to enqueue the skb to the socket. This function may still fail to enqueue to skb to the socket (if e.g. the socket buffer is full), so check the return value so that we only report the instances where the skb is successfully enqueued. This function is called by both the IPv4 and IPv6 UDP paths, so similar to the TCP case we only need to hook a single function now. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 21 +++++++-------------- netstacklat/netstacklat.c | 9 ++++----- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 513ddb73..5d237a89 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -301,26 +301,19 @@ int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb) return 0; } -SEC("fexit/tcp_data_queue") -int BPF_PROG(netstacklat_tcp_data_queue, struct sock *sk, struct sk_buff *skb) +SEC("fexit/tcp_queue_rcv") +int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb) { record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); return 0; } -SEC("fexit/udp_queue_rcv_one_skb") -int BPF_PROG(netstacklat_udp_queue_rcv_one_skb, struct sock *sk, - struct sk_buff *skb) +SEC("fexit/__udp_enqueue_schedule_skb") +int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, + struct sk_buff *skb, int retval) { - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); - return 0; -} - -SEC("fexit/udpv6_queue_rcv_one_skb") -int BPF_PROG(netstacklat_udpv6_queue_rcv_one_skb, struct sock *sk, - struct sk_buff *skb) -{ - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); + if (retval == 0) + record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); return 0; } diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 80567689..4b6b2945 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -251,14 +251,13 @@ static void hook_to_progs(struct hook_prog_collection *progs, progs->nprogs = 2; break; case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED: - progs->progs[0] = obj->progs.netstacklat_tcp_data_queue; + progs->progs[0] = obj->progs.netstacklat_tcp_queue_rcv; progs->nprogs = 1; break; case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED: - progs->progs[0] = obj->progs.netstacklat_udp_queue_rcv_one_skb; - progs->progs[1] = - obj->progs.netstacklat_udpv6_queue_rcv_one_skb; - progs->nprogs = 2; + progs->progs[0] = + obj->progs.netstacklat_udp_enqueue_schedule_skb; + progs->nprogs = 1; break; case NETSTACKLAT_HOOK_TCP_SOCK_READ: progs->progs[0] = obj->progs.netstacklat_tcp_recv_timestamp; From 2bb9e4eba3e12bb638749e7b36e2d66aecd78c6e Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Wed, 28 May 2025 10:50:41 +0200 Subject: [PATCH 02/14] netstacklat: Multiplex all histograms in a single map Change the way that the latency histograms are stored in BPF maps. Instead of keeping a separate array map for each histogram, store all histograms in a single hash map, encoding the hook as part of the key. This results in higher overhead (as hash lookups are slower than array lookups), but is much more flexible. This makes it easier to add additional hook points as no new maps (and related code for mapping hooks to maps) need to be added. Furthermore, in the future it allows to easily group the results on various aspects by adding additional members to the key. On the userspace side, maintain a sorted array of the encountered histogram keys and a mapping to the corresponding histogram buckets. Instead of keeping a separate key for each histogram bucket (as the BPF maps do to be compatible with ebpf-exporter), restructure the data so only a single key is used per histogram. Essentially remove the bucket member from the key, keeping a full histogram (where any missing buckets are zeroed) for the remaining unique members in the histogram key (so far just the hook identifier). Keeping the array of histogram keys sorted allows for relatively quick lookups using binary search. When a new histogram key is encountered it will incur significant overhead the first time as it needs to be inserted into the right place in the array, but lookups ought to be much more common than inserting new keys. While this data structure will not scale well to a very large amount of unique keys (insertion time is O(n), lookup O(log n)), it avoids implementing or adding dependencies to more complicated data structures like trees or hash maps. As long as we do not need to keep track of many thousands of histograms, this solution should be good enough. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 107 +++---------- netstacklat/netstacklat.c | 282 +++++++++++++++++++++++----------- netstacklat/netstacklat.h | 14 +- 3 files changed, 226 insertions(+), 177 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 5d237a89..533bfe6f 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -30,67 +30,12 @@ struct sk_buff___old { __u8 mono_delivery_time: 1; } __attribute__((preserve_access_index)); -/* - * To be compatible with ebpf-exporter, all histograms need a key struct whose final - * member is named "bucket" and is the histogram bucket index. - * As we store the histograms in array maps, the key type for each array map - * below has to be a u32 (and not a struct), but as this struct consists of a - * single u32 member we can still use a pointer to the hist_key struct in - * lookup-functions, and the u32 bucket index will implicitly be mapped to the - * array map index. - */ -struct hist_key { - u32 bucket; -}; - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_ip_start_seconds SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_tcp_start_seconds SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_udp_start_seconds SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_tcp_sock_enqueued_seconds SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_udp_sock_enqueued_seconds SEC(".maps"); - struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); - __type(value, u64); -} netstack_latency_tcp_sock_read_seconds SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, HIST_NBUCKETS); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS); + __type(key, struct hist_key); __type(value, u64); -} netstack_latency_udp_sock_read_seconds SEC(".maps"); +} netstack_latency_seconds SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY); @@ -99,6 +44,20 @@ struct { __type(value, u8); } netstack_pidfilter SEC(".maps"); +static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) +{ + u64 zero = 0; + u64 *val; + + val = bpf_map_lookup_elem(map, key); + if (val) + return val; + + // Key not in map - try insert it and lookup again + bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); + return bpf_map_lookup_elem(map, key); +} + static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket) { u32 bucket = log2l(value); @@ -130,7 +89,7 @@ static void increment_exp2_histogram_nosync(void *map, struct hist_key key, // Increment histogram key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket); - bucket_count = bpf_map_lookup_elem(map, &key); + bucket_count = lookup_or_zeroinit_histentry(map, &key); if (bucket_count) (*bucket_count)++; @@ -139,33 +98,11 @@ static void increment_exp2_histogram_nosync(void *map, struct hist_key key, return; key.bucket = max_bucket + 1; - bucket_count = bpf_map_lookup_elem(map, &key); + bucket_count = lookup_or_zeroinit_histentry(map, &key); if (bucket_count) *bucket_count += value; } -static void *hook_to_histmap(enum netstacklat_hook hook) -{ - switch (hook) { - case NETSTACKLAT_HOOK_IP_RCV: - return &netstack_latency_ip_start_seconds; - case NETSTACKLAT_HOOK_TCP_START: - return &netstack_latency_tcp_start_seconds; - case NETSTACKLAT_HOOK_UDP_START: - return &netstack_latency_udp_start_seconds; - case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED: - return &netstack_latency_tcp_sock_enqueued_seconds; - case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED: - return &netstack_latency_udp_sock_enqueued_seconds; - case NETSTACKLAT_HOOK_TCP_SOCK_READ: - return &netstack_latency_tcp_sock_read_seconds; - case NETSTACKLAT_HOOK_UDP_SOCK_READ: - return &netstack_latency_udp_sock_read_seconds; - default: - return NULL; - } -} - static ktime_t time_since(ktime_t tstamp) { ktime_t now; @@ -182,8 +119,8 @@ static ktime_t time_since(ktime_t tstamp) static void record_latency(ktime_t latency, enum netstacklat_hook hook) { - struct hist_key key = { 0 }; - increment_exp2_histogram_nosync(hook_to_histmap(hook), key, latency, + struct hist_key key = { .hook = hook }; + increment_exp2_histogram_nosync(&netstack_latency_seconds, key, latency, HIST_MAX_LATENCY_SLOT); } diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 4b6b2945..47feb752 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -44,6 +44,8 @@ static const char *__doc__ = #define MAX_BUCKETCOUNT_STRLEN 10 #define MAX_BAR_STRLEN (80 - 6 - MAX_BUCKETSPAN_STRLEN - MAX_BUCKETCOUNT_STRLEN) +#define LOOKUP_BATCH_SIZE 128 + #define MAX_HOOK_PROGS 4 // Maximum number of different pids that can be filtered for @@ -54,6 +56,17 @@ struct hook_prog_collection { int nprogs; }; +struct histogram_entry { + struct hist_key key; + __u64 *buckets; +}; + +struct histogram_buffer { + struct histogram_entry *hists; + size_t max_size; + size_t current_size; +}; + struct netstacklat_config { struct netstacklat_bpf_config bpf_conf; double report_interval_s; @@ -201,35 +214,6 @@ static const char *hook_to_description(enum netstacklat_hook hook) } } -static int hook_to_histmap(enum netstacklat_hook hook, - const struct netstacklat_bpf *obj) -{ - switch (hook) { - case NETSTACKLAT_HOOK_IP_RCV: - return bpf_map__fd(obj->maps.netstack_latency_ip_start_seconds); - case NETSTACKLAT_HOOK_TCP_START: - return bpf_map__fd( - obj->maps.netstack_latency_tcp_start_seconds); - case NETSTACKLAT_HOOK_UDP_START: - return bpf_map__fd( - obj->maps.netstack_latency_udp_start_seconds); - case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED: - return bpf_map__fd( - obj->maps.netstack_latency_tcp_sock_enqueued_seconds); - case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED: - return bpf_map__fd( - obj->maps.netstack_latency_udp_sock_enqueued_seconds); - case NETSTACKLAT_HOOK_TCP_SOCK_READ: - return bpf_map__fd( - obj->maps.netstack_latency_tcp_sock_read_seconds); - case NETSTACKLAT_HOOK_UDP_SOCK_READ: - return bpf_map__fd( - obj->maps.netstack_latency_udp_sock_read_seconds); - default: - return -EINVAL; - } -} - static void hook_to_progs(struct hook_prog_collection *progs, enum netstacklat_hook hook, const struct netstacklat_bpf *obj) @@ -620,93 +604,192 @@ static void print_log2hist(FILE *stream, size_t n, const __u64 hist[n], } } -static void merge_percpu_hist(size_t n, int ncpus, - const __u64 percpu_hist[n][ncpus], - __u64 merged_hist[n]) +static void print_histkey(FILE *stream, const struct hist_key *key) +{ + fprintf(stream, "%s", hook_to_str(key->hook)); +} + +static int cmp_histkey(const void *val1, const void *val2) { - int idx, cpu; + const struct hist_key *key1 = val1, *key2 = val2; - memset(merged_hist, 0, sizeof(__u64) * n); + return key1->hook == key2->hook ? 0 : key1->hook > key2->hook ? 1 : -1; +} - for (idx = 0; idx < n; idx++) { - for (cpu = 0; cpu < ncpus; cpu++) { - merged_hist[idx] += percpu_hist[idx][cpu]; - } +static int cmp_histentry(const void *val1, const void *val2) +{ + const struct histogram_entry *entry1 = val1, *entry2 = val2; + + return cmp_histkey(&entry1->key, &entry2->key); +} + +static int insert_last_hist_sorted(struct histogram_buffer *buf) +{ + struct histogram_entry *hists = buf->hists; + int i, last = buf->current_size - 1; + struct histogram_entry tmp; + + if (buf->current_size < 2) + return 0; + + i = last; + while (i > 0 && cmp_histentry(&hists[last], &hists[i - 1]) < 0) + i--; + + if (i == last) + // Last hist already in the right place, no need to swap it in + return i; + + // Swap in hist to the correct position + memcpy(&tmp, &hists[last], sizeof(tmp)); + memmove(&hists[i + 1], &hists[i], (last - i) * sizeof(*hists)); + memcpy(&hists[i], &tmp, sizeof(*hists)); + + return i; +} + +static struct histogram_entry * +lookup_or_zeroinit_hist(const struct hist_key *key, + struct histogram_buffer *buf) +{ + struct histogram_entry *hist; + __u64 *buckets; + int i; + + hist = bsearch(key, buf->hists, buf->current_size, sizeof(*buf->hists), + cmp_histentry); + if (hist) + return hist; + + // No matching histogram key found - create new histogram entry and insert it + if (buf->current_size >= buf->max_size) { + errno = ENOSPC; + return NULL; + } + + buckets = calloc(HIST_NBUCKETS, sizeof(*buckets)); + if (!buckets) { + errno = ENOMEM; + return NULL; } + + hist = &buf->hists[buf->current_size++]; + memcpy(&hist->key, key, sizeof(hist->key)); + hist->key.bucket = 0; + hist->buckets = buckets; + + i = insert_last_hist_sorted(buf); + return &buf->hists[i]; } -static int fetch_hist_map(int map_fd, __u64 hist[HIST_NBUCKETS]) +static int update_histogram_entry_bucket(const struct hist_key *key, + __u64 count, + struct histogram_buffer *buf) { - __u32 in_batch, out_batch, count = HIST_NBUCKETS; + struct histogram_entry *hist; + int bucket = key->bucket; + + hist = lookup_or_zeroinit_hist(key, buf); + if (!hist) + return -errno; + + hist->buckets[bucket] = count; + return 0; +} + +static __u64 sum_percpu_vals(int cpus, __u64 vals[cpus]) +{ + __u64 sum = 0; + int i; + + for (i = 0; i < cpus; i++) + sum += vals[i]; + + return sum; +} + +static int fetch_histograms(int map_fd, struct histogram_buffer *buf) +{ + __u32 in_batch, out_batch, count = LOOKUP_BATCH_SIZE; int ncpus = libbpf_num_possible_cpus(); - __u32 idx, buckets_fetched = 0; - __u64 (*percpu_hist)[ncpus]; - __u32 *keys; - int err = 0; + int i, nentries = 0, err, err2 = 0; + __u64(*percpu_buckets)[ncpus]; + bool entries_remain = true; + struct hist_key *keys; - DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, batch_opts, .flags = BPF_EXIST); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); - percpu_hist = calloc(HIST_NBUCKETS, sizeof(*percpu_hist)); - keys = calloc(HIST_NBUCKETS, sizeof(*keys)); - if (!percpu_hist || !keys) { + percpu_buckets = calloc(LOOKUP_BATCH_SIZE, sizeof(*percpu_buckets)); + keys = calloc(LOOKUP_BATCH_SIZE, sizeof(*keys)); + if (!percpu_buckets || !keys) { err = -ENOMEM; goto exit; } - while (buckets_fetched < HIST_NBUCKETS) { + while (entries_remain) { err = bpf_map_lookup_batch(map_fd, - buckets_fetched > 0 ? &in_batch : NULL, - &out_batch, keys + buckets_fetched, - percpu_hist + buckets_fetched, &count, - &batch_opts); - if (err == -ENOENT) // All entries fetched + nentries > 0 ? &in_batch : NULL, + &out_batch, keys, percpu_buckets, + &count, &batch_opts); + if (err == -ENOENT) { // All entries fetched + entries_remain = false; err = 0; - else if (err) + } else if (err) { goto exit; + } - // Verify keys match expected idx range - for (idx = buckets_fetched; idx < buckets_fetched + count; idx++) { - if (keys[idx] != idx) { - err = -EBADSLT; + for (i = 0; i < count; i++) { + err = update_histogram_entry_bucket( + &keys[i], + sum_percpu_vals(ncpus, percpu_buckets[i]), buf); + if (err == -ENOSPC) { + /* + * Out of histogram entries. + * Record error, but continue. + * Use error code that should not clash with + * bpf_map_lookup_batch + */ + err2 = -ETOOMANYREFS; + err = 0; + } else if (err) { + // Critical error - abort goto exit; } } + nentries += count; + count = LOOKUP_BATCH_SIZE; in_batch = out_batch; - buckets_fetched += count; - count = HIST_NBUCKETS - buckets_fetched; } - merge_percpu_hist(HIST_NBUCKETS, ncpus, percpu_hist, hist); - exit: - free(percpu_hist); + free(percpu_buckets); free(keys); - return err; + return err ?: err2; } -static int report_stats(const struct netstacklat_config *conf, - const struct netstacklat_bpf *obj) +static int report_stats(const struct netstacklat_bpf *obj, + struct histogram_buffer *hist_buf) { - enum netstacklat_hook hook; - __u64 hist[HIST_NBUCKETS] = { 0 }; + int i, err; time_t t; - int err; + + err = fetch_histograms(bpf_map__fd(obj->maps.netstack_latency_seconds), + hist_buf); + if (err == -ETOOMANYREFS) + fprintf(stderr, + "Warning: Histogram buffer ran out of space - some histograms may not be reported\n"); + else if (err) + return err; time(&t); printf("%s", ctime(&t)); - for (hook = 1; hook < NETSTACKLAT_N_HOOKS; hook++) { - if (!conf->enabled_hooks[hook]) - continue; - - printf("%s:\n", hook_to_str(hook)); - - err = fetch_hist_map(hook_to_histmap(hook, obj), hist); - if (err) - return err; - - print_log2hist(stdout, ARRAY_SIZE(hist), hist, 1); + for (i = 0; i < hist_buf->current_size; i++) { + print_histkey(stdout, &hist_buf->hists[i].key); + printf(":\n"); + print_log2hist(stdout, HIST_NBUCKETS, + hist_buf->hists[i].buckets, 1); printf("\n"); } fflush(stdout); @@ -714,6 +797,19 @@ static int report_stats(const struct netstacklat_config *conf, return 0; } +static int init_histogram_buffer(struct histogram_buffer *buf) +{ + int max_hists = NETSTACKLAT_N_HOOKS; + + buf->hists = calloc(max_hists, sizeof(*buf->hists)); + if (!buf->hists) + return -errno; + + buf->max_size = max_hists; + buf->current_size = 0; + return 0; +} + static int enable_sw_rx_tstamps(void) { int tstamp_opt = SOF_TIMESTAMPING_RX_SOFTWARE; @@ -835,8 +931,8 @@ static int setup_timer(__u64 interval_ns) return fd; } -static int handle_timer(int timer_fd, const struct netstacklat_config *conf, - const struct netstacklat_bpf *obj) +static int handle_timer(int timer_fd, const struct netstacklat_bpf *obj, + struct histogram_buffer *hist_buf) { __u64 timer_exps; ssize_t size; @@ -853,7 +949,7 @@ static int handle_timer(int timer_fd, const struct netstacklat_config *conf, fprintf(stderr, "Warning: Missed %llu reporting intervals\n", timer_exps - 1); - return report_stats(conf, obj); + return report_stats(obj, hist_buf); } static int epoll_add_event(int epoll_fd, int fd, __u64 event_type, __u64 value) @@ -893,8 +989,8 @@ static int setup_epoll_instance(int sig_fd, int timer_fd) return err; } -static int poll_events(int epoll_fd, const struct netstacklat_config *conf, - const struct netstacklat_bpf *obj) +static int poll_events(int epoll_fd, const struct netstacklat_bpf *obj, + struct histogram_buffer *hist_buf) { struct epoll_event events[MAX_EPOLL_EVENTS]; int i, n, fd, err = 0; @@ -913,7 +1009,7 @@ static int poll_events(int epoll_fd, const struct netstacklat_config *conf, err = handle_signal(fd); break; case NETSTACKLAT_EPOLL_TIMER: - err = handle_timer(fd, conf, obj); + err = handle_timer(fd, obj, hist_buf); break; default: fprintf(stderr, "Warning: unexpected epoll data: %lu\n", @@ -952,6 +1048,7 @@ int main(int argc, char *argv[]) struct netstacklat_config config = { .report_interval_s = 5, }; + struct histogram_buffer hist_buf; struct netstacklat_bpf *obj; char errmsg[128]; @@ -962,6 +1059,13 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } + err = init_histogram_buffer(&hist_buf); + if (err) { + fprintf(stderr, "Failed allocating buffer for histograms: %s\n", + strerror(-err)); + return EXIT_FAILURE; + } + sock_fd = enable_sw_rx_tstamps(); if (sock_fd < 0) { err = sock_fd; @@ -1031,12 +1135,12 @@ int main(int argc, char *argv[]) // Report stats until user shuts down program while (true) { - err = poll_events(epoll_fd, &config, obj); + err = poll_events(epoll_fd, obj, &hist_buf); if (err) { if (err == NETSTACKLAT_ABORT) { // Report stats a final time before terminating - err = report_stats(&config, obj); + err = report_stats(obj, &hist_buf); } else { libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed polling fds: %s\n", diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index bb0162a1..cdd571a4 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -41,10 +41,18 @@ enum netstacklat_hook { NETSTACKLAT_N_HOOKS, }; -struct netstacklat_bpf_config -{ +/* + * Key used for the histogram map + * To be compatible with ebpf-exporter, all histograms need a key struct whose final + * member is named "bucket" and is the histogram bucket index. + */ +struct hist_key { + __u16 hook; // need well defined size for ebpf-exporter to decode + __u16 bucket; // needs to be last to be compatible with ebpf-exporter +}; + +struct netstacklat_bpf_config { bool filter_pid; }; #endif - From 2745820780913fd5fce92d6848fca554c2e0025f Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 22 May 2025 16:32:32 +0200 Subject: [PATCH 03/14] Update the ebpf-exporter config for multiplexed histogram maps Update the ebpf-exporter config to match the change to how the histograms are stored in the previous commit. As all histograms are stored in a single map, adding additional hooks in the future will only require adding a single line to the hook static_map. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.yaml | 82 +++++++----------------------------- 1 file changed, 15 insertions(+), 67 deletions(-) diff --git a/netstacklat/netstacklat.yaml b/netstacklat/netstacklat.yaml index 2fb99530..18e1614a 100644 --- a/netstacklat/netstacklat.yaml +++ b/netstacklat/netstacklat.yaml @@ -1,79 +1,27 @@ metrics: histograms: - - name: netstack_latency_ip_start_seconds - help: Time for packet to reach the start of the IP-stack + - name: netstack_latency_seconds + help: Latency for packets (skbs) to reach various points in the kernel network stack bucket_type: exp2 bucket_min: 0 bucket_max: 34 bucket_multiplier: 0.000000001 # nanoseconds to seconds labels: - - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_tcp_start_seconds - help: Time for packet to reach the start of the TCP stack - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_udp_start_seconds - help: Time until packet to reach the start of the UDP stack - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_tcp_sock_enqueued_seconds - help: Time until packet is queued to TCP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_udp_sock_enqueued_seconds - help: Time until packet is queued to UDP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 + - name: hook + size: 2 decoders: - name: uint - - name: netstack_latency_tcp_sock_read_seconds - help: Time until packet data is read from TCP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: - - name: bucket - size: 4 - decoders: - - name: uint - - name: netstack_latency_udp_sock_read_seconds - help: Time until packet data is read from UDP socket - bucket_type: exp2 - bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds - labels: + - name: static_map + static_map: + 1: "ip-start" + 2: "tcp-start" + 3: "udp-start" + 4: "tcp-socket-enqueued" + 5: "udp-socket-enqueued" + 6: "tcp-socket-read" + 7: "udp-socket-read" - name: bucket - size: 4 + size: 2 decoders: - name: uint + From d54e31e40c13673f79cdcd0418d454f6e04d21e6 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Wed, 28 May 2025 11:50:46 +0200 Subject: [PATCH 04/14] netstacklat: Refactor parsing of argument lists Refactor the parsing of arguments that accept lists of values (eg. --pids 1,2,3). Introduce a generic function for parsing delimited string lists and reuse that function to avoid repeating similar logic. This will simplify adding additional arguments that accept lists of values in the future. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.c | 121 ++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 50 deletions(-) diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 47feb752..df93225e 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -51,6 +51,8 @@ static const char *__doc__ = // Maximum number of different pids that can be filtered for #define MAX_FILTER_PIDS 4096 +typedef int (*t_parse_val_func)(const char *, void *); + struct hook_prog_collection { struct bpf_program *progs[MAX_HOOK_PROGS]; int nprogs; @@ -318,67 +320,31 @@ static int parse_bounded_long(long long *res, const char *str, long long low, return 0; } -/* - * Parses a comma-delimited string of hook-names, and sets the positions for - * the hooks that appear in the string to true. - */ -static int parse_hooks(bool hooks[NETSTACKLAT_N_HOOKS], const char *_str) +static int parse_strlist_to_arr(const char *_str, void *arr, size_t nelem, + size_t elem_size, const char *delim, + t_parse_val_func parse_func) { - enum netstacklat_hook hook; - char *tokp = NULL; - char str[1024]; - char *hookstr; - int i; - - for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) - hooks[i] = false; - - if (strlen(_str) >= sizeof(str)) - return -E2BIG; - strcpy(str, _str); - - hookstr = strtok_r(str, ",", &tokp); - while (hookstr) { - hook = str_to_hook(hookstr); - if (hook == NETSTACKLAT_HOOK_INVALID) { - fprintf(stderr, "%s is not a valid hook\n", hookstr); - return -EINVAL; - } - - hooks[hook] = true; - - hookstr = strtok_r(NULL, ",", &tokp); - } - - return 0; -} - -static int parse_pids(size_t size, __u32 arr[size], const char *_str, - const char *name) -{ - char *pidstr, *str; - char *tokp = NULL; - int err, i = 0; - long long val; + char *tokstr, *str; + char *saveptr = NULL; + int err = 0, i = 0; str = malloc(strlen(_str) + 1); if (!str) return -ENOMEM; strcpy(str, _str); - pidstr = strtok_r(str, ",", &tokp); - while (pidstr && i < size) { - err = parse_bounded_long(&val, pidstr, 1, PID_MAX_LIMIT, name); + tokstr = strtok_r(str, delim, &saveptr); + while (tokstr && i < nelem) { + err = parse_func(tokstr, (char *)arr + i * elem_size); if (err) goto exit; - arr[i] = val; - pidstr = strtok_r(NULL, ",", &tokp); + tokstr = strtok_r(NULL, delim, &saveptr); i++; } - if (pidstr) - // Parsed size pids, but more still remain + if (tokstr) + // Parsed size values, but more still remain err = -E2BIG; exit: @@ -386,6 +352,62 @@ static int parse_pids(size_t size, __u32 arr[size], const char *_str, return err ?: i; } +int parse_hook(const char *str, void *hookout) +{ + enum netstacklat_hook hook; + + hook = str_to_hook(str); + if (hook == NETSTACKLAT_HOOK_INVALID) { + fprintf(stderr, "%s is not a valid hook\n", str); + return -EINVAL; + } + + *(enum netstacklat_hook *)hookout = hook; + return 0; +} + +/* + * Parses a comma-delimited string of hook-names, and sets the positions for + * the hooks that appear in the string to true. + */ +static int parse_hooks(bool hooks[NETSTACKLAT_N_HOOKS], const char *str) +{ + enum netstacklat_hook ehooks[NETSTACKLAT_N_HOOKS * 2]; + int len, i; + + len = parse_strlist_to_arr(str, ehooks, ARRAY_SIZE(ehooks), + sizeof(*ehooks), ",", parse_hook); + if (len < 0) + return len; + + for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) + hooks[i] = false; + + for (i = 0; i < len; i++) + hooks[ehooks[i]] = true; + + return 0; +} + +static int parse_pid(const char *str, void *pidout) +{ + long long lval; + int err; + + err = parse_bounded_long(&lval, str, 1, PID_MAX_LIMIT, "pid"); + if (err) + return err; + + *(__u32 *)pidout = lval; + return 0; +} + +static int parse_pids(size_t size, __u32 arr[size], const char *str) +{ + return parse_strlist_to_arr(str, arr, size, sizeof(*arr), ",", + parse_pid); +} + static int parse_arguments(int argc, char *argv[], struct netstacklat_config *conf) { @@ -444,8 +466,7 @@ static int parse_arguments(int argc, char *argv[], break; case 'p': // filter-pids ret = parse_pids(ARRAY_SIZE(conf->pids) - conf->npids, - conf->pids + conf->npids, optarg, - optval_to_longopt(opt)->name); + conf->pids + conf->npids, optarg); if (ret < 0) return ret; From c427f3e347d796302e9e332ef92d6cd706cdaa0a Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Wed, 28 May 2025 11:57:42 +0200 Subject: [PATCH 05/14] netstacklat: Move pid-list to heap The full array of pid-values that could be parsed from the user was kept inside the config struct, which is allocated on stack (in the main function). Change the config struct to only keep a pointer to this array, and allocate it on the heap instead to avoid keeping this relatively large data structure on the stack. While not necessarily a large problem yet, establishing this pattern reduces the risk of running out of stack as new fields to filter for are added down the line or the maximum number of values to parse is increased. Also rename MAX_FIILTER_PIDS to MAX_PARSED_PIDS to better reflect what it actually is, and update the comment in parse_arguments() to reflect that the option is called pids and not filter-pids. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index df93225e..d2740440 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -48,8 +48,8 @@ static const char *__doc__ = #define MAX_HOOK_PROGS 4 -// Maximum number of different pids that can be filtered for -#define MAX_FILTER_PIDS 4096 +// Maximum number of PIDs to read from user +#define MAX_PARSED_PIDS 4096 typedef int (*t_parse_val_func)(const char *, void *); @@ -74,7 +74,7 @@ struct netstacklat_config { double report_interval_s; bool enabled_hooks[NETSTACKLAT_N_HOOKS]; int npids; - __u32 pids[MAX_FILTER_PIDS]; + __u32 *pids; }; static const struct option long_options[] = { @@ -420,6 +420,10 @@ static int parse_arguments(int argc, char *argv[], conf->npids = 0; conf->bpf_conf.filter_pid = false; + conf->pids = calloc(MAX_PARSED_PIDS, sizeof(*conf->pids)); + if (!conf->pids) + return -errno; + for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) // All probes enabled by default conf->enabled_hooks[i] = true; @@ -464,8 +468,8 @@ static int parse_arguments(int argc, char *argv[], conf->enabled_hooks[i] = !hooks[i]; hooks_off = true; break; - case 'p': // filter-pids - ret = parse_pids(ARRAY_SIZE(conf->pids) - conf->npids, + case 'p': // pids + ret = parse_pids(MAX_PARSED_PIDS - conf->npids, conf->pids + conf->npids, optarg); if (ret < 0) return ret; From 87ba9f9fc8f6acc8c2e2298dbbd03713698202ce Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Tue, 27 May 2025 17:34:03 +0200 Subject: [PATCH 06/14] netstacklat: Generalize initialization of array filter maps The filtering for specific pids (--pids/-p) makes use of a BPF array map where each entry to be included is set to 1 (the rest remain 0 as all entries by default are zeroed in array maps). Generalize the user space logic that initializes the entries in this filter map so that it can be reused by other similar features in the future. Furthermore, change the value type of the filter map from u8 to u64. While only one bit is really required, each BPF map entry takes up at least 8 bytes regardless, so using u8 does not save any space. Additionally, a future commit will add functionality to also filter by cgroup, and to keep that feature compatible with ebpf-exporter the value size needs to be 8 bytes. So use u64 keys for all filter maps from here on out to make it easier to reuse logic. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 4 ++-- netstacklat/netstacklat.c | 38 +++++++++++++++++------------------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 533bfe6f..0774ea68 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -41,7 +41,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, PID_MAX_LIMIT); __type(key, u32); - __type(value, u8); + __type(value, u64); } netstack_pidfilter SEC(".maps"); static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) @@ -161,7 +161,7 @@ static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook) static bool filter_pid(u32 pid) { - u8 *pid_ok; + u64 *pid_ok; if (!user_config.filter_pid) // No PID filter - all PIDs ok diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index d2740440..c962d157 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -887,6 +887,22 @@ static void set_programs_to_load(const struct netstacklat_config *conf, } } +static int init_filtermap(int map_fd, void *keys, size_t nelem, + size_t elem_size) +{ + __u64 ok_val = 1; + int i, err; + + for (i = 0; i < nelem; i++) { + err = bpf_map_update_elem(map_fd, (char *)keys + i * elem_size, + &ok_val, 0); + if (err) + return err; + } + + return 0; +} + static int init_signalfd(void) { sigset_t mask; @@ -1049,24 +1065,6 @@ static int poll_events(int epoll_fd, const struct netstacklat_bpf *obj, return err; } -static int init_pidfilter_map(const struct netstacklat_bpf *obj, - const struct netstacklat_config *conf) -{ - __u8 pid_ok_val = 1; - int map_fd, err; - __u32 i; - - map_fd = bpf_map__fd(obj->maps.netstack_pidfilter); - for (i = 0; i < conf->npids; i++) { - err = bpf_map_update_elem(map_fd, &conf->pids[i], &pid_ok_val, - 0); - if (err) - return err; - } - - return 0; -} - int main(int argc, char *argv[]) { int sig_fd, timer_fd, epoll_fd, sock_fd, err; @@ -1120,7 +1118,9 @@ int main(int argc, char *argv[]) goto exit_destroy_bpf; } - err = init_pidfilter_map(obj, &config); + err = init_filtermap(bpf_map__fd(obj->maps.netstack_pidfilter), + config.pids, config.npids, sizeof(*config.pids)); + if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed filling the pid filter map: %s\n", From 487c7e4bbf973d789651aac11c7ead2bb4595b1d Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 22 May 2025 20:35:43 +0200 Subject: [PATCH 07/14] netstacklat: Add filtering for network interfaces Add the option -i/--interfaces option to filter for specific network interfaces. The interfaces can either be provided as interface names or indices, although if the interfaces are in another namespace than the netstacklat userspace agent is running in they SHOULD be provided as indices. The names are resolved in the current namespace, and may therefore fail or yield incorrect indices if the interface is in another namespace. Unlike the previous -p/--pids option, this option applies to all existing probe points in netstacklat. On the eBPF side, use skb->skb_iif if the skb is available as context, otherwise use the sk->sk_rx_dst_ifindex from the socket. Use a similar approach as the previous PID filter, where an array map is used to hold ifindices that should be filtered for, allowing a quick lookup. While the ifindex (unlike the PID) does not seem to have a clear upper limit, limit it to 16384 (IFINDEX_MAX) to keep the filter map reasonably small while still supporting the vast majority of scenarios. Note that internally filtering is applied on the interface index (ifindex), regardless if the option provided the index or the name for the interface. If the same ifindex is repeated in multiple network namespaces, it will include traffic for all of them. A future commit will add an option to also filter for a specific network namespaces. Signed-off-by: Simon Sundberg --- headers/vmlinux/vmlinux_net.h | 6 ++++ netstacklat/netstacklat.bpf.c | 42 +++++++++++++++++++--- netstacklat/netstacklat.c | 66 +++++++++++++++++++++++++++++++++-- netstacklat/netstacklat.h | 3 ++ 4 files changed, 111 insertions(+), 6 deletions(-) diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h index 64b26212..013c42de 100644 --- a/headers/vmlinux/vmlinux_net.h +++ b/headers/vmlinux/vmlinux_net.h @@ -150,4 +150,10 @@ struct scm_timestamping_internal { struct timespec64 ts[3]; }; +struct sock { + struct dst_entry *sk_rx_dst; + int sk_rx_dst_ifindex; + u32 sk_rx_dst_cookie; +}; + #endif /* __VMLINUX_NET_H__ */ diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 0774ea68..ecdcb83f 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -15,6 +15,7 @@ char LICENSE[] SEC("license") = "GPL"; volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); volatile const struct netstacklat_bpf_config user_config = { .filter_pid = false, + .filter_ifindex = false, }; /* @@ -44,6 +45,13 @@ struct { __type(value, u64); } netstack_pidfilter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, IFINDEX_MAX); + __type(key, u32); + __type(value, u64); +} netstack_ifindexfilter SEC(".maps"); + static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) { u64 zero = 0; @@ -131,6 +139,21 @@ static void record_latency_since(ktime_t tstamp, enum netstacklat_hook hook) record_latency(latency, hook); } +static bool filter_ifindex(u32 ifindex) +{ + u64 *ifindex_ok; + + if (!user_config.filter_ifindex) + // No ifindex filter - all ok + return true; + + ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex); + if (!ifindex_ok) + return false; + + return *ifindex_ok > 0; +} + static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook) { if (bpf_core_field_exists(skb->tstamp_type)) { @@ -156,6 +179,9 @@ static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook) return; } + if (!filter_ifindex(skb->skb_iif)) + return; + record_latency_since(skb->tstamp, hook); } @@ -185,12 +211,18 @@ static bool filter_current_task(void) return filter_pid(tgid); } -static void record_socket_latency(struct sock *sk, ktime_t tstamp, - enum netstacklat_hook hook) +static void record_socket_latency(struct sock *sk, struct sk_buff *skb, + ktime_t tstamp, enum netstacklat_hook hook) { + u32 ifindex; + if (!filter_current_task()) return; + ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; + if (!filter_ifindex(ifindex)) + return; + record_latency_since(tstamp, hook); } @@ -259,7 +291,8 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { struct timespec64 *ts = &tss->ts[0]; - record_socket_latency(sk, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, + record_socket_latency(sk, NULL, + (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, NETSTACKLAT_HOOK_TCP_SOCK_READ); return 0; } @@ -268,6 +301,7 @@ SEC("fentry/skb_consume_udp") int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, int len) { - record_socket_latency(sk, skb->tstamp, NETSTACKLAT_HOOK_UDP_SOCK_READ); + record_socket_latency(sk, skb, skb->tstamp, + NETSTACKLAT_HOOK_UDP_SOCK_READ); return 0; } diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index c962d157..ce3186d5 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -10,6 +10,7 @@ static const char *__doc__ = #include #include #include +#include #include #include #include @@ -50,6 +51,7 @@ static const char *__doc__ = // Maximum number of PIDs to read from user #define MAX_PARSED_PIDS 4096 +#define MAX_PARSED_IFACES 4096 typedef int (*t_parse_val_func)(const char *, void *); @@ -74,7 +76,9 @@ struct netstacklat_config { double report_interval_s; bool enabled_hooks[NETSTACKLAT_N_HOOKS]; int npids; + int nifindices; __u32 *pids; + __u32 *ifindices; }; static const struct option long_options[] = { @@ -84,6 +88,7 @@ static const struct option long_options[] = { { "enable-probes", required_argument, NULL, 'e' }, { "disable-probes", required_argument, NULL, 'd' }, { "pids", required_argument, NULL, 'p' }, + { "interfaces", required_argument, NULL, 'i' }, { 0, 0, 0, 0 } }; @@ -408,6 +413,40 @@ static int parse_pids(size_t size, __u32 arr[size], const char *str) parse_pid); } +static int parse_iface(const char *str, void *ifindexout) +{ + int ifindex, err = 0; + long long lval; + + ifindex = if_nametoindex(str); + if (ifindex > IFINDEX_MAX) { + fprintf(stderr, + "%s has ifindex %d which is above the supported limit %d\n", + str, ifindex, IFINDEX_MAX); + return -ENOTSUP; + } else if (ifindex == 0) { + // Not a valid interface name - try parsing it as an index instead + err = parse_bounded_long(&lval, str, 1, IFINDEX_MAX, + "interface"); + if (!err) + ifindex = lval; + } + + if (ifindex > 0) + *(__u32 *)ifindexout = ifindex; + else + fprintf(stderr, + "%s is not a recognized interface name, nor a valid interface index\n", + str); + + return err; +} + +static int parse_ifaces(size_t size, __u32 arr[size], const char *str) +{ + return parse_strlist_to_arr(str, arr, size, sizeof(*arr), ",", parse_iface); +} + static int parse_arguments(int argc, char *argv[], struct netstacklat_config *conf) { @@ -418,11 +457,14 @@ static int parse_arguments(int argc, char *argv[], double fval; conf->npids = 0; + conf->nifindices = 0; conf->bpf_conf.filter_pid = false; + conf->bpf_conf.filter_ifindex = false; conf->pids = calloc(MAX_PARSED_PIDS, sizeof(*conf->pids)); - if (!conf->pids) - return -errno; + conf->ifindices = calloc(MAX_PARSED_IFACES, sizeof(*conf->ifindices)); + if (!conf->pids || !conf->ifindices) + return -ENOMEM; for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) // All probes enabled by default @@ -477,6 +519,16 @@ static int parse_arguments(int argc, char *argv[], conf->npids += ret; conf->bpf_conf.filter_pid = true; break; + case 'i': // interfaces + ret = parse_ifaces(MAX_PARSED_IFACES - conf->nifindices, + conf->ifindices + conf->nifindices, + optarg); + if (ret < 0) + return ret; + + conf->nifindices += ret; + conf->bpf_conf.filter_ifindex = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); @@ -1128,6 +1180,16 @@ int main(int argc, char *argv[]) goto exit_destroy_bpf; } + err = init_filtermap(bpf_map__fd(obj->maps.netstack_ifindexfilter), + config.ifindices, config.nifindices, + sizeof(*config.ifindices)); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed filling the ifindex filter map: %s\n", + errmsg); + goto exit_destroy_bpf; + } + err = netstacklat_bpf__attach(obj); if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index cdd571a4..50333f65 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -15,6 +15,8 @@ // The highest possible PID on a Linux system (from /include/linux/threads.h) #define PID_MAX_LIMIT (4 * 1024 * 1024) +// The highest ifindex we expect to encounter +#define IFINDEX_MAX 16384 #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -53,6 +55,7 @@ struct hist_key { struct netstacklat_bpf_config { bool filter_pid; + bool filter_ifindex; }; #endif From 197e7bf804dd8a42f32b57e64dd31a2eb96754d8 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 2 Jun 2025 12:45:58 +0200 Subject: [PATCH 08/14] netstacklat: Add filtering for network namespace Add the -n/--network-namespace option, which let's the user specify which network namespace ID (inode number) should be monitored. Apply the filtering to all current netstacklat probe points. Use the value 0 (default) to filter for the network namespace that the netstacklat application itself is running in. Use value -1 to disable the filtering, including data from all network namespaces (equivalent with the behavior before this commit). Only support filtering for a single network namespace (or all namespaces if the filtering is disabled). This minimizes runtime overhead by allowing the ID to filter for to be kept as a constant in the eBPF program. Supporting multiple values would require an additional map lookup, and due to the wide range of IDs it would have to be a hashmap lookup, which would add considerable overhead for a rather niche use case (monitoring multiple network namespaces). Note that this option will interact with the -i/--interfaces option, as the ifindex that the --interface option filters for are relative to the network namespace set by this option. Signed-off-by: Simon Sundberg --- headers/vmlinux/vmlinux_net.h | 25 ++++++++++++++++- netstacklat/netstacklat.bpf.c | 46 ++++++++++++++++++++++++------- netstacklat/netstacklat.c | 51 ++++++++++++++++++++++++++++++----- netstacklat/netstacklat.h | 1 + 4 files changed, 106 insertions(+), 17 deletions(-) diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h index 013c42de..15e877b5 100644 --- a/headers/vmlinux/vmlinux_net.h +++ b/headers/vmlinux/vmlinux_net.h @@ -3,6 +3,15 @@ typedef __u32 __wsum; +typedef struct { + struct net *net; +} possible_net_t; + +struct net_device { + int ifindex; + possible_net_t nd_net; +}; + typedef unsigned int sk_buff_data_t; // Assumes 64-bit. FIXME see below /* // BITS_PER_LONG can be wrong with -target bpf @@ -147,10 +156,24 @@ enum ip_conntrack_status { }; struct scm_timestamping_internal { - struct timespec64 ts[3]; + struct timespec64 ts[3]; +}; + +struct ns_common { + struct dentry *stashed; + unsigned int inum; +}; + +struct net { + struct ns_common ns; +}; + +struct sock_common { + possible_net_t skc_net; }; struct sock { + struct sock_common __sk_common; struct dst_entry *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index ecdcb83f..ed19c10d 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -14,6 +14,7 @@ char LICENSE[] SEC("license") = "GPL"; volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); volatile const struct netstacklat_bpf_config user_config = { + .network_ns = 0, .filter_pid = false, .filter_ifindex = false, }; @@ -154,7 +155,28 @@ static bool filter_ifindex(u32 ifindex) return *ifindex_ok > 0; } -static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook) +static bool filter_network_ns(u32 ns) +{ + if (user_config.network_ns == 0) + return true; + + return ns == user_config.network_ns; +} + +static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) +{ + /* + * Favor reading from sk due to less redirection (fewer probe reads) + * and skb->dev is not always set. + */ + if (sk) + return BPF_CORE_READ(sk->__sk_common.skc_net.net, ns.inum); + else if (skb) + return BPF_CORE_READ(skb->dev, nd_net.net, ns.inum); + return 0; +} + +static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) { if (bpf_core_field_exists(skb->tstamp_type)) { /* @@ -182,6 +204,9 @@ static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook) if (!filter_ifindex(skb->skb_iif)) return; + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + record_latency_since(skb->tstamp, hook); } @@ -223,6 +248,9 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (!filter_ifindex(ifindex)) return; + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + record_latency_since(tstamp, hook); } @@ -230,7 +258,7 @@ SEC("fentry/ip_rcv_core") int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block, void *tp, void *res, bool compat_mode) { - record_skb_latency(skb, NETSTACKLAT_HOOK_IP_RCV); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); return 0; } @@ -238,42 +266,42 @@ SEC("fentry/ip6_rcv_core") int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block, void *tp, void *res, bool compat_mode) { - record_skb_latency(skb, NETSTACKLAT_HOOK_IP_RCV); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); return 0; } SEC("fentry/tcp_v4_rcv") int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_START); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); return 0; } SEC("fentry/tcp_v6_rcv") int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_START); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); return 0; } SEC("fentry/udp_rcv") int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_START); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); return 0; } SEC("fentry/udpv6_rcv") int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_START); + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); return 0; } SEC("fexit/tcp_queue_rcv") int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb) { - record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); return 0; } @@ -282,7 +310,7 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, struct sk_buff *skb, int retval) { if (retval == 0) - record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); return 0; } diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index ce3186d5..5a86fde2 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -16,6 +16,7 @@ static const char *__doc__ = #include #include #include +#include #include #include @@ -82,13 +83,14 @@ struct netstacklat_config { }; static const struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "report-interval", required_argument, NULL, 'r' }, - { "list-probes", no_argument, NULL, 'l' }, - { "enable-probes", required_argument, NULL, 'e' }, - { "disable-probes", required_argument, NULL, 'd' }, - { "pids", required_argument, NULL, 'p' }, - { "interfaces", required_argument, NULL, 'i' }, + { "help", no_argument, NULL, 'h' }, + { "report-interval", required_argument, NULL, 'r' }, + { "list-probes", no_argument, NULL, 'l' }, + { "enable-probes", required_argument, NULL, 'e' }, + { "disable-probes", required_argument, NULL, 'd' }, + { "pids", required_argument, NULL, 'p' }, + { "interfaces", required_argument, NULL, 'i' }, + { "network-namespace", required_argument, NULL, 'n' }, { 0, 0, 0, 0 } }; @@ -274,6 +276,18 @@ static void list_hooks(FILE *stream) hook_to_description(hook)); } +static long long get_current_network_ns(void) +{ + struct stat ns_stat; + int err; + + err = stat("/proc/self/ns/net", &ns_stat); + if (err) + return -errno; + + return ns_stat.st_ino; +} + static int parse_bounded_double(double *res, const char *str, double low, double high, const char *name) { @@ -452,6 +466,7 @@ static int parse_arguments(int argc, char *argv[], { bool hooks_on = false, hooks_off = false; bool hooks[NETSTACKLAT_N_HOOKS]; + long long network_ns = 0; int opt, err, ret, i; char optstr[64]; double fval; @@ -529,6 +544,13 @@ static int parse_arguments(int argc, char *argv[], conf->nifindices += ret; conf->bpf_conf.filter_ifindex = true; break; + case 'n': // network-namespace + err = parse_bounded_long(&network_ns, optarg, -1, + UINT32_MAX, + optval_to_longopt(opt)->name); + if (err) + return err; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); @@ -547,6 +569,21 @@ static int parse_arguments(int argc, char *argv[], return -EINVAL; } + if (network_ns < 0) { + conf->bpf_conf.network_ns = 0; + } else if (network_ns == 0) { + network_ns = get_current_network_ns(); + if (network_ns < 0) { + fprintf(stderr, + "Failed getting current network namespace: %s\n", + strerror(-network_ns)); + return network_ns; + } + conf->bpf_conf.network_ns = network_ns; + } else { + conf->bpf_conf.network_ns = network_ns; + } + return 0; } diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index 50333f65..dcd178e6 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -54,6 +54,7 @@ struct hist_key { }; struct netstacklat_bpf_config { + __u32 network_ns; bool filter_pid; bool filter_ifindex; }; From a806f161777925339bcdd75ccd57d29ebb15d0a1 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 2 Jun 2025 19:03:00 +0200 Subject: [PATCH 09/14] netstacklat: Add filtering for cgroups Add the -c/--cgroups option that lets the user specify one or more cgroups (v2) to filter for. The cgroups can either be provided through their absolute path (including mount path) or as the cgroup ID (inode). This filter only applies to the probe points running in process context, just like the PIDs filter, which is currently only the socket dequeue probes (tcp-read and udp-read). To keep it simple (and avoid high run-time overhead), only do an exact match on the cgroup ID. Do not consider the hierarchical relationship between cgroups. I.e. if the output is filtered for a parent cgroup, the children of that cgroup will NOT be included unless the children cgroups have also been explicitly specified. To support the wide range of possible cgroup IDs, keep the cgroups to filter for in a sparse hasmap (where only values to include have entries) instead of the dense array maps (where all possible values have keys but those to include have non-zero values) like previous multi-valued filters. This unfortuantely adds considerable overhead for doing an additional hash map lookup, but keeping a dense map for all possible IDs is not feasible. If running with ebpf-exporter, the cgroup filter map can be populated and maintained by ebpf-exporter by configuring the cgroup_id_map setting in the YAML config. Update the YAML config to show an example of how it can be set (in this case including all cgroups under the system.slice). Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 33 +++++++++-- netstacklat/netstacklat.c | 108 +++++++++++++++++++++++++++++++++- netstacklat/netstacklat.h | 3 + netstacklat/netstacklat.yaml | 6 ++ 4 files changed, 145 insertions(+), 5 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index ed19c10d..de177615 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -17,6 +17,7 @@ volatile const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_pid = false, .filter_ifindex = false, + .filter_cgroup = false, }; /* @@ -53,6 +54,13 @@ struct { __type(value, u64); } netstack_ifindexfilter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_TRACKED_CGROUPS); + __type(key, u64); + __type(value, u64); +} netstack_cgroupfilter SEC(".maps"); + static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) { u64 zero = 0; @@ -225,15 +233,32 @@ static bool filter_pid(u32 pid) return *pid_ok > 0; } +static bool filter_cgroup(u64 cgroup_id) +{ + if (!user_config.filter_cgroup) + // No cgroup filter - all cgroups ok + return true; + + return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL; +} + static bool filter_current_task(void) { + bool ok = true; + __u64 cgroup; __u32 tgid; - if (!user_config.filter_pid) - return true; + if (user_config.filter_pid) { + tgid = bpf_get_current_pid_tgid() >> 32; + ok = ok && filter_pid(tgid); + } + + if (user_config.filter_cgroup) { + cgroup = bpf_get_current_cgroup_id(); + ok = ok && filter_cgroup(cgroup); + } - tgid = bpf_get_current_pid_tgid() >> 32; - return filter_pid(tgid); + return ok; } static void record_socket_latency(struct sock *sk, struct sk_buff *skb, diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 5a86fde2..861a915b 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -2,6 +2,7 @@ static const char *__doc__ = "Netstacklat - Monitor latency to various points in the ingress network stack"; +#define _GNU_SOURCE // to get name_to_handle_at #include #include #include @@ -10,6 +11,7 @@ static const char *__doc__ = #include #include #include +#include #include #include #include @@ -78,8 +80,10 @@ struct netstacklat_config { bool enabled_hooks[NETSTACKLAT_N_HOOKS]; int npids; int nifindices; + int ncgroups; __u32 *pids; __u32 *ifindices; + __u64 *cgroups; }; static const struct option long_options[] = { @@ -91,6 +95,7 @@ static const struct option long_options[] = { { "pids", required_argument, NULL, 'p' }, { "interfaces", required_argument, NULL, 'i' }, { "network-namespace", required_argument, NULL, 'n' }, + { "cgroups", required_argument, NULL, 'c' }, { 0, 0, 0, 0 } }; @@ -461,6 +466,86 @@ static int parse_ifaces(size_t size, __u32 arr[size], const char *str) return parse_strlist_to_arr(str, arr, size, sizeof(*arr), ",", parse_iface); } +/** + * get_cgroup_id_from_path - Get cgroup id for a particular cgroup path + * @cgroup_workdir: The absolute cgroup path + * + * On success, it returns the cgroup id. On failure it returns 0, + * which is an invalid cgroup id, and errno is set. + * + * Slightly modified version of get_cgroup_id_from_path from + * /tools/testing/selftests/bpf/cgroup_helpers.c that does not + * print out the errors + */ +static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) +{ + int dirfd, err, flags, mount_id, fhsize; + union { + unsigned long long cgid; + unsigned char raw_bytes[8]; + } id; + struct file_handle *fhp, *fhp2; + unsigned long long ret = 0; + + dirfd = AT_FDCWD; + flags = 0; + fhsize = sizeof(*fhp); + fhp = calloc(1, fhsize); + if (!fhp) + return 0; + + err = name_to_handle_at(dirfd, cgroup_workdir, fhp, &mount_id, flags); + if (err >= 0 || fhp->handle_bytes != 8) { + errno = EBADE; + goto free_mem; + } + + fhsize = sizeof(struct file_handle) + fhp->handle_bytes; + fhp2 = realloc(fhp, fhsize); + if (!fhp2) + goto free_mem; + + err = name_to_handle_at(dirfd, cgroup_workdir, fhp2, &mount_id, flags); + fhp = fhp2; + if (err < 0) + goto free_mem; + + memcpy(id.raw_bytes, fhp->f_handle, 8); + ret = id.cgid; + +free_mem: + free(fhp); + return ret; +} + +static int parse_cgroup(const char *str, void *cgroupout) +{ + long long lval; + __u64 cgroup; + int err = 0; + + cgroup = get_cgroup_id_from_path(str); + + if (cgroup == 0) { + // Not a valid cgroup path - try parse it as an int instead + err = parse_bounded_long(&lval, str, 0, INT64_MAX, "cgroup"); + if (!err) + cgroup = lval; + } + + if (cgroup != 0) + *(__u64 *)cgroupout = cgroup; + else + fprintf(stderr, "%s is not a valid cgroup path or ID\n", str); + + return err; +} + +static int parse_cgroups(size_t size, __u64 arr[size], const char *str) +{ + return parse_strlist_to_arr(str, arr, size, sizeof(*arr), ",", parse_cgroup); +} + static int parse_arguments(int argc, char *argv[], struct netstacklat_config *conf) { @@ -478,7 +563,8 @@ static int parse_arguments(int argc, char *argv[], conf->pids = calloc(MAX_PARSED_PIDS, sizeof(*conf->pids)); conf->ifindices = calloc(MAX_PARSED_IFACES, sizeof(*conf->ifindices)); - if (!conf->pids || !conf->ifindices) + conf->cgroups = calloc(MAX_TRACKED_CGROUPS, sizeof(*conf->cgroups)); + if (!conf->pids || !conf->ifindices || !conf->cgroups) return -ENOMEM; for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) @@ -551,6 +637,16 @@ static int parse_arguments(int argc, char *argv[], if (err) return err; break; + case 'c': // cgroups + ret = parse_cgroups(MAX_TRACKED_CGROUPS - + conf->ncgroups, + conf->cgroups, optarg); + if (ret < 0) + return ret; + + conf->ncgroups += ret; + conf->bpf_conf.filter_cgroup = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); @@ -1227,6 +1323,16 @@ int main(int argc, char *argv[]) goto exit_destroy_bpf; } + err = init_filtermap(bpf_map__fd(obj->maps.netstack_cgroupfilter), + config.cgroups, config.ncgroups, + sizeof(*config.cgroups)); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed filling the cgroup filter map: %s\n", + errmsg); + goto exit_destroy_bpf; + } + err = netstacklat_bpf__attach(obj); if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index dcd178e6..7044f5fc 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -17,6 +17,8 @@ #define PID_MAX_LIMIT (4 * 1024 * 1024) // The highest ifindex we expect to encounter #define IFINDEX_MAX 16384 +// The maximum number of different cgroups we can filter for +#define MAX_TRACKED_CGROUPS 4096 #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -57,6 +59,7 @@ struct netstacklat_bpf_config { __u32 network_ns; bool filter_pid; bool filter_ifindex; + bool filter_cgroup; }; #endif diff --git a/netstacklat/netstacklat.yaml b/netstacklat/netstacklat.yaml index 18e1614a..7ac8ea9c 100644 --- a/netstacklat/netstacklat.yaml +++ b/netstacklat/netstacklat.yaml @@ -25,3 +25,9 @@ metrics: decoders: - name: uint +cgroup_id_map: + name: netstack_cgroupfilter + type: hash + regexps: + - ^.*(system.slice/.*)$ + From bea360a1f105228cf2b3866b5422dd4d1cc25df7 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 9 Jun 2025 17:46:24 +0200 Subject: [PATCH 10/14] netstacklat: Add filtering for non-empty rxqueue Add the -q/--nonempty-queue option, which when enabled only includes latency values when the socket receive queue is non-empty. Only apply this to the socket-read hooks (tcp-socket-read, udp-socket-read), where the probes are triggered AFTER the skbs have been read from the socket queue, and a non-empty queue therefore signifies that additional data remains after the read. The idea behind this hook is to offer a way to reduce overhead (by early aborting for all instances where the socket receive queue is empty) while still capturing latency for applications that can be assumed is under some load (enough load that more data queues up than the application will immediately read). Signed-off-by: Simon Sundberg --- headers/vmlinux/vmlinux_net.h | 17 +++++++++++++++++ netstacklat/netstacklat.bpf.c | 25 +++++++++++++++++++++++++ netstacklat/netstacklat.c | 5 +++++ netstacklat/netstacklat.h | 1 + 4 files changed, 48 insertions(+) diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h index 15e877b5..0d5ffc0b 100644 --- a/headers/vmlinux/vmlinux_net.h +++ b/headers/vmlinux/vmlinux_net.h @@ -26,6 +26,22 @@ typedef unsigned char *sk_buff_data_t; #endif */ +struct sk_buff_list { + struct sk_buff *next; + struct sk_buff *prev; +}; + +struct sk_buff_head { + union { + struct { + struct sk_buff *next; + struct sk_buff *prev; + }; + struct sk_buff_list list; + }; + __u32 qlen; +}; + struct sk_buff { union { struct { @@ -174,6 +190,7 @@ struct sock_common { struct sock { struct sock_common __sk_common; + struct sk_buff_head sk_receive_queue; struct dst_entry *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index de177615..3ef17f85 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -18,6 +18,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_pid = false, .filter_ifindex = false, .filter_cgroup = false, + .filter_nonempty_sockqueue = false, }; /* @@ -261,11 +262,35 @@ static bool filter_current_task(void) return ok; } +/** + * skb_queue_empty - check if a queue is empty + * @list: queue head + * + * Returns true if the queue is empty, false otherwise. + * + * Copied from /include/linux/skbuff.h + */ +static inline int skb_queue_empty(const struct sk_buff_head *list) +{ + return list->next == (const struct sk_buff *)list; +} + +static bool filter_nonempty_sockqueue(struct sock *sk) +{ + if (!user_config.filter_nonempty_sockqueue) + return true; + + return !skb_queue_empty(&sk->sk_receive_queue); +} + static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook) { u32 ifindex; + if (!filter_nonempty_sockqueue(sk)) + return; + if (!filter_current_task()) return; diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 861a915b..36c65bd8 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -96,6 +96,7 @@ static const struct option long_options[] = { { "interfaces", required_argument, NULL, 'i' }, { "network-namespace", required_argument, NULL, 'n' }, { "cgroups", required_argument, NULL, 'c' }, + { "nonempty-queue", no_argument, NULL, 'q' }, { 0, 0, 0, 0 } }; @@ -560,6 +561,7 @@ static int parse_arguments(int argc, char *argv[], conf->nifindices = 0; conf->bpf_conf.filter_pid = false; conf->bpf_conf.filter_ifindex = false; + conf->bpf_conf.filter_nonempty_sockqueue = false; conf->pids = calloc(MAX_PARSED_PIDS, sizeof(*conf->pids)); conf->ifindices = calloc(MAX_PARSED_IFACES, sizeof(*conf->ifindices)); @@ -647,6 +649,9 @@ static int parse_arguments(int argc, char *argv[], conf->ncgroups += ret; conf->bpf_conf.filter_cgroup = true; break; + case 'q': // nonempty-queue + conf->bpf_conf.filter_nonempty_sockqueue = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index 7044f5fc..fc167bbd 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -60,6 +60,7 @@ struct netstacklat_bpf_config { bool filter_pid; bool filter_ifindex; bool filter_cgroup; + bool filter_nonempty_sockqueue; }; #endif From 3f9e2334d342bd4851d82820082a87f8e1695a07 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 5 Jun 2025 13:38:46 +0200 Subject: [PATCH 11/14] netstacklat: Dynamically configure map sizes Make the userspace agent set the BPF map sizes based on configured options. This allows more suitable map sizes to be used during run time than the static limits set in the BPF programs. This avoids wasting memory by using unnecessary large maps, and might slightly improve hashmap lookup performance by sizing them based on the expected number of entries (small enough that many entries may fit in cache, large enough to avoid excessive hash collisions). Scale the histogram map based on the expected number of histograms, the PID and ifindex filter maps to fit the largest key they need to include and the cgroup filter map to fit all tracked cgroups. This also fixes a bug where the maximum allowed PID (PID_MAX_LIMIT) and ifindex (IFINDEX_MAX) did not fit in their corresponding filter maps (off-by-one error). Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.c | 74 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 36c65bd8..34ba80ee 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -1012,9 +1012,15 @@ static int report_stats(const struct netstacklat_bpf *obj, return 0; } -static int init_histogram_buffer(struct histogram_buffer *buf) +static int init_histogram_buffer(struct histogram_buffer *buf, + const struct netstacklat_config *conf) { - int max_hists = NETSTACKLAT_N_HOOKS; + int max_hists = 0, i; + + for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) { + if (conf->enabled_hooks[i]) + max_hists++; + } buf->hists = calloc(max_hists, sizeof(*buf->hists)); if (!buf->hists) @@ -1077,6 +1083,59 @@ static void set_programs_to_load(const struct netstacklat_config *conf, } } +static int set_map_sizes(const struct netstacklat_config *conf, + struct netstacklat_bpf *obj, int max_hists) +{ + __u32 size; + int err, i; + + size = max_hists * HIST_NBUCKETS; + err = bpf_map__set_max_entries(obj->maps.netstack_latency_seconds, + size); + if (err) { + fprintf(stderr, "Failed setting size of histogram map to %u\n", + size); + return err; + } + + // PID filter - arraymap, needs max PID + 1 entries + for (i = 0, size = 1; i < conf->npids; i++) { + if (conf->pids[i] >= size) + size = conf->pids[i] + 1; + } + err = bpf_map__set_max_entries(obj->maps.netstack_pidfilter, size); + if (err) { + fprintf(stderr, "Failed setting size of PID filter map to %u\n", + size); + return err; + } + + // ifindex filter - arraymap, needs max ifindex + 1 entries + for (i = 0, size = 1; i < conf->nifindices; i++) { + if (conf->ifindices[i] >= size) + size = conf->ifindices[i] + 1; + } + err = bpf_map__set_max_entries(obj->maps.netstack_ifindexfilter, size); + if (err) { + fprintf(stderr, + "Failed setting size of ifindex filter map to %u\n", + size); + return err; + } + + // cgroup filter - hashmap, should be ~2x expected number of entries + size = conf->bpf_conf.filter_cgroup ? conf->ncgroups * 2 : 1; + err = bpf_map__set_max_entries(obj->maps.netstack_cgroupfilter, size); + if (err) { + fprintf(stderr, + "Failed setting size of cgroup filter map to %u\n", + size); + return err; + } + + return 0; +} + static int init_filtermap(int map_fd, void *keys, size_t nelem, size_t elem_size) { @@ -1272,7 +1331,7 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } - err = init_histogram_buffer(&hist_buf); + err = init_histogram_buffer(&hist_buf, &config); if (err) { fprintf(stderr, "Failed allocating buffer for histograms: %s\n", strerror(-err)); @@ -1290,7 +1349,7 @@ int main(int argc, char *argv[]) obj = netstacklat_bpf__open(); if (!obj) { - err = libbpf_get_error(obj); + err = -errno; libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed opening eBPF object file: %s\n", errmsg); goto exit_sockfd; @@ -1301,6 +1360,13 @@ int main(int argc, char *argv[]) set_programs_to_load(&config, obj); + err = set_map_sizes(&config, obj, hist_buf.max_size); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed configuring map sizes: %s\n", errmsg); + goto exit_destroy_bpf; + } + err = netstacklat_bpf__load(obj); if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); From 7ab0dc5b63b48365323257f5bfdd4673d83fc615 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 9 Jun 2025 19:23:40 +0200 Subject: [PATCH 12/14] netstacklat: Add option to group by interface Add the -I/--groupby-interface option to collect and report the data on a per-interface (or rather ifindex) basis. Note that the network interfaces are tracked based on their ifindex, so if network namespace filtering has been disabled and there exists interfaces in different namespaces with a common ifindex, their data will be merged into the same histogram. Always write the interface index rather than the interface name in the output. While the interface name for the same network namespace as the user space agent runs in can easily be retrieved with e.g. if_indextoname(), that will only be valid if the user has configured netstacklat to only monitor its own network namespace. If a different network namespace is monitored, or filtration for network namespaces is disabled, translating to the interface names in the current namespace might produce misleading results. An alternative could be to print out the interface names in case the current network namespace is the one monitored (the default), or the index if there's a risk that the data might be from a different namespace. However, in addition to that added complexity, that will produce somewhat inconsistent output (i.e. you might get interface names or interface indices depending on how you configure netstacklat). Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 29 ++++++++++++++++++++--------- netstacklat/netstacklat.c | 22 +++++++++++++++++++++- netstacklat/netstacklat.h | 11 +++++++++++ netstacklat/netstacklat.yaml | 7 +++++++ 4 files changed, 59 insertions(+), 10 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 3ef17f85..efc72359 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -19,6 +19,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_ifindex = false, .filter_cgroup = false, .filter_nonempty_sockqueue = false, + .groupby_ifindex = false, }; /* @@ -36,7 +37,7 @@ struct sk_buff___old { struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS); + __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 16); __type(key, struct hist_key); __type(value, u64); } netstack_latency_seconds SEC(".maps"); @@ -135,18 +136,17 @@ static ktime_t time_since(ktime_t tstamp) return now - tstamp; } -static void record_latency(ktime_t latency, enum netstacklat_hook hook) +static void record_latency(ktime_t latency, const struct hist_key *key) { - struct hist_key key = { .hook = hook }; - increment_exp2_histogram_nosync(&netstack_latency_seconds, key, latency, + increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency, HIST_MAX_LATENCY_SLOT); } -static void record_latency_since(ktime_t tstamp, enum netstacklat_hook hook) +static void record_latency_since(ktime_t tstamp, const struct hist_key *key) { ktime_t latency = time_since(tstamp); if (latency >= 0) - record_latency(latency, hook); + record_latency(latency, key); } static bool filter_ifindex(u32 ifindex) @@ -187,6 +187,9 @@ static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) { + struct hist_key key = { .hook = hook }; + u32 ifindex; + if (bpf_core_field_exists(skb->tstamp_type)) { /* * For kernels >= v6.11 the tstamp_type being non-zero @@ -210,13 +213,17 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta return; } - if (!filter_ifindex(skb->skb_iif)) + ifindex = skb->skb_iif; + if (!filter_ifindex(ifindex)) return; if (!filter_network_ns(get_network_ns(skb, sk))) return; - record_latency_since(skb->tstamp, hook); + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + + record_latency_since(skb->tstamp, &key); } static bool filter_pid(u32 pid) @@ -286,6 +293,7 @@ static bool filter_nonempty_sockqueue(struct sock *sk) static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook) { + struct hist_key key = { .hook = hook }; u32 ifindex; if (!filter_nonempty_sockqueue(sk)) @@ -301,7 +309,10 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (!filter_network_ns(get_network_ns(skb, sk))) return; - record_latency_since(tstamp, hook); + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + + record_latency_since(tstamp, &key); } SEC("fentry/ip_rcv_core") diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 34ba80ee..0884cda7 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -97,6 +97,7 @@ static const struct option long_options[] = { { "network-namespace", required_argument, NULL, 'n' }, { "cgroups", required_argument, NULL, 'c' }, { "nonempty-queue", no_argument, NULL, 'q' }, + { "groupby-interface", no_argument, NULL, 'I' }, { 0, 0, 0, 0 } }; @@ -562,6 +563,7 @@ static int parse_arguments(int argc, char *argv[], conf->bpf_conf.filter_pid = false; conf->bpf_conf.filter_ifindex = false; conf->bpf_conf.filter_nonempty_sockqueue = false; + conf->bpf_conf.groupby_ifindex = false; conf->pids = calloc(MAX_PARSED_PIDS, sizeof(*conf->pids)); conf->ifindices = calloc(MAX_PARSED_IFACES, sizeof(*conf->ifindices)); @@ -649,9 +651,13 @@ static int parse_arguments(int argc, char *argv[], conf->ncgroups += ret; conf->bpf_conf.filter_cgroup = true; break; + case 'q': // nonempty-queue conf->bpf_conf.filter_nonempty_sockqueue = true; break; + case 'I': // groupby-interface + conf->bpf_conf.groupby_ifindex = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); @@ -822,13 +828,22 @@ static void print_log2hist(FILE *stream, size_t n, const __u64 hist[n], static void print_histkey(FILE *stream, const struct hist_key *key) { fprintf(stream, "%s", hook_to_str(key->hook)); + + if (key->ifindex) + fprintf(stream, ", interface=%u", key->ifindex); } static int cmp_histkey(const void *val1, const void *val2) { const struct hist_key *key1 = val1, *key2 = val2; - return key1->hook == key2->hook ? 0 : key1->hook > key2->hook ? 1 : -1; + if (key1->hook != key2->hook) + return key1->hook > key2->hook ? 1 : -1; + + if (key1->ifindex != key2->ifindex) + return key1->ifindex > key2->ifindex ? 1 : -1; + + return 0; } static int cmp_histentry(const void *val1, const void *val2) @@ -1022,6 +1037,11 @@ static int init_histogram_buffer(struct histogram_buffer *buf, max_hists++; } + if (conf->bpf_conf.groupby_ifindex) + max_hists *= conf->bpf_conf.filter_ifindex ? + min(conf->nifindices, 64) : + 32; + buf->hists = calloc(max_hists, sizeof(*buf->hists)); if (!buf->hists) return -errno; diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index fc167bbd..a6cfaf6f 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -33,6 +33,15 @@ }) #endif +#ifndef min +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#endif + enum netstacklat_hook { NETSTACKLAT_HOOK_INVALID = 0, NETSTACKLAT_HOOK_IP_RCV, @@ -51,6 +60,7 @@ enum netstacklat_hook { * member is named "bucket" and is the histogram bucket index. */ struct hist_key { + __u32 ifindex; __u16 hook; // need well defined size for ebpf-exporter to decode __u16 bucket; // needs to be last to be compatible with ebpf-exporter }; @@ -61,6 +71,7 @@ struct netstacklat_bpf_config { bool filter_ifindex; bool filter_cgroup; bool filter_nonempty_sockqueue; + bool groupby_ifindex; }; #endif diff --git a/netstacklat/netstacklat.yaml b/netstacklat/netstacklat.yaml index 7ac8ea9c..e3305457 100644 --- a/netstacklat/netstacklat.yaml +++ b/netstacklat/netstacklat.yaml @@ -7,6 +7,13 @@ metrics: bucket_max: 34 bucket_multiplier: 0.000000001 # nanoseconds to seconds labels: + - name: iface + size: 4 + decoders: + # If including output from a different network namespace than ebpf-exporter + # you probably just want to decode as a uint (ifindex) instead + # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others + - name: ifname - name: hook size: 2 decoders: From 31ae18679c0673322f20a59742c391f926c7d293 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 9 Jun 2025 19:17:45 +0200 Subject: [PATCH 13/14] netstacklat: Add option to groupby cgroup Add the -C/--groupby-cgroup option to collect and report data on a per-cgroup basis. Just like the -c/--cgroups option, this will only apply to probes in the process context, which is currently only tcp-socket-read and udp-socket-read. When reporting the data, print out the cgroup ID (inode number) directly instead of the cgroup path. As far as I can tell, the only way to resolve the ID into a path is the walk the entire cgroup mount (e.g. /sys/fs/cgroup) and stat each path to find the matching inode. Doing this every time the cgroup needs to be printed seems highly inefficient, and to create an efficient cache the most suitable data structure seems like a hashmap, which C lacks. Adding support for printing out the cgroup path would thus be a significant implementation effort for something we in the end primarily will rely on ebpf-exporter for anyways. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 18 +++++++++++------- netstacklat/netstacklat.c | 16 ++++++++++++++++ netstacklat/netstacklat.h | 2 ++ netstacklat/netstacklat.yaml | 5 +++++ 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index efc72359..574cdbd4 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -20,6 +20,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_cgroup = false, .filter_nonempty_sockqueue = false, .groupby_ifindex = false, + .groupby_cgroup = false, }; /* @@ -37,7 +38,7 @@ struct sk_buff___old { struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 16); + __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64); __type(key, struct hist_key); __type(value, u64); } netstack_latency_seconds SEC(".maps"); @@ -250,10 +251,9 @@ static bool filter_cgroup(u64 cgroup_id) return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL; } -static bool filter_current_task(void) +static bool filter_current_task(u64 cgroup) { bool ok = true; - __u64 cgroup; __u32 tgid; if (user_config.filter_pid) { @@ -261,10 +261,8 @@ static bool filter_current_task(void) ok = ok && filter_pid(tgid); } - if (user_config.filter_cgroup) { - cgroup = bpf_get_current_cgroup_id(); + if (user_config.filter_cgroup) ok = ok && filter_cgroup(cgroup); - } return ok; } @@ -294,12 +292,16 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook) { struct hist_key key = { .hook = hook }; + u64 cgroup = 0; u32 ifindex; if (!filter_nonempty_sockqueue(sk)) return; - if (!filter_current_task()) + if (user_config.filter_cgroup || user_config.groupby_cgroup) + cgroup = bpf_get_current_cgroup_id(); + + if (!filter_current_task(cgroup)) return; ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; @@ -311,6 +313,8 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (user_config.groupby_ifindex) key.ifindex = ifindex; + if (user_config.groupby_cgroup) + key.cgroup = cgroup; record_latency_since(tstamp, &key); } diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 0884cda7..7a311e28 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -98,6 +98,7 @@ static const struct option long_options[] = { { "cgroups", required_argument, NULL, 'c' }, { "nonempty-queue", no_argument, NULL, 'q' }, { "groupby-interface", no_argument, NULL, 'I' }, + { "groupby-cgroup", no_argument, NULL, 'C' }, { 0, 0, 0, 0 } }; @@ -564,6 +565,7 @@ static int parse_arguments(int argc, char *argv[], conf->bpf_conf.filter_ifindex = false; conf->bpf_conf.filter_nonempty_sockqueue = false; conf->bpf_conf.groupby_ifindex = false; + conf->bpf_conf.groupby_cgroup = false; conf->pids = calloc(MAX_PARSED_PIDS, sizeof(*conf->pids)); conf->ifindices = calloc(MAX_PARSED_IFACES, sizeof(*conf->ifindices)); @@ -658,6 +660,9 @@ static int parse_arguments(int argc, char *argv[], case 'I': // groupby-interface conf->bpf_conf.groupby_ifindex = true; break; + case 'C': // groupby-cgroup + conf->bpf_conf.groupby_cgroup = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); @@ -831,6 +836,9 @@ static void print_histkey(FILE *stream, const struct hist_key *key) if (key->ifindex) fprintf(stream, ", interface=%u", key->ifindex); + + if (key->cgroup) + fprintf(stream, ", cgroup=%llu", key->cgroup); } static int cmp_histkey(const void *val1, const void *val2) @@ -843,6 +851,9 @@ static int cmp_histkey(const void *val1, const void *val2) if (key1->ifindex != key2->ifindex) return key1->ifindex > key2->ifindex ? 1 : -1; + if (key1->cgroup != key2->cgroup) + return key1->cgroup > key2->cgroup ? 1 : -1; + return 0; } @@ -1042,6 +1053,11 @@ static int init_histogram_buffer(struct histogram_buffer *buf, min(conf->nifindices, 64) : 32; + if (conf->bpf_conf.groupby_cgroup) + max_hists *= conf->bpf_conf.filter_cgroup ? + min(conf->ncgroups, 128) : + 64; + buf->hists = calloc(max_hists, sizeof(*buf->hists)); if (!buf->hists) return -errno; diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index a6cfaf6f..4811da4c 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -60,6 +60,7 @@ enum netstacklat_hook { * member is named "bucket" and is the histogram bucket index. */ struct hist_key { + __u64 cgroup; __u32 ifindex; __u16 hook; // need well defined size for ebpf-exporter to decode __u16 bucket; // needs to be last to be compatible with ebpf-exporter @@ -72,6 +73,7 @@ struct netstacklat_bpf_config { bool filter_cgroup; bool filter_nonempty_sockqueue; bool groupby_ifindex; + bool groupby_cgroup; }; #endif diff --git a/netstacklat/netstacklat.yaml b/netstacklat/netstacklat.yaml index e3305457..3b6e5dc8 100644 --- a/netstacklat/netstacklat.yaml +++ b/netstacklat/netstacklat.yaml @@ -7,6 +7,11 @@ metrics: bucket_max: 34 bucket_multiplier: 0.000000001 # nanoseconds to seconds labels: + - name: cgroup + size: 8 + decoders: + - name: uint + - name: cgroup - name: iface size: 4 decoders: From 35ac2d00fc374905531378e950d287e0166b5d36 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Tue, 17 Jun 2025 11:42:36 +0200 Subject: [PATCH 14/14] netstacklat: Add script to fill in filter maps externally Many of the options to filter for a subset of values (--pids, --interfaces, and --cgroups) rely on filling BPF maps with the values to include. When running together with the provided netstacklat user space process, the user space process handles filling these maps with the values passed on the command line. However, when using the netstacklat eBPF programs with some external loader, like the ebpf-exporter, these maps have to be filled by the user in some other manner. To make it easier to use netstacklat with external eBPF loaders, provide the fill_filter_maps.sh script. Rely on bpftool to fill the BPF maps (based on the map names as defined in the netstacklat.bpf.c file). in the script. Make the script support all the current filters that make use of maps, i.e. PIDs (pid), network interfaces (iface) and cgroups (cgroup). The pid option only supports integers. The iface option support either interface names or their ifindex. The cgroup option accepts either the cgroup ID (their inode number) or the full path to the cgroup. Examples: $ ./fill_filter_maps.sh pid 1234 98765 $ ./fill_filter_maps.sh iface veth0 lo 123 $ ./fill_filter_maps.sh cgroup /sys/fs/cgroup/system.slice/prometheus.service/ 12345 Note that for the values in the filter map to actually be used by the netstacklat eBPF programs, the corresponding filter_{pid,ifindex,cgroup} value must be true (by default they're all false). The netstacklat user space process normally takes care of enabling these as needed, but if used with an external loader the easiest way to enable these is probably to just change them in user_config at the start of netstacklat.bpf.c (and recompile). Also note that this script can also be used together with the netstacklat user space loader to add additional values to filter for after the starting the program. However, the netstacklat user space loader automatically minimizes the size of the filter maps since commit "netstacklat: Dynamically configure map sizes". So unless the initial filter values provided as netstacklat CLI arguments resulted in sufficiently large filter maps, the fill_filter_maps.sh script may not be able to successfully add the desired values. Finally, note that as bpftool expects you to feed it the individual bytes of the keys, the order of the bytes will be dependant on the endianess of the machine. Currently only support little-endian machines, big-endian support can be added later if needed. Signed-off-by: Simon Sundberg --- netstacklat/fill_filter_maps.sh | 134 ++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100755 netstacklat/fill_filter_maps.sh diff --git a/netstacklat/fill_filter_maps.sh b/netstacklat/fill_filter_maps.sh new file mode 100755 index 00000000..b3d10ea7 --- /dev/null +++ b/netstacklat/fill_filter_maps.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +declare -rA bpf_maps=( + [pid]="netstack_pidfil" + [iface]="netstack_ifinde" + [cgroup]="netstack_cgroup" +) + +declare -rA key_converters=( + [pid]=pid_to_bpftool + [iface]=iface_to_bpftool + [cgroup]=cgroup_to_bpftool +) + +print_usage() +{ + echo "usage: $0 TYPE val1 [val2 val3 val4...]" + echo "TYPE: { $(echo "${!bpf_maps[@]}" | tr ' ' '\|') }" +} + +pid_to_bpftool() +{ + local val="$1" + + uint_to_bpftool_u32 "$val" +} + +# Supports ifname or ifindex +iface_to_bpftool() +{ + local val="$1" + + if ! is_uint "$val"; then + val="$(ifname_to_idx "$val")" + fi + + uint_to_bpftool_u32 "$val" +} + +# Supports full cgroup path or direct cgroup id (inode) +cgroup_to_bpftool() +{ + local val="$1" + + if ! is_uint "$val"; then + val="$(cgroup_path_to_id "$val")" + fi + + uint_to_bpftool_u64 "$val" +} + +is_uint() +{ + local val="$1" + + [[ "$val" == +([0-9]) ]] +} + +ifname_to_idx() +{ + local ifname="$1" + local ifindex=0 + + ifindex="$(ip address show "$ifname" | grep "[0-9][0-9]*: ${ifname}.*: <")" + ifindex="${ifindex%%:*}" + + if [[ -z "$ifindex" ]]; then + return 1 + fi + + echo "$ifindex" +} + +cgroup_path_to_id() +{ + local cpath="$1" + + stat -L -c '%i' "$(realpath "$cpath")" +} + +# When providing keys/values to bpftool map update, it basically wants one +# argument for each byte in the key/value. So if you have a u32 key (as in any +# array map) and you want to update key 1234, then you will have to provide +# key 0xd2 0x04 0x00 0x00 (1234 in hex split up as the 4 bytes in a u32 in +# little-endian order). These helpers assume you're on a little endian machine. +uint_to_bpftool_u32() +{ + local val="$1" + + printf "0x%02x 0x%02x 0x%02x 0x%02x\n" \ + $((val & 0xff)) $(((val >> 8) & 0xff)) $(((val >> 16) & 0xff)) $(((val >> 24) & 0xff)) +} + +uint_to_bpftool_u64() +{ + local val="$1" + + printf "0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n" \ + $((val & 0xff)) $(((val >> 8) & 0xff)) $(((val >> 16) & 0xff)) $(((val >> 24) & 0xff)) \ + $(((val >> 32) & 0xff)) $(((val >> 40) & 0xff)) $(((val >> 48) & 0xff)) $(((val >> 56) & 0xff)) +} + +add_to_filter_map() +{ + local map="$1" + local key="$2" + + # All the filter maps use a u64 as value + # Set the value to 1 to indicate that the key should be included in the filter + bpftool map update name "$map" key $key value $(uint_to_bpftool_u64 1) +} + +if (( $# < 2 )); then + print_usage + exit 1 +fi + +type=$1 +if [[ -z "${bpf_maps[$type]}" ]]; then + echo "Error: unrecognized type $type, must be one of: ${!bpf_maps[*]}" + exit 1 +fi + +map=${bpf_maps[$type]} +converter=${key_converters[$type]} + +for val in "${@:2}"; do + key=$($converter "$val") + if ! add_to_filter_map "$map" "$key"; then + echo "Error adding $val ($key) to map $map" + exit 1 + fi +done