diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
new file mode 100644
index 00000000..17c69d2c
--- /dev/null
+++ b/examples/netstacklat.bpf.c
@@ -0,0 +1,490 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include <vmlinux.h>
+//#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "netstacklat.h"
+#include "bits.bpf.h"
+
+char LICENSE[] SEC("license") = "GPL";
+
+volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S);
+volatile const struct netstacklat_bpf_config user_config = {
+	.network_ns = 0,
+	.filter_pid = false,
+	.filter_ifindex = true,
+	.filter_cgroup = true,
+	.filter_nonempty_sockqueue = true,
+#define CONFIG_FILTER_NONEMPTY_SOCKQUEUE	1
+	.groupby_ifindex = true,
+	.groupby_cgroup = true,
+};
+
+/* This provide easy way compile-time to disable some hooks */
+//#define	CONFIG_HOOKS_EARLY_RCV	1
+#undef 	CONFIG_HOOKS_EARLY_RCV
+//#define	CONFIG_HOOKS_ENQUEUE	1
+#undef		CONFIG_HOOKS_ENQUEUE
+#define CONFIG_HOOKS_DEQUEUE	1
+
+/* Allows to compile-time disable ifindex map as YAML cannot conf this */
+//#define	CONFIG_IFINDEX_FILTER_MAP	1
+#undef		CONFIG_IFINDEX_FILTER_MAP
+
+/* Allows to compile-time disable PID filter map as it is very large */
+//#define	CONFIG_PID_FILTER_MAP	1
+#undef		CONFIG_PID_FILTER_MAP
+
+/*
+ * Alternative definition of sk_buff to handle renaming of the field
+ * mono_delivery_time to tstamp_type. See
+ * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
+ */
+struct sk_buff___old {
+	union {
+		ktime_t tstamp;
+		u64 skb_mstamp_ns;
+	};
+	__u8 mono_delivery_time: 1;
+} __attribute__((preserve_access_index));
+
+/* NOTICE: max_entries need to be adjusted based on maximum
+ *  number of cgroups and ifindex (that are "groupby" collecting)
+ *  and "enabled" hooks (as we want to disable some)
+ */
+#define N_CGROUPS	2 /* depend on cgroup_id_map matches in YAML config*/
+#define N_HOOKS	NETSTACKLAT_N_HOOKS  /* Keep it same until we disable some */
+#define N_IFACES	64 /* On prod only interested in ext0 and vlan100@ext0 */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+	__uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES * 64);
+	__type(key, struct hist_key);
+	__type(value, u64);
+} netstack_latency_seconds SEC(".maps");
+
+#ifdef CONFIG_PID_FILTER_MAP
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, PID_MAX_LIMIT);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_pidfilter SEC(".maps");
+#endif
+
+#ifdef CONFIG_IFINDEX_FILTER_MAP
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, IFINDEX_MAX);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_ifindexfilter SEC(".maps");
+#endif
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_TRACKED_CGROUPS);
+	__type(key, u64);
+	__type(value, u64);
+} netstack_cgroupfilter SEC(".maps");
+
+static ktime_t time_since(ktime_t tstamp)
+{
+	ktime_t now;
+
+	if (tstamp <= 0)
+		return -1;
+
+	now = bpf_ktime_get_tai_ns() - TAI_OFFSET;
+	if (tstamp > now)
+		return -1;
+
+	return now - tstamp;
+}
+
+/* Determine if ebpf_exporter macro or local C implementation is used */
+#define CONFIG_MAP_MACROS	1
+#ifdef  CONFIG_MAP_MACROS
+#include "maps.bpf.h"
+#define _record_latency_since(tstamp, key)					\
+	ktime_t latency = time_since(tstamp);					\
+	if (latency >= 0)							\
+		increment_exp2_histogram_nosync(&netstack_latency_seconds,	\
+						key, latency,			\
+						HIST_MAX_LATENCY_SLOT);
+#else /* !CONFIG_MAP_MACROS */
+#define _record_latency_since(tstamp, key)	\
+	record_latency_since(tstamp, &key)
+
+static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
+{
+	u64 zero = 0;
+	u64 *val;
+
+	val = bpf_map_lookup_elem(map, key);
+	if (val)
+		return val;
+
+	// Key not in map - try insert it and lookup again
+	bpf_map_update_elem(map, key, &zero, BPF_NOEXIST);
+	return bpf_map_lookup_elem(map, key);
+}
+
+static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket)
+{
+	u32 bucket = log2l(value);
+
+	// Right-inclusive histogram, so "round up" the log value
+	if (bucket > 0 && 1ULL << bucket < value)
+		bucket++;
+
+	if (bucket > max_bucket)
+		bucket = max_bucket;
+
+	return bucket;
+}
+
+/*
+ * Same call signature as the increment_exp2_histogram_nosync macro from
+ * https://github.com/cloudflare/ebpf_exporter/blob/master/examples/maps.bpf.h
+ * but provided as a function.
+ *
+ * Unlike the macro, only works with keys of type struct hist_key. The hist_key
+ * struct must be provided by value (rather than as a pointer) to keep the same
+ * call signature as the ebpf-exporter macro, although this will get inefficent
+ * if struct hist_key grows large.
+ */
+static void increment_exp2_histogram_nosync(void *map, struct hist_key key,
+					    u64 value, u32 max_bucket)
+{
+	u64 *bucket_count;
+
+	// Increment histogram
+	key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket);
+	bucket_count = lookup_or_zeroinit_histentry(map, &key);
+	if (bucket_count)
+		(*bucket_count)++;
+
+	// Increment sum at end of histogram
+	if (value == 0)
+		return;
+
+	key.bucket = max_bucket + 1;
+	bucket_count = lookup_or_zeroinit_histentry(map, &key);
+	if (bucket_count)
+		*bucket_count += value;
+}
+
+static void record_latency(ktime_t latency, const struct hist_key *key)
+{
+	increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency,
+					HIST_MAX_LATENCY_SLOT);
+}
+static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
+{
+	ktime_t latency = time_since(tstamp);
+	if (latency >= 0)
+		record_latency(latency, key);
+}
+#endif /* !CONFIG_MAP_MACROS */
+
+static bool filter_ifindex(u32 ifindex)
+{
+	if (!user_config.filter_ifindex)
+		// No ifindex filter - all ok
+		return true;
+
+#ifdef CONFIG_IFINDEX_FILTER_MAP
+	u64 *ifindex_ok;
+
+	ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex);
+	if (!ifindex_ok)
+		return false;
+
+	return *ifindex_ok > 0;
+#else
+	/* Hack for production:
+	 * - We want to exclude 'lo' which have ifindex==1.
+	 * - We want to filter on ext0 (ifindex 2) and vlan100@ext0 (ifindex 5)
+	 */
+	if (ifindex > 1 && ifindex < 6)
+		return true;
+
+	return false;
+#endif
+}
+
+static bool filter_network_ns(u32 ns)
+{
+	if (user_config.network_ns == 0)
+		return true;
+
+	return ns == user_config.network_ns;
+}
+
+static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk)
+{
+	/*
+	 * Favor reading from sk due to less redirection (fewer probe reads)
+	 * and skb->dev is not always set.
+	 */
+	if (sk)
+		return BPF_CORE_READ(sk->__sk_common.skc_net.net, ns.inum);
+	else if (skb)
+		return BPF_CORE_READ(skb->dev, nd_net.net, ns.inum);
+	return 0;
+}
+
+#if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE)
+static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook)
+{
+	struct hist_key key = { .hook = hook };
+	u32 ifindex;
+
+	if (bpf_core_field_exists(skb->tstamp_type)) {
+		/*
+		 * For kernels >= v6.11 the tstamp_type being non-zero
+		 * (SKB_CLOCK_REALTIME) implies that skb->tstamp holds a
+		 * preserved TX timestamp rather than a RX timestamp. See
+		 * https://lore.kernel.org/all/20240509211834.3235191-2-quic_abchauha@quicinc.com/
+		 */
+		if (BPF_CORE_READ_BITFIELD(skb, tstamp_type) > 0)
+			return;
+
+	} else {
+		/*
+		 * For kernels < v6.11, the field was called mono_delivery_time
+		 * instead, see https://lore.kernel.org/all/20220302195525.3480280-1-kafai@fb.com/
+		 * Kernels < v5.18 do not have the mono_delivery_field either,
+		 * but we do not support those anyways (as they lack the
+		 * bpf_ktime_get_tai_ns helper)
+		 */
+		struct sk_buff___old *skb_old = (void *)skb;
+		if (BPF_CORE_READ_BITFIELD(skb_old, mono_delivery_time) > 0)
+			return;
+	}
+
+	ifindex = skb->skb_iif;
+	if (!filter_ifindex(ifindex))
+		return;
+
+	if (!filter_network_ns(get_network_ns(skb, sk)))
+		return;
+
+	if (user_config.groupby_ifindex)
+		key.ifindex = ifindex;
+
+	_record_latency_since(skb->tstamp, key);
+}
+#endif
+
+#ifdef CONFIG_PID_FILTER_MAP
+static bool filter_pid(u32 pid)
+{
+	u64 *pid_ok;
+
+	if (!user_config.filter_pid)
+		// No PID filter - all PIDs ok
+		return true;
+
+	pid_ok = bpf_map_lookup_elem(&netstack_pidfilter, &pid);
+	if (!pid_ok)
+		return false;
+
+	return *pid_ok > 0;
+
+}
+#endif /* CONFIG_PID_FILTER_MAP */
+
+static bool filter_cgroup(u64 cgroup_id)
+{
+	if (!user_config.filter_cgroup)
+		// No cgroup filter - all cgroups ok
+		return true;
+
+	return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL;
+}
+
+static bool filter_current_task(u64 cgroup)
+{
+	bool ok = true;
+
+#ifdef CONFIG_PID_FILTER_MAP
+	__u32 tgid;
+
+	if (user_config.filter_pid) {
+		tgid = bpf_get_current_pid_tgid() >> 32;
+		ok = ok && filter_pid(tgid);
+	}
+#endif
+	if (user_config.filter_cgroup)
+		ok = ok && filter_cgroup(cgroup);
+
+	return ok;
+}
+
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+/**
+ * skb_queue_empty - check if a queue is empty
+ * @list: queue head
+ *
+ * Returns true if the queue is empty, false otherwise.
+ *
+ * Copied from /include/linux/skbuff.h
+ */
+static inline int skb_queue_empty(const struct sk_buff_head *list)
+{
+	return READ_ONCE(list->next) == (const struct sk_buff *)list;
+}
+
+static bool filter_nonempty_sockqueue(struct sock *sk)
+{
+#ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE
+	if (!user_config.filter_nonempty_sockqueue)
+		return true;
+#endif
+
+	return !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+/* To lower runtime overhead, skip recording timestamps for sockets with very
+ * few packets. Use sk_buff_head->qlen to see if e.g. queue have more than 2
+ * elements
+ */
+static inline __u32 sk_queue_len(const struct sk_buff_head *list_)
+{
+	return READ_ONCE(list_->qlen);
+}
+
+static bool filter_queue_len(struct sock *sk, const __u32 above_len)
+{
+	if (sk_queue_len(&sk->sk_receive_queue) > above_len)
+		return true;
+	return false;
+}
+
+static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
+				  ktime_t tstamp, enum netstacklat_hook hook)
+{
+	struct hist_key key = { .hook = hook };
+	u64 cgroup = 0;
+	u32 ifindex;
+
+	if (user_config.filter_cgroup || user_config.groupby_cgroup)
+		cgroup = bpf_get_current_cgroup_id();
+
+	if (!filter_current_task(cgroup))
+		return;
+
+	ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex;
+	if (!filter_ifindex(ifindex))
+		return;
+
+	if (!filter_network_ns(get_network_ns(skb, sk)))
+		return;
+
+	if (user_config.groupby_ifindex)
+		key.ifindex = ifindex;
+	if (user_config.groupby_cgroup)
+		key.cgroup = cgroup;
+
+	_record_latency_since(tstamp, key);
+}
+
+#ifdef CONFIG_HOOKS_EARLY_RCV
+SEC("fentry/ip_rcv_core")
+int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+
+SEC("fentry/ip6_rcv_core")
+int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+
+SEC("fentry/tcp_v4_rcv")
+int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+
+SEC("fentry/tcp_v6_rcv")
+int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+
+SEC("fentry/udp_rcv")
+int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+
+SEC("fentry/udpv6_rcv")
+int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+#endif /* CONFIG_HOOKS_EARLY_RCV */
+
+#ifdef CONFIG_HOOKS_ENQUEUE
+SEC("fexit/tcp_queue_rcv")
+int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb)
+{
+	record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED);
+	return 0;
+}
+
+SEC("fexit/__udp_enqueue_schedule_skb")
+int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
+	     struct sk_buff *skb, int retval)
+{
+	if (retval == 0)
+		record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED);
+	return 0;
+}
+#endif /* CONFIG_HOOKS_ENQUEUE */
+
+#ifdef CONFIG_HOOKS_DEQUEUE
+SEC("fentry/tcp_recv_timestamp")
+int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
+	     struct scm_timestamping_internal *tss)
+{
+	if (!filter_nonempty_sockqueue(sk))
+		return 0;
+
+	if (!filter_queue_len(sk, 3))
+		return 0;
+
+	struct timespec64 *ts = &tss->ts[0];
+	record_socket_latency(sk, NULL,
+			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
+			      NETSTACKLAT_HOOK_TCP_SOCK_READ);
+	return 0;
+}
+
+SEC("fentry/skb_consume_udp")
+int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
+	     int len)
+{
+	if (!filter_nonempty_sockqueue(sk))
+		return 0;
+
+	record_socket_latency(sk, skb, skb->tstamp,
+			      NETSTACKLAT_HOOK_UDP_SOCK_READ);
+	return 0;
+}
+#endif /* CONFIG_HOOKS_DEQUEUE */
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
new file mode 100644
index 00000000..4811da4c
--- /dev/null
+++ b/examples/netstacklat.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef NETSTACKLAT_H
+#define NETSTACKLAT_H
+
+#define HIST_MAX_LATENCY_SLOT 34 // 2^34 ns -> ~17s
+/*
+ * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key"
+ * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys)
+ * that ebpf_exporter expects for exp2 hists (see how it's used in the
+ * increment_exp2_histogram_nosync() function)
+ */
+#define HIST_NBUCKETS (HIST_MAX_LATENCY_SLOT + 2)
+
+#define NS_PER_S 1000000000
+
+// The highest possible PID on a Linux system (from /include/linux/threads.h)
+#define PID_MAX_LIMIT (4 * 1024 * 1024)
+// The highest ifindex we expect to encounter
+#define IFINDEX_MAX 16384
+// The maximum number of different cgroups we can filter for
+#define MAX_TRACKED_CGROUPS 4096
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+#endif
+
+#ifndef max
+#define max(a, b)                   \
+	({                          \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a > _b ? _a : _b;  \
+	})
+#endif
+
+#ifndef min
+#define min(a, b)                   \
+	({                          \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a < _b ? _a : _b;  \
+	})
+#endif
+
+enum netstacklat_hook {
+	NETSTACKLAT_HOOK_INVALID = 0,
+	NETSTACKLAT_HOOK_IP_RCV,
+	NETSTACKLAT_HOOK_TCP_START,
+	NETSTACKLAT_HOOK_UDP_START,
+	NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_TCP_SOCK_READ,
+	NETSTACKLAT_HOOK_UDP_SOCK_READ,
+	NETSTACKLAT_N_HOOKS,
+};
+
+/*
+ * Key used for the histogram map
+ * To be compatible with ebpf-exporter, all histograms need a key struct whose final
+ * member is named "bucket" and is the histogram bucket index.
+ */
+struct hist_key {
+	__u64 cgroup;
+	__u32 ifindex;
+	__u16 hook; // need well defined size for ebpf-exporter to decode
+	__u16 bucket; // needs to be last to be compatible with ebpf-exporter
+};
+
+struct netstacklat_bpf_config {
+	__u32 network_ns;
+	bool filter_pid;
+	bool filter_ifindex;
+	bool filter_cgroup;
+	bool filter_nonempty_sockqueue;
+	bool groupby_ifindex;
+	bool groupby_cgroup;
+};
+
+#endif
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
new file mode 100644
index 00000000..d59d3b6b
--- /dev/null
+++ b/examples/netstacklat.yaml
@@ -0,0 +1,49 @@
+metrics:
+  histograms:
+    - name: netstack_latency_seconds
+      help: Latency for packets (skbs) to reach various points in the kernel network stack
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: cgroup
+          size: 8
+          decoders:
+            - name: uint
+            - name: cgroup
+        - name: iface
+          size: 4
+          decoders:
+            # If including output from a different network namespace than ebpf-exporter
+            # you probably just want to decode as a uint (ifindex) instead
+            # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others
+            - name: ifname
+        - name: hook
+          size: 2
+          decoders:
+            - name: uint
+            - name: static_map
+              static_map:
+                1: "ip-start"
+                2: "tcp-start"
+                3: "udp-start"
+                4: "tcp-socket-enqueued"
+                5: "udp-socket-enqueued"
+                6: "tcp-socket-read"
+                7: "udp-socket-read"
+        - name: bucket
+          size: 2
+          decoders:
+            - name: uint
+
+cgroup_id_map:
+  name: netstack_cgroupfilter
+  type: hash
+  regexps:
+    - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$
+    - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$
+#    - ^(/sys/fs/cgroup/production.slice/.*/pingora-backend-router.service).*$
+#    - ^(/sys/fs/cgroup/production.slice/.*/pingora-origin.service).*$
+#    - ^.*(system.slice/.*)$
+#    - ^.*(user.slice/.*)$