tcpdump and ebpf

以内核v6.6代码,介绍tcpdump程序如何与内核交互,加载bpf程序的。

加载:

libpcap通过sockopt里的“SO_ATTACH_FILTER“,
在 packet socket下的”sk_filter“挂载prog呈现。

运行:

当skb报文到达packet_rcv时候, 通过调用___bpf_prog_run函数(注意,这个函数是3个下滑线,区别于2个下划线的函数)
运行”sk_filter“对应的prog。
其中prog的
==>packet_rcv
==> ==> run_filter
==> ==> ==>bpf_prog_run_clear_cb

563 struct sk_filter {
564 refcount_t refcnt;
565 struct rcu_head rcu;
566 struct bpf_prog *prog;
567 };

加载调用栈:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
==> pcap_setfilter
==> ==> p->setfilter_op 相当于pcap_setfilter_linux
==> ==> ==> pcap_setfilter_linux
==> ==> ==> ==> set_kernel_filter
==> ==> ==> ==> ==> setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER,
——————————————————kernel——————————————————
==> setsockopt
==> ==> __sys_setsockopt
==> ==> sock_setsockopt
==> ==> ==> sk_setsockopt
==> ==> ==> ==>sk_attach_filter
==> ==> ==> ==> ==> __get_filter
==> ==> ==> ==> ==> ==> bpf_prepare_filter
==> ==> ==> ==> ==> ==> ==> bpf_migrate_filter
==> ==> ==> ==> ==> ==> ==> ==>bpf_prog_select_runtime
==> ==> ==> ==> ==> ==> ==> ==> ==>bpf_prog_select_func
==> ==> ==> ==> ==> ==> ==> ==> ==> ==>fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
==> ==> ==> ==> ==> __sk_attach_prog
==> ==> ==> ==> ==> ==> rcu_assign_pointer(sk->sk_filter, fp);

函数定义

1
2
3
4
5
3723 int
3724 pcap_setfilter(pcap_t *p, struct bpf_program *fp)
3725 {
3726 return (p->setfilter_op(p, fp));
3727 }
1
2
3
4
5
6
7
992 static int
993 pcap_activate_linux(pcap_t *handle)
994 {
...
1119 handle->setfilter_op = pcap_setfilter_linux;
1120 handle->setdirection_op = pcap_setdirection_linux;
1121 handle->set_datalink_op = pcap_set_datalink_linux;
1
2
3
4
5
6
4315 static int
4316 pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter)
4317 {
...
4429 if (can_filter_in_kernel) {
4430 if ((err = set_kernel_filter(handle, &fcode)) == 0)
1
2
3
4
5
6
5414 static int
5415 set_kernel_filter(pcap_t *handle, struct sock_fprog *fcode)
5416 {

5453 if (setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER,
5454 &total_fcode, sizeof(total_fcode)) == 0) {
1
2
3
4
5
6
7
8
9
10
11
12
13
14
——————————————————kernel——————————————————
==> setsockopt
==> ==> __sys_setsockopt
==> ==> sock_setsockopt
==> ==> ==> sk_setsockopt
==> ==> ==> ==>sk_attach_filter
==> ==> ==> ==> ==> __get_filter
==> ==> ==> ==> ==> ==> bpf_prepare_filter
==> ==> ==> ==> ==> ==> ==> bpf_migrate_filter
==> ==> ==> ==> ==> ==> ==> ==>bpf_prog_select_runtime
==> ==> ==> ==> ==> ==> ==> ==> ==>bpf_prog_select_func
==> ==> ==> ==> ==> ==> ==> ==> ==> ==>fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
==> ==> ==> ==> ==> __sk_attach_prog
==> ==> ==> ==> ==> ==> rcu_assign_pointer(sk->sk_filter, fp);
1
2
3
4
5
6
7
8
9
10
11
12
1094 int sk_setsockopt(struct sock *sk, int level, int optname,
1095 sockptr_t optval, unsigned int optlen)
1096 {
...
1306 case SO_ATTACH_FILTER: {
1307 struct sock_fprog fprog;
1308
1309 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1310 if (!ret)
1311 ret = sk_attach_filter(&fprog, sk);
1312 break;
1313 }
1
2
3
4
5
6
7
8
1528 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1529 {
1530 struct bpf_prog *prog = __get_filter(fprog, sk);

1536 err = __sk_attach_prog(prog, sk);
...
1543 }
1544 EXPORT_SYMBOL_GPL(sk_attach_filter);
1
2
3
4
5
6
7
8
9
10
11
12
13
1481 static
1482 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1483 {
1484 unsigned int fsize = bpf_classic_proglen(fprog);
1485 struct bpf_prog *prog;
...
1495 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
...
1499 if (copy_from_user(prog->insns, fprog->filter, fsize)) {
...

1515 return bpf_prepare_filter(prog, NULL);
1516 }
1
2
3
4
5
6
7
8
9
10
11
12
13
 1312 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1313 bpf_aux_classic_check_t trans)
1314 {
...
1318 fp->jited = 0;
...
1340 bpf_jit_compile(fp);
...
1345 if (!fp->jited)
1346 fp = bpf_migrate_filter(fp);
1347
1348 return fp;
1349 }
1
2
3
4
 1242 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1243 {
...
1298 fp = bpf_prog_select_runtime(fp, &err);
1
2
3
4
2179 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
2180 {
...
2193 bpf_prog_select_func(fp);
1
2
3
4
5
6
7
8
9
10
2157 static void bpf_prog_select_func(struct bpf_prog *fp)
2158 {
2159 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
2160 u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
2161
2162 fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
2163 #else
2164 fp->bpf_func = __bpf_prog_ret0_warn;
2165 #endif
2166 }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
1455 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1456 {
1457 struct sk_filter *fp, *old_fp;
1458
1459 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1460 if (!fp)
1461 return -ENOMEM;
1462
1463 fp->prog = prog;
1464
1465 if (!__sk_filter_charge(sk, fp)) {
1466 kfree(fp);
1467 return -ENOMEM;
1468 }
1469 refcount_set(&fp->refcnt, 1);
1470
1471 old_fp = rcu_dereference_protected(sk->sk_filter,
1472 lockdep_sock_is_held(sk));
1473 rcu_assign_pointer(sk->sk_filter, fp);
1474
1475 if (old_fp)
1476 sk_filter_uncharge(sk, old_fp);
1477
1478 return 0;
1479 }

============================== 内核态运行 ========================================
运行:
当skb报文到达packet_rcv时候, 通过调用bpf run函数,
运行”sk_filter“对应的prog。
prog的bpf_func在attach时候被初始化为interpreters函数指针中对应的栈大小的函数。也就是___bpf_prog_run函数。

1
2
3
4
5
6
7
8
9
==>packet_rcv
==> ==> run_filter
==> ==> ==> bpf_prog_run_clear_cb
==> ==> ==> ==> bpf_prog_run_pin_on_cpu
==> ==> ==> ==> ==> bpf_prog_run
==> ==> ==> ==> ==> ==>__bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); //注意这个函数是两个下划线,不是三个下划线的___bpf_prog_run
==> ==> ==> ==> ==> ==> ==> dfunc(ctx, prog->insnsi, prog->bpf_func); //dfunc,相当于bpf_dispatcher_nop_func(ctx, prog->insnsi, prog->bpf_func);
==> ==> ==> ==> ==> ==> ==> bpf_dispatcher_nop_func(ctx, prog->insnsi, prog->bpf_func) //bpf_func
==> ==> ==> ==> ==> ==> ==> ==> bpf_func(ctx, insnsi); //
1
2
3
4
5
2121 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2122 struct packet_type *pt, struct net_device *orig_dev)
2123 {
...
2161 res = run_filter(skb, sk, snaplen);
1
2
3
4
5
6
7
8
9
10
11
2079 static unsigned int run_filter(struct sk_buff *skb,
2080 const struct sock *sk,
2081 unsigned int res)
2082 {
...
2085 rcu_read_lock();
2086 filter = rcu_dereference(sk->sk_filter);
2087 if (filter != NULL)
2088 res = bpf_prog_run_clear_cb(filter->prog, skb);
2089 rcu_read_unlock();
...
1
2
3
4
5
 749 static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
750 struct sk_buff *skb)
751 {
...
758 res = bpf_prog_run_pin_on_cpu(prog, skb);
1
2
3
4
5
6
7
618 static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
619 const void *ctx)
620 {
...
623 migrate_disable();
624 ret = bpf_prog_run(prog, ctx);
625 migrate_enable();
1
2
3
4
605 static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
606 {
607 return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
608 }
1
2
3
4
5
6
 581 static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
582 const void *ctx,
583 bpf_dispatcher_fn dfunc)
584 {
...
600 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
1
2
3
4
5
6
7
1162 static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
1163 const void *ctx,
1164 const struct bpf_insn *insnsi,
1165 bpf_func_t bpf_func)
1166 {
1167 return bpf_func(ctx, insnsi);
1168 }
1
2
3
4
5
6
7
8
9
10
11
1644 /**
1645 * ___bpf_prog_run - run eBPF program on a given context
1646 * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
1647 * @insn: is the array of eBPF instructions
1648 *
1649 * Decode and execute eBPF instructions.
1650 *
1651 * Return: whatever value is in %BPF_R0 at program exit
1652 */
1653 static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
1654 {