how tcpdump work with cbpf

tcpdump通过libpcap库以及内核的af_packet对数据包问题进行抓取。
关于这两部分的如何协作抓包,之前blog里已经写过。
这里主要记录分析,在ebpf之前的内核(以v3.0)如何处理tcpdump里的filter的。

filter编译后,如何加载到内核里的:

在filter被翻译为一系列的指令后,这个指令buff被libpcap,
通过sockopt里的SO_ATTACH_FILTE选项,
最终挂载到AF_PACKET socket下的sk_filter上。

函数调用栈:

1
2
3
4
5
6
7
8
9
--> pcap_setfilter
--> --> p->setfilter_op 相当于pcap_setfilter_linux
--> --> --> pcap_setfilter_linux
--> --> --> --> set_kernel_filter
--> --> --> --> --> setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER,
——————————————————kernel——————————————————
--> --> --> --> --> --> sock_setsockopt
--> --> --> --> --> --> --> sk_attach_filter
--> --> --> --> --> --> --> --> rcu_assign_pointer(sk->sk_filter, fp);

cbpf程序是如何被运行的:

当skb报文到达packet_rcv时候, 通过调用sk_run_filter函数,运行sk_filter对应的prog。

1
2
3
4
5
6
--> packet_rcv
--> --> run_filter
--> --> --> SK_RUN_FILTER
--> --> --> --> (*FILTER->bpf_func)
--> --> --> --> 相当于sk_run_filter, bpf_func在sk_attach_filter时候被赋值为sk_run_filter
--> --> --> --> --> 相当于SK_RUN_FILTER

关键数据结构struct sk_filter

1
2
3
4
5
6
7
8
9
138 struct sk_filter
139 {
140 atomic_t refcnt;
141 unsigned int len; /* Number of filter blocks */
142 unsigned int (*bpf_func)(const struct sk_buff *skb,
143 const struct sock_filter *filter);
144 struct rcu_head rcu;
145 struct sock_filter insns[0];
146 };

用来存放cbpf对应的prog程序,其中insns和len用来指定程序指令和总长度。
bpf_func在isk_attach_filter 中被为sk_run_filter

函数摘录

libpcap函数

1
2
3
4
5
3723 int
3724 pcap_setfilter(pcap_t *p, struct bpf_program *fp)
3725 {
3726 return (p->setfilter_op(p, fp));
3727 }
1
2
3
4
5
6
7
 992 static int
993 pcap_activate_linux(pcap_t *handle)
994 {
...
1119 handle->setfilter_op = pcap_setfilter_linux;
1120 handle->setdirection_op = pcap_setdirection_linux;
1121 handle->set_datalink_op = pcap_set_datalink_linux;

setfilter_op被赋值为pcap_setfilter_linux,所以pcap_setfilter相当于调用pcap_setfilter_linux.
pcap_setfilter借助set_kernel_filter,通过系统调用setsockoptSO_ATTACH_FILTER选项,把prog程序加载到内核里。

1
2
3
4
5
6
4315 static int
4316 pcap_setfilter_linux(pcap_t *handle, struct bpf_program *filter)
4317 {
...
4429 if (can_filter_in_kernel) {
4430 if ((err = set_kernel_filter(handle, &fcode)) == 0)
1
2
3
4
5
6
5414 static int
5415 set_kernel_filter(pcap_t *handle, struct sock_fprog *fcode)
5416 {

5453 if (setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER,
5454 &total_fcode, sizeof(total_fcode)) == 0) {

内核态处理函数。

内核态系统入口函数,最终会调用sock_setsockopt,并走到case SO_ATTACH_FILTER这个分支。
sk_attach_filter把用户下发的prog程序挂载到tcpdump进程创建的AF_PACKsocket对应的sk_filter,
同时还会把bpf_func初始为sk_run_filter, 而sk_run_filter这个函数在收包处理时候,用来运行filter对应的prog,
这个在下面一节里详细说明。

1
2
3
4
5
6
480 int sock_setsockopt(struct socket *sock, int level, int optname,
482 {
...
705 case SO_ATTACH_FILTER:
...
714 ret = sk_attach_filter(&fprog, sk);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
600 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
601 {
602 struct sk_filter *fp, *old_fp;
...
610 fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
613 if (copy_from_user(fp->insns, fprog->filter, fsize)) {
...
619 fp->len = fprog->len;
620 fp->bpf_func = sk_run_filter;
...
628 bpf_jit_compile(fp);
629
630 old_fp = rcu_dereference_protected(sk->sk_filter,
631 sock_owned_by_user(sk));
632 rcu_assign_pointer(sk->sk_filter, fp);
...
637 }
638 EXPORT_SYMBOL_GPL(sk_attach_filter);

内核运行时的函数摘录

到报文到达cbpf内核的核心入口函数,packet_rc,根据上一节提到的,
对应sk下面保存的sk_run_filtersk_filter,找到对应的运行函数和prog程序,并运行。
sk_run_filter就是跟逻辑无关的一个cbpf指令运行函数,并返回是否匹配,依次让packet协议
决定是否把报文通过AF_PACKET socket 上送给应用层的libpcap处理。

在新版内核里这里的sk_run_filter被bpf里的更通用函数bfp——run代替,最新内核代码的分析在下一篇里介绍。

1
2
3
4
5
2121 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2122 struct packet_type *pt, struct net_device *orig_dev)
2123 {
...
2161 res = run_filter(skb, sk, snaplen);
1
163 #define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
532 static inline unsigned int run_filter(const struct sk_buff *skb,
533 const struct sock *sk,
534 unsigned int res)
535 {
536 struct sk_filter *filter;
537
538 rcu_read_lock();
539 filter = rcu_dereference(sk->sk_filter);
540 if (filter != NULL)
541 res = SK_RUN_FILTER(filter, skb);
542 rcu_read_unlock();
543
544 return res;
545 }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
100 /**
101 * sk_run_filter - run a filter on a socket
102 * @skb: buffer to run the filter on
103 * @fentry: filter to apply
104 *
105 * Decode and apply filter instructions to the skb->data.
106 * Return length to keep, 0 for none. @skb is the data we are
107 * filtering, @filter is the array of filter instructions.
108 * Because all jumps are guaranteed to be before last instruction,
109 * and last instruction guaranteed to be a RET, we dont need to check
110 * flen. (We used to pass to this function the length of filter)
111 */
112 unsigned int sk_run_filter(const struct sk_buff *skb,
113 const struct sock_filter *fentry)
114 {
115 void *ptr;
116 u32 A = 0; /* Accumulator */
117 u32 X = 0; /* Index Register */
118 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
119 u32 tmp;
120 int k;