how tcpdown direction filter work

tcpudmp对 direction的支持。

内核代码有一个关键数据结构:skb的pkt_type字段。
在收发路径这个域被赋值为PACKET_OUTGOING或者其他。
这个值被传递到往用户空间,libpcap根据它判断报文的方向是否是期望的。

pkt_type的可能取值

1
2
3
4
5
24 #define PACKET_HOST             0               /* To us                */
25 #define PACKET_BROADCAST 1 /* To all */
26 #define PACKET_MULTICAST 2 /* To group */
27 #define PACKET_OTHERHOST 3 /* To someone else */
28 #define PACKET_OUTGOING 4 /* Outgoing of any type */

Read More

vhost net study

vhost net 的目的是为了避免在host kerne上做一次qemu的调度,提升性能。
xmit: 让vm的数据报在 host的内核就把报文发送出去。
rcv:

核心数据结构

vhost_poll是vhost里最关键的一个数据结构。

1
2
3
4
5
6
7
8
9
10
27 /* Pol> > file (eventfd or socket) */
28 /* Note: there's nothing vhost specific about this structure. */
29 struct vhost_poll {
3> > > > poll_tabl> > > > > > > > table;
3> > > > wait_queue_head_> > > > *wqh;
3> > > > wait_queue_> > > > > > wait;
3> > > > struct vhost_wor> > > > work;
3> > > > unsigned lon> > > > > > mask;
3> > > > struct vhost_de> > > > *dev;
36 };
  • table:每次负责把wait域放倒wqh里。vhost_net_open将它的执行函数vhost_poll_func
  • wqh:它的wqh被初始化指向一个eventfd的ctx,
  • wait:每次把wait放倒这个wqh链表里,当guest vm的发送报文时,wait被摘下,
    并执行其对应的func,vhost_net_open将该func被初始化为vhost_poll_wakeup。
    vhost_poll_wakeup负责将work放入对应vhost_dev下的work_list链表中。
  • work: 每个vhost_dev有一个thread,负责从work_list链表里的摘除work节点,
    并执行work节点对应的fn. fn是真正干活的的函数。
    对于rx vhost_virqueue, vhost_net_open将该fn初始化为handle_rx_kick
    对于tx vhost_virqueue, vhost_net_open将该fn初始化为handle_tx_kick
    对于rx vhost_virqueue, vhost_net_open将该fn初始化为
    handle_rx_kick.
  • mask:是需要监听的eventfd的事件集合
  • dev: 该vhost_poll对应的vhost_dev;

Read More

ipvlan study

L2 mode

xmit packet
xmit a normal pkt to other phy machine
1
2
3
4
==> ipvlan_start_xmit
==> ==> ipvlan_xmit_mode_l2
==> ==> ==> skb->dev = ipvlan->phy_dev;
==> ==> ==> return dev_queue_xmit(skb);
xmit a normal pkt to other namespace
1
2
3
4
5
6
7
8
9
==> ipvlan_start_xmit
==> ==> ipvlan_xmit_mode_l2
==> ==> ==> if (ether_addr_equal(eth->h_dest, eth->h_source))
==> ==> ==> addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
==> ==> ==> ipvlan_rcv_frame(addr, skb, true);
==> ==> ==> ==> skb->dev = dev; <== dst namespace dev
==> ==> ==> ==> dev_forward_skb(ipvlan->dev, skb)
==> ==> ==> ==> ==> return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
==> ==> ==> ==> ==> ==> enqueue_to_backlog(skb, get_cpu(), &qtail);
xmit a mutlicast pkt
1
2
3
4
5
6
==> ipvlan_start_xmit
==> ==> ipvlan_xmit_mode_l2
==> ==> ==> else if (is_multicast_ether_addr(eth->h_dest))
==> ==> ==> ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
==> ==> ==> ==> list_for_each_entry(ipvlan, &port->ipvlans, pnode)
==> ==> ==> ==> ==> dev_forward_skb or netif_rx(nskb);
recv packet

All the packet are get by the rx_handler, ipvlan_handle_frame.

unicast packet

lookup the dest ipvlan port(net_device)
by the dst IPv4/6 address, and send to it.

1
2
3
4
5
==> ipvlan_handle_frame
==> ==> ipvlan_handle_mode_l2
==> ==> ==> ipvlan_addr_lookup(port, lyr3h, addr_type, true);
==> ==> ==> ==> skb->dev = dev;
==> ==> ==> ==> dev_forward_skb or ret = RX_HANDLER_ANOTHER;
multicast packet.
1
2
3
4
5
6
==> ipvlan_handle_frame
==> ==> ipvlan_handle_mode_l2
==> ==> ==> if (is_multicast_ether_addr(eth->h_dest)) {
==> ==> ==> ipvlan_addr_lookup(port, lyr3h, addr_type, true);
==> ==> ==> ==> if (ipvlan_external_frame(skb, port))
==> ==> ==> ==> ==> ipvlan_multicast_frame(port, skb, NULL, false);

l3 mode

1
ipvlan_start_xmit
1
2
3
4
5
6
7
8
9
10
11
12
13
308 static const struct net_device_ops ipvlan_netdev_ops = {
309 .ndo_init = ipvlan_init,
310 .ndo_uninit = ipvlan_uninit,
311 .ndo_open = ipvlan_open,
312 .ndo_stop = ipvlan_stop,
313 .ndo_start_xmit = ipvlan_start_xmit,
314 .ndo_fix_features = ipvlan_fix_features,
315 .ndo_change_rx_flags = ipvlan_change_rx_flags,
316 .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter,
317 .ndo_get_stats64 = ipvlan_get_stats64,
318 .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid,
319 .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid,
320 };
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
495 int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
496 {
497 struct ipvl_dev *ipvlan = netdev_priv(dev);
498 struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev);
499
500 if (!port)
501 goto out;
502
503 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
504 goto out;
505
506 switch(port->mode) {
507 case IPVLAN_MODE_L2:
508 return ipvlan_xmit_mode_l2(skb, dev);
509 case IPVLAN_MODE_L3:
510 return ipvlan_xmit_mode_l3(skb, dev);
511 }
512
513 /* Should not reach here */
514 WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
515 port->mode);
516 out:
517 kfree_skb(skb);
518 return NET_XMIT_DROP;
519 }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
457 static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
458 {
459 const struct ipvl_dev *ipvlan = netdev_priv(dev);
460 struct ethhdr *eth = eth_hdr(skb);
461 struct ipvl_addr *addr;
462 void *lyr3h;
463 int addr_type;
464
465 if (ether_addr_equal(eth->h_dest, eth->h_source)) {
466 lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
467 if (lyr3h) {
468 addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
469 if (addr)
470 return ipvlan_rcv_frame(addr, skb, true);
471 }
472 skb = skb_share_check(skb, GFP_ATOMIC);
473 if (!skb)
474 return NET_XMIT_DROP;
475
476 /* Packet definitely does not belong to any of the
477 * virtual devices, but the dest is local. So forward
478 * the skb for the main-dev. At the RX side we just return
479 * RX_PASS for it to be processed further on the stack.
480 */
481 return dev_forward_skb(ipvlan->phy_dev, skb);
482
483 } else if (is_multicast_ether_addr(eth->h_dest)) {
484 u8 ip_summed = skb->ip_summed;
485
486 skb->ip_summed = CHECKSUM_UNNECESSARY;
487 ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
488 skb->ip_summed = ip_summed;
489 }
490
491 skb->dev = ipvlan->phy_dev;
492 return dev_queue_xmit(skb);
493 }

ftrace study

test case

We found ixgbe rx softirq poll function ixgbe_poll was called even without pkt coming.
how to prove it and who call it?

analysis

By browse source ixgbe_poll is called, it should be done by napi schedule.
If this is true, __napi_schedule should be called.

Read More

netdevice watchdog cause tx queue schedule

test case

For ixgbe nic, we want to assign a tx hardware qeueue to each cpu,
and the tx softirq should use the corresponding hardware queue.

each packet will select a softqueue in dev_queue_xmit,
we rewrite ixgbe driver ndo_select_queue(ixgbe_select_queue),
which will return current cpu index(based 0) when packet select queue.
thus for each cpu use its own tx queue.

but, we found some packet had unmatched queue index when send
on specific cpu.

for example, a packet’s queue index is 5 but is sent by cpu3,
thus, cpu3 will operate tx hw queue5, which should only be done by cpu5.

Read More

how does ixgbe use queue index

data structure

1
2
3
4
5
7913 static const struct net_device_ops ixgbe_netdev_ops = {
...
7916 .ndo_start_xmit = ixgbe_xmit_frame,
7917 .ndo_select_queue = ixgbe_select_queue,
...

receive skb: record receive queue index

record queue_index +1, 0 is used as NOT record.

####call trace

1
2
3
4
> ixgbe_poll
> > ixgbe_clean_rx_irq
> > > ixgbe_process_skb_fields
> > > > skb_record_rx_queue

Read More

tcpdump work with bonding interface

test case

####On redhat5, Why tcpdump could not work on bonding work.
OS: redhat 5.
There are two 82599 interfaces eth0 and eth1.
These two interfaces are used as slave of bond0,
eth1 is backup of eth0.

We ping the default gateway on test machine.
ping work OK, and tcpdump on bond0 show the icmp request and icmp require packets.
while on eth0 only icmp request, and eth1 has no any packet.

Read More

register irq handler

call trace

handle_level_irq为例说明.

1
2
3
4
===> handle_level_irq
==> ==> handle_irq_event
==> ==> ==> handle_irq_event_percpu
==> ==> ==> ==>action->handler

where handler is registered

1
2
3
4
5
6
127 static inline int __must_check
128 request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
129 const char *name, void *dev)
130 {
131 return request_threaded_irq(irq, handler, NULL, flags, name, dev);
132 }

Read More

irq framework

中断处理过程:

硬件中断到中断控制器

reg value–>irq(int) —> struct irq_desc

1
2
3
4
5
6
==> 中断时的有一个寄存器会保存中断源的vector值.
==> ==> `arch/x86/kernel/entry_64.S`调用函数`do_IRQ`.
==> ==> ==> `do_IRQ`依据`vector_irq`和vector值, 找到对应的中断号,并调用`handle_irq`.
==> ==> ==> ==> `handle_irq`通过函数irq_to_descdesc,可将中断号转化为`struct irq_desc`.
==> ==> ==> ==> generic_handle_irq_desc(irq, desc);
==> ==> ==> ==> ==> `generic_handle_irq_desc`调用 desc->handle_irq(irq, desc);

注:这里的handle_irq不是真正的中断处理函数,而是几大类中断控制器处理函数.
如82599, msi等.
具体分析见:irq study1

中断控制器到具体的中断处理函数

1
2
3
4
5
==> handle_level_irq
==> ==> irqreturn_t handle_irq_event(struct irq_desc *desc)
==> ==> ==> struct irqaction *action = desc->action
==> ==> ==> ret = handle_irq_event_percpu(desc, action);
==> ==> ==> ==> action->handler(irq, action->dev_id);

这里的action->handler才是我们使用request_irq注册的中断处理函数.
具体分析见:
具体分析见:irq study2