irq vector

中断处理过程:

reg value–>irq(int) —> struct irq_desc

1
2
3
4
5
6
==> 中断时的有一个寄存器会保存中断源的vector值.
==> ==> `arch/x86/kernel/entry_64.S`调用函数`do_IRQ`.
==> ==> ==> `do_IRQ`依据`vector_irq`和vector值, 找到对应的中断号,并调用`handle_irq`.
==> ==> ==> ==> `handle_irq`通过函数irq_to_descdesc,可将中断号转化为`struct irq_desc`.
==> ==> ==> ==> generic_handle_irq_desc(irq, desc);
==> ==> ==> ==> ==> `generic_handle_irq_desc`调用 desc->handle_irq(irq, desc);

注:这里的handle_irq不是真正的中断处理函数,而是几大类中断控制器处理函数.
如82599, msi等.

`do_IRQ(struct pt_regs *regs)

File: arch/x86/kernel/irq.c

arch/x86/kernel/entry_64.S
will call do_IRQ

Read More

Delayed work: dst_gc_work

summary

A delayed work will first start a timer,
and when timeout, the delayed work will be put a worker_pool‘s
worklist or a pool_workqueue‘s delayed_works

how to use delayed work

data structure

1
2
3
4
5
6
7
8
113 struct delayed_work {
114 struct work_struct work;
115 struct timer_list timer;
116
117 /* target workqueue and CPU ->timer uses to queue ->work */
118 struct workqueue_struct *wq;
119 int cpu;
120 };

Read More

struct worker_pool->nr_running

Defination

1
2
3
4
5
6
7
8
9
10
141 /* struct worker is defined in workqueue_internal.h */
142
143 struct worker_pool {
...
173 /*
174 * The current concurrency level. As it's likely to be accessed
175 * from other CPUs during try_to_wake_up(), put it in a separate
176 * cacheline.
177 */
178 atomic_t nr_running ____cacheline_aligned_in_smp;

Increase

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
813 /**
814 * wq_worker_waking_up - a worker is waking up
815 * @task: task waking up
816 * @cpu: CPU @task is waking up to
817 *
818 * This function is called during try_to_wake_up() when a worker is
819 * being awoken.
820 *
821 * CONTEXT:
822 * spin_lock_irq(rq->lock)
823 */
824 void wq_worker_waking_up(struct task_struct *task, int cpu)
825 {
826 struct worker *worker = kthread_data(task);
827
828 if (!(worker->flags & WORKER_NOT_RUNNING)) {
829 WARN_ON_ONCE(worker->pool->cpu != cpu);
830 atomic_inc(&worker->pool->nr_running);
831 }
832 }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
923 /**
924 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
925 * @worker: self
926 * @flags: flags to clear
927 *
928 * Clear @flags in @worker->flags and adjust nr_running accordingly.
929 *
930 * CONTEXT:
931 * spin_lock_irq(pool->lock)
932 */
933 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
934 {
935 struct worker_pool *pool = worker->pool;
936 unsigned int oflags = worker->flags;
937
938 WARN_ON_ONCE(worker->task != current);
939
940 worker->flags &= ~flags;
941
942 /*
943 * If transitioning out of NOT_RUNNING, increment nr_running. Note
944 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
945 * of multiple flags, not a single flag.
946 */
947 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
948 if (!(worker->flags & WORKER_NOT_RUNNING))
949 atomic_inc(&pool->nr_running);
950 }

Decrease

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
885 /**
886 * worker_set_flags - set worker flags and adjust nr_running accordingly
887 * @worker: self
888 * @flags: flags to set
889 * @wakeup: wakeup an idle worker if necessary
890 *
891 * Set @flags in @worker->flags and adjust nr_running accordingly. If
892 * nr_running becomes zero and @wakeup is %true, an idle worker is
893 * woken up.
894 *
895 * CONTEXT:
896 * spin_lock_irq(pool->lock)
897 */
898 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
899 bool wakeup)
900 {
901 struct worker_pool *pool = worker->pool;
902
903 WARN_ON_ONCE(worker->task != current);
904
905 /*
906 * If transitioning into NOT_RUNNING, adjust nr_running and
907 * wake up an idle worker as necessary if requested by
908 * @wakeup.
909 */
910 if ((flags & WORKER_NOT_RUNNING) &&
911 !(worker->flags & WORKER_NOT_RUNNING)) {
912 if (wakeup) {
913 if (atomic_dec_and_test(&pool->nr_running) &&
914 !list_empty(&pool->worklist))
915 wake_up_worker(pool);
916 } else
917 atomic_dec(&pool->nr_running);
918 }
919
920 worker->flags |= flags;
921 }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
834 /**
835 * wq_worker_sleeping - a worker is going to sleep
836 * @task: task going to sleep
837 * @cpu: CPU in question, must be the current CPU number
838 *
839 * This function is called during schedule() when a busy worker is
840 * going to sleep. Worker on the same cpu can be woken up by
841 * returning pointer to its task.
842 *
843 * CONTEXT:
844 * spin_lock_irq(rq->lock)
845 *
846 * Return:
847 * Worker task on @cpu to wake up, %NULL if none.
848 */
849 struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
850 {
851 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
852 struct worker_pool *pool;
853
854 /*
855 * Rescuers, which may not have all the fields set up like normal
856 * workers, also reach here, let's not access anything before
857 * checking NOT_RUNNING.
858 */
859 if (worker->flags & WORKER_NOT_RUNNING)
860 return NULL;
861
862 pool = worker->pool;
863
864 /* this can only happen on the local cpu */
865 if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
866 return NULL;
867
868 /*
869 * The counterpart of the following dec_and_test, implied mb,
870 * worklist not empty test sequence is in insert_work().
871 * Please read comment there.
872 *
873 * NOT_RUNNING is clear. This means that we're bound to and
874 * running on the local cpu w/ rq lock held and preemption
875 * disabled, which in turn means that none else could be
876 * manipulating idle_list, so dereferencing idle_list without pool
877 * lock is safe.
878 */
879 if (atomic_dec_and_test(&pool->nr_running) &&
880 !list_empty(&pool->worklist))
881 to_wakeup = first_worker(pool);
882 return to_wakeup ? to_wakeup->task : NULL;
883 }

worker and worker_thread

Summary

The struct worker is the really scheudle unit in workqueue.
Each struct worker has a corresponding thread(task) by worker->task.
A struct worker is linked to struct worker_pool->idle_list when work is idle.
and moved to struct worker_pool->busy_hash.

worker_thread

  1. move worker from pool->idle_list and clear worker ‘s WORKER_IDLE flag.
  2. check the pool and manage the workers(create/destory)
  3. Iterate all the `struct work_struct *work` in the `struct worker_pool->worklist`,
    
    and run them in sequence with process_one_work(worker, work);.
  4. move worker into idle list again.
  5. schedule();

Read More

dst garbage

dst garbage summary

garbage collection is a common method used in kernel.
When a object(struct,memeory) become invalid, we need
free them, but the object maybe reference by others.

such as a dst_entry is not invalid, and it is still
referenced(used) by others.

then __dst_free will be called for this case.
It will first set dst to dirty(dead),
and then put it into dst_garbage.list by dst->next.

Then a workqueue task will check the dst‘s reference,
and free(destory) it when no reference on it.

Two key struct struct dst_garbage and dst_gc_work

Read More

dst ops

Call trace

forward a packet.

1
2
3
4
5
6
> ip_rcv_finish
> > ip_route_input_noref
> > > ip_route_input_slow
> > > > fib_lookup
> > > > ip_mkroute_input
> > dst_input(skb)
1
2
3
4
> > > > ip_mkroute_input
> > > > > __mkroute_input
> > > > > > rth = rt_dst_alloc(...)
> > > > > > skb_dst_set(skb, &rth->dst);

Read More

Qdisc running flag

Summary

In struct Qdisc, there are two similar fileds.
running flag is stored in __state of struct Qdisc, NOT state.
Every time, when we send a packet from qdisc, the running flag is
set by qdisc_run_begin, and after that, it is removed by qdisc_run_end.

1
2
3
84         unsigned long           state;
...
87 unsigned int __state;

todo

why need busylock?

Read More

how to xmit a packet with Qdisc

summary

We think it as a ideal and simple case:

Call Trace:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
> dev_queue_xmit
> > __dev_queue_xmit(skb, NULL);
> > > rcu_read_lock_bh();
> > > txq = netdev_pick_tx(dev, skb, accel_priv);
> > > q = rcu_dereference_bh(txq->qdisc);
> > > rc = __dev_xmit_skb(skb, q, dev, txq);
> > > > skb_dst_force(skb);
> > > > q->enqueue(skb, q);
> > > > qdisc_run_begin(q)
> > > > __qdisc_run(q);
> > > > > while (qdisc_restart(q))
> > > > > > __netif_schedule
> > > > > qdisc_run_end(q)
> > > rcu_read_unlock_bh();
> > > return rc;

Read More

how to create dev qdisc

Summary

Part 1: Register multi queue net device.

In this part, only the framework is prepared for qdisc,
and the noop_qdisc is set as default.

prepare netdev_queues.

for example: intel igb hardware has 8 hardware tx queue,
and nic driver create 8 corresponding struct netdev_queue
in the _tx of struct net_device.

prepare mq_qdisc

The mq_qdisc is attached to the corresponding device.
In mq_qdisc private field, a default qdisc will be
create for each NIC’s hardware queue.
This is done in mq_init.
The default qdisc is pfifo_fast_ops.

attach mq_qdisc to netdev_queue.

In mq_attach, these qdiscs are attatched to corresponding
struct netdev_queue.

Part 2: Active a net device with right qdiscs

Here only trace with the case mq_qdisc.
When dev is up, dev_open is called, which will call dev_activate.

Read More