softirq

为什么需要软中断

引入软中断的目的是为了中断要尽快返回。 把一些不紧急的工作放到下半部里执行。 软中断是下半部的一种实现方式。其他的还有tasklet和workqueue。

软中断特点

软中断特点是可以同时运行在CPU上,而tasklet则是整个系统中只有一个。 软中断和tasklet都不允许睡眠,因此必须避免执行引发睡眠的操作。 workqueue则允许睡眠。

Read More

the mem_init on mips(octean)

arch/mips/kernel/setup.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
static void __init bootmem_init(void)
{
unsigned long reserved_end;
unsigned long mapstart = ~0UL;
unsigned long bootmap_size;
int i;

/*
* Init any data related to initrd. It's a nop if INITRD is
* not selected. Once that done we can determine the low bound
* of usable memory.
*/
reserved_end = max(init_initrd(),
(unsigned long) PFN_UP(__pa_symbol(&_end)));

/*
* max_low_pfn is not a number of pages. The number of pages
* of the system is given by 'max_low_pfn - min_low_pfn'.
*/
min_low_pfn = ~0UL;
max_low_pfn = 0;

/*
* Find the highest page frame number we have available.
*/
for (i = 0; i < boot_mem_map.nr_map; i++) {
unsigned long start, end;

if (boot_mem_map.map[i].type != BOOT_MEM_RAM)
continue;

start = PFN_UP(boot_mem_map.map[i].addr);
end = PFN_DOWN(boot_mem_map.map[i].addr
+ boot_mem_map.map[i].size);

if (end > max_low_pfn)
max_low_pfn = end;
if (start < min_low_pfn)
min_low_pfn = start;
if (end <= reserved_end)
continue;
if (start >= mapstart)
continue;
mapstart = max(reserved_end, start);
}

if (min_low_pfn >= max_low_pfn)
panic("Incorrect memory mapping !!!");
if (min_low_pfn > ARCH_PFN_OFFSET) {
pr_info("Wasting %lu bytes for tracking %lu unused pages\n",
(min_low_pfn - ARCH_PFN_OFFSET) * sizeof(struct page),
min_low_pfn - ARCH_PFN_OFFSET);
} else if (min_low_pfn < ARCH_PFN_OFFSET) {
pr_info("%lu free pages won't be used\n",
ARCH_PFN_OFFSET - min_low_pfn);
}
min_low_pfn = ARCH_PFN_OFFSET;

/*
* Determine low and high memory ranges
*/
max_pfn = max_low_pfn;
if (max_low_pfn > PFN_DOWN(HIGHMEM_START)) {
#ifdef CONFIG_HIGHMEM
highstart_pfn = PFN_DOWN(HIGHMEM_START);
highend_pfn = max_low_pfn;
#endif
max_low_pfn = PFN_DOWN(HIGHMEM_START);
}

/*
* Initialize the boot-time allocator with low memory only.
*/
bootmap_size = init_bootmem_node(NODE_DATA(0), mapstart,
min_low_pfn, max_low_pfn);


for (i = 0; i < boot_mem_map.nr_map; i++) {
unsigned long start, end;

start = PFN_UP(boot_mem_map.map[i].addr);
end = PFN_DOWN(boot_mem_map.map[i].addr
+ boot_mem_map.map[i].size);

if (start <= min_low_pfn)
start = min_low_pfn;
if (start >= end)
continue;

#ifndef CONFIG_HIGHMEM
if (end > max_low_pfn)
end = max_low_pfn;

/*
* ... finally, is the area going away?
*/
if (end <= start)
continue;
#endif

add_active_range(0, start, end);
}

/*
* Register fully available low RAM pages with the bootmem allocator.
*/
for (i = 0; i < boot_mem_map.nr_map; i++) {
unsigned long start, end, size;

/*
* Reserve usable memory.
*/
if (boot_mem_map.map[i].type != BOOT_MEM_RAM)
continue;

start = PFN_UP(boot_mem_map.map[i].addr);
end = PFN_DOWN(boot_mem_map.map[i].addr
+ boot_mem_map.map[i].size);
/*
* We are rounding up the start address of usable memory
* and at the end of the usable range downwards.
*/
if (start >= max_low_pfn)
continue;
if (start < reserved_end)
start = reserved_end;
if (end > max_low_pfn)
end = max_low_pfn;

/*
* ... finally, is the area going away?
*/
if (end <= start)
continue;
size = end - start;

/* Register lowmem ranges */
free_bootmem(PFN_PHYS(start), size << PAGE_SHIFT);
memory_present(0, start, end);
}

/*
* Reserve the bootmap memory.
*/
reserve_bootmem(PFN_PHYS(mapstart), bootmap_size, BOOTMEM_DEFAULT);

/*
* Reserve initrd memory if needed.
*/
finalize_initrd();
}



struct node_active_region {
unsigned long start_pfn;
unsigned long end_pfn;
int nid;
};



mm/page_alloc.c
===================
static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];



/**
* add_active_range - Register a range of PFNs backed by physical memory
* @nid: The node ID the range resides on
* @start_pfn: The start PFN of the available physical memory
* @end_pfn: The end PFN of the available physical memory
*
* These ranges are stored in an early_node_map[] and later used by
* free_area_init_nodes() to calculate zone sizes and holes. If the
* range spans a memory hole, it is up to the architecture to ensure
* the memory is not freed by the bootmem allocator. If possible
* the range being registered will be merged with existing ranges.
*/
void __init add_active_range(unsigned int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
int i;

mminit_dprintk(MMINIT_TRACE, "memory_register",
"Entering add_active_range(%d, %#lx, %#lx) "
"%d entries of %d used\n",
nid, start_pfn, end_pfn,
nr_nodemap_entries, MAX_ACTIVE_REGIONS);

mminit_validate_memmodel_limits(&start_pfn, &end_pfn);

/* Merge with existing active regions if possible */
for (i = 0; i < nr_nodemap_entries; i++) {
if (early_node_map[i].nid != nid)
continue;

/* Skip if an existing region covers this new one */
if (start_pfn >= early_node_map[i].start_pfn &&
end_pfn <= early_node_map[i].end_pfn)
return;

/* Merge forward if suitable */
if (start_pfn <= early_node_map[i].end_pfn &&
end_pfn > early_node_map[i].end_pfn) {
early_node_map[i].end_pfn = end_pfn;
return;
}

/* Merge backward if suitable */
if (start_pfn < early_node_map[i].start_pfn &&
end_pfn >= early_node_map[i].start_pfn) {
early_node_map[i].start_pfn = start_pfn;
return;
}
}

/* Check that early_node_map is large enough */
if (i >= MAX_ACTIVE_REGIONS) {
printk(KERN_CRIT "More than %d memory regions, truncating\n",
MAX_ACTIVE_REGIONS);
return;
}

early_node_map[i].nid = nid;
early_node_map[i].start_pfn = start_pfn;
early_node_map[i].end_pfn = end_pfn;
nr_nodemap_entries = i + 1;
}





arch/mips/mm/init.c
====================================


void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
unsigned long lastpfn __maybe_unused;

pagetable_init();

#ifdef CONFIG_HIGHMEM
kmap_init();
#endif
kmap_coherent_init();

#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
lastpfn = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
lastpfn = highend_pfn;

if (cpu_has_dc_aliases && max_low_pfn != highend_pfn) {
printk(KERN_WARNING "This processor doesn't support highmem."
" %ldk highmem ignored\n",
(highend_pfn - max_low_pfn) << (PAGE_SHIFT - 10));
max_zone_pfns[ZONE_HIGHMEM] = max_low_pfn;
lastpfn = max_low_pfn;
}
#endif

free_area_init_nodes(max_zone_pfns);
}

#ifdef CONFIG_64BIT
static struct kcore_list kcore_kseg0;
#endif


arch/mips/kernel.c
================================
void __init setup_arch(char **cmdline_p)
{
cpu_probe();
prom_init();<=====

#ifdef CONFIG_EARLY_PRINTK
setup_early_printk();
#endif
cpu_report();
check_bugs_early();

#if defined(CONFIG_VT)
#if defined(CONFIG_VGA_CONSOLE)
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
#endif
#endif

arch_mem_init(cmdline_p);<=====

resource_init();
plat_smp_setup();
}

neighbour 学习笔记(kernel 3.0)

  1. For ethernet, dev->header_ops is eth_header_ops
    1
    2
    3
    4
    5
     936 static int __devinit e1000_probe(struct pci_dev *pdev,          
    937 const struct pci_device_id *ent)
    938
    ...
    973 netdev = alloc_etherdev(sizeof(struct e1000_adapter));

include/linux/etherdevice.h

1
2
53 #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
54 #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)

net/ethernet/eth.c

1
2
3
4
5
365 struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
366 unsigned int rxqs)
367 {
368 return alloc_netdev_mqs(sizeof_priv, "eth%d", ether_setup, txqs, rxqs);
369 }

net/core/dev.c

1
2
3
4
5
6
7
8
5821 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5822 void (*setup)(struct net_device *),
5823 unsigned int txqs, unsigned int rxqs)
...
5880 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5881 setup(dev); <===
5882
5883 dev->num_tx_queues = txqs;

net/ethernet/eth.c

1
2
3
4
5
6
7
8
9
10
11
12
13
334 void ether_setup(struct net_device *dev)
336 dev->header_ops = &eth_header_ops;<===
337 dev->type = ARPHRD_ETHER;
338 dev->hard_header_len = ETH_HLEN;
339 dev->mtu = ETH_DATA_LEN;
340 dev->addr_len = ETH_ALEN;
341 dev->tx_queue_len = 1000; /* Ethernet wants good queues */
342 dev->flags = IFF_BROADCAST|IFF_MULTICAST;
343 dev->priv_flags = IFF_TX_SKB_SHARING;
344
345 memset(dev->broadcast, 0xFF, ETH_ALEN);
346
347 }

register_pernet_subsys 笔记

pernet ops

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/**
* register_pernet_subsys - register a network namespace subsystem
* @ops: pernet operations structure for the subsystem
*
* Register a subsystem which has init and exit functions
* that are called when network namespaces are created and
* destroyed respectively.
*
* When registered all network namespace init functions are
* called for every existing network namespace. Allowing kernel
* modules to have a race free view of the set of network namespaces.
*
* When a new network namespace is created all of the init
* methods are called in the order in which they were registered.
*
* When a network namespace is destroyed all of the exit methods
* are called in the reverse of the order with which they were
* registered.
*/
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
int register_pernet_subsys(struct pernet_operations *ops)
{
int error;
mutex_lock(&net_mutex);
error = register_pernet_operations(first_device, ops);
mutex_unlock(&net_mutex);
return error;
}
===>static int register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
error = __register_pernet_operations(list, ops);
}

======>#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
LIST_HEAD(net_exit_list);

list_add_tail(&ops->list, list);
if (ops->init || (ops->id && ops->size)) {
for_each_net(net) {
error = ops_init(ops, net);
if (error)
goto out_undo;
list_add_tail(&net->exit_list, &net_exit_list);
}
}
return 0;


========>#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
struct net *net;
int error;
LIST_HEAD(net_exit_list);

list_add_tail(&ops->list, list);
if (ops->init || (ops->id && ops->size)) {
for_each_net(net) {
=============> error = ops_init(ops, net);
if (error)
goto out_undo;
=============> list_add_tail(&net->exit_list, &net_exit_list);<<< confused?!!! net_exit_list局部变量?

}
}
return 0;


=============>static int ops_init(const struct pernet_operations *ops, struct net *net)
{
int err;
if (ops->id && ops->size) {
void *data = kzalloc(ops->size, GFP_KERNEL);
if (!data)
return -ENOMEM;

err = net_assign_generic(net, *ops->id, data);
if (err) {
kfree(data);
return err;
}
}
if (ops->init)
return ops->init(net);<====== the ops->init will be called.
return 0;
}

####Fox example
inet6_init in pernet.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static struct pernet_operations inet6_net_ops = { 
.init = inet6_net_init,
.exit = inet6_net_exit,
};
static int __init inet6_init(void)
{
.....
err = register_pernet_subsys(&inet6_net_ops);
if (err)
goto register_pernet_fail;
.....
}
call: ops->init(net);<====== the ops->init will be called.
equal with ======= .init = inet6_net_init,
inet6_net_init(net);

fix bug: timezone of toolchain

When we compile a glibc(or eglibc), we need generated the timezone data file with it. although, it is stable and no change almost in every version update.

Today a problem is met about it.

We use the old glibc’s timezone file, which is used by many different toolchain for several paltforms.

unfortunately.the data file has been change after 2007 year by GNU official. but I did not found the exact version(date) of glibc, which change the timezone data file.

btw: toolchain = binutils + gcc + glibc(eglic) + kernel(header)

pf_key module summary

###af_key.c
linux kernel provide 3 method to manager SA/SP,
such as add/del/flush/dump SAs/SPs.

  1. pf_key socket.
  2. netlink message.
  3. socket option.

The af_key.c implement the pf_key socket.

###part 1. pf_key socket defination about socket opertion.
important function is
pfkey_create,pfkey_sendmsg,pfkey_recvmsg,
pfkey_release,datagram_poll,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
static const struct proto_ops pfkey_ops = { 
.family = PF_KEY,
.owner = THIS_MODULE,
/* Operations that make no sense on pfkey sockets. */
.bind = sock_no_bind,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = sock_no_getname,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
.getsockopt = sock_no_getsockopt,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,

/* Now the operations that really occur. */
.release = pfkey_release,
.poll = datagram_poll,
.sendmsg = pfkey_sendmsg,
.recvmsg = pfkey_recvmsg,
};


static struct net_proto_family pfkey_family_ops = {
.family = PF_KEY,
.create = pfkey_create,
.owner = THIS_MODULE,
};


struct pfkey_sock {
/* struct sock must be the first member of struct pfkey_sock */
struct sock sk;
int registered;
int promisc;

struct {
uint8_t msg_version;
uint32_t msg_pid;
int (*dump)(struct pfkey_sock *sk);
void (*done)(struct pfkey_sock *sk);
union {
struct xfrm_policy_walk policy;
struct xfrm_state_walk state;
} u;
struct sk_buff *skb;
} dump;
};

###part 2. pf_key kernel message

1
2
3
4
5
6
7
8
9
10
static struct xfrm_mgr pfkeyv2_mgr =
{
.id = "pfkeyv2",
.notify = pfkey_send_notify,
.acquire = pfkey_send_acquire,
.compile_policy = pfkey_compile_policy,
.new_mapping = pfkey_send_new_mapping,
.notify_policy = pfkey_send_policy_notify,
.migrate = pfkey_send_migrate,
};

pf_key message process.

in kernel 3.0, pf_key message format
A traditional TLV format.

header + (extenion-header + extention_value)*n

The header is sadb_msg.
extention header is sadb_ext.
extention value is different according the extention header.
Such as sadb_sa,sadb_x_policy and so on.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
struct sadb_msg { 
uint8_t sadb_msg_version;
uint8_t sadb_msg_type;
uint8_t sadb_msg_errno;
uint8_t sadb_msg_satype;
uint16_t sadb_msg_len;
uint16_t sadb_msg_reserved;
uint32_t sadb_msg_seq;
uint32_t sadb_msg_pid;
} __attribute__((packed));
/* sizeof(struct sadb_msg) == 16 */

struct sadb_ext {
uint16_t sadb_ext_len;
uint16_t sadb_ext_type;
} __attribute__((packed));
/* sizeof(struct sadb_ext) == 4 */


struct sadb_sa {
uint16_t sadb_sa_len;
uint16_t sadb_sa_exttype;
__be32 sadb_sa_spi;
uint8_t sadb_sa_replay;
uint8_t sadb_sa_state;
uint8_t sadb_sa_auth;
uint8_t sadb_sa_encrypt;
uint32_t sadb_sa_flags;
} __attribute__((packed));
/* sizeof(struct sadb_sa) == 16 */

struct sadb_x_policy {
uint16_t sadb_x_policy_len;
uint16_t sadb_x_policy_exttype;
uint16_t sadb_x_policy_type;
uint8_t sadb_x_policy_dir;
uint8_t sadb_x_policy_reserved;
uint32_t sadb_x_policy_id;
uint32_t sadb_x_policy_priority;
} __attribute__((packed));
/* sizeof(struct sadb_x_policy) == 16 */

The application program(such as setkey) sent a command to kernel by sendmsg system API.
Thus in kernel pf_key will call pfkey_sendmsg.
pfkey_sendmsg will call pfkey_get_base_msg to do some simple check, and
then call pfkey_process.

pfkey_process will first pfkey_broadcast, then divid the extention message
to a pointer array one by one.
void *ext_hdrs\[SADB_EXT_MAX\];
SADB_EXT_SA —->
SADB_EXT_ADDRESS_SRC—->
SADB_EXT_ADDRESS_DST—->
this pointer array will be used by the following handler.

and then call the pfkey_handler according the sadb_msg_type in the pf_key messag header.

typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs); 
static pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
[SADB_RESERVED] = pfkey_reserved,
[SADB_GETSPI] = pfkey_getspi,
[SADB_UPDATE] = pfkey_add,
[SADB_ADD] = pfkey_add,
[SADB_DELETE] = pfkey_delete,
[SADB_GET] = pfkey_get,
[SADB_ACQUIRE] = pfkey_acquire,
[SADB_REGISTER] = pfkey_register,
[SADB_EXPIRE] = NULL,
[SADB_FLUSH] = pfkey_flush,
[SADB_DUMP] = pfkey_dump,
[SADB_X_PROMISC] = pfkey_promisc,
[SADB_X_PCHANGE] = NULL,
[SADB_X_SPDUPDATE] = pfkey_spdadd,
[SADB_X_SPDADD] = pfkey_spdadd,
[SADB_X_SPDDELETE] = pfkey_spddelete,
[SADB_X_SPDGET] = pfkey_spdget,
[SADB_X_SPDACQUIRE] = NULL,
[SADB_X_SPDDUMP] = pfkey_spddump,
[SADB_X_SPDFLUSH] = pfkey_spdflush,
[SADB_X_SPDSETIDX] = pfkey_spdadd,
[SADB_X_SPDDELETE2] = pfkey_spdget,
[SADB_X_MIGRATE] = pfkey_migrate,
};

The policy related function was done in xfrm_policy.c and xfrm_state.c
3.1 policy add handler: pfkey_spdadd

3.2 polcy dump handler: pfkey_spddump
function pfkey_xfrm_policy2msg

3.3 policy flush handler: pfkey_spdflush.

3.4 SA add handler:pfkey_add

3.2 SA dump handler:pfkey_dump

3.3 SA flush handler:pfkey_flush

xfrm in kernel

global var and structure:

static DEFINE_PER_NET(struct hlist_head *, xfrm_state_byspi);
static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
xfrm_policy_afinfo 定义一个大的数组,每一个元素对应一个地址族,如ipv4(AF_INET),ipv6(AF_INET6).

Read More

嵌入Linux下的usb storage的支持

目标: 在mipsel架构的嵌入式linux系统上支持USB盘的读写。
思路: 使用可加载模块的形式增加 scsimod.ko, sdmod.ko, usbstorage.ko, fat.ko, vfat.ko.

##具体实现:
###步骤I 准备必须的驱动(可加载模块)

  1. scsimod.ko 源代码目录下的 driver/scsi目录下的文件编译
  2. sdmod.ko, 源代码目录下的 driver/scsi目录下的文件编译
  3. usbstorage.ko 源代码目录下的 driver/usb/storage 目录下的文件编译
  4. fat.ko 源代码目录下的 fs/fat目录下的文件编译
  5. vfat.ko 源代码目录下的 fs/vfat目录下的文件编译
    上述五个可加载模块的编译过程可以具体参考各自目录下的Makefile。

Read More