debuggers.hg

view xen/net/dev.c @ 618:4480b471191c

bitkeeper revision 1.259.2.7 (3f0c428fGYxQAV_56B2hOOjYs1PF0A)

Port a bunch of network drivers for low-quality NICS (which will incur extra copying overheads within Xen). But will allow us to work on a wider range of systems at least.
author kaf24@scramble.cl.cam.ac.uk
date Wed Jul 09 16:27:59 2003 +0000 (2003-07-09)
parents 0d6b14f25be6
children 125f43340354
line source
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
10 #include <asm/uaccess.h>
11 #include <asm/system.h>
12 #include <asm/bitops.h>
13 #include <linux/config.h>
14 #include <linux/delay.h>
15 #include <linux/lib.h>
16 #include <linux/types.h>
17 #include <linux/sched.h>
18 #include <linux/mm.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/errno.h>
22 #include <linux/interrupt.h>
23 #include <linux/if_ether.h>
24 #include <linux/netdevice.h>
25 #include <linux/etherdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/brlock.h>
28 #include <linux/init.h>
29 #include <linux/module.h>
31 #include <linux/event.h>
32 #include <asm/domain_page.h>
33 #include <asm/pgalloc.h>
35 #define BUG_TRAP ASSERT
36 #define notifier_call_chain(_a,_b,_c) ((void)0)
37 #define rtmsg_ifinfo(_a,_b,_c) ((void)0)
38 #define rtnl_lock() ((void)0)
39 #define rtnl_unlock() ((void)0)
41 #if 0
42 #define DPRINTK(_f, _a...) printk(_f , ## _a)
43 #else
44 #define DPRINTK(_f, _a...) ((void)0)
45 #endif
47 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
48 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
49 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
50 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
52 static void make_tx_response(net_vif_t *vif,
53 unsigned short id,
54 unsigned char st);
55 static void make_rx_response(net_vif_t *vif,
56 unsigned short id,
57 unsigned short size,
58 unsigned char st,
59 unsigned char off);
61 struct net_device *the_dev = NULL;
63 /*
64 * Transmitted packets are fragmented, so we can copy the important headesr
65 * before checking them for validity. Avoids need for page protection.
66 */
67 /* Ethernet + IP headers */
68 #define PKT_PROT_LEN (ETH_HLEN + 20)
69 static kmem_cache_t *net_header_cachep;
71 /**
72 * __dev_get_by_name - find a device by its name
73 * @name: name to find
74 *
75 * Find an interface by name. Must be called under RTNL semaphore
76 * or @dev_base_lock. If the name is found a pointer to the device
77 * is returned. If the name is not found then %NULL is returned. The
78 * reference counters are not incremented so the caller must be
79 * careful with locks.
80 */
83 struct net_device *__dev_get_by_name(const char *name)
84 {
85 struct net_device *dev;
87 for (dev = dev_base; dev != NULL; dev = dev->next) {
88 if (strncmp(dev->name, name, IFNAMSIZ) == 0)
89 return dev;
90 }
91 return NULL;
92 }
94 /**
95 * dev_get_by_name - find a device by its name
96 * @name: name to find
97 *
98 * Find an interface by name. This can be called from any
99 * context and does its own locking. The returned handle has
100 * the usage count incremented and the caller must use dev_put() to
101 * release it when it is no longer needed. %NULL is returned if no
102 * matching device is found.
103 */
105 struct net_device *dev_get_by_name(const char *name)
106 {
107 struct net_device *dev;
109 read_lock(&dev_base_lock);
110 dev = __dev_get_by_name(name);
111 if (dev)
112 dev_hold(dev);
113 read_unlock(&dev_base_lock);
114 return dev;
115 }
117 /**
118 * dev_get - test if a device exists
119 * @name: name to test for
120 *
121 * Test if a name exists. Returns true if the name is found. In order
122 * to be sure the name is not allocated or removed during the test the
123 * caller must hold the rtnl semaphore.
124 *
125 * This function primarily exists for back compatibility with older
126 * drivers.
127 */
129 int dev_get(const char *name)
130 {
131 struct net_device *dev;
133 read_lock(&dev_base_lock);
134 dev = __dev_get_by_name(name);
135 read_unlock(&dev_base_lock);
136 return dev != NULL;
137 }
139 /**
140 * __dev_get_by_index - find a device by its ifindex
141 * @ifindex: index of device
142 *
143 * Search for an interface by index. Returns %NULL if the device
144 * is not found or a pointer to the device. The device has not
145 * had its reference counter increased so the caller must be careful
146 * about locking. The caller must hold either the RTNL semaphore
147 * or @dev_base_lock.
148 */
150 struct net_device * __dev_get_by_index(int ifindex)
151 {
152 struct net_device *dev;
154 for (dev = dev_base; dev != NULL; dev = dev->next) {
155 if (dev->ifindex == ifindex)
156 return dev;
157 }
158 return NULL;
159 }
162 /**
163 * dev_get_by_index - find a device by its ifindex
164 * @ifindex: index of device
165 *
166 * Search for an interface by index. Returns NULL if the device
167 * is not found or a pointer to the device. The device returned has
168 * had a reference added and the pointer is safe until the user calls
169 * dev_put to indicate they have finished with it.
170 */
172 struct net_device * dev_get_by_index(int ifindex)
173 {
174 struct net_device *dev;
176 read_lock(&dev_base_lock);
177 dev = __dev_get_by_index(ifindex);
178 if (dev)
179 dev_hold(dev);
180 read_unlock(&dev_base_lock);
181 return dev;
182 }
184 /**
185 * dev_getbyhwaddr - find a device by its hardware address
186 * @type: media type of device
187 * @ha: hardware address
188 *
189 * Search for an interface by MAC address. Returns NULL if the device
190 * is not found or a pointer to the device. The caller must hold the
191 * rtnl semaphore. The returned device has not had its ref count increased
192 * and the caller must therefore be careful about locking
193 *
194 * BUGS:
195 * If the API was consistent this would be __dev_get_by_hwaddr
196 */
198 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
199 {
200 struct net_device *dev;
202 for (dev = dev_base; dev != NULL; dev = dev->next) {
203 if (dev->type == type &&
204 memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
205 return dev;
206 }
207 return NULL;
208 }
210 /**
211 * dev_alloc_name - allocate a name for a device
212 * @dev: device
213 * @name: name format string
214 *
215 * Passed a format string - eg "lt%d" it will try and find a suitable
216 * id. Not efficient for many devices, not called a lot. The caller
217 * must hold the dev_base or rtnl lock while allocating the name and
218 * adding the device in order to avoid duplicates. Returns the number
219 * of the unit assigned or a negative errno code.
220 */
222 int dev_alloc_name(struct net_device *dev, const char *name)
223 {
224 int i;
225 char buf[32];
226 char *p;
228 /*
229 * Verify the string as this thing may have come from
230 * the user. There must be either one "%d" and no other "%"
231 * characters, or no "%" characters at all.
232 */
233 p = strchr(name, '%');
234 if (p && (p[1] != 'd' || strchr(p+2, '%')))
235 return -EINVAL;
237 /*
238 * If you need over 100 please also fix the algorithm...
239 */
240 for (i = 0; i < 100; i++) {
241 snprintf(buf,sizeof(buf),name,i);
242 if (__dev_get_by_name(buf) == NULL) {
243 strcpy(dev->name, buf);
244 return i;
245 }
246 }
247 return -ENFILE; /* Over 100 of the things .. bail out! */
248 }
250 /**
251 * dev_alloc - allocate a network device and name
252 * @name: name format string
253 * @err: error return pointer
254 *
255 * Passed a format string, eg. "lt%d", it will allocate a network device
256 * and space for the name. %NULL is returned if no memory is available.
257 * If the allocation succeeds then the name is assigned and the
258 * device pointer returned. %NULL is returned if the name allocation
259 * failed. The cause of an error is returned as a negative errno code
260 * in the variable @err points to.
261 *
262 * The caller must hold the @dev_base or RTNL locks when doing this in
263 * order to avoid duplicate name allocations.
264 */
266 struct net_device *dev_alloc(const char *name, int *err)
267 {
268 struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL);
269 if (dev == NULL) {
270 *err = -ENOBUFS;
271 return NULL;
272 }
273 memset(dev, 0, sizeof(struct net_device));
274 *err = dev_alloc_name(dev, name);
275 if (*err < 0) {
276 kfree(dev);
277 return NULL;
278 }
279 return dev;
280 }
282 /**
283 * netdev_state_change - device changes state
284 * @dev: device to cause notification
285 *
286 * Called to indicate a device has changed state. This function calls
287 * the notifier chains for netdev_chain and sends a NEWLINK message
288 * to the routing socket.
289 */
291 void netdev_state_change(struct net_device *dev)
292 {
293 if (dev->flags&IFF_UP) {
294 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
295 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
296 }
297 }
300 #ifdef CONFIG_KMOD
302 /**
303 * dev_load - load a network module
304 * @name: name of interface
305 *
306 * If a network interface is not present and the process has suitable
307 * privileges this function loads the module. If module loading is not
308 * available in this kernel then it becomes a nop.
309 */
311 void dev_load(const char *name)
312 {
313 if (!dev_get(name) && capable(CAP_SYS_MODULE))
314 request_module(name);
315 }
317 #else
319 extern inline void dev_load(const char *unused){;}
321 #endif
323 static int default_rebuild_header(struct sk_buff *skb)
324 {
325 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
326 skb->dev ? skb->dev->name : "NULL!!!");
327 kfree_skb(skb);
328 return 1;
329 }
331 /**
332 * dev_open - prepare an interface for use.
333 * @dev: device to open
334 *
335 * Takes a device from down to up state. The device's private open
336 * function is invoked and then the multicast lists are loaded. Finally
337 * the device is moved into the up state and a %NETDEV_UP message is
338 * sent to the netdev notifier chain.
339 *
340 * Calling this function on an active interface is a nop. On a failure
341 * a negative errno code is returned.
342 */
344 int dev_open(struct net_device *dev)
345 {
346 int ret = 0;
348 /*
349 * Is it already up?
350 */
352 if (dev->flags&IFF_UP)
353 return 0;
355 /*
356 * Is it even present?
357 */
358 if (!netif_device_present(dev))
359 return -ENODEV;
361 /*
362 * Call device private open method
363 */
364 if (try_inc_mod_count(dev->owner)) {
365 if (dev->open) {
366 ret = dev->open(dev);
367 if (ret != 0 && dev->owner)
368 __MOD_DEC_USE_COUNT(dev->owner);
369 }
370 } else {
371 ret = -ENODEV;
372 }
374 /*
375 * If it went open OK then:
376 */
378 if (ret == 0)
379 {
380 /*
381 * Set the flags.
382 */
383 dev->flags |= IFF_UP;
385 set_bit(__LINK_STATE_START, &dev->state);
387 /*
388 * Initialize multicasting status
389 */
390 dev_mc_upload(dev);
392 /*
393 * Wakeup transmit queue engine
394 */
395 dev_activate(dev);
397 /*
398 * ... and announce new interface.
399 */
400 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
401 }
402 return(ret);
403 }
406 /**
407 * dev_close - shutdown an interface.
408 * @dev: device to shutdown
409 *
410 * This function moves an active device into down state. A
411 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
412 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
413 * chain.
414 */
416 int dev_close(struct net_device *dev)
417 {
418 if (!(dev->flags&IFF_UP))
419 return 0;
421 /*
422 * Tell people we are going down, so that they can
423 * prepare to death, when device is still operating.
424 */
425 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
427 dev_deactivate(dev);
429 clear_bit(__LINK_STATE_START, &dev->state);
431 /*
432 * Call the device specific close. This cannot fail.
433 * Only if device is UP
434 *
435 * We allow it to be called even after a DETACH hot-plug
436 * event.
437 */
439 if (dev->stop)
440 dev->stop(dev);
442 /*
443 * Device is now down.
444 */
446 dev->flags &= ~IFF_UP;
448 /*
449 * Tell people we are down
450 */
451 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
453 /*
454 * Drop the module refcount
455 */
456 if (dev->owner)
457 __MOD_DEC_USE_COUNT(dev->owner);
459 return(0);
460 }
463 #ifdef CONFIG_HIGHMEM
464 /* Actually, we should eliminate this check as soon as we know, that:
465 * 1. IOMMU is present and allows to map all the memory.
466 * 2. No high memory really exists on this machine.
467 */
469 static inline int
470 illegal_highdma(struct net_device *dev, struct sk_buff *skb)
471 {
472 int i;
474 if (dev->features&NETIF_F_HIGHDMA)
475 return 0;
477 for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
478 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
479 return 1;
481 return 0;
482 }
483 #else
484 #define illegal_highdma(dev, skb) (0)
485 #endif
488 /*=======================================================================
489 Receiver routines
490 =======================================================================*/
492 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
494 void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
495 {
496 rx_shadow_entry_t *rx;
497 unsigned long *ptep;
498 struct pfn_info *old_page, *new_page, *pte_page;
499 unsigned int i;
500 unsigned short size;
501 unsigned char offset, status = RING_STATUS_OK;
503 memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
504 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
505 memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
507 /*
508 * Slightly gross: we need the page_lock so that we can do PTE checking.
509 * However, we take it slightly early so that it can protect the update
510 * of rx_cons. This saves us from grabbing two locks.
511 */
512 spin_lock(&vif->domain->page_lock);
514 if ( (i = vif->rx_cons) == vif->rx_prod )
515 {
516 spin_unlock(&vif->domain->page_lock);
517 return;
518 }
519 rx = vif->rx_shadow_ring + i;
520 vif->rx_cons = RX_RING_INC(i);
522 size = (unsigned short)skb->len;
523 offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
525 /* Release the page-table page. */
526 pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
527 put_page_type(pte_page);
528 put_page_tot(pte_page);
530 old_page = frame_table + rx->buf_pfn;
531 new_page = skb->pf;
533 ptep = map_domain_mem(rx->pte_ptr);
535 if ( (*ptep & _PAGE_PRESENT) )
536 {
537 /* Bail out if the PTE has been reused under our feet. */
538 list_add(&old_page->list, &vif->domain->pg_head);
539 old_page->flags = vif->domain->domain;
540 unmap_domain_mem(ptep);
541 spin_unlock(&vif->domain->page_lock);
542 status = RING_STATUS_BAD_PAGE;
543 goto out;
544 }
546 /* Give the new page to the domain, marking it writeable. */
547 new_page->tot_count = new_page->type_count = 1;
548 new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
549 list_add(&new_page->list, &vif->domain->pg_head);
551 /* Patch the PTE to map the new page as writeable. */
552 machine_to_phys_mapping[new_page - frame_table]
553 = machine_to_phys_mapping[old_page - frame_table];
554 *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
555 (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
557 unmap_domain_mem(ptep);
559 spin_unlock(&vif->domain->page_lock);
561 /* Our skbuff now points at the guest's old frame. */
562 skb->pf = old_page;
564 /* Updates must happen before releasing the descriptor. */
565 smp_wmb();
567 /*
568 * NB. The remote flush here should be safe, as we hold no locks. The
569 * network driver that called us should also have no nasty locks.
570 */
571 if ( rx->flush_count == (unsigned short)
572 atomic_read(&tlb_flush_count[vif->domain->processor]) )
573 flush_tlb_cpu(vif->domain->processor);
575 out:
576 make_rx_response(vif, rx->id, size, status, offset);
578 /* record this so they can be billed */
579 vif->total_packets_received++;
580 vif->total_bytes_received += size;
581 }
583 /**
584 * netif_rx - post buffer to the network code
585 * @skb: buffer to post
586 *
587 * This function receives a packet from a device driver and queues it for
588 * the upper (protocol) levels to process. It always succeeds. The buffer
589 * may be dropped during processing for congestion control or by the
590 * protocol layers.
591 *
592 * return values:
593 * NET_RX_SUCCESS (no congestion)
594 * NET_RX_DROP (packet was dropped)
595 */
597 int netif_rx(struct sk_buff *skb)
598 {
599 int offset, this_cpu = smp_processor_id();
600 unsigned long flags;
602 local_irq_save(flags);
604 ASSERT(skb->skb_type == SKB_ZERO_COPY);
606 /*
607 * Offset will include 16 bytes padding from dev_alloc_skb, 14 bytes for
608 * ethernet header, plus any other alignment padding added by the driver.
609 */
610 offset = (int)skb->data & ~PAGE_MASK;
611 skb->head = (u8 *)map_domain_mem(((skb->pf - frame_table) << PAGE_SHIFT));
612 skb->data = skb->nh.raw = skb->head + offset;
613 skb->tail = skb->data + skb->len;
614 skb_push(skb, ETH_HLEN);
615 skb->mac.raw = skb->data;
617 netdev_rx_stat[this_cpu].total++;
619 if ( skb->dst_vif == NULL )
620 skb->dst_vif = net_get_target_vif(skb->data, skb->len, skb->src_vif);
622 if ( (skb->dst_vif == VIF_PHYS) || (skb->dst_vif == VIF_DROP) )
623 {
624 netdev_rx_stat[this_cpu].dropped++;
625 unmap_domain_mem(skb->head);
626 kfree_skb(skb);
627 local_irq_restore(flags);
628 return NET_RX_DROP;
629 }
631 deliver_packet(skb, skb->dst_vif);
632 put_vif(skb->dst_vif);
634 unmap_domain_mem(skb->head);
635 kfree_skb(skb);
636 local_irq_restore(flags);
637 return NET_RX_SUCCESS;
638 }
641 /*************************************************************
642 * NEW TRANSMIT SCHEDULER
643 *
644 * NB. We ought also to only send a limited number of bytes to the NIC
645 * for transmission at any one time (to avoid head-of-line blocking).
646 * However, driver rings are small enough that they provide a reasonable
647 * limit.
648 *
649 * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps
650 * e1000 has 256 descriptors == 128 packets, at 1000Mbps
651 * tg3 has 512 descriptors == 256 packets, at 1000Mbps
652 *
653 * So, worst case is tg3 with 256 1500-bytes packets == 375kB.
654 * This would take 3ms, and represents our worst-case HoL blocking cost.
655 *
656 * We think this is reasonable.
657 */
659 struct list_head net_schedule_list;
660 spinlock_t net_schedule_list_lock;
662 static int __on_net_schedule_list(net_vif_t *vif)
663 {
664 return vif->list.next != NULL;
665 }
667 static void remove_from_net_schedule_list(net_vif_t *vif)
668 {
669 unsigned long flags;
670 spin_lock_irqsave(&net_schedule_list_lock, flags);
671 ASSERT(__on_net_schedule_list(vif));
672 list_del(&vif->list);
673 vif->list.next = NULL;
674 put_vif(vif);
675 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
676 }
678 static void add_to_net_schedule_list_tail(net_vif_t *vif)
679 {
680 unsigned long flags;
681 if ( __on_net_schedule_list(vif) ) return;
682 spin_lock_irqsave(&net_schedule_list_lock, flags);
683 if ( !__on_net_schedule_list(vif) )
684 {
685 list_add_tail(&vif->list, &net_schedule_list);
686 get_vif(vif);
687 }
688 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
689 }
692 /* Destructor function for tx skbs. */
693 static void tx_skb_release(struct sk_buff *skb)
694 {
695 int i;
696 net_vif_t *vif = skb->src_vif;
697 unsigned long flags;
699 spin_lock_irqsave(&vif->domain->page_lock, flags);
700 for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
701 put_page_tot(skb_shinfo(skb)->frags[i].page);
702 spin_unlock_irqrestore(&vif->domain->page_lock, flags);
704 if ( skb->skb_type == SKB_NODATA )
705 kmem_cache_free(net_header_cachep, skb->head);
707 skb_shinfo(skb)->nr_frags = 0;
709 make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
711 put_vif(vif);
712 }
715 static void net_tx_action(unsigned long unused)
716 {
717 struct net_device *dev = the_dev;
718 struct list_head *ent;
719 struct sk_buff *skb;
720 net_vif_t *vif;
721 tx_shadow_entry_t *tx;
723 spin_lock(&dev->xmit_lock);
724 while ( !netif_queue_stopped(dev) &&
725 !list_empty(&net_schedule_list) )
726 {
727 /* Get a vif from the list with work to do. */
728 ent = net_schedule_list.next;
729 vif = list_entry(ent, net_vif_t, list);
730 get_vif(vif);
731 remove_from_net_schedule_list(vif);
732 if ( vif->tx_cons == vif->tx_prod )
733 {
734 put_vif(vif);
735 continue;
736 }
738 if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
739 {
740 printk("Out of memory in net_tx_action()!\n");
741 add_to_net_schedule_list_tail(vif);
742 put_vif(vif);
743 break;
744 }
746 /* Pick an entry from the transmit queue. */
747 tx = &vif->tx_shadow_ring[vif->tx_cons];
748 vif->tx_cons = TX_RING_INC(vif->tx_cons);
749 if ( vif->tx_cons != vif->tx_prod )
750 add_to_net_schedule_list_tail(vif);
752 skb->destructor = tx_skb_release;
754 skb->head = skb->data = tx->header;
755 skb->end = skb->tail = skb->head + PKT_PROT_LEN;
757 skb->dev = the_dev;
758 skb->src_vif = vif;
759 skb->dst_vif = NULL;
760 skb->mac.raw = skb->data;
761 skb->guest_id = tx->id;
763 skb_shinfo(skb)->frags[0].page = frame_table +
764 (tx->payload >> PAGE_SHIFT);
765 skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN;
766 skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
767 skb_shinfo(skb)->nr_frags = 1;
769 skb->data_len = tx->size - PKT_PROT_LEN;
770 skb->len = tx->size;
772 /* record the transmission so they can be billed */
773 vif->total_packets_sent++;
774 vif->total_bytes_sent += tx->size;
776 /* Is the NIC crap? */
777 if ( !(dev->features & NETIF_F_SG) )
778 skb_linearize(skb, GFP_KERNEL);
780 /* Transmit should always work, or the queue would be stopped. */
781 if ( dev->hard_start_xmit(skb, dev) != 0 )
782 {
783 printk("Weird failure in hard_start_xmit!\n");
784 kfree_skb(skb);
785 break;
786 }
787 }
788 spin_unlock(&dev->xmit_lock);
789 }
791 DECLARE_TASKLET_DISABLED(net_tx_tasklet, net_tx_action, 0);
793 static inline void maybe_schedule_tx_action(void)
794 {
795 smp_mb();
796 if ( !netif_queue_stopped(the_dev) &&
797 !list_empty(&net_schedule_list) )
798 tasklet_schedule(&net_tx_tasklet);
799 }
802 /*
803 * We need this ioctl for efficient implementation of the
804 * if_indextoname() function required by the IPv6 API. Without
805 * it, we would have to search all the interfaces to find a
806 * match. --pb
807 */
809 static int dev_ifname(struct ifreq *arg)
810 {
811 struct net_device *dev;
812 struct ifreq ifr;
814 /*
815 * Fetch the caller's info block.
816 */
818 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
819 return -EFAULT;
821 read_lock(&dev_base_lock);
822 dev = __dev_get_by_index(ifr.ifr_ifindex);
823 if (!dev) {
824 read_unlock(&dev_base_lock);
825 return -ENODEV;
826 }
828 strcpy(ifr.ifr_name, dev->name);
829 read_unlock(&dev_base_lock);
831 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
832 return -EFAULT;
833 return 0;
834 }
837 /**
838 * netdev_set_master - set up master/slave pair
839 * @slave: slave device
840 * @master: new master device
841 *
842 * Changes the master device of the slave. Pass %NULL to break the
843 * bonding. The caller must hold the RTNL semaphore. On a failure
844 * a negative errno code is returned. On success the reference counts
845 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
846 * function returns zero.
847 */
849 int netdev_set_master(struct net_device *slave, struct net_device *master)
850 {
851 struct net_device *old = slave->master;
853 if (master) {
854 if (old)
855 return -EBUSY;
856 dev_hold(master);
857 }
859 br_write_lock_bh(BR_NETPROTO_LOCK);
860 slave->master = master;
861 br_write_unlock_bh(BR_NETPROTO_LOCK);
863 if (old)
864 dev_put(old);
866 if (master)
867 slave->flags |= IFF_SLAVE;
868 else
869 slave->flags &= ~IFF_SLAVE;
871 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
872 return 0;
873 }
875 /**
876 * dev_set_promiscuity - update promiscuity count on a device
877 * @dev: device
878 * @inc: modifier
879 *
880 * Add or remove promsicuity from a device. While the count in the device
881 * remains above zero the interface remains promiscuous. Once it hits zero
882 * the device reverts back to normal filtering operation. A negative inc
883 * value is used to drop promiscuity on the device.
884 */
886 void dev_set_promiscuity(struct net_device *dev, int inc)
887 {
888 unsigned short old_flags = dev->flags;
890 dev->flags |= IFF_PROMISC;
891 if ((dev->promiscuity += inc) == 0)
892 dev->flags &= ~IFF_PROMISC;
893 if (dev->flags^old_flags) {
894 #ifdef CONFIG_NET_FASTROUTE
895 if (dev->flags&IFF_PROMISC) {
896 netdev_fastroute_obstacles++;
897 dev_clear_fastroute(dev);
898 } else
899 netdev_fastroute_obstacles--;
900 #endif
901 dev_mc_upload(dev);
902 printk(KERN_INFO "device %s %s promiscuous mode\n",
903 dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
904 }
905 }
907 /**
908 * dev_set_allmulti - update allmulti count on a device
909 * @dev: device
910 * @inc: modifier
911 *
912 * Add or remove reception of all multicast frames to a device. While the
913 * count in the device remains above zero the interface remains listening
914 * to all interfaces. Once it hits zero the device reverts back to normal
915 * filtering operation. A negative @inc value is used to drop the counter
916 * when releasing a resource needing all multicasts.
917 */
919 void dev_set_allmulti(struct net_device *dev, int inc)
920 {
921 unsigned short old_flags = dev->flags;
923 dev->flags |= IFF_ALLMULTI;
924 if ((dev->allmulti += inc) == 0)
925 dev->flags &= ~IFF_ALLMULTI;
926 if (dev->flags^old_flags)
927 dev_mc_upload(dev);
928 }
930 int dev_change_flags(struct net_device *dev, unsigned flags)
931 {
932 int ret;
933 int old_flags = dev->flags;
935 /*
936 * Set the flags on our device.
937 */
939 dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC|
940 IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
941 (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
943 /*
944 * Load in the correct multicast list now the flags have changed.
945 */
947 dev_mc_upload(dev);
949 /*
950 * Have we downed the interface. We handle IFF_UP ourselves
951 * according to user attempts to set it, rather than blindly
952 * setting it.
953 */
955 ret = 0;
956 if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
957 {
958 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
960 if (ret == 0)
961 dev_mc_upload(dev);
962 }
964 if (dev->flags&IFF_UP &&
965 ((old_flags^dev->flags)&
966 ~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
967 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
969 if ((flags^dev->gflags)&IFF_PROMISC) {
970 int inc = (flags&IFF_PROMISC) ? +1 : -1;
971 dev->gflags ^= IFF_PROMISC;
972 dev_set_promiscuity(dev, inc);
973 }
975 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
976 is important. Some (broken) drivers set IFF_PROMISC, when
977 IFF_ALLMULTI is requested not asking us and not reporting.
978 */
979 if ((flags^dev->gflags)&IFF_ALLMULTI) {
980 int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
981 dev->gflags ^= IFF_ALLMULTI;
982 dev_set_allmulti(dev, inc);
983 }
985 if (old_flags^dev->flags)
986 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags);
988 return ret;
989 }
991 /*
992 * Perform the SIOCxIFxxx calls.
993 */
995 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
996 {
997 struct net_device *dev;
998 int err;
1000 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
1001 return -ENODEV;
1003 switch(cmd)
1005 case SIOCGIFFLAGS: /* Get interface flags */
1006 ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING))
1007 |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
1008 if (netif_running(dev) && netif_carrier_ok(dev))
1009 ifr->ifr_flags |= IFF_RUNNING;
1010 return 0;
1012 case SIOCSIFFLAGS: /* Set interface flags */
1013 return dev_change_flags(dev, ifr->ifr_flags);
1015 case SIOCGIFMETRIC: /* Get the metric on the interface */
1016 ifr->ifr_metric = 0;
1017 return 0;
1019 case SIOCSIFMETRIC: /* Set the metric on the interface */
1020 return -EOPNOTSUPP;
1022 case SIOCGIFMTU: /* Get the MTU of a device */
1023 ifr->ifr_mtu = dev->mtu;
1024 return 0;
1026 case SIOCSIFMTU: /* Set the MTU of a device */
1027 if (ifr->ifr_mtu == dev->mtu)
1028 return 0;
1030 /*
1031 * MTU must be positive.
1032 */
1034 if (ifr->ifr_mtu<0)
1035 return -EINVAL;
1037 if (!netif_device_present(dev))
1038 return -ENODEV;
1040 if (dev->change_mtu)
1041 err = dev->change_mtu(dev, ifr->ifr_mtu);
1042 else {
1043 dev->mtu = ifr->ifr_mtu;
1044 err = 0;
1046 if (!err && dev->flags&IFF_UP)
1047 notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
1048 return err;
1050 case SIOCGIFHWADDR:
1051 memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
1052 ifr->ifr_hwaddr.sa_family=dev->type;
1053 return 0;
1055 case SIOCSIFHWADDR:
1056 if (dev->set_mac_address == NULL)
1057 return -EOPNOTSUPP;
1058 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1059 return -EINVAL;
1060 if (!netif_device_present(dev))
1061 return -ENODEV;
1062 err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
1063 if (!err)
1064 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1065 return err;
1067 case SIOCSIFHWBROADCAST:
1068 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1069 return -EINVAL;
1070 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
1071 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1072 return 0;
1074 case SIOCGIFMAP:
1075 ifr->ifr_map.mem_start=dev->mem_start;
1076 ifr->ifr_map.mem_end=dev->mem_end;
1077 ifr->ifr_map.base_addr=dev->base_addr;
1078 ifr->ifr_map.irq=dev->irq;
1079 ifr->ifr_map.dma=dev->dma;
1080 ifr->ifr_map.port=dev->if_port;
1081 return 0;
1083 case SIOCSIFMAP:
1084 if (dev->set_config) {
1085 if (!netif_device_present(dev))
1086 return -ENODEV;
1087 return dev->set_config(dev,&ifr->ifr_map);
1089 return -EOPNOTSUPP;
1091 case SIOCADDMULTI:
1092 if (dev->set_multicast_list == NULL ||
1093 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
1094 return -EINVAL;
1095 if (!netif_device_present(dev))
1096 return -ENODEV;
1097 dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
1098 return 0;
1100 case SIOCDELMULTI:
1101 if (dev->set_multicast_list == NULL ||
1102 ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
1103 return -EINVAL;
1104 if (!netif_device_present(dev))
1105 return -ENODEV;
1106 dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
1107 return 0;
1109 case SIOCGIFINDEX:
1110 ifr->ifr_ifindex = dev->ifindex;
1111 return 0;
1113 case SIOCSIFNAME:
1114 if (dev->flags&IFF_UP)
1115 return -EBUSY;
1116 if (__dev_get_by_name(ifr->ifr_newname))
1117 return -EEXIST;
1118 memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
1119 dev->name[IFNAMSIZ-1] = 0;
1120 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1121 return 0;
1123 #ifdef WIRELESS_EXT
1124 case SIOCGIWSTATS:
1125 return dev_iwstats(dev, ifr);
1126 #endif /* WIRELESS_EXT */
1128 /*
1129 * Unknown or private ioctl
1130 */
1132 default:
1133 if ((cmd >= SIOCDEVPRIVATE &&
1134 cmd <= SIOCDEVPRIVATE + 15) ||
1135 cmd == SIOCBONDENSLAVE ||
1136 cmd == SIOCBONDRELEASE ||
1137 cmd == SIOCBONDSETHWADDR ||
1138 cmd == SIOCBONDSLAVEINFOQUERY ||
1139 cmd == SIOCBONDINFOQUERY ||
1140 cmd == SIOCBONDCHANGEACTIVE ||
1141 cmd == SIOCETHTOOL ||
1142 cmd == SIOCGMIIPHY ||
1143 cmd == SIOCGMIIREG ||
1144 cmd == SIOCSMIIREG) {
1145 if (dev->do_ioctl) {
1146 if (!netif_device_present(dev))
1147 return -ENODEV;
1148 return dev->do_ioctl(dev, ifr, cmd);
1150 return -EOPNOTSUPP;
1153 #ifdef WIRELESS_EXT
1154 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1155 if (dev->do_ioctl) {
1156 if (!netif_device_present(dev))
1157 return -ENODEV;
1158 return dev->do_ioctl(dev, ifr, cmd);
1160 return -EOPNOTSUPP;
1162 #endif /* WIRELESS_EXT */
1165 return -EINVAL;
1168 /*
1169 * This function handles all "interface"-type I/O control requests. The actual
1170 * 'doing' part of this is dev_ifsioc above.
1171 */
1173 /**
1174 * dev_ioctl - network device ioctl
1175 * @cmd: command to issue
1176 * @arg: pointer to a struct ifreq in user space
1178 * Issue ioctl functions to devices. This is normally called by the
1179 * user space syscall interfaces but can sometimes be useful for
1180 * other purposes. The return value is the return from the syscall if
1181 * positive or a negative errno code on error.
1182 */
1184 int dev_ioctl(unsigned int cmd, void *arg)
1186 struct ifreq ifr;
1187 int ret;
1188 char *colon;
1190 /* One special case: SIOCGIFCONF takes ifconf argument
1191 and requires shared lock, because it sleeps writing
1192 to user space.
1193 */
1195 if (cmd == SIOCGIFCONF) {
1196 return -ENOSYS;
1198 if (cmd == SIOCGIFNAME) {
1199 return dev_ifname((struct ifreq *)arg);
1202 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1203 return -EFAULT;
1205 ifr.ifr_name[IFNAMSIZ-1] = 0;
1207 colon = strchr(ifr.ifr_name, ':');
1208 if (colon)
1209 *colon = 0;
1211 /*
1212 * See which interface the caller is talking about.
1213 */
1215 switch(cmd)
1217 /*
1218 * These ioctl calls:
1219 * - can be done by all.
1220 * - atomic and do not require locking.
1221 * - return a value
1222 */
1224 case SIOCGIFFLAGS:
1225 case SIOCGIFMETRIC:
1226 case SIOCGIFMTU:
1227 case SIOCGIFHWADDR:
1228 case SIOCGIFSLAVE:
1229 case SIOCGIFMAP:
1230 case SIOCGIFINDEX:
1231 dev_load(ifr.ifr_name);
1232 read_lock(&dev_base_lock);
1233 ret = dev_ifsioc(&ifr, cmd);
1234 read_unlock(&dev_base_lock);
1235 if (!ret) {
1236 if (colon)
1237 *colon = ':';
1238 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1239 return -EFAULT;
1241 return ret;
1243 /*
1244 * These ioctl calls:
1245 * - require superuser power.
1246 * - require strict serialization.
1247 * - return a value
1248 */
1250 case SIOCETHTOOL:
1251 case SIOCGMIIPHY:
1252 case SIOCGMIIREG:
1253 if (!capable(CAP_NET_ADMIN))
1254 return -EPERM;
1255 dev_load(ifr.ifr_name);
1256 dev_probe_lock();
1257 rtnl_lock();
1258 ret = dev_ifsioc(&ifr, cmd);
1259 rtnl_unlock();
1260 dev_probe_unlock();
1261 if (!ret) {
1262 if (colon)
1263 *colon = ':';
1264 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1265 return -EFAULT;
1267 return ret;
1269 /*
1270 * These ioctl calls:
1271 * - require superuser power.
1272 * - require strict serialization.
1273 * - do not return a value
1274 */
1276 case SIOCSIFFLAGS:
1277 case SIOCSIFMETRIC:
1278 case SIOCSIFMTU:
1279 case SIOCSIFMAP:
1280 case SIOCSIFHWADDR:
1281 case SIOCSIFSLAVE:
1282 case SIOCADDMULTI:
1283 case SIOCDELMULTI:
1284 case SIOCSIFHWBROADCAST:
1285 case SIOCSIFNAME:
1286 case SIOCSMIIREG:
1287 case SIOCBONDENSLAVE:
1288 case SIOCBONDRELEASE:
1289 case SIOCBONDSETHWADDR:
1290 case SIOCBONDSLAVEINFOQUERY:
1291 case SIOCBONDINFOQUERY:
1292 case SIOCBONDCHANGEACTIVE:
1293 if (!capable(CAP_NET_ADMIN))
1294 return -EPERM;
1295 dev_load(ifr.ifr_name);
1296 dev_probe_lock();
1297 rtnl_lock();
1298 ret = dev_ifsioc(&ifr, cmd);
1299 rtnl_unlock();
1300 dev_probe_unlock();
1301 return ret;
1303 case SIOCGIFMEM:
1304 /* Get the per device memory space. We can add this but currently
1305 do not support it */
1306 case SIOCSIFMEM:
1307 /* Set the per device memory buffer space. */
1308 case SIOCSIFLINK:
1309 return -EINVAL;
1311 /*
1312 * Unknown or private ioctl.
1313 */
1315 default:
1316 if (cmd >= SIOCDEVPRIVATE &&
1317 cmd <= SIOCDEVPRIVATE + 15) {
1318 dev_load(ifr.ifr_name);
1319 dev_probe_lock();
1320 rtnl_lock();
1321 ret = dev_ifsioc(&ifr, cmd);
1322 rtnl_unlock();
1323 dev_probe_unlock();
1324 if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1325 return -EFAULT;
1326 return ret;
1328 #ifdef WIRELESS_EXT
1329 /* Take care of Wireless Extensions */
1330 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1331 /* If command is `set a parameter', or
1332 * `get the encoding parameters', check if
1333 * the user has the right to do it */
1334 if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) {
1335 if(!capable(CAP_NET_ADMIN))
1336 return -EPERM;
1338 dev_load(ifr.ifr_name);
1339 rtnl_lock();
1340 ret = dev_ifsioc(&ifr, cmd);
1341 rtnl_unlock();
1342 if (!ret && IW_IS_GET(cmd) &&
1343 copy_to_user(arg, &ifr,
1344 sizeof(struct ifreq)))
1345 return -EFAULT;
1346 return ret;
1348 #endif /* WIRELESS_EXT */
1349 return -EINVAL;
1354 /**
1355 * dev_new_index - allocate an ifindex
1357 * Returns a suitable unique value for a new device interface
1358 * number. The caller must hold the rtnl semaphore or the
1359 * dev_base_lock to be sure it remains unique.
1360 */
1362 int dev_new_index(void)
1364 static int ifindex;
1365 for (;;) {
1366 if (++ifindex <= 0)
1367 ifindex=1;
1368 if (__dev_get_by_index(ifindex) == NULL)
1369 return ifindex;
1373 static int dev_boot_phase = 1;
1375 /**
1376 * register_netdevice - register a network device
1377 * @dev: device to register
1379 * Take a completed network device structure and add it to the kernel
1380 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
1381 * chain. 0 is returned on success. A negative errno code is returned
1382 * on a failure to set up the device, or if the name is a duplicate.
1384 * Callers must hold the rtnl semaphore. See the comment at the
1385 * end of Space.c for details about the locking. You may want
1386 * register_netdev() instead of this.
1388 * BUGS:
1389 * The locking appears insufficient to guarantee two parallel registers
1390 * will not get the same name.
1391 */
1393 int net_dev_init(void);
1395 int register_netdevice(struct net_device *dev)
1397 struct net_device *d, **dp;
1398 #ifdef CONFIG_NET_DIVERT
1399 int ret;
1400 #endif
1402 spin_lock_init(&dev->queue_lock);
1403 spin_lock_init(&dev->xmit_lock);
1404 dev->xmit_lock_owner = -1;
1405 #ifdef CONFIG_NET_FASTROUTE
1406 dev->fastpath_lock=RW_LOCK_UNLOCKED;
1407 #endif
1409 if (dev_boot_phase)
1410 net_dev_init();
1412 #ifdef CONFIG_NET_DIVERT
1413 ret = alloc_divert_blk(dev);
1414 if (ret)
1415 return ret;
1416 #endif /* CONFIG_NET_DIVERT */
1418 dev->iflink = -1;
1420 /* Init, if this function is available */
1421 if (dev->init && dev->init(dev) != 0) {
1422 #ifdef CONFIG_NET_DIVERT
1423 free_divert_blk(dev);
1424 #endif
1425 return -EIO;
1428 dev->ifindex = dev_new_index();
1429 if (dev->iflink == -1)
1430 dev->iflink = dev->ifindex;
1432 /* Check for existence, and append to tail of chain */
1433 for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
1434 if (d == dev || strcmp(d->name, dev->name) == 0) {
1435 #ifdef CONFIG_NET_DIVERT
1436 free_divert_blk(dev);
1437 #endif
1438 return -EEXIST;
1441 /*
1442 * nil rebuild_header routine,
1443 * that should be never called and used as just bug trap.
1444 */
1446 if (dev->rebuild_header == NULL)
1447 dev->rebuild_header = default_rebuild_header;
1449 /*
1450 * Default initial state at registry is that the
1451 * device is present.
1452 */
1454 set_bit(__LINK_STATE_PRESENT, &dev->state);
1456 dev->next = NULL;
1457 dev_init_scheduler(dev);
1458 write_lock_bh(&dev_base_lock);
1459 *dp = dev;
1460 dev_hold(dev);
1461 dev->deadbeaf = 0;
1462 write_unlock_bh(&dev_base_lock);
1464 /* Notify protocols, that a new device appeared. */
1465 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
1467 return 0;
1470 /**
1471 * netdev_finish_unregister - complete unregistration
1472 * @dev: device
1474 * Destroy and free a dead device. A value of zero is returned on
1475 * success.
1476 */
1478 int netdev_finish_unregister(struct net_device *dev)
1480 BUG_TRAP(dev->ip_ptr==NULL);
1481 BUG_TRAP(dev->ip6_ptr==NULL);
1482 BUG_TRAP(dev->dn_ptr==NULL);
1484 if (!dev->deadbeaf) {
1485 printk(KERN_ERR "Freeing alive device %p, %s\n",
1486 dev, dev->name);
1487 return 0;
1489 #ifdef NET_REFCNT_DEBUG
1490 printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name,
1491 (dev->features & NETIF_F_DYNALLOC)?"":", old style");
1492 #endif
1493 if (dev->destructor)
1494 dev->destructor(dev);
1495 if (dev->features & NETIF_F_DYNALLOC)
1496 kfree(dev);
1497 return 0;
1500 /**
1501 * unregister_netdevice - remove device from the kernel
1502 * @dev: device
1504 * This function shuts down a device interface and removes it
1505 * from the kernel tables. On success 0 is returned, on a failure
1506 * a negative errno code is returned.
1508 * Callers must hold the rtnl semaphore. See the comment at the
1509 * end of Space.c for details about the locking. You may want
1510 * unregister_netdev() instead of this.
1511 */
1513 int unregister_netdevice(struct net_device *dev)
1515 unsigned long now, warning_time;
1516 struct net_device *d, **dp;
1518 /* If device is running, close it first. */
1519 if (dev->flags & IFF_UP)
1520 dev_close(dev);
1522 BUG_TRAP(dev->deadbeaf==0);
1523 dev->deadbeaf = 1;
1525 /* And unlink it from device chain. */
1526 for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
1527 if (d == dev) {
1528 write_lock_bh(&dev_base_lock);
1529 *dp = d->next;
1530 write_unlock_bh(&dev_base_lock);
1531 break;
1534 if (d == NULL) {
1535 printk(KERN_DEBUG "unregister_netdevice: device %s/%p"
1536 " not registered\n", dev->name, dev);
1537 return -ENODEV;
1540 /* Synchronize to net_rx_action. */
1541 br_write_lock_bh(BR_NETPROTO_LOCK);
1542 br_write_unlock_bh(BR_NETPROTO_LOCK);
1544 if (dev_boot_phase == 0) {
1546 /* Shutdown queueing discipline. */
1547 dev_shutdown(dev);
1549 /* Notify protocols, that we are about to destroy
1550 this device. They should clean all the things.
1551 */
1552 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1554 /*
1555 * Flush the multicast chain
1556 */
1557 dev_mc_discard(dev);
1560 if (dev->uninit)
1561 dev->uninit(dev);
1563 /* Notifier chain MUST detach us from master device. */
1564 BUG_TRAP(dev->master==NULL);
1566 #ifdef CONFIG_NET_DIVERT
1567 free_divert_blk(dev);
1568 #endif
1570 if (dev->features & NETIF_F_DYNALLOC) {
1571 #ifdef NET_REFCNT_DEBUG
1572 if (atomic_read(&dev->refcnt) != 1)
1573 printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n",
1574 dev->name, atomic_read(&dev->refcnt)-1);
1575 #endif
1576 dev_put(dev);
1577 return 0;
1580 /* Last reference is our one */
1581 if (atomic_read(&dev->refcnt) == 1) {
1582 dev_put(dev);
1583 return 0;
1586 #ifdef NET_REFCNT_DEBUG
1587 printk("unregister_netdevice: waiting %s refcnt=%d\n",
1588 dev->name, atomic_read(&dev->refcnt));
1589 #endif
1591 /* EXPLANATION. If dev->refcnt is not now 1 (our own reference)
1592 it means that someone in the kernel still has a reference
1593 to this device and we cannot release it.
1595 "New style" devices have destructors, hence we can return from this
1596 function and destructor will do all the work later. As of kernel 2.4.0
1597 there are very few "New Style" devices.
1599 "Old style" devices expect that the device is free of any references
1600 upon exit from this function.
1601 We cannot return from this function until all such references have
1602 fallen away. This is because the caller of this function will probably
1603 immediately kfree(*dev) and then be unloaded via sys_delete_module.
1605 So, we linger until all references fall away. The duration of the
1606 linger is basically unbounded! It is driven by, for example, the
1607 current setting of sysctl_ipfrag_time.
1609 After 1 second, we start to rebroadcast unregister notifications
1610 in hope that careless clients will release the device.
1612 */
1614 now = warning_time = jiffies;
1615 while (atomic_read(&dev->refcnt) != 1) {
1616 if ((jiffies - now) > 1*HZ) {
1617 /* Rebroadcast unregister notification */
1618 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1620 mdelay(250);
1621 if ((jiffies - warning_time) > 10*HZ) {
1622 printk(KERN_EMERG "unregister_netdevice: waiting for %s to "
1623 "become free. Usage count = %d\n",
1624 dev->name, atomic_read(&dev->refcnt));
1625 warning_time = jiffies;
1628 dev_put(dev);
1629 return 0;
1633 /*
1634 * Initialize the DEV module. At boot time this walks the device list and
1635 * unhooks any devices that fail to initialise (normally hardware not
1636 * present) and leaves us with a valid list of present and active devices.
1638 */
1640 extern void net_device_init(void);
1641 extern void ip_auto_config(void);
1642 #ifdef CONFIG_NET_DIVERT
1643 extern void dv_init(void);
1644 #endif /* CONFIG_NET_DIVERT */
1647 /*
1648 * Callers must hold the rtnl semaphore. See the comment at the
1649 * end of Space.c for details about the locking.
1650 */
1651 int __init net_dev_init(void)
1653 struct net_device *dev, **dp;
1655 if ( !dev_boot_phase )
1656 return 0;
1658 skb_init();
1660 net_header_cachep = kmem_cache_create(
1661 "net_header_cache",
1662 (PKT_PROT_LEN + sizeof(void *) - 1) & ~(sizeof(void *) - 1),
1663 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1665 spin_lock_init(&net_schedule_list_lock);
1666 INIT_LIST_HEAD(&net_schedule_list);
1668 /*
1669 * Add the devices.
1670 * If the call to dev->init fails, the dev is removed
1671 * from the chain disconnecting the device until the
1672 * next reboot.
1674 * NB At boot phase networking is dead. No locking is required.
1675 * But we still preserve dev_base_lock for sanity.
1676 */
1677 dp = &dev_base;
1678 while ((dev = *dp) != NULL) {
1679 spin_lock_init(&dev->queue_lock);
1680 spin_lock_init(&dev->xmit_lock);
1682 dev->xmit_lock_owner = -1;
1683 dev->iflink = -1;
1684 dev_hold(dev);
1686 /*
1687 * Allocate name. If the init() fails
1688 * the name will be reissued correctly.
1689 */
1690 if (strchr(dev->name, '%'))
1691 dev_alloc_name(dev, dev->name);
1693 if (dev->init && dev->init(dev)) {
1694 /*
1695 * It failed to come up. It will be unhooked later.
1696 * dev_alloc_name can now advance to next suitable
1697 * name that is checked next.
1698 */
1699 dev->deadbeaf = 1;
1700 dp = &dev->next;
1701 } else {
1702 dp = &dev->next;
1703 dev->ifindex = dev_new_index();
1704 if (dev->iflink == -1)
1705 dev->iflink = dev->ifindex;
1706 if (dev->rebuild_header == NULL)
1707 dev->rebuild_header = default_rebuild_header;
1708 dev_init_scheduler(dev);
1709 set_bit(__LINK_STATE_PRESENT, &dev->state);
1713 /*
1714 * Unhook devices that failed to come up
1715 */
1716 dp = &dev_base;
1717 while ((dev = *dp) != NULL) {
1718 if (dev->deadbeaf) {
1719 write_lock_bh(&dev_base_lock);
1720 *dp = dev->next;
1721 write_unlock_bh(&dev_base_lock);
1722 dev_put(dev);
1723 } else {
1724 dp = &dev->next;
1728 dev_boot_phase = 0;
1730 dev_mcast_init();
1732 /*
1733 * Initialise network devices
1734 */
1736 net_device_init();
1738 return 0;
1741 inline int init_tx_header(u8 *data, unsigned int len, struct net_device *dev)
1743 memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN);
1745 switch ( ntohs(*(unsigned short *)(data + 12)) )
1747 case ETH_P_ARP:
1748 if ( len < 42 ) break;
1749 memcpy(data + 22, dev->dev_addr, ETH_ALEN);
1750 return ETH_P_ARP;
1751 case ETH_P_IP:
1752 return ETH_P_IP;
1754 return 0;
1758 /*
1759 * do_net_update:
1761 * Called from guest OS to notify updates to its transmit and/or receive
1762 * descriptor rings.
1763 */
1765 long do_net_update(void)
1767 net_ring_t *shared_rings;
1768 net_vif_t *vif;
1769 net_idx_t *shared_idxs;
1770 unsigned int i, j, idx;
1771 struct sk_buff *skb;
1772 tx_req_entry_t tx;
1773 rx_req_entry_t rx;
1774 unsigned long pte_pfn, buf_pfn;
1775 struct pfn_info *pte_page, *buf_page;
1776 unsigned long *ptep;
1777 net_vif_t *target;
1778 u8 *g_data;
1779 unsigned short protocol;
1781 for ( idx = 0; idx < MAX_DOMAIN_VIFS; idx++ )
1783 if ( (vif = current->net_vif_list[idx]) == NULL )
1784 break;
1786 shared_idxs = vif->shared_idxs;
1787 shared_rings = vif->shared_rings;
1789 /*
1790 * PHASE 1 -- TRANSMIT RING
1791 */
1793 /*
1794 * Collect up new transmit buffers. We collect up to the guest OS's
1795 * new producer index, but take care not to catch up with our own
1796 * consumer index.
1797 */
1798 j = vif->tx_prod;
1799 for ( i = vif->tx_req_cons;
1800 (i != shared_idxs->tx_req_prod) &&
1801 (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
1802 i = TX_RING_INC(i) )
1804 tx = shared_rings->tx_ring[i].req;
1805 target = VIF_DROP;
1807 if ( (tx.size < PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
1809 DPRINTK("Bad packet size: %d\n", tx.size);
1810 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1811 continue;
1814 /* No crossing a page boundary as the payload mustn't fragment. */
1815 if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE )
1817 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n",
1818 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
1819 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1820 continue;
1823 buf_pfn = tx.addr >> PAGE_SHIFT;
1824 buf_page = frame_table + buf_pfn;
1825 spin_lock_irq(&current->page_lock);
1826 if ( (buf_pfn >= max_page) ||
1827 ((buf_page->flags & PG_domain_mask) != current->domain) )
1829 DPRINTK("Bad page frame\n");
1830 spin_unlock_irq(&current->page_lock);
1831 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1832 continue;
1835 g_data = map_domain_mem(tx.addr);
1837 protocol = __constant_htons(
1838 init_tx_header(g_data, tx.size, the_dev));
1839 if ( protocol == 0 )
1841 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1842 goto tx_unmap_and_continue;
1845 target = net_get_target_vif(g_data, tx.size, vif);
1847 if ( target == VIF_PHYS )
1849 vif->tx_shadow_ring[j].id = tx.id;
1850 vif->tx_shadow_ring[j].size = tx.size;
1851 vif->tx_shadow_ring[j].header =
1852 kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
1853 if ( vif->tx_shadow_ring[j].header == NULL )
1855 make_tx_response(vif, tx.id, RING_STATUS_OK);
1856 goto tx_unmap_and_continue;
1859 memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
1860 vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
1861 get_page_tot(buf_page);
1862 j = TX_RING_INC(j);
1864 else if ( target != VIF_DROP )
1866 /* Local delivery */
1867 if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
1869 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1870 put_vif(target);
1871 goto tx_unmap_and_continue;
1874 skb->src_vif = vif;
1875 skb->dst_vif = target;
1876 skb->protocol = protocol;
1878 /*
1879 * We don't need a well-formed skb as netif_rx will fill these
1880 * fields in as necessary. All we actually need is the right
1881 * page offset in skb->data, and the right length in skb->len.
1882 * Note that the correct address/length *excludes* link header.
1883 */
1884 skb->head = (u8 *)map_domain_mem(
1885 ((skb->pf - frame_table) << PAGE_SHIFT));
1886 skb->data = skb->head + 18;
1887 memcpy(skb->data, g_data, tx.size);
1888 skb->data += ETH_HLEN;
1889 skb->len = tx.size - ETH_HLEN;
1890 unmap_domain_mem(skb->head);
1892 (void)netif_rx(skb);
1894 make_tx_response(vif, tx.id, RING_STATUS_OK);
1897 tx_unmap_and_continue:
1898 unmap_domain_mem(g_data);
1899 spin_unlock_irq(&current->page_lock);
1902 vif->tx_req_cons = i;
1904 if ( vif->tx_prod != j )
1906 smp_mb(); /* Let other CPUs see new descriptors first. */
1907 vif->tx_prod = j;
1908 add_to_net_schedule_list_tail(vif);
1909 maybe_schedule_tx_action();
1912 /*
1913 * PHASE 2 -- RECEIVE RING
1914 */
1916 /*
1917 * Collect up new receive buffers. We collect up to the guest OS's
1918 * new producer index, but take care not to catch up with our own
1919 * consumer index.
1920 */
1921 j = vif->rx_prod;
1922 for ( i = vif->rx_req_cons;
1923 (i != shared_idxs->rx_req_prod) &&
1924 (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
1925 i = RX_RING_INC(i) )
1927 rx = shared_rings->rx_ring[i].req;
1929 pte_pfn = rx.addr >> PAGE_SHIFT;
1930 pte_page = frame_table + pte_pfn;
1932 spin_lock_irq(&current->page_lock);
1933 if ( (pte_pfn >= max_page) ||
1934 ((pte_page->flags & (PG_type_mask | PG_domain_mask)) !=
1935 (PGT_l1_page_table | current->domain)) )
1937 DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
1938 current->domain, pte_pfn, max_page, pte_page->flags);
1939 spin_unlock_irq(&current->page_lock);
1940 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
1941 continue;
1944 ptep = map_domain_mem(rx.addr);
1946 if ( !(*ptep & _PAGE_PRESENT) )
1948 DPRINTK("Invalid PTE passed down (not present)\n");
1949 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
1950 goto rx_unmap_and_continue;
1953 buf_pfn = *ptep >> PAGE_SHIFT;
1954 buf_page = frame_table + buf_pfn;
1956 if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
1957 (PGT_writeable_page | current->domain)) ||
1958 (buf_page->tot_count != 1) )
1960 DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
1961 buf_page->type_count, buf_page->tot_count, buf_page->flags);
1962 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
1963 goto rx_unmap_and_continue;
1966 /*
1967 * The pte they passed was good, so take it away from them. We
1968 * also lock down the page-table page, so it doesn't go away.
1969 */
1970 get_page_type(pte_page);
1971 get_page_tot(pte_page);
1972 *ptep &= ~_PAGE_PRESENT;
1973 buf_page->flags = buf_page->type_count = buf_page->tot_count = 0;
1974 list_del(&buf_page->list);
1976 vif->rx_shadow_ring[j].id = rx.id;
1977 vif->rx_shadow_ring[j].pte_ptr = rx.addr;
1978 vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
1979 vif->rx_shadow_ring[j].flush_count = (unsigned short)
1980 atomic_read(&tlb_flush_count[smp_processor_id()]);
1981 j = RX_RING_INC(j);
1983 rx_unmap_and_continue:
1984 unmap_domain_mem(ptep);
1985 spin_unlock_irq(&current->page_lock);
1988 vif->rx_req_cons = i;
1990 if ( vif->rx_prod != j )
1992 smp_mb(); /* Let other CPUs see new descriptors first. */
1993 vif->rx_prod = j;
1997 return 0;
2001 static void make_tx_response(net_vif_t *vif,
2002 unsigned short id,
2003 unsigned char st)
2005 unsigned long flags;
2006 unsigned int pos;
2007 tx_resp_entry_t *resp;
2009 /* Place on the response ring for the relevant domain. */
2010 spin_lock_irqsave(&vif->tx_lock, flags);
2011 pos = vif->tx_resp_prod;
2012 resp = &vif->shared_rings->tx_ring[pos].resp;
2013 resp->id = id;
2014 resp->status = st;
2015 pos = TX_RING_INC(pos);
2016 vif->tx_resp_prod = vif->shared_idxs->tx_resp_prod = pos;
2017 if ( pos == vif->shared_idxs->rx_event )
2019 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_TX);
2020 guest_event_notify(cpu_mask);
2022 spin_unlock_irqrestore(&vif->tx_lock, flags);
2026 static void make_rx_response(net_vif_t *vif,
2027 unsigned short id,
2028 unsigned short size,
2029 unsigned char st,
2030 unsigned char off)
2032 unsigned long flags;
2033 unsigned int pos;
2034 rx_resp_entry_t *resp;
2036 /* Place on the response ring for the relevant domain. */
2037 spin_lock_irqsave(&vif->rx_lock, flags);
2038 pos = vif->rx_resp_prod;
2039 resp = &vif->shared_rings->rx_ring[pos].resp;
2040 resp->id = id;
2041 resp->size = size;
2042 resp->status = st;
2043 resp->offset = off;
2044 pos = RX_RING_INC(pos);
2045 vif->rx_resp_prod = vif->shared_idxs->rx_resp_prod = pos;
2046 if ( pos == vif->shared_idxs->rx_event )
2048 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_RX);
2049 guest_event_notify(cpu_mask);
2051 spin_unlock_irqrestore(&vif->rx_lock, flags);
2055 int setup_network_devices(void)
2057 int ret;
2058 extern char opt_ifname[];
2059 struct net_device *dev = dev_get_by_name(opt_ifname);
2061 if ( dev == NULL )
2063 printk("Could not find device %s\n", opt_ifname);
2064 return 0;
2067 ret = dev_open(dev);
2068 if ( ret != 0 )
2070 printk("Error opening device %s for use (%d)\n", opt_ifname, ret);
2071 return 0;
2073 printk("Device %s opened and ready for use.\n", opt_ifname);
2074 the_dev = dev;
2076 tasklet_enable(&net_tx_tasklet);
2078 return 1;