debuggers.hg

view linux-2.6.10-xen-sparse/drivers/xen/blkfront/blkfront.c @ 3484:515888a1b568

bitkeeper revision 1.1159.170.99 (41eb7325xet5XQ2BbrNnK2ynj_F4XQ)

Change __initcall to module_init as suggested by Rusty Russel
author iap10@labyrinth.cl.cam.ac.uk
date Mon Jan 17 08:11:17 2005 +0000 (2005-01-17)
parents ea428d76cfb3
children 4abfb7f9fa7a 17e50d7ed675
line source
1 /******************************************************************************
2 * blkfront.c
3 *
4 * XenLinux virtual block-device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 *
10 * This file may be distributed separately from the Linux kernel, or
11 * incorporated into other software packages, subject to the following license:
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining a copy
14 * of this source file (the "Software"), to deal in the Software without
15 * restriction, including without limitation the rights to use, copy, modify,
16 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17 * and to permit persons to whom the Software is furnished to do so, subject to
18 * the following conditions:
19 *
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29 * IN THE SOFTWARE.
30 */
32 #include <linux/version.h>
34 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
35 #include "block.h"
36 #else
37 #include "common.h"
38 #include <linux/blk.h>
39 #include <linux/tqueue.h>
40 #endif
42 #include <linux/cdrom.h>
43 #include <linux/sched.h>
44 #include <linux/interrupt.h>
45 #include <scsi/scsi.h>
46 #include <asm-xen/ctrl_if.h>
48 typedef unsigned char byte; /* from linux/ide.h */
50 /* Control whether runtime update of vbds is enabled. */
51 #define ENABLE_VBD_UPDATE 1
53 #if ENABLE_VBD_UPDATE
54 static void vbd_update(void);
55 #else
56 static void vbd_update(void){};
57 #endif
59 #define BLKIF_STATE_CLOSED 0
60 #define BLKIF_STATE_DISCONNECTED 1
61 #define BLKIF_STATE_CONNECTED 2
63 #define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_blk: " fmt, ##args)
65 static int blkif_handle = 0;
66 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
67 static unsigned int blkif_evtchn = 0;
68 static unsigned int blkif_irq = 0;
70 static int blkif_control_rsp_valid;
71 static blkif_response_t blkif_control_rsp;
73 static blkif_ring_t *blk_ring = NULL;
74 static BLKIF_RING_IDX resp_cons; /* Response consumer for comms ring. */
75 static BLKIF_RING_IDX req_prod; /* Private request producer. */
77 unsigned long rec_ring_free;
78 blkif_request_t rec_ring[BLKIF_RING_SIZE];
80 static int recovery = 0; /* "Recovery in progress" flag. Protected
81 * by the blkif_io_lock */
83 /* We plug the I/O ring if the driver is suspended or if the ring is full. */
84 #define BLKIF_RING_FULL (((req_prod - resp_cons) == BLKIF_RING_SIZE) || \
85 (blkif_state != BLKIF_STATE_CONNECTED))
87 static void kick_pending_request_queues(void);
89 int __init xlblk_init(void);
91 void blkif_completion( blkif_request_t *req );
93 static inline int GET_ID_FROM_FREELIST( void )
94 {
95 unsigned long free = rec_ring_free;
97 if ( free > BLKIF_RING_SIZE )
98 BUG();
100 rec_ring_free = rec_ring[free].id;
102 rec_ring[free].id = 0x0fffffee; /* debug */
104 return free;
105 }
107 static inline void ADD_ID_TO_FREELIST( unsigned long id )
108 {
109 rec_ring[id].id = rec_ring_free;
110 rec_ring_free = id;
111 }
114 /************************ COMMON CODE (inlined) ************************/
116 /* Kernel-specific definitions used in the common code */
117 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
118 #define DISABLE_SCATTERGATHER()
119 #else
120 static int sg_operation = -1;
121 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
122 #endif
124 static inline void translate_req_to_pfn(blkif_request_t *xreq,
125 blkif_request_t *req)
126 {
127 int i;
129 xreq->operation = req->operation;
130 xreq->nr_segments = req->nr_segments;
131 xreq->device = req->device;
132 /* preserve id */
133 xreq->sector_number = req->sector_number;
135 for ( i = 0; i < req->nr_segments; i++ )
136 xreq->frame_and_sects[i] = machine_to_phys(req->frame_and_sects[i]);
137 }
139 static inline void translate_req_to_mfn(blkif_request_t *xreq,
140 blkif_request_t *req)
141 {
142 int i;
144 xreq->operation = req->operation;
145 xreq->nr_segments = req->nr_segments;
146 xreq->device = req->device;
147 xreq->id = req->id; /* copy id (unlike above) */
148 xreq->sector_number = req->sector_number;
150 for ( i = 0; i < req->nr_segments; i++ )
151 xreq->frame_and_sects[i] = phys_to_machine(req->frame_and_sects[i]);
152 }
155 static inline void flush_requests(void)
156 {
157 DISABLE_SCATTERGATHER();
158 wmb(); /* Ensure that the frontend can see the requests. */
159 blk_ring->req_prod = req_prod;
160 notify_via_evtchn(blkif_evtchn);
161 }
166 /************************** KERNEL VERSION 2.6 **************************/
168 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
170 module_init(xlblk_init);
172 #if ENABLE_VBD_UPDATE
173 static void vbd_update(void)
174 {
175 }
176 #endif /* ENABLE_VBD_UPDATE */
178 static void kick_pending_request_queues(void)
179 {
181 if ( (xlbd_blk_queue != NULL) &&
182 test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
183 {
184 blk_start_queue(xlbd_blk_queue);
185 /* XXXcl call to request_fn should not be needed but
186 * we get stuck without... needs investigating
187 */
188 xlbd_blk_queue->request_fn(xlbd_blk_queue);
189 }
191 }
194 int blkif_open(struct inode *inode, struct file *filep)
195 {
196 struct gendisk *gd = inode->i_bdev->bd_disk;
197 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
199 /* Update of usage count is protected by per-device semaphore. */
200 di->mi->usage++;
202 return 0;
203 }
206 int blkif_release(struct inode *inode, struct file *filep)
207 {
208 struct gendisk *gd = inode->i_bdev->bd_disk;
209 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
211 /*
212 * When usage drops to zero it may allow more VBD updates to occur.
213 * Update of usage count is protected by a per-device semaphore.
214 */
215 if (--di->mi->usage == 0) {
216 vbd_update();
217 }
219 return 0;
220 }
223 int blkif_ioctl(struct inode *inode, struct file *filep,
224 unsigned command, unsigned long argument)
225 {
226 /* struct gendisk *gd = inode->i_bdev->bd_disk; */
228 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
229 command, (long)argument, inode->i_rdev);
231 switch (command) {
233 case HDIO_GETGEO:
234 /* return ENOSYS to use defaults */
235 return -ENOSYS;
237 default:
238 printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
239 command);
240 return -ENOSYS;
241 }
243 return 0;
244 }
246 #if 0
247 /* check media change: should probably do something here in some cases :-) */
248 int blkif_check(kdev_t dev)
249 {
250 DPRINTK("blkif_check\n");
251 return 0;
252 }
254 int blkif_revalidate(kdev_t dev)
255 {
256 struct block_device *bd;
257 struct gendisk *gd;
258 xen_block_t *disk;
259 unsigned long capacity;
260 int i, rc = 0;
262 if ( (bd = bdget(dev)) == NULL )
263 return -EINVAL;
265 /*
266 * Update of partition info, and check of usage count, is protected
267 * by the per-block-device semaphore.
268 */
269 down(&bd->bd_sem);
271 if ( ((gd = get_gendisk(dev)) == NULL) ||
272 ((disk = xldev_to_xldisk(dev)) == NULL) ||
273 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
274 {
275 rc = -EINVAL;
276 goto out;
277 }
279 if ( disk->usage > 1 )
280 {
281 rc = -EBUSY;
282 goto out;
283 }
285 /* Only reread partition table if VBDs aren't mapped to partitions. */
286 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
287 {
288 for ( i = gd->max_p - 1; i >= 0; i-- )
289 {
290 invalidate_device(dev+i, 1);
291 gd->part[MINOR(dev+i)].start_sect = 0;
292 gd->part[MINOR(dev+i)].nr_sects = 0;
293 gd->sizes[MINOR(dev+i)] = 0;
294 }
296 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
297 }
299 out:
300 up(&bd->bd_sem);
301 bdput(bd);
302 return rc;
303 }
304 #endif
306 /*
307 * blkif_queue_request
308 *
309 * request block io
310 *
311 * id: for guest use only.
312 * operation: BLKIF_OP_{READ,WRITE,PROBE}
313 * buffer: buffer to read/write into. this should be a
314 * virtual address in the guest os.
315 */
316 static int blkif_queue_request(struct request *req)
317 {
318 struct xlbd_disk_info *di =
319 (struct xlbd_disk_info *)req->rq_disk->private_data;
320 unsigned long buffer_ma;
321 blkif_request_t *ring_req;
322 struct bio *bio;
323 struct bio_vec *bvec;
324 int idx;
325 unsigned long id;
326 unsigned int fsect, lsect;
328 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
329 return 1;
331 /* Fill out a communications ring structure. */
332 ring_req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req;
333 id = GET_ID_FROM_FREELIST();
334 rec_ring[id].id = (unsigned long) req;
336 ring_req->id = id;
337 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
338 BLKIF_OP_READ;
339 ring_req->sector_number = (blkif_sector_t)req->sector;
340 ring_req->device = di->xd_device;
342 ring_req->nr_segments = 0;
343 rq_for_each_bio(bio, req)
344 {
345 bio_for_each_segment(bvec, bio, idx)
346 {
347 if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
348 BUG();
349 buffer_ma = page_to_phys(bvec->bv_page);
350 fsect = bvec->bv_offset >> 9;
351 lsect = fsect + (bvec->bv_len >> 9) - 1;
352 ring_req->frame_and_sects[ring_req->nr_segments++] =
353 buffer_ma | (fsect << 3) | lsect;
354 }
355 }
357 req_prod++;
359 /* Keep a private copy so we can reissue requests when recovering. */
360 translate_req_to_pfn(&rec_ring[id], ring_req);
362 return 0;
363 }
366 /*
367 * do_blkif_request
368 * read a block; request is in a request queue
369 */
370 void do_blkif_request(request_queue_t *rq)
371 {
372 struct request *req;
373 int queued;
375 DPRINTK("Entered do_blkif_request\n");
377 queued = 0;
379 while ((req = elv_next_request(rq)) != NULL) {
380 if (!blk_fs_request(req)) {
381 end_request(req, 0);
382 continue;
383 }
385 if ( BLKIF_RING_FULL )
386 {
387 blk_stop_queue(rq);
388 break;
389 }
390 DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
391 req, req->cmd, req->sector, req->current_nr_sectors,
392 req->nr_sectors, req->buffer,
393 rq_data_dir(req) ? "write" : "read");
394 blkdev_dequeue_request(req);
395 if (blkif_queue_request(req)) {
396 blk_stop_queue(rq);
397 break;
398 }
399 queued++;
400 }
402 if (queued != 0)
403 flush_requests();
404 }
407 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
408 {
409 struct request *req;
410 blkif_response_t *bret;
411 BLKIF_RING_IDX i, rp;
412 unsigned long flags;
414 spin_lock_irqsave(&blkif_io_lock, flags);
416 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ||
417 unlikely(recovery) )
418 {
419 spin_unlock_irqrestore(&blkif_io_lock, flags);
420 return IRQ_HANDLED;
421 }
423 rp = blk_ring->resp_prod;
424 rmb(); /* Ensure we see queued responses up to 'rp'. */
426 for ( i = resp_cons; i != rp; i++ )
427 {
428 unsigned long id;
429 bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
431 id = bret->id;
432 req = (struct request *)rec_ring[id].id;
434 blkif_completion( &rec_ring[id] );
436 ADD_ID_TO_FREELIST(id); /* overwrites req */
438 switch ( bret->operation )
439 {
440 case BLKIF_OP_READ:
441 case BLKIF_OP_WRITE:
442 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
443 DPRINTK("Bad return from blkdev data request: %x\n",
444 bret->status);
446 if ( unlikely(end_that_request_first
447 (req,
448 (bret->status == BLKIF_RSP_OKAY),
449 req->hard_nr_sectors)) )
450 BUG();
451 end_that_request_last(req);
453 break;
454 case BLKIF_OP_PROBE:
455 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
456 blkif_control_rsp_valid = 1;
457 break;
458 default:
459 BUG();
460 }
461 }
463 resp_cons = i;
465 kick_pending_request_queues();
467 spin_unlock_irqrestore(&blkif_io_lock, flags);
469 return IRQ_HANDLED;
470 }
472 #else
473 /************************** KERNEL VERSION 2.4 **************************/
475 static kdev_t sg_dev;
476 static unsigned long sg_next_sect;
478 /*
479 * Request queues with outstanding work, but ring is currently full.
480 * We need no special lock here, as we always access this with the
481 * blkif_io_lock held. We only need a small maximum list.
482 */
483 #define MAX_PENDING 8
484 static request_queue_t *pending_queues[MAX_PENDING];
485 static int nr_pending;
488 #define blkif_io_lock io_request_lock
490 /*============================================================================*/
491 #if ENABLE_VBD_UPDATE
493 /*
494 * blkif_update_int/update-vbds_task - handle VBD update events.
495 * Schedule a task for keventd to run, which will update the VBDs and perform
496 * the corresponding updates to our view of VBD state.
497 */
498 static void update_vbds_task(void *unused)
499 {
500 xlvbd_update_vbds();
501 }
503 static void vbd_update(void)
504 {
505 static struct tq_struct update_tq;
506 update_tq.routine = update_vbds_task;
507 schedule_task(&update_tq);
508 }
510 #endif /* ENABLE_VBD_UPDATE */
511 /*============================================================================*/
514 static void kick_pending_request_queues(void)
515 {
516 /* We kick pending request queues if the ring is reasonably empty. */
517 if ( (nr_pending != 0) &&
518 ((req_prod - resp_cons) < (BLKIF_RING_SIZE >> 1)) )
519 {
520 /* Attempt to drain the queue, but bail if the ring becomes full. */
521 while ( (nr_pending != 0) && !BLKIF_RING_FULL )
522 do_blkif_request(pending_queues[--nr_pending]);
523 }
524 }
526 int blkif_open(struct inode *inode, struct file *filep)
527 {
528 short xldev = inode->i_rdev;
529 struct gendisk *gd = get_gendisk(xldev);
530 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
531 short minor = MINOR(xldev);
533 if ( gd->part[minor].nr_sects == 0 )
534 {
535 /*
536 * Device either doesn't exist, or has zero capacity; we use a few
537 * cheesy heuristics to return the relevant error code
538 */
539 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
540 ((minor & (gd->max_p - 1)) != 0) )
541 {
542 /*
543 * We have a real device, but no such partition, or we just have a
544 * partition number so guess this is the problem.
545 */
546 return -ENXIO; /* no such device or address */
547 }
548 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
549 {
550 /* This is a removable device => assume that media is missing. */
551 return -ENOMEDIUM; /* media not present (this is a guess) */
552 }
553 else
554 {
555 /* Just go for the general 'no such device' error. */
556 return -ENODEV; /* no such device */
557 }
558 }
560 /* Update of usage count is protected by per-device semaphore. */
561 disk->usage++;
563 return 0;
564 }
567 int blkif_release(struct inode *inode, struct file *filep)
568 {
569 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
571 /*
572 * When usage drops to zero it may allow more VBD updates to occur.
573 * Update of usage count is protected by a per-device semaphore.
574 */
575 if ( --disk->usage == 0 ) {
576 vbd_update();
577 }
579 return 0;
580 }
583 int blkif_ioctl(struct inode *inode, struct file *filep,
584 unsigned command, unsigned long argument)
585 {
586 kdev_t dev = inode->i_rdev;
587 struct hd_geometry *geo = (struct hd_geometry *)argument;
588 struct gendisk *gd;
589 struct hd_struct *part;
590 int i;
591 unsigned short cylinders;
592 byte heads, sectors;
594 /* NB. No need to check permissions. That is done for us. */
596 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
597 command, (long) argument, dev);
599 gd = get_gendisk(dev);
600 part = &gd->part[MINOR(dev)];
602 switch ( command )
603 {
604 case BLKGETSIZE:
605 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
606 return put_user(part->nr_sects, (unsigned long *) argument);
608 case BLKGETSIZE64:
609 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
610 (u64)part->nr_sects * 512);
611 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
613 case BLKRRPART: /* re-read partition table */
614 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
615 return blkif_revalidate(dev);
617 case BLKSSZGET:
618 return hardsect_size[MAJOR(dev)][MINOR(dev)];
620 case BLKBSZGET: /* get block size */
621 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
622 break;
624 case BLKBSZSET: /* set block size */
625 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
626 break;
628 case BLKRASET: /* set read-ahead */
629 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
630 break;
632 case BLKRAGET: /* get read-ahead */
633 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
634 break;
636 case HDIO_GETGEO:
637 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
638 if (!argument) return -EINVAL;
640 /* We don't have real geometry info, but let's at least return
641 values consistent with the size of the device */
643 heads = 0xff;
644 sectors = 0x3f;
645 cylinders = part->nr_sects / (heads * sectors);
647 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
648 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
649 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
650 if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;
652 return 0;
654 case HDIO_GETGEO_BIG:
655 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
656 if (!argument) return -EINVAL;
658 /* We don't have real geometry info, but let's at least return
659 values consistent with the size of the device */
661 heads = 0xff;
662 sectors = 0x3f;
663 cylinders = part->nr_sects / (heads * sectors);
665 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
666 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
667 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
668 if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;
670 return 0;
672 case CDROMMULTISESSION:
673 DPRINTK("FIXME: support multisession CDs later\n");
674 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
675 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
676 return 0;
678 case SCSI_IOCTL_GET_BUS_NUMBER:
679 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
680 return -ENOSYS;
682 default:
683 printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
684 return -ENOSYS;
685 }
687 return 0;
688 }
692 /* check media change: should probably do something here in some cases :-) */
693 int blkif_check(kdev_t dev)
694 {
695 DPRINTK("blkif_check\n");
696 return 0;
697 }
699 int blkif_revalidate(kdev_t dev)
700 {
701 struct block_device *bd;
702 struct gendisk *gd;
703 xl_disk_t *disk;
704 unsigned long capacity;
705 int i, rc = 0;
707 if ( (bd = bdget(dev)) == NULL )
708 return -EINVAL;
710 /*
711 * Update of partition info, and check of usage count, is protected
712 * by the per-block-device semaphore.
713 */
714 down(&bd->bd_sem);
716 if ( ((gd = get_gendisk(dev)) == NULL) ||
717 ((disk = xldev_to_xldisk(dev)) == NULL) ||
718 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
719 {
720 rc = -EINVAL;
721 goto out;
722 }
724 if ( disk->usage > 1 )
725 {
726 rc = -EBUSY;
727 goto out;
728 }
730 /* Only reread partition table if VBDs aren't mapped to partitions. */
731 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
732 {
733 for ( i = gd->max_p - 1; i >= 0; i-- )
734 {
735 invalidate_device(dev+i, 1);
736 gd->part[MINOR(dev+i)].start_sect = 0;
737 gd->part[MINOR(dev+i)].nr_sects = 0;
738 gd->sizes[MINOR(dev+i)] = 0;
739 }
741 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
742 }
744 out:
745 up(&bd->bd_sem);
746 bdput(bd);
747 return rc;
748 }
751 /*
752 * blkif_queue_request
753 *
754 * request block io
755 *
756 * id: for guest use only.
757 * operation: BLKIF_OP_{READ,WRITE,PROBE}
758 * buffer: buffer to read/write into. this should be a
759 * virtual address in the guest os.
760 */
761 static int blkif_queue_request(unsigned long id,
762 int operation,
763 char * buffer,
764 unsigned long sector_number,
765 unsigned short nr_sectors,
766 kdev_t device)
767 {
768 unsigned long buffer_ma = virt_to_bus(buffer);
769 unsigned long xid;
770 struct gendisk *gd;
771 blkif_request_t *req;
772 struct buffer_head *bh;
773 unsigned int fsect, lsect;
775 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
776 lsect = fsect + nr_sectors - 1;
778 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
779 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
780 BUG();
781 if ( lsect > 7 )
782 BUG();
784 buffer_ma &= PAGE_MASK;
786 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
787 return 1;
789 switch ( operation )
790 {
792 case BLKIF_OP_READ:
793 case BLKIF_OP_WRITE:
794 gd = get_gendisk(device);
796 /*
797 * Update the sector_number we'll pass down as appropriate; note that
798 * we could sanity check that resulting sector will be in this
799 * partition, but this will happen in driver backend anyhow.
800 */
801 sector_number += gd->part[MINOR(device)].start_sect;
803 /*
804 * If this unit doesn't consist of virtual partitions then we clear
805 * the partn bits from the device number.
806 */
807 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
808 GENHD_FL_VIRT_PARTNS) )
809 device &= ~(gd->max_p - 1);
811 if ( (sg_operation == operation) &&
812 (sg_dev == device) &&
813 (sg_next_sect == sector_number) )
814 {
816 req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod-1)].req;
817 bh = (struct buffer_head *)id;
819 bh->b_reqnext = (struct buffer_head *)rec_ring[req->id].id;
822 rec_ring[req->id].id = id;
824 req->frame_and_sects[req->nr_segments] =
825 buffer_ma | (fsect<<3) | lsect;
826 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
827 sg_next_sect += nr_sectors;
828 else
829 DISABLE_SCATTERGATHER();
831 /* Update the copy of the request in the recovery ring. */
832 translate_req_to_pfn(&rec_ring[req->id], req );
834 return 0;
835 }
836 else if ( BLKIF_RING_FULL )
837 {
838 return 1;
839 }
840 else
841 {
842 sg_operation = operation;
843 sg_dev = device;
844 sg_next_sect = sector_number + nr_sectors;
845 }
846 break;
848 default:
849 panic("unknown op %d\n", operation);
850 }
852 /* Fill out a communications ring structure. */
853 req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req;
855 xid = GET_ID_FROM_FREELIST();
856 rec_ring[xid].id = id;
858 req->id = xid;
859 req->operation = operation;
860 req->sector_number = (blkif_sector_t)sector_number;
861 req->device = device;
862 req->nr_segments = 1;
863 req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
865 req_prod++;
867 /* Keep a private copy so we can reissue requests when recovering. */
868 translate_req_to_pfn(&rec_ring[xid], req );
870 return 0;
871 }
874 /*
875 * do_blkif_request
876 * read a block; request is in a request queue
877 */
878 void do_blkif_request(request_queue_t *rq)
879 {
880 struct request *req;
881 struct buffer_head *bh, *next_bh;
882 int rw, nsect, full, queued = 0;
884 DPRINTK("Entered do_blkif_request\n");
886 while ( !rq->plugged && !list_empty(&rq->queue_head))
887 {
888 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
889 goto out;
891 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
892 req, req->cmd, req->sector,
893 req->current_nr_sectors, req->nr_sectors, req->bh);
895 rw = req->cmd;
896 if ( rw == READA )
897 rw = READ;
898 if ( unlikely((rw != READ) && (rw != WRITE)) )
899 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
901 req->errors = 0;
903 bh = req->bh;
904 while ( bh != NULL )
905 {
906 next_bh = bh->b_reqnext;
907 bh->b_reqnext = NULL;
909 full = blkif_queue_request(
910 (unsigned long)bh,
911 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
912 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
914 if ( full )
915 {
916 bh->b_reqnext = next_bh;
917 pending_queues[nr_pending++] = rq;
918 if ( unlikely(nr_pending >= MAX_PENDING) )
919 BUG();
920 goto out;
921 }
923 queued++;
925 /* Dequeue the buffer head from the request. */
926 nsect = bh->b_size >> 9;
927 bh = req->bh = next_bh;
929 if ( bh != NULL )
930 {
931 /* There's another buffer head to do. Update the request. */
932 req->hard_sector += nsect;
933 req->hard_nr_sectors -= nsect;
934 req->sector = req->hard_sector;
935 req->nr_sectors = req->hard_nr_sectors;
936 req->current_nr_sectors = bh->b_size >> 9;
937 req->buffer = bh->b_data;
938 }
939 else
940 {
941 /* That was the last buffer head. Finalise the request. */
942 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
943 BUG();
944 blkdev_dequeue_request(req);
945 end_that_request_last(req);
946 }
947 }
948 }
950 out:
951 if ( queued != 0 )
952 flush_requests();
953 }
956 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
957 {
958 BLKIF_RING_IDX i, rp;
959 unsigned long flags;
960 struct buffer_head *bh, *next_bh;
962 spin_lock_irqsave(&io_request_lock, flags);
964 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
965 {
966 spin_unlock_irqrestore(&io_request_lock, flags);
967 return;
968 }
970 rp = blk_ring->resp_prod;
971 rmb(); /* Ensure we see queued responses up to 'rp'. */
973 for ( i = resp_cons; i != rp; i++ )
974 {
975 unsigned long id;
976 blkif_response_t *bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
978 id = bret->id;
979 bh = (struct buffer_head *)rec_ring[id].id;
981 blkif_completion( &rec_ring[id] );
983 ADD_ID_TO_FREELIST(id);
985 switch ( bret->operation )
986 {
987 case BLKIF_OP_READ:
988 case BLKIF_OP_WRITE:
989 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
990 DPRINTK("Bad return from blkdev data request: %lx\n",
991 bret->status);
992 for ( ; bh != NULL; bh = next_bh )
993 {
994 next_bh = bh->b_reqnext;
995 bh->b_reqnext = NULL;
996 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
997 }
999 break;
1000 case BLKIF_OP_PROBE:
1001 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
1002 blkif_control_rsp_valid = 1;
1003 break;
1004 default:
1005 BUG();
1009 resp_cons = i;
1011 kick_pending_request_queues();
1013 spin_unlock_irqrestore(&io_request_lock, flags);
1016 #endif
1018 /***************************** COMMON CODE *******************************/
1021 void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
1023 unsigned long flags, id;
1025 retry:
1026 while ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
1028 set_current_state(TASK_INTERRUPTIBLE);
1029 schedule_timeout(1);
1032 spin_lock_irqsave(&blkif_io_lock, flags);
1033 if ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
1035 spin_unlock_irqrestore(&blkif_io_lock, flags);
1036 goto retry;
1039 DISABLE_SCATTERGATHER();
1040 blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req = *req;
1042 id = GET_ID_FROM_FREELIST();
1043 blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req.id = id;
1044 rec_ring[id].id = (unsigned long) req;
1046 translate_req_to_pfn( &rec_ring[id], req );
1048 req_prod++;
1049 flush_requests();
1051 spin_unlock_irqrestore(&blkif_io_lock, flags);
1053 while ( !blkif_control_rsp_valid )
1055 set_current_state(TASK_INTERRUPTIBLE);
1056 schedule_timeout(1);
1059 memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
1060 blkif_control_rsp_valid = 0;
1064 /* Send a driver status notification to the domain controller. */
1065 static void send_driver_status(int ok)
1067 ctrl_msg_t cmsg = {
1068 .type = CMSG_BLKIF_FE,
1069 .subtype = CMSG_BLKIF_FE_DRIVER_STATUS,
1070 .length = sizeof(blkif_fe_driver_status_t),
1071 };
1072 blkif_fe_driver_status_t *msg = (void*)cmsg.msg;
1074 msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN);
1076 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1079 /* Tell the controller to bring up the interface. */
1080 static void blkif_send_interface_connect(void)
1082 ctrl_msg_t cmsg = {
1083 .type = CMSG_BLKIF_FE,
1084 .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
1085 .length = sizeof(blkif_fe_interface_connect_t),
1086 };
1087 blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
1089 msg->handle = 0;
1090 msg->shmem_frame = (virt_to_machine(blk_ring) >> PAGE_SHIFT);
1092 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1095 static void blkif_free(void)
1097 /* Prevent new requests being issued until we fix things up. */
1098 spin_lock_irq(&blkif_io_lock);
1099 recovery = 1;
1100 blkif_state = BLKIF_STATE_DISCONNECTED;
1101 spin_unlock_irq(&blkif_io_lock);
1103 /* Free resources associated with old device channel. */
1104 if ( blk_ring != NULL )
1106 free_page((unsigned long)blk_ring);
1107 blk_ring = NULL;
1109 free_irq(blkif_irq, NULL);
1110 blkif_irq = 0;
1112 unbind_evtchn_from_irq(blkif_evtchn);
1113 blkif_evtchn = 0;
1116 static void blkif_close(void)
1120 /* Move from CLOSED to DISCONNECTED state. */
1121 static void blkif_disconnect(void)
1123 if ( blk_ring != NULL )
1124 free_page((unsigned long)blk_ring);
1125 blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
1126 blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
1127 blkif_state = BLKIF_STATE_DISCONNECTED;
1128 blkif_send_interface_connect();
1131 static void blkif_reset(void)
1133 blkif_free();
1134 blkif_disconnect();
1137 static void blkif_recover(void)
1139 int i;
1141 /* Hmm, requests might be re-ordered when we re-issue them.
1142 * This will need to be fixed once we have barriers */
1144 /* Stage 1 : Find active and move to safety. */
1145 for ( i = 0; i < BLKIF_RING_SIZE; i++ )
1147 if ( rec_ring[i].id >= PAGE_OFFSET )
1149 translate_req_to_mfn(
1150 &blk_ring->ring[req_prod].req, &rec_ring[i]);
1151 req_prod++;
1155 /* Stage 2 : Set up shadow list. */
1156 for ( i = 0; i < req_prod; i++ )
1158 rec_ring[i].id = blk_ring->ring[i].req.id;
1159 blk_ring->ring[i].req.id = i;
1160 translate_req_to_pfn(&rec_ring[i], &blk_ring->ring[i].req);
1163 /* Stage 3 : Set up free list. */
1164 for ( ; i < BLKIF_RING_SIZE; i++ )
1165 rec_ring[i].id = i+1;
1166 rec_ring_free = req_prod;
1167 rec_ring[BLKIF_RING_SIZE-1].id = 0x0fffffff;
1169 /* blk_ring->req_prod will be set when we flush_requests().*/
1170 wmb();
1172 /* Switch off recovery mode, using a memory barrier to ensure that
1173 * it's seen before we flush requests - we don't want to miss any
1174 * interrupts. */
1175 recovery = 0;
1176 wmb();
1178 /* Kicks things back into life. */
1179 flush_requests();
1181 /* Now safe to left other peope use interface. */
1182 blkif_state = BLKIF_STATE_CONNECTED;
1185 static void blkif_connect(blkif_fe_interface_status_t *status)
1187 int err = 0;
1189 blkif_evtchn = status->evtchn;
1190 blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
1192 err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
1193 if ( err )
1195 printk(KERN_ALERT "xen_blk: request_irq failed (err=%d)\n", err);
1196 return;
1199 if ( recovery )
1201 blkif_recover();
1203 else
1205 /* Transition to connected in case we need to do
1206 * a partition probe on a whole disk. */
1207 blkif_state = BLKIF_STATE_CONNECTED;
1209 /* Probe for discs attached to the interface. */
1210 xlvbd_init();
1213 /* Kick pending requests. */
1214 spin_lock_irq(&blkif_io_lock);
1215 kick_pending_request_queues();
1216 spin_unlock_irq(&blkif_io_lock);
1219 static void unexpected(blkif_fe_interface_status_t *status)
1221 DPRINTK(" Unexpected blkif status %u in state %u\n",
1222 status->status, blkif_state);
1225 static void blkif_status(blkif_fe_interface_status_t *status)
1227 if ( status->handle != blkif_handle )
1229 WPRINTK(" Invalid blkif: handle=%u", status->handle);
1230 return;
1233 switch ( status->status )
1235 case BLKIF_INTERFACE_STATUS_CLOSED:
1236 switch ( blkif_state )
1238 case BLKIF_STATE_CLOSED:
1239 unexpected(status);
1240 break;
1241 case BLKIF_STATE_DISCONNECTED:
1242 case BLKIF_STATE_CONNECTED:
1243 unexpected(status);
1244 blkif_close();
1245 break;
1247 break;
1249 case BLKIF_INTERFACE_STATUS_DISCONNECTED:
1250 switch ( blkif_state )
1252 case BLKIF_STATE_CLOSED:
1253 blkif_disconnect();
1254 break;
1255 case BLKIF_STATE_DISCONNECTED:
1256 case BLKIF_STATE_CONNECTED:
1257 /* unexpected(status); */ /* occurs during suspend/resume */
1258 blkif_reset();
1259 break;
1261 break;
1263 case BLKIF_INTERFACE_STATUS_CONNECTED:
1264 switch ( blkif_state )
1266 case BLKIF_STATE_CLOSED:
1267 unexpected(status);
1268 blkif_disconnect();
1269 blkif_connect(status);
1270 break;
1271 case BLKIF_STATE_DISCONNECTED:
1272 blkif_connect(status);
1273 break;
1274 case BLKIF_STATE_CONNECTED:
1275 unexpected(status);
1276 blkif_connect(status);
1277 break;
1279 break;
1281 case BLKIF_INTERFACE_STATUS_CHANGED:
1282 switch ( blkif_state )
1284 case BLKIF_STATE_CLOSED:
1285 case BLKIF_STATE_DISCONNECTED:
1286 unexpected(status);
1287 break;
1288 case BLKIF_STATE_CONNECTED:
1289 vbd_update();
1290 break;
1292 break;
1294 default:
1295 WPRINTK(" Invalid blkif status: %d\n", status->status);
1296 break;
1301 static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
1303 switch ( msg->subtype )
1305 case CMSG_BLKIF_FE_INTERFACE_STATUS:
1306 if ( msg->length != sizeof(blkif_fe_interface_status_t) )
1307 goto parse_error;
1308 blkif_status((blkif_fe_interface_status_t *)
1309 &msg->msg[0]);
1310 break;
1311 default:
1312 goto parse_error;
1315 ctrl_if_send_response(msg);
1316 return;
1318 parse_error:
1319 msg->length = 0;
1320 ctrl_if_send_response(msg);
1323 int wait_for_blkif(void)
1325 int err = 0;
1326 int i;
1327 send_driver_status(1);
1329 /*
1330 * We should read 'nr_interfaces' from response message and wait
1331 * for notifications before proceeding. For now we assume that we
1332 * will be notified of exactly one interface.
1333 */
1334 for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ )
1336 set_current_state(TASK_INTERRUPTIBLE);
1337 schedule_timeout(1);
1340 if ( blkif_state != BLKIF_STATE_CONNECTED )
1342 printk(KERN_INFO "xen_blk: Timeout connecting to device!\n");
1343 err = -ENOSYS;
1345 return err;
1348 int __init xlblk_init(void)
1350 int i;
1352 if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
1353 (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
1354 return 0;
1356 printk(KERN_INFO "xen_blk: Initialising virtual block device driver\n");
1358 rec_ring_free = 0;
1359 for ( i = 0; i < BLKIF_RING_SIZE; i++ )
1360 rec_ring[i].id = i+1;
1361 rec_ring[BLKIF_RING_SIZE-1].id = 0x0fffffff;
1363 (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
1364 CALLBACK_IN_BLOCKING_CONTEXT);
1366 wait_for_blkif();
1368 return 0;
1371 void blkdev_suspend(void)
1375 void blkdev_resume(void)
1377 send_driver_status(1);
1380 /* XXXXX THIS IS A TEMPORARY FUNCTION UNTIL WE GET GRANT TABLES */
1382 void blkif_completion(blkif_request_t *req)
1384 int i;
1386 switch ( req->operation )
1388 case BLKIF_OP_READ:
1389 for ( i = 0; i < req->nr_segments; i++ )
1391 unsigned long pfn = req->frame_and_sects[i] >> PAGE_SHIFT;
1392 unsigned long mfn = phys_to_machine_mapping[pfn];
1393 xen_machphys_update(mfn, pfn);
1395 break;