debuggers.hg

view linux-2.6.10-xen-sparse/drivers/xen/blkfront/blkfront.c @ 3491:2c56c6b39a48

bitkeeper revision 1.1159.212.18 (41ebe1caQQ_SlJestrvsT95t1oER-Q)

Merge arcadians.cl.cam.ac.uk:/auto/groups/xeno/BK/xen-unstable.bk
into arcadians.cl.cam.ac.uk:/auto/groups/xeno/users/cl349/BK/xen-unstable.bk
author cl349@arcadians.cl.cam.ac.uk
date Mon Jan 17 16:03:22 2005 +0000 (2005-01-17)
parents 2e0bc3416d78 4abfb7f9fa7a
children cee684f223ee d49c0626928e
line source
1 /******************************************************************************
2 * blkfront.c
3 *
4 * XenLinux virtual block-device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 *
11 * This file may be distributed separately from the Linux kernel, or
12 * incorporated into other software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
33 #include <linux/version.h>
35 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
36 #include "block.h"
37 #else
38 #include "common.h"
39 #include <linux/blk.h>
40 #include <linux/tqueue.h>
41 #endif
43 #include <linux/cdrom.h>
44 #include <linux/sched.h>
45 #include <linux/interrupt.h>
46 #include <scsi/scsi.h>
47 #include <asm-xen/ctrl_if.h>
48 #include <asm-xen/evtchn.h>
50 typedef unsigned char byte; /* from linux/ide.h */
52 /* Control whether runtime update of vbds is enabled. */
53 #define ENABLE_VBD_UPDATE 1
55 #if ENABLE_VBD_UPDATE
56 static void vbd_update(void);
57 #else
58 static void vbd_update(void){};
59 #endif
61 #define BLKIF_STATE_CLOSED 0
62 #define BLKIF_STATE_DISCONNECTED 1
63 #define BLKIF_STATE_CONNECTED 2
65 #define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_blk: " fmt, ##args)
67 static int blkif_handle = 0;
68 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
69 static unsigned int blkif_evtchn = 0;
70 static unsigned int blkif_irq = 0;
72 static int blkif_control_rsp_valid;
73 static blkif_response_t blkif_control_rsp;
75 static blkif_front_ring_t blk_ring;
77 unsigned long rec_ring_free;
78 blkif_request_t rec_ring[RING_SIZE(BLKIF_RING, &blk_ring)];
80 static int recovery = 0; /* "Recovery in progress" flag. Protected
81 * by the blkif_io_lock */
83 static void kick_pending_request_queues(void);
85 int __init xlblk_init(void);
87 void blkif_completion( blkif_request_t *req );
89 static inline int GET_ID_FROM_FREELIST( void )
90 {
91 unsigned long free = rec_ring_free;
93 if ( free > RING_SIZE(BLKIF_RING, &blk_ring) )
94 BUG();
96 rec_ring_free = rec_ring[free].id;
98 rec_ring[free].id = 0x0fffffee; /* debug */
100 return free;
101 }
103 static inline void ADD_ID_TO_FREELIST( unsigned long id )
104 {
105 rec_ring[id].id = rec_ring_free;
106 rec_ring_free = id;
107 }
110 /************************ COMMON CODE (inlined) ************************/
112 /* Kernel-specific definitions used in the common code */
113 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
114 #define DISABLE_SCATTERGATHER()
115 #else
116 static int sg_operation = -1;
117 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
118 #endif
120 static inline void translate_req_to_pfn(blkif_request_t *xreq,
121 blkif_request_t *req)
122 {
123 int i;
125 xreq->operation = req->operation;
126 xreq->nr_segments = req->nr_segments;
127 xreq->device = req->device;
128 /* preserve id */
129 xreq->sector_number = req->sector_number;
131 for ( i = 0; i < req->nr_segments; i++ )
132 xreq->frame_and_sects[i] = machine_to_phys(req->frame_and_sects[i]);
133 }
135 static inline void translate_req_to_mfn(blkif_request_t *xreq,
136 blkif_request_t *req)
137 {
138 int i;
140 xreq->operation = req->operation;
141 xreq->nr_segments = req->nr_segments;
142 xreq->device = req->device;
143 xreq->id = req->id; /* copy id (unlike above) */
144 xreq->sector_number = req->sector_number;
146 for ( i = 0; i < req->nr_segments; i++ )
147 xreq->frame_and_sects[i] = phys_to_machine(req->frame_and_sects[i]);
148 }
151 static inline void flush_requests(void)
152 {
153 DISABLE_SCATTERGATHER();
154 RING_PUSH_REQUESTS(BLKIF_RING, &blk_ring);
155 notify_via_evtchn(blkif_evtchn);
156 }
161 /************************** KERNEL VERSION 2.6 **************************/
163 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
165 module_init(xlblk_init);
167 #if ENABLE_VBD_UPDATE
168 static void vbd_update(void)
169 {
170 }
171 #endif /* ENABLE_VBD_UPDATE */
173 static void kick_pending_request_queues(void)
174 {
176 if ( (xlbd_blk_queue != NULL) &&
177 test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
178 {
179 blk_start_queue(xlbd_blk_queue);
180 /* XXXcl call to request_fn should not be needed but
181 * we get stuck without... needs investigating
182 */
183 xlbd_blk_queue->request_fn(xlbd_blk_queue);
184 }
186 }
189 int blkif_open(struct inode *inode, struct file *filep)
190 {
191 struct gendisk *gd = inode->i_bdev->bd_disk;
192 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
194 /* Update of usage count is protected by per-device semaphore. */
195 di->mi->usage++;
197 return 0;
198 }
201 int blkif_release(struct inode *inode, struct file *filep)
202 {
203 struct gendisk *gd = inode->i_bdev->bd_disk;
204 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
206 /*
207 * When usage drops to zero it may allow more VBD updates to occur.
208 * Update of usage count is protected by a per-device semaphore.
209 */
210 if (--di->mi->usage == 0) {
211 vbd_update();
212 }
214 return 0;
215 }
218 int blkif_ioctl(struct inode *inode, struct file *filep,
219 unsigned command, unsigned long argument)
220 {
221 /* struct gendisk *gd = inode->i_bdev->bd_disk; */
223 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
224 command, (long)argument, inode->i_rdev);
226 switch (command) {
228 case HDIO_GETGEO:
229 /* return ENOSYS to use defaults */
230 return -ENOSYS;
232 default:
233 printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
234 command);
235 return -ENOSYS;
236 }
238 return 0;
239 }
241 #if 0
242 /* check media change: should probably do something here in some cases :-) */
243 int blkif_check(kdev_t dev)
244 {
245 DPRINTK("blkif_check\n");
246 return 0;
247 }
249 int blkif_revalidate(kdev_t dev)
250 {
251 struct block_device *bd;
252 struct gendisk *gd;
253 xen_block_t *disk;
254 unsigned long capacity;
255 int i, rc = 0;
257 if ( (bd = bdget(dev)) == NULL )
258 return -EINVAL;
260 /*
261 * Update of partition info, and check of usage count, is protected
262 * by the per-block-device semaphore.
263 */
264 down(&bd->bd_sem);
266 if ( ((gd = get_gendisk(dev)) == NULL) ||
267 ((disk = xldev_to_xldisk(dev)) == NULL) ||
268 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
269 {
270 rc = -EINVAL;
271 goto out;
272 }
274 if ( disk->usage > 1 )
275 {
276 rc = -EBUSY;
277 goto out;
278 }
280 /* Only reread partition table if VBDs aren't mapped to partitions. */
281 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
282 {
283 for ( i = gd->max_p - 1; i >= 0; i-- )
284 {
285 invalidate_device(dev+i, 1);
286 gd->part[MINOR(dev+i)].start_sect = 0;
287 gd->part[MINOR(dev+i)].nr_sects = 0;
288 gd->sizes[MINOR(dev+i)] = 0;
289 }
291 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
292 }
294 out:
295 up(&bd->bd_sem);
296 bdput(bd);
297 return rc;
298 }
299 #endif
301 /*
302 * blkif_queue_request
303 *
304 * request block io
305 *
306 * id: for guest use only.
307 * operation: BLKIF_OP_{READ,WRITE,PROBE}
308 * buffer: buffer to read/write into. this should be a
309 * virtual address in the guest os.
310 */
311 static int blkif_queue_request(struct request *req)
312 {
313 struct xlbd_disk_info *di =
314 (struct xlbd_disk_info *)req->rq_disk->private_data;
315 unsigned long buffer_ma;
316 blkif_request_t *ring_req;
317 struct bio *bio;
318 struct bio_vec *bvec;
319 int idx;
320 unsigned long id;
321 unsigned int fsect, lsect;
323 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
324 return 1;
326 /* Fill out a communications ring structure. */
327 ring_req = RING_GET_REQUEST(BLKIF_RING, &blk_ring, blk_ring.req_prod_pvt);
328 id = GET_ID_FROM_FREELIST();
329 rec_ring[id].id = (unsigned long) req;
331 ring_req->id = id;
332 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
333 BLKIF_OP_READ;
334 ring_req->sector_number = (blkif_sector_t)req->sector;
335 ring_req->device = di->xd_device;
337 ring_req->nr_segments = 0;
338 rq_for_each_bio(bio, req)
339 {
340 bio_for_each_segment(bvec, bio, idx)
341 {
342 if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
343 BUG();
344 buffer_ma = page_to_phys(bvec->bv_page);
345 fsect = bvec->bv_offset >> 9;
346 lsect = fsect + (bvec->bv_len >> 9) - 1;
347 ring_req->frame_and_sects[ring_req->nr_segments++] =
348 buffer_ma | (fsect << 3) | lsect;
349 }
350 }
352 blk_ring.req_prod_pvt++;
354 /* Keep a private copy so we can reissue requests when recovering. */
355 translate_req_to_pfn(&rec_ring[id], ring_req);
357 return 0;
358 }
361 /*
362 * do_blkif_request
363 * read a block; request is in a request queue
364 */
365 void do_blkif_request(request_queue_t *rq)
366 {
367 struct request *req;
368 int queued;
370 DPRINTK("Entered do_blkif_request\n");
372 queued = 0;
374 while ((req = elv_next_request(rq)) != NULL) {
375 if (!blk_fs_request(req)) {
376 end_request(req, 0);
377 continue;
378 }
380 if ( RING_FULL(BLKIF_RING, &blk_ring) )
381 {
382 blk_stop_queue(rq);
383 break;
384 }
385 DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
386 req, req->cmd, req->sector, req->current_nr_sectors,
387 req->nr_sectors, req->buffer,
388 rq_data_dir(req) ? "write" : "read");
389 blkdev_dequeue_request(req);
390 if (blkif_queue_request(req)) {
391 blk_stop_queue(rq);
392 break;
393 }
394 queued++;
395 }
397 if (queued != 0)
398 flush_requests();
399 }
402 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
403 {
404 struct request *req;
405 blkif_response_t *bret;
406 RING_IDX i, rp;
407 unsigned long flags;
409 spin_lock_irqsave(&blkif_io_lock, flags);
411 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ||
412 unlikely(recovery) )
413 {
414 spin_unlock_irqrestore(&blkif_io_lock, flags);
415 return IRQ_HANDLED;
416 }
418 rp = blk_ring.sring->rsp_prod;
419 rmb(); /* Ensure we see queued responses up to 'rp'. */
421 for ( i = blk_ring.rsp_cons; i != rp; i++ )
422 {
423 unsigned long id;
425 bret = RING_GET_RESPONSE(BLKIF_RING, &blk_ring, i);
426 id = bret->id;
427 req = (struct request *)rec_ring[id].id;
428 blkif_completion( &rec_ring[id] );
430 ADD_ID_TO_FREELIST(id); /* overwrites req */
432 switch ( bret->operation )
433 {
434 case BLKIF_OP_READ:
435 case BLKIF_OP_WRITE:
436 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
437 DPRINTK("Bad return from blkdev data request: %x\n",
438 bret->status);
440 if ( unlikely(end_that_request_first
441 (req,
442 (bret->status == BLKIF_RSP_OKAY),
443 req->hard_nr_sectors)) )
444 BUG();
445 end_that_request_last(req);
447 break;
448 case BLKIF_OP_PROBE:
449 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
450 blkif_control_rsp_valid = 1;
451 break;
452 default:
453 BUG();
454 }
455 }
457 blk_ring.rsp_cons = i;
459 kick_pending_request_queues();
461 spin_unlock_irqrestore(&blkif_io_lock, flags);
463 return IRQ_HANDLED;
464 }
466 #else
467 /************************** KERNEL VERSION 2.4 **************************/
469 static kdev_t sg_dev;
470 static unsigned long sg_next_sect;
472 /*
473 * Request queues with outstanding work, but ring is currently full.
474 * We need no special lock here, as we always access this with the
475 * blkif_io_lock held. We only need a small maximum list.
476 */
477 #define MAX_PENDING 8
478 static request_queue_t *pending_queues[MAX_PENDING];
479 static int nr_pending;
482 #define blkif_io_lock io_request_lock
484 /*============================================================================*/
485 #if ENABLE_VBD_UPDATE
487 /*
488 * blkif_update_int/update-vbds_task - handle VBD update events.
489 * Schedule a task for keventd to run, which will update the VBDs and perform
490 * the corresponding updates to our view of VBD state.
491 */
492 static void update_vbds_task(void *unused)
493 {
494 xlvbd_update_vbds();
495 }
497 static void vbd_update(void)
498 {
499 static struct tq_struct update_tq;
500 update_tq.routine = update_vbds_task;
501 schedule_task(&update_tq);
502 }
504 #endif /* ENABLE_VBD_UPDATE */
505 /*============================================================================*/
507 static void kick_pending_request_queues(void)
508 {
509 /* We kick pending request queues if the ring is reasonably empty. */
510 if ( (nr_pending != 0) &&
511 (RING_PENDING_REQUESTS(BLKIF_RING, &blk_ring) <
512 (RING_SIZE(BLKIF_RING, &blk_ring) >> 1)) )
513 {
514 /* Attempt to drain the queue, but bail if the ring becomes full. */
515 while ( (nr_pending != 0) && !RING_FULL(BLKIF_RING, &blk_ring) )
516 do_blkif_request(pending_queues[--nr_pending]);
517 }
518 }
520 int blkif_open(struct inode *inode, struct file *filep)
521 {
522 short xldev = inode->i_rdev;
523 struct gendisk *gd = get_gendisk(xldev);
524 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
525 short minor = MINOR(xldev);
527 if ( gd->part[minor].nr_sects == 0 )
528 {
529 /*
530 * Device either doesn't exist, or has zero capacity; we use a few
531 * cheesy heuristics to return the relevant error code
532 */
533 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
534 ((minor & (gd->max_p - 1)) != 0) )
535 {
536 /*
537 * We have a real device, but no such partition, or we just have a
538 * partition number so guess this is the problem.
539 */
540 return -ENXIO; /* no such device or address */
541 }
542 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
543 {
544 /* This is a removable device => assume that media is missing. */
545 return -ENOMEDIUM; /* media not present (this is a guess) */
546 }
547 else
548 {
549 /* Just go for the general 'no such device' error. */
550 return -ENODEV; /* no such device */
551 }
552 }
554 /* Update of usage count is protected by per-device semaphore. */
555 disk->usage++;
557 return 0;
558 }
561 int blkif_release(struct inode *inode, struct file *filep)
562 {
563 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
565 /*
566 * When usage drops to zero it may allow more VBD updates to occur.
567 * Update of usage count is protected by a per-device semaphore.
568 */
569 if ( --disk->usage == 0 ) {
570 vbd_update();
571 }
573 return 0;
574 }
577 int blkif_ioctl(struct inode *inode, struct file *filep,
578 unsigned command, unsigned long argument)
579 {
580 kdev_t dev = inode->i_rdev;
581 struct hd_geometry *geo = (struct hd_geometry *)argument;
582 struct gendisk *gd;
583 struct hd_struct *part;
584 int i;
585 unsigned short cylinders;
586 byte heads, sectors;
588 /* NB. No need to check permissions. That is done for us. */
590 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
591 command, (long) argument, dev);
593 gd = get_gendisk(dev);
594 part = &gd->part[MINOR(dev)];
596 switch ( command )
597 {
598 case BLKGETSIZE:
599 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
600 return put_user(part->nr_sects, (unsigned long *) argument);
602 case BLKGETSIZE64:
603 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
604 (u64)part->nr_sects * 512);
605 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
607 case BLKRRPART: /* re-read partition table */
608 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
609 return blkif_revalidate(dev);
611 case BLKSSZGET:
612 return hardsect_size[MAJOR(dev)][MINOR(dev)];
614 case BLKBSZGET: /* get block size */
615 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
616 break;
618 case BLKBSZSET: /* set block size */
619 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
620 break;
622 case BLKRASET: /* set read-ahead */
623 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
624 break;
626 case BLKRAGET: /* get read-ahead */
627 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
628 break;
630 case HDIO_GETGEO:
631 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
632 if (!argument) return -EINVAL;
634 /* We don't have real geometry info, but let's at least return
635 values consistent with the size of the device */
637 heads = 0xff;
638 sectors = 0x3f;
639 cylinders = part->nr_sects / (heads * sectors);
641 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
642 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
643 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
644 if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;
646 return 0;
648 case HDIO_GETGEO_BIG:
649 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
650 if (!argument) return -EINVAL;
652 /* We don't have real geometry info, but let's at least return
653 values consistent with the size of the device */
655 heads = 0xff;
656 sectors = 0x3f;
657 cylinders = part->nr_sects / (heads * sectors);
659 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
660 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
661 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
662 if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;
664 return 0;
666 case CDROMMULTISESSION:
667 DPRINTK("FIXME: support multisession CDs later\n");
668 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
669 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
670 return 0;
672 case SCSI_IOCTL_GET_BUS_NUMBER:
673 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
674 return -ENOSYS;
676 default:
677 printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
678 return -ENOSYS;
679 }
681 return 0;
682 }
686 /* check media change: should probably do something here in some cases :-) */
687 int blkif_check(kdev_t dev)
688 {
689 DPRINTK("blkif_check\n");
690 return 0;
691 }
693 int blkif_revalidate(kdev_t dev)
694 {
695 struct block_device *bd;
696 struct gendisk *gd;
697 xl_disk_t *disk;
698 unsigned long capacity;
699 int i, rc = 0;
701 if ( (bd = bdget(dev)) == NULL )
702 return -EINVAL;
704 /*
705 * Update of partition info, and check of usage count, is protected
706 * by the per-block-device semaphore.
707 */
708 down(&bd->bd_sem);
710 if ( ((gd = get_gendisk(dev)) == NULL) ||
711 ((disk = xldev_to_xldisk(dev)) == NULL) ||
712 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
713 {
714 rc = -EINVAL;
715 goto out;
716 }
718 if ( disk->usage > 1 )
719 {
720 rc = -EBUSY;
721 goto out;
722 }
724 /* Only reread partition table if VBDs aren't mapped to partitions. */
725 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
726 {
727 for ( i = gd->max_p - 1; i >= 0; i-- )
728 {
729 invalidate_device(dev+i, 1);
730 gd->part[MINOR(dev+i)].start_sect = 0;
731 gd->part[MINOR(dev+i)].nr_sects = 0;
732 gd->sizes[MINOR(dev+i)] = 0;
733 }
735 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
736 }
738 out:
739 up(&bd->bd_sem);
740 bdput(bd);
741 return rc;
742 }
745 /*
746 * blkif_queue_request
747 *
748 * request block io
749 *
750 * id: for guest use only.
751 * operation: BLKIF_OP_{READ,WRITE,PROBE}
752 * buffer: buffer to read/write into. this should be a
753 * virtual address in the guest os.
754 */
755 static int blkif_queue_request(unsigned long id,
756 int operation,
757 char * buffer,
758 unsigned long sector_number,
759 unsigned short nr_sectors,
760 kdev_t device)
761 {
762 unsigned long buffer_ma = virt_to_bus(buffer);
763 unsigned long xid;
764 struct gendisk *gd;
765 blkif_request_t *req;
766 struct buffer_head *bh;
767 unsigned int fsect, lsect;
769 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
770 lsect = fsect + nr_sectors - 1;
772 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
773 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
774 BUG();
775 if ( lsect > 7 )
776 BUG();
778 buffer_ma &= PAGE_MASK;
780 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
781 return 1;
783 switch ( operation )
784 {
786 case BLKIF_OP_READ:
787 case BLKIF_OP_WRITE:
788 gd = get_gendisk(device);
790 /*
791 * Update the sector_number we'll pass down as appropriate; note that
792 * we could sanity check that resulting sector will be in this
793 * partition, but this will happen in driver backend anyhow.
794 */
795 sector_number += gd->part[MINOR(device)].start_sect;
797 /*
798 * If this unit doesn't consist of virtual partitions then we clear
799 * the partn bits from the device number.
800 */
801 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
802 GENHD_FL_VIRT_PARTNS) )
803 device &= ~(gd->max_p - 1);
805 if ( (sg_operation == operation) &&
806 (sg_dev == device) &&
807 (sg_next_sect == sector_number) )
808 {
809 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring,
810 blk_ring.req_prod_pvt - 1);
811 bh = (struct buffer_head *)id;
813 bh->b_reqnext = (struct buffer_head *)rec_ring[req->id].id;
816 rec_ring[req->id].id = id;
818 req->frame_and_sects[req->nr_segments] =
819 buffer_ma | (fsect<<3) | lsect;
820 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
821 sg_next_sect += nr_sectors;
822 else
823 DISABLE_SCATTERGATHER();
825 /* Update the copy of the request in the recovery ring. */
826 translate_req_to_pfn(&rec_ring[req->id], req );
828 return 0;
829 }
830 else if ( RING_FULL(BLKIF_RING, &blk_ring) )
831 {
832 return 1;
833 }
834 else
835 {
836 sg_operation = operation;
837 sg_dev = device;
838 sg_next_sect = sector_number + nr_sectors;
839 }
840 break;
842 default:
843 panic("unknown op %d\n", operation);
844 }
846 /* Fill out a communications ring structure. */
847 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring, blk_ring.req_prod_pvt);
849 xid = GET_ID_FROM_FREELIST();
850 rec_ring[xid].id = id;
852 req->id = xid;
853 req->operation = operation;
854 req->sector_number = (blkif_sector_t)sector_number;
855 req->device = device;
856 req->nr_segments = 1;
857 req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
859 /* Keep a private copy so we can reissue requests when recovering. */
860 translate_req_to_pfn(&rec_ring[xid], req );
862 blk_ring.req_prod_pvt++;
864 return 0;
865 }
868 /*
869 * do_blkif_request
870 * read a block; request is in a request queue
871 */
872 void do_blkif_request(request_queue_t *rq)
873 {
874 struct request *req;
875 struct buffer_head *bh, *next_bh;
876 int rw, nsect, full, queued = 0;
878 DPRINTK("Entered do_blkif_request\n");
880 while ( !rq->plugged && !list_empty(&rq->queue_head))
881 {
882 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
883 goto out;
885 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
886 req, req->cmd, req->sector,
887 req->current_nr_sectors, req->nr_sectors, req->bh);
889 rw = req->cmd;
890 if ( rw == READA )
891 rw = READ;
892 if ( unlikely((rw != READ) && (rw != WRITE)) )
893 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
895 req->errors = 0;
897 bh = req->bh;
898 while ( bh != NULL )
899 {
900 next_bh = bh->b_reqnext;
901 bh->b_reqnext = NULL;
903 full = blkif_queue_request(
904 (unsigned long)bh,
905 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
906 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
908 if ( full )
909 {
910 bh->b_reqnext = next_bh;
911 pending_queues[nr_pending++] = rq;
912 if ( unlikely(nr_pending >= MAX_PENDING) )
913 BUG();
914 goto out;
915 }
917 queued++;
919 /* Dequeue the buffer head from the request. */
920 nsect = bh->b_size >> 9;
921 bh = req->bh = next_bh;
923 if ( bh != NULL )
924 {
925 /* There's another buffer head to do. Update the request. */
926 req->hard_sector += nsect;
927 req->hard_nr_sectors -= nsect;
928 req->sector = req->hard_sector;
929 req->nr_sectors = req->hard_nr_sectors;
930 req->current_nr_sectors = bh->b_size >> 9;
931 req->buffer = bh->b_data;
932 }
933 else
934 {
935 /* That was the last buffer head. Finalise the request. */
936 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
937 BUG();
938 blkdev_dequeue_request(req);
939 end_that_request_last(req);
940 }
941 }
942 }
944 out:
945 if ( queued != 0 )
946 flush_requests();
947 }
950 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
951 {
952 RING_IDX i, rp;
953 unsigned long flags;
954 struct buffer_head *bh, *next_bh;
956 spin_lock_irqsave(&io_request_lock, flags);
958 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
959 {
960 spin_unlock_irqrestore(&io_request_lock, flags);
961 return;
962 }
964 rp = blk_ring.sring->rsp_prod;
965 rmb(); /* Ensure we see queued responses up to 'rp'. */
967 for ( i = blk_ring.rsp_cons; i != rp; i++ )
968 {
969 unsigned long id;
970 blkif_response_t *bret;
972 bret = RING_GET_RESPONSE(BLKIF_RING, &blk_ring, i);
973 id = bret->id;
974 bh = (struct buffer_head *)rec_ring[id].id;
976 blkif_completion( &rec_ring[id] );
978 ADD_ID_TO_FREELIST(id);
980 switch ( bret->operation )
981 {
982 case BLKIF_OP_READ:
983 case BLKIF_OP_WRITE:
984 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
985 DPRINTK("Bad return from blkdev data request: %lx\n",
986 bret->status);
987 for ( ; bh != NULL; bh = next_bh )
988 {
989 next_bh = bh->b_reqnext;
990 bh->b_reqnext = NULL;
991 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
992 }
994 break;
995 case BLKIF_OP_PROBE:
996 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
997 blkif_control_rsp_valid = 1;
998 break;
999 default:
1000 BUG();
1004 blk_ring.rsp_cons = i;
1006 kick_pending_request_queues();
1008 spin_unlock_irqrestore(&io_request_lock, flags);
1011 #endif
1013 /***************************** COMMON CODE *******************************/
1016 void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
1018 unsigned long flags, id;
1019 blkif_request_t *req_d;
1021 retry:
1022 while ( RING_FULL(BLKIF_RING, &blk_ring) )
1024 set_current_state(TASK_INTERRUPTIBLE);
1025 schedule_timeout(1);
1028 spin_lock_irqsave(&blkif_io_lock, flags);
1029 if ( RING_FULL(BLKIF_RING, &blk_ring) )
1031 spin_unlock_irqrestore(&blkif_io_lock, flags);
1032 goto retry;
1035 DISABLE_SCATTERGATHER();
1036 req_d = RING_GET_REQUEST(BLKIF_RING, &blk_ring, blk_ring.req_prod_pvt);
1037 *req_d = *req;
1039 id = GET_ID_FROM_FREELIST();
1040 req_d->id = id;
1041 rec_ring[id].id = (unsigned long) req;
1043 translate_req_to_pfn( &rec_ring[id], req );
1045 blk_ring.req_prod_pvt++;
1046 flush_requests();
1048 spin_unlock_irqrestore(&blkif_io_lock, flags);
1050 while ( !blkif_control_rsp_valid )
1052 set_current_state(TASK_INTERRUPTIBLE);
1053 schedule_timeout(1);
1056 memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
1057 blkif_control_rsp_valid = 0;
1061 /* Send a driver status notification to the domain controller. */
1062 static void send_driver_status(int ok)
1064 ctrl_msg_t cmsg = {
1065 .type = CMSG_BLKIF_FE,
1066 .subtype = CMSG_BLKIF_FE_DRIVER_STATUS,
1067 .length = sizeof(blkif_fe_driver_status_t),
1068 };
1069 blkif_fe_driver_status_t *msg = (void*)cmsg.msg;
1071 msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN);
1073 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1076 /* Tell the controller to bring up the interface. */
1077 static void blkif_send_interface_connect(void)
1079 ctrl_msg_t cmsg = {
1080 .type = CMSG_BLKIF_FE,
1081 .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
1082 .length = sizeof(blkif_fe_interface_connect_t),
1083 };
1084 blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
1086 msg->handle = 0;
1087 msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT);
1089 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1092 static void blkif_free(void)
1094 /* Prevent new requests being issued until we fix things up. */
1095 spin_lock_irq(&blkif_io_lock);
1096 recovery = 1;
1097 blkif_state = BLKIF_STATE_DISCONNECTED;
1098 spin_unlock_irq(&blkif_io_lock);
1100 /* Free resources associated with old device channel. */
1101 if ( blk_ring.sring != NULL )
1103 free_page((unsigned long)blk_ring.sring);
1104 blk_ring.sring = NULL;
1106 free_irq(blkif_irq, NULL);
1107 blkif_irq = 0;
1109 unbind_evtchn_from_irq(blkif_evtchn);
1110 blkif_evtchn = 0;
1113 static void blkif_close(void)
1117 /* Move from CLOSED to DISCONNECTED state. */
1118 static void blkif_disconnect(void)
1120 blkif_sring_t *sring;
1122 if ( blk_ring.sring != NULL )
1123 free_page((unsigned long)blk_ring.sring);
1125 sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
1126 SHARED_RING_INIT(BLKIF_RING, sring);
1127 FRONT_RING_INIT(BLKIF_RING, &blk_ring, sring);
1128 blkif_state = BLKIF_STATE_DISCONNECTED;
1129 blkif_send_interface_connect();
1132 static void blkif_reset(void)
1134 blkif_free();
1135 blkif_disconnect();
1138 static void blkif_recover(void)
1140 int i;
1141 blkif_request_t *req;
1143 /* Hmm, requests might be re-ordered when we re-issue them.
1144 * This will need to be fixed once we have barriers */
1146 /* Stage 1 : Find active and move to safety. */
1147 for ( i = 0; i < RING_SIZE(BLKIF_RING, &blk_ring); i++ )
1149 if ( rec_ring[i].id >= PAGE_OFFSET )
1151 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring,
1152 blk_ring.req_prod_pvt);
1153 translate_req_to_mfn(req, &rec_ring[i]);
1154 blk_ring.req_prod_pvt++;
1158 /* Stage 2 : Set up shadow list. */
1159 for ( i = 0; i < blk_ring.req_prod_pvt; i++ )
1161 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring, i);
1162 rec_ring[i].id = req->id;
1163 req->id = i;
1164 translate_req_to_pfn(&rec_ring[i], req);
1167 /* Stage 3 : Set up free list. */
1168 for ( ; i < RING_SIZE(BLKIF_RING, &blk_ring); i++ )
1169 rec_ring[i].id = i+1;
1170 rec_ring_free = blk_ring.req_prod_pvt;
1171 rec_ring[RING_SIZE(BLKIF_RING, &blk_ring)-1].id = 0x0fffffff;
1173 /* blk_ring->req_prod will be set when we flush_requests().*/
1174 wmb();
1176 /* Switch off recovery mode, using a memory barrier to ensure that
1177 * it's seen before we flush requests - we don't want to miss any
1178 * interrupts. */
1179 recovery = 0;
1180 wmb();
1182 /* Kicks things back into life. */
1183 flush_requests();
1185 /* Now safe to left other peope use interface. */
1186 blkif_state = BLKIF_STATE_CONNECTED;
1189 static void blkif_connect(blkif_fe_interface_status_t *status)
1191 int err = 0;
1193 blkif_evtchn = status->evtchn;
1194 blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
1196 err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
1197 if ( err )
1199 printk(KERN_ALERT "xen_blk: request_irq failed (err=%d)\n", err);
1200 return;
1203 if ( recovery )
1205 blkif_recover();
1207 else
1209 /* Transition to connected in case we need to do
1210 * a partition probe on a whole disk. */
1211 blkif_state = BLKIF_STATE_CONNECTED;
1213 /* Probe for discs attached to the interface. */
1214 xlvbd_init();
1217 /* Kick pending requests. */
1218 spin_lock_irq(&blkif_io_lock);
1219 kick_pending_request_queues();
1220 spin_unlock_irq(&blkif_io_lock);
1223 static void unexpected(blkif_fe_interface_status_t *status)
1225 DPRINTK(" Unexpected blkif status %u in state %u\n",
1226 status->status, blkif_state);
1229 static void blkif_status(blkif_fe_interface_status_t *status)
1231 if ( status->handle != blkif_handle )
1233 WPRINTK(" Invalid blkif: handle=%u\n", status->handle);
1234 unexpected(status);
1235 return;
1238 switch ( status->status )
1240 case BLKIF_INTERFACE_STATUS_CLOSED:
1241 switch ( blkif_state )
1243 case BLKIF_STATE_CLOSED:
1244 unexpected(status);
1245 break;
1246 case BLKIF_STATE_DISCONNECTED:
1247 case BLKIF_STATE_CONNECTED:
1248 unexpected(status);
1249 blkif_close();
1250 break;
1252 break;
1254 case BLKIF_INTERFACE_STATUS_DISCONNECTED:
1255 switch ( blkif_state )
1257 case BLKIF_STATE_CLOSED:
1258 blkif_disconnect();
1259 break;
1260 case BLKIF_STATE_DISCONNECTED:
1261 case BLKIF_STATE_CONNECTED:
1262 /* unexpected(status); */ /* occurs during suspend/resume */
1263 blkif_reset();
1264 break;
1266 break;
1268 case BLKIF_INTERFACE_STATUS_CONNECTED:
1269 switch ( blkif_state )
1271 case BLKIF_STATE_CLOSED:
1272 unexpected(status);
1273 blkif_disconnect();
1274 blkif_connect(status);
1275 break;
1276 case BLKIF_STATE_DISCONNECTED:
1277 blkif_connect(status);
1278 break;
1279 case BLKIF_STATE_CONNECTED:
1280 unexpected(status);
1281 blkif_connect(status);
1282 break;
1284 break;
1286 case BLKIF_INTERFACE_STATUS_CHANGED:
1287 switch ( blkif_state )
1289 case BLKIF_STATE_CLOSED:
1290 case BLKIF_STATE_DISCONNECTED:
1291 unexpected(status);
1292 break;
1293 case BLKIF_STATE_CONNECTED:
1294 vbd_update();
1295 break;
1297 break;
1299 default:
1300 WPRINTK(" Invalid blkif status: %d\n", status->status);
1301 break;
1306 static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
1308 switch ( msg->subtype )
1310 case CMSG_BLKIF_FE_INTERFACE_STATUS:
1311 if ( msg->length != sizeof(blkif_fe_interface_status_t) )
1312 goto parse_error;
1313 blkif_status((blkif_fe_interface_status_t *)
1314 &msg->msg[0]);
1315 break;
1316 default:
1317 goto parse_error;
1320 ctrl_if_send_response(msg);
1321 return;
1323 parse_error:
1324 msg->length = 0;
1325 ctrl_if_send_response(msg);
1328 int wait_for_blkif(void)
1330 int err = 0;
1331 int i;
1332 send_driver_status(1);
1334 /*
1335 * We should read 'nr_interfaces' from response message and wait
1336 * for notifications before proceeding. For now we assume that we
1337 * will be notified of exactly one interface.
1338 */
1339 for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ )
1341 set_current_state(TASK_INTERRUPTIBLE);
1342 schedule_timeout(1);
1345 if ( blkif_state != BLKIF_STATE_CONNECTED )
1347 printk(KERN_INFO "xen_blk: Timeout connecting to device!\n");
1348 err = -ENOSYS;
1350 return err;
1353 int __init xlblk_init(void)
1355 int i;
1357 if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
1358 (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
1359 return 0;
1361 printk(KERN_INFO "xen_blk: Initialising virtual block device driver\n");
1363 rec_ring_free = 0;
1364 for ( i = 0; i < RING_SIZE(BLKIF_RING, &blk_ring); i++ )
1365 rec_ring[i].id = i+1;
1366 rec_ring[RING_SIZE(BLKIF_RING, &blk_ring)-1].id = 0x0fffffff;
1368 (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
1369 CALLBACK_IN_BLOCKING_CONTEXT);
1371 wait_for_blkif();
1373 return 0;
1376 void blkdev_suspend(void)
1380 void blkdev_resume(void)
1382 send_driver_status(1);
1385 /* XXXXX THIS IS A TEMPORARY FUNCTION UNTIL WE GET GRANT TABLES */
1387 void blkif_completion(blkif_request_t *req)
1389 int i;
1391 switch ( req->operation )
1393 case BLKIF_OP_READ:
1394 for ( i = 0; i < req->nr_segments; i++ )
1396 unsigned long pfn = req->frame_and_sects[i] >> PAGE_SHIFT;
1397 unsigned long mfn = phys_to_machine_mapping[pfn];
1398 xen_machphys_update(mfn, pfn);
1400 break;