debuggers.hg

view linux-2.6.10-xen-sparse/drivers/xen/blkfront/blkfront.c @ 3685:bbe8541361dd

bitkeeper revision 1.1159.1.542 (42038a42_52IAalMZRKdTn0UbVN5fw)

Merge tempest.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xeno.bk
into tempest.cl.cam.ac.uk:/local/scratch/smh22/xen-unstable.bk
author smh22@tempest.cl.cam.ac.uk
date Fri Feb 04 14:44:18 2005 +0000 (2005-02-04)
parents cee684f223ee d49c0626928e
children 0a4b76b6b5a0
line source
1 /******************************************************************************
2 * blkfront.c
3 *
4 * XenLinux virtual block-device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 *
11 * This file may be distributed separately from the Linux kernel, or
12 * incorporated into other software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
33 #include <linux/version.h>
35 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
36 #include "block.h"
37 #else
38 #include "common.h"
39 #include <linux/blk.h>
40 #include <linux/tqueue.h>
41 #endif
43 #include <linux/cdrom.h>
44 #include <linux/sched.h>
45 #include <linux/interrupt.h>
46 #include <scsi/scsi.h>
47 #include <asm-xen/ctrl_if.h>
48 #include <asm-xen/evtchn.h>
50 typedef unsigned char byte; /* from linux/ide.h */
52 /* Control whether runtime update of vbds is enabled. */
53 #define ENABLE_VBD_UPDATE 1
55 #if ENABLE_VBD_UPDATE
56 static void vbd_update(void);
57 #else
58 static void vbd_update(void){};
59 #endif
61 #define BLKIF_STATE_CLOSED 0
62 #define BLKIF_STATE_DISCONNECTED 1
63 #define BLKIF_STATE_CONNECTED 2
65 #define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_blk: " fmt, ##args)
67 static int blkif_handle = 0;
68 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
69 static unsigned int blkif_evtchn = 0;
70 static unsigned int blkif_irq = 0;
72 static int blkif_control_rsp_valid;
73 static blkif_response_t blkif_control_rsp;
75 static blkif_front_ring_t blk_ring;
77 unsigned long rec_ring_free;
78 blkif_request_t rec_ring[RING_SIZE(BLKIF_RING, &blk_ring)];
80 static int recovery = 0; /* "Recovery in progress" flag. Protected
81 * by the blkif_io_lock */
83 static void kick_pending_request_queues(void);
85 int __init xlblk_init(void);
87 void blkif_completion( blkif_request_t *req );
89 static inline int GET_ID_FROM_FREELIST( void )
90 {
91 unsigned long free = rec_ring_free;
93 if ( free > RING_SIZE(BLKIF_RING, &blk_ring) )
94 BUG();
96 rec_ring_free = rec_ring[free].id;
98 rec_ring[free].id = 0x0fffffee; /* debug */
100 return free;
101 }
103 static inline void ADD_ID_TO_FREELIST( unsigned long id )
104 {
105 rec_ring[id].id = rec_ring_free;
106 rec_ring_free = id;
107 }
110 /************************ COMMON CODE (inlined) ************************/
112 /* Kernel-specific definitions used in the common code */
113 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
114 #define DISABLE_SCATTERGATHER()
115 #else
116 static int sg_operation = -1;
117 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
118 #endif
120 static inline void translate_req_to_pfn(blkif_request_t *xreq,
121 blkif_request_t *req)
122 {
123 int i;
125 xreq->operation = req->operation;
126 xreq->nr_segments = req->nr_segments;
127 xreq->device = req->device;
128 /* preserve id */
129 xreq->sector_number = req->sector_number;
131 for ( i = 0; i < req->nr_segments; i++ )
132 xreq->frame_and_sects[i] = machine_to_phys(req->frame_and_sects[i]);
133 }
135 static inline void translate_req_to_mfn(blkif_request_t *xreq,
136 blkif_request_t *req)
137 {
138 int i;
140 xreq->operation = req->operation;
141 xreq->nr_segments = req->nr_segments;
142 xreq->device = req->device;
143 xreq->id = req->id; /* copy id (unlike above) */
144 xreq->sector_number = req->sector_number;
146 for ( i = 0; i < req->nr_segments; i++ )
147 xreq->frame_and_sects[i] = phys_to_machine(req->frame_and_sects[i]);
148 }
151 static inline void flush_requests(void)
152 {
153 DISABLE_SCATTERGATHER();
154 RING_PUSH_REQUESTS(BLKIF_RING, &blk_ring);
155 notify_via_evtchn(blkif_evtchn);
156 }
161 /************************** KERNEL VERSION 2.6 **************************/
163 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
165 module_init(xlblk_init);
167 #if ENABLE_VBD_UPDATE
168 static void vbd_update(void)
169 {
170 }
171 #endif /* ENABLE_VBD_UPDATE */
173 static void kick_pending_request_queues(void)
174 {
176 if ( (xlbd_blk_queue != NULL) &&
177 test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
178 {
179 blk_start_queue(xlbd_blk_queue);
180 /* XXXcl call to request_fn should not be needed but
181 * we get stuck without... needs investigating
182 */
183 xlbd_blk_queue->request_fn(xlbd_blk_queue);
184 }
186 }
189 int blkif_open(struct inode *inode, struct file *filep)
190 {
191 struct gendisk *gd = inode->i_bdev->bd_disk;
192 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
194 /* Update of usage count is protected by per-device semaphore. */
195 di->mi->usage++;
197 return 0;
198 }
201 int blkif_release(struct inode *inode, struct file *filep)
202 {
203 struct gendisk *gd = inode->i_bdev->bd_disk;
204 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
206 /*
207 * When usage drops to zero it may allow more VBD updates to occur.
208 * Update of usage count is protected by a per-device semaphore.
209 */
210 if (--di->mi->usage == 0) {
211 vbd_update();
212 }
214 return 0;
215 }
218 int blkif_ioctl(struct inode *inode, struct file *filep,
219 unsigned command, unsigned long argument)
220 {
221 int i;
222 /* struct gendisk *gd = inode->i_bdev->bd_disk; */
224 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
225 command, (long)argument, inode->i_rdev);
227 switch (command) {
229 case HDIO_GETGEO:
230 /* return ENOSYS to use defaults */
231 return -ENOSYS;
233 case CDROMMULTISESSION:
234 DPRINTK("FIXME: support multisession CDs later\n");
235 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
236 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
237 return 0;
239 default:
240 printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
241 command);
242 return -ENOSYS;
243 }
245 return 0;
246 }
248 #if 0
249 /* check media change: should probably do something here in some cases :-) */
250 int blkif_check(kdev_t dev)
251 {
252 DPRINTK("blkif_check\n");
253 return 0;
254 }
256 int blkif_revalidate(kdev_t dev)
257 {
258 struct block_device *bd;
259 struct gendisk *gd;
260 xen_block_t *disk;
261 unsigned long capacity;
262 int i, rc = 0;
264 if ( (bd = bdget(dev)) == NULL )
265 return -EINVAL;
267 /*
268 * Update of partition info, and check of usage count, is protected
269 * by the per-block-device semaphore.
270 */
271 down(&bd->bd_sem);
273 if ( ((gd = get_gendisk(dev)) == NULL) ||
274 ((disk = xldev_to_xldisk(dev)) == NULL) ||
275 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
276 {
277 rc = -EINVAL;
278 goto out;
279 }
281 if ( disk->usage > 1 )
282 {
283 rc = -EBUSY;
284 goto out;
285 }
287 /* Only reread partition table if VBDs aren't mapped to partitions. */
288 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
289 {
290 for ( i = gd->max_p - 1; i >= 0; i-- )
291 {
292 invalidate_device(dev+i, 1);
293 gd->part[MINOR(dev+i)].start_sect = 0;
294 gd->part[MINOR(dev+i)].nr_sects = 0;
295 gd->sizes[MINOR(dev+i)] = 0;
296 }
298 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
299 }
301 out:
302 up(&bd->bd_sem);
303 bdput(bd);
304 return rc;
305 }
306 #endif
308 /*
309 * blkif_queue_request
310 *
311 * request block io
312 *
313 * id: for guest use only.
314 * operation: BLKIF_OP_{READ,WRITE,PROBE}
315 * buffer: buffer to read/write into. this should be a
316 * virtual address in the guest os.
317 */
318 static int blkif_queue_request(struct request *req)
319 {
320 struct xlbd_disk_info *di =
321 (struct xlbd_disk_info *)req->rq_disk->private_data;
322 unsigned long buffer_ma;
323 blkif_request_t *ring_req;
324 struct bio *bio;
325 struct bio_vec *bvec;
326 int idx;
327 unsigned long id;
328 unsigned int fsect, lsect;
330 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
331 return 1;
333 /* Fill out a communications ring structure. */
334 ring_req = RING_GET_REQUEST(BLKIF_RING, &blk_ring, blk_ring.req_prod_pvt);
335 id = GET_ID_FROM_FREELIST();
336 rec_ring[id].id = (unsigned long) req;
338 ring_req->id = id;
339 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
340 BLKIF_OP_READ;
341 ring_req->sector_number = (blkif_sector_t)req->sector;
342 ring_req->device = di->xd_device;
344 ring_req->nr_segments = 0;
345 rq_for_each_bio(bio, req)
346 {
347 bio_for_each_segment(bvec, bio, idx)
348 {
349 if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
350 BUG();
351 buffer_ma = page_to_phys(bvec->bv_page);
352 fsect = bvec->bv_offset >> 9;
353 lsect = fsect + (bvec->bv_len >> 9) - 1;
354 ring_req->frame_and_sects[ring_req->nr_segments++] =
355 buffer_ma | (fsect << 3) | lsect;
356 }
357 }
359 blk_ring.req_prod_pvt++;
361 /* Keep a private copy so we can reissue requests when recovering. */
362 translate_req_to_pfn(&rec_ring[id], ring_req);
364 return 0;
365 }
368 /*
369 * do_blkif_request
370 * read a block; request is in a request queue
371 */
372 void do_blkif_request(request_queue_t *rq)
373 {
374 struct request *req;
375 int queued;
377 DPRINTK("Entered do_blkif_request\n");
379 queued = 0;
381 while ((req = elv_next_request(rq)) != NULL) {
382 if (!blk_fs_request(req)) {
383 end_request(req, 0);
384 continue;
385 }
387 if ( RING_FULL(BLKIF_RING, &blk_ring) )
388 {
389 blk_stop_queue(rq);
390 break;
391 }
392 DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
393 req, req->cmd, req->sector, req->current_nr_sectors,
394 req->nr_sectors, req->buffer,
395 rq_data_dir(req) ? "write" : "read");
396 blkdev_dequeue_request(req);
397 if (blkif_queue_request(req)) {
398 blk_stop_queue(rq);
399 break;
400 }
401 queued++;
402 }
404 if (queued != 0)
405 flush_requests();
406 }
409 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
410 {
411 struct request *req;
412 blkif_response_t *bret;
413 RING_IDX i, rp;
414 unsigned long flags;
416 spin_lock_irqsave(&blkif_io_lock, flags);
418 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ||
419 unlikely(recovery) )
420 {
421 spin_unlock_irqrestore(&blkif_io_lock, flags);
422 return IRQ_HANDLED;
423 }
425 rp = blk_ring.sring->rsp_prod;
426 rmb(); /* Ensure we see queued responses up to 'rp'. */
428 for ( i = blk_ring.rsp_cons; i != rp; i++ )
429 {
430 unsigned long id;
432 bret = RING_GET_RESPONSE(BLKIF_RING, &blk_ring, i);
433 id = bret->id;
434 req = (struct request *)rec_ring[id].id;
435 blkif_completion( &rec_ring[id] );
437 ADD_ID_TO_FREELIST(id); /* overwrites req */
439 switch ( bret->operation )
440 {
441 case BLKIF_OP_READ:
442 case BLKIF_OP_WRITE:
443 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
444 DPRINTK("Bad return from blkdev data request: %x\n",
445 bret->status);
447 if ( unlikely(end_that_request_first
448 (req,
449 (bret->status == BLKIF_RSP_OKAY),
450 req->hard_nr_sectors)) )
451 BUG();
452 end_that_request_last(req);
454 break;
455 case BLKIF_OP_PROBE:
456 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
457 blkif_control_rsp_valid = 1;
458 break;
459 default:
460 BUG();
461 }
462 }
464 blk_ring.rsp_cons = i;
466 kick_pending_request_queues();
468 spin_unlock_irqrestore(&blkif_io_lock, flags);
470 return IRQ_HANDLED;
471 }
473 #else
474 /************************** KERNEL VERSION 2.4 **************************/
476 static kdev_t sg_dev;
477 static unsigned long sg_next_sect;
479 /*
480 * Request queues with outstanding work, but ring is currently full.
481 * We need no special lock here, as we always access this with the
482 * blkif_io_lock held. We only need a small maximum list.
483 */
484 #define MAX_PENDING 8
485 static request_queue_t *pending_queues[MAX_PENDING];
486 static int nr_pending;
489 #define blkif_io_lock io_request_lock
491 /*============================================================================*/
492 #if ENABLE_VBD_UPDATE
494 /*
495 * blkif_update_int/update-vbds_task - handle VBD update events.
496 * Schedule a task for keventd to run, which will update the VBDs and perform
497 * the corresponding updates to our view of VBD state.
498 */
499 static void update_vbds_task(void *unused)
500 {
501 xlvbd_update_vbds();
502 }
504 static void vbd_update(void)
505 {
506 static struct tq_struct update_tq;
507 update_tq.routine = update_vbds_task;
508 schedule_task(&update_tq);
509 }
511 #endif /* ENABLE_VBD_UPDATE */
512 /*============================================================================*/
514 static void kick_pending_request_queues(void)
515 {
516 /* We kick pending request queues if the ring is reasonably empty. */
517 if ( (nr_pending != 0) &&
518 (RING_PENDING_REQUESTS(BLKIF_RING, &blk_ring) <
519 (RING_SIZE(BLKIF_RING, &blk_ring) >> 1)) )
520 {
521 /* Attempt to drain the queue, but bail if the ring becomes full. */
522 while ( (nr_pending != 0) && !RING_FULL(BLKIF_RING, &blk_ring) )
523 do_blkif_request(pending_queues[--nr_pending]);
524 }
525 }
527 int blkif_open(struct inode *inode, struct file *filep)
528 {
529 short xldev = inode->i_rdev;
530 struct gendisk *gd = get_gendisk(xldev);
531 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
532 short minor = MINOR(xldev);
534 if ( gd->part[minor].nr_sects == 0 )
535 {
536 /*
537 * Device either doesn't exist, or has zero capacity; we use a few
538 * cheesy heuristics to return the relevant error code
539 */
540 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
541 ((minor & (gd->max_p - 1)) != 0) )
542 {
543 /*
544 * We have a real device, but no such partition, or we just have a
545 * partition number so guess this is the problem.
546 */
547 return -ENXIO; /* no such device or address */
548 }
549 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
550 {
551 /* This is a removable device => assume that media is missing. */
552 return -ENOMEDIUM; /* media not present (this is a guess) */
553 }
554 else
555 {
556 /* Just go for the general 'no such device' error. */
557 return -ENODEV; /* no such device */
558 }
559 }
561 /* Update of usage count is protected by per-device semaphore. */
562 disk->usage++;
564 return 0;
565 }
568 int blkif_release(struct inode *inode, struct file *filep)
569 {
570 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
572 /*
573 * When usage drops to zero it may allow more VBD updates to occur.
574 * Update of usage count is protected by a per-device semaphore.
575 */
576 if ( --disk->usage == 0 ) {
577 vbd_update();
578 }
580 return 0;
581 }
584 int blkif_ioctl(struct inode *inode, struct file *filep,
585 unsigned command, unsigned long argument)
586 {
587 kdev_t dev = inode->i_rdev;
588 struct hd_geometry *geo = (struct hd_geometry *)argument;
589 struct gendisk *gd;
590 struct hd_struct *part;
591 int i;
592 unsigned short cylinders;
593 byte heads, sectors;
595 /* NB. No need to check permissions. That is done for us. */
597 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
598 command, (long) argument, dev);
600 gd = get_gendisk(dev);
601 part = &gd->part[MINOR(dev)];
603 switch ( command )
604 {
605 case BLKGETSIZE:
606 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
607 return put_user(part->nr_sects, (unsigned long *) argument);
609 case BLKGETSIZE64:
610 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
611 (u64)part->nr_sects * 512);
612 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
614 case BLKRRPART: /* re-read partition table */
615 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
616 return blkif_revalidate(dev);
618 case BLKSSZGET:
619 return hardsect_size[MAJOR(dev)][MINOR(dev)];
621 case BLKBSZGET: /* get block size */
622 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
623 break;
625 case BLKBSZSET: /* set block size */
626 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
627 break;
629 case BLKRASET: /* set read-ahead */
630 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
631 break;
633 case BLKRAGET: /* get read-ahead */
634 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
635 break;
637 case HDIO_GETGEO:
638 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
639 if (!argument) return -EINVAL;
641 /* We don't have real geometry info, but let's at least return
642 values consistent with the size of the device */
644 heads = 0xff;
645 sectors = 0x3f;
646 cylinders = part->nr_sects / (heads * sectors);
648 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
649 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
650 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
651 if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;
653 return 0;
655 case HDIO_GETGEO_BIG:
656 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
657 if (!argument) return -EINVAL;
659 /* We don't have real geometry info, but let's at least return
660 values consistent with the size of the device */
662 heads = 0xff;
663 sectors = 0x3f;
664 cylinders = part->nr_sects / (heads * sectors);
666 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
667 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
668 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
669 if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;
671 return 0;
673 case CDROMMULTISESSION:
674 DPRINTK("FIXME: support multisession CDs later\n");
675 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
676 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
677 return 0;
679 case SCSI_IOCTL_GET_BUS_NUMBER:
680 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
681 return -ENOSYS;
683 default:
684 printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
685 return -ENOSYS;
686 }
688 return 0;
689 }
693 /* check media change: should probably do something here in some cases :-) */
694 int blkif_check(kdev_t dev)
695 {
696 DPRINTK("blkif_check\n");
697 return 0;
698 }
700 int blkif_revalidate(kdev_t dev)
701 {
702 struct block_device *bd;
703 struct gendisk *gd;
704 xl_disk_t *disk;
705 unsigned long capacity;
706 int i, rc = 0;
708 if ( (bd = bdget(dev)) == NULL )
709 return -EINVAL;
711 /*
712 * Update of partition info, and check of usage count, is protected
713 * by the per-block-device semaphore.
714 */
715 down(&bd->bd_sem);
717 if ( ((gd = get_gendisk(dev)) == NULL) ||
718 ((disk = xldev_to_xldisk(dev)) == NULL) ||
719 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
720 {
721 rc = -EINVAL;
722 goto out;
723 }
725 if ( disk->usage > 1 )
726 {
727 rc = -EBUSY;
728 goto out;
729 }
731 /* Only reread partition table if VBDs aren't mapped to partitions. */
732 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
733 {
734 for ( i = gd->max_p - 1; i >= 0; i-- )
735 {
736 invalidate_device(dev+i, 1);
737 gd->part[MINOR(dev+i)].start_sect = 0;
738 gd->part[MINOR(dev+i)].nr_sects = 0;
739 gd->sizes[MINOR(dev+i)] = 0;
740 }
742 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
743 }
745 out:
746 up(&bd->bd_sem);
747 bdput(bd);
748 return rc;
749 }
752 /*
753 * blkif_queue_request
754 *
755 * request block io
756 *
757 * id: for guest use only.
758 * operation: BLKIF_OP_{READ,WRITE,PROBE}
759 * buffer: buffer to read/write into. this should be a
760 * virtual address in the guest os.
761 */
762 static int blkif_queue_request(unsigned long id,
763 int operation,
764 char * buffer,
765 unsigned long sector_number,
766 unsigned short nr_sectors,
767 kdev_t device)
768 {
769 unsigned long buffer_ma = virt_to_bus(buffer);
770 unsigned long xid;
771 struct gendisk *gd;
772 blkif_request_t *req;
773 struct buffer_head *bh;
774 unsigned int fsect, lsect;
776 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
777 lsect = fsect + nr_sectors - 1;
779 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
780 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
781 BUG();
782 if ( lsect > 7 )
783 BUG();
785 buffer_ma &= PAGE_MASK;
787 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
788 return 1;
790 switch ( operation )
791 {
793 case BLKIF_OP_READ:
794 case BLKIF_OP_WRITE:
795 gd = get_gendisk(device);
797 /*
798 * Update the sector_number we'll pass down as appropriate; note that
799 * we could sanity check that resulting sector will be in this
800 * partition, but this will happen in driver backend anyhow.
801 */
802 sector_number += gd->part[MINOR(device)].start_sect;
804 /*
805 * If this unit doesn't consist of virtual partitions then we clear
806 * the partn bits from the device number.
807 */
808 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
809 GENHD_FL_VIRT_PARTNS) )
810 device &= ~(gd->max_p - 1);
812 if ( (sg_operation == operation) &&
813 (sg_dev == device) &&
814 (sg_next_sect == sector_number) )
815 {
816 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring,
817 blk_ring.req_prod_pvt - 1);
818 bh = (struct buffer_head *)id;
820 bh->b_reqnext = (struct buffer_head *)rec_ring[req->id].id;
823 rec_ring[req->id].id = id;
825 req->frame_and_sects[req->nr_segments] =
826 buffer_ma | (fsect<<3) | lsect;
827 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
828 sg_next_sect += nr_sectors;
829 else
830 DISABLE_SCATTERGATHER();
832 /* Update the copy of the request in the recovery ring. */
833 translate_req_to_pfn(&rec_ring[req->id], req );
835 return 0;
836 }
837 else if ( RING_FULL(BLKIF_RING, &blk_ring) )
838 {
839 return 1;
840 }
841 else
842 {
843 sg_operation = operation;
844 sg_dev = device;
845 sg_next_sect = sector_number + nr_sectors;
846 }
847 break;
849 default:
850 panic("unknown op %d\n", operation);
851 }
853 /* Fill out a communications ring structure. */
854 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring, blk_ring.req_prod_pvt);
856 xid = GET_ID_FROM_FREELIST();
857 rec_ring[xid].id = id;
859 req->id = xid;
860 req->operation = operation;
861 req->sector_number = (blkif_sector_t)sector_number;
862 req->device = device;
863 req->nr_segments = 1;
864 req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
866 /* Keep a private copy so we can reissue requests when recovering. */
867 translate_req_to_pfn(&rec_ring[xid], req );
869 blk_ring.req_prod_pvt++;
871 return 0;
872 }
875 /*
876 * do_blkif_request
877 * read a block; request is in a request queue
878 */
879 void do_blkif_request(request_queue_t *rq)
880 {
881 struct request *req;
882 struct buffer_head *bh, *next_bh;
883 int rw, nsect, full, queued = 0;
885 DPRINTK("Entered do_blkif_request\n");
887 while ( !rq->plugged && !list_empty(&rq->queue_head))
888 {
889 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
890 goto out;
892 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
893 req, req->cmd, req->sector,
894 req->current_nr_sectors, req->nr_sectors, req->bh);
896 rw = req->cmd;
897 if ( rw == READA )
898 rw = READ;
899 if ( unlikely((rw != READ) && (rw != WRITE)) )
900 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
902 req->errors = 0;
904 bh = req->bh;
905 while ( bh != NULL )
906 {
907 next_bh = bh->b_reqnext;
908 bh->b_reqnext = NULL;
910 full = blkif_queue_request(
911 (unsigned long)bh,
912 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
913 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
915 if ( full )
916 {
917 bh->b_reqnext = next_bh;
918 pending_queues[nr_pending++] = rq;
919 if ( unlikely(nr_pending >= MAX_PENDING) )
920 BUG();
921 goto out;
922 }
924 queued++;
926 /* Dequeue the buffer head from the request. */
927 nsect = bh->b_size >> 9;
928 bh = req->bh = next_bh;
930 if ( bh != NULL )
931 {
932 /* There's another buffer head to do. Update the request. */
933 req->hard_sector += nsect;
934 req->hard_nr_sectors -= nsect;
935 req->sector = req->hard_sector;
936 req->nr_sectors = req->hard_nr_sectors;
937 req->current_nr_sectors = bh->b_size >> 9;
938 req->buffer = bh->b_data;
939 }
940 else
941 {
942 /* That was the last buffer head. Finalise the request. */
943 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
944 BUG();
945 blkdev_dequeue_request(req);
946 end_that_request_last(req);
947 }
948 }
949 }
951 out:
952 if ( queued != 0 )
953 flush_requests();
954 }
957 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
958 {
959 RING_IDX i, rp;
960 unsigned long flags;
961 struct buffer_head *bh, *next_bh;
963 spin_lock_irqsave(&io_request_lock, flags);
965 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
966 {
967 spin_unlock_irqrestore(&io_request_lock, flags);
968 return;
969 }
971 rp = blk_ring.sring->rsp_prod;
972 rmb(); /* Ensure we see queued responses up to 'rp'. */
974 for ( i = blk_ring.rsp_cons; i != rp; i++ )
975 {
976 unsigned long id;
977 blkif_response_t *bret;
979 bret = RING_GET_RESPONSE(BLKIF_RING, &blk_ring, i);
980 id = bret->id;
981 bh = (struct buffer_head *)rec_ring[id].id;
983 blkif_completion( &rec_ring[id] );
985 ADD_ID_TO_FREELIST(id);
987 switch ( bret->operation )
988 {
989 case BLKIF_OP_READ:
990 case BLKIF_OP_WRITE:
991 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
992 DPRINTK("Bad return from blkdev data request: %lx\n",
993 bret->status);
994 for ( ; bh != NULL; bh = next_bh )
995 {
996 next_bh = bh->b_reqnext;
997 bh->b_reqnext = NULL;
998 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
999 }
1001 break;
1002 case BLKIF_OP_PROBE:
1003 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
1004 blkif_control_rsp_valid = 1;
1005 break;
1006 default:
1007 BUG();
1011 blk_ring.rsp_cons = i;
1013 kick_pending_request_queues();
1015 spin_unlock_irqrestore(&io_request_lock, flags);
1018 #endif
1020 /***************************** COMMON CODE *******************************/
1023 void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
1025 unsigned long flags, id;
1026 blkif_request_t *req_d;
1028 retry:
1029 while ( RING_FULL(BLKIF_RING, &blk_ring) )
1031 set_current_state(TASK_INTERRUPTIBLE);
1032 schedule_timeout(1);
1035 spin_lock_irqsave(&blkif_io_lock, flags);
1036 if ( RING_FULL(BLKIF_RING, &blk_ring) )
1038 spin_unlock_irqrestore(&blkif_io_lock, flags);
1039 goto retry;
1042 DISABLE_SCATTERGATHER();
1043 req_d = RING_GET_REQUEST(BLKIF_RING, &blk_ring, blk_ring.req_prod_pvt);
1044 *req_d = *req;
1046 id = GET_ID_FROM_FREELIST();
1047 req_d->id = id;
1048 rec_ring[id].id = (unsigned long) req;
1050 translate_req_to_pfn( &rec_ring[id], req );
1052 blk_ring.req_prod_pvt++;
1053 flush_requests();
1055 spin_unlock_irqrestore(&blkif_io_lock, flags);
1057 while ( !blkif_control_rsp_valid )
1059 set_current_state(TASK_INTERRUPTIBLE);
1060 schedule_timeout(1);
1063 memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
1064 blkif_control_rsp_valid = 0;
1068 /* Send a driver status notification to the domain controller. */
1069 static void send_driver_status(int ok)
1071 ctrl_msg_t cmsg = {
1072 .type = CMSG_BLKIF_FE,
1073 .subtype = CMSG_BLKIF_FE_DRIVER_STATUS,
1074 .length = sizeof(blkif_fe_driver_status_t),
1075 };
1076 blkif_fe_driver_status_t *msg = (void*)cmsg.msg;
1078 msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN);
1080 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1083 /* Tell the controller to bring up the interface. */
1084 static void blkif_send_interface_connect(void)
1086 ctrl_msg_t cmsg = {
1087 .type = CMSG_BLKIF_FE,
1088 .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
1089 .length = sizeof(blkif_fe_interface_connect_t),
1090 };
1091 blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
1093 msg->handle = 0;
1094 msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT);
1096 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1099 static void blkif_free(void)
1101 /* Prevent new requests being issued until we fix things up. */
1102 spin_lock_irq(&blkif_io_lock);
1103 recovery = 1;
1104 blkif_state = BLKIF_STATE_DISCONNECTED;
1105 spin_unlock_irq(&blkif_io_lock);
1107 /* Free resources associated with old device channel. */
1108 if ( blk_ring.sring != NULL )
1110 free_page((unsigned long)blk_ring.sring);
1111 blk_ring.sring = NULL;
1113 free_irq(blkif_irq, NULL);
1114 blkif_irq = 0;
1116 unbind_evtchn_from_irq(blkif_evtchn);
1117 blkif_evtchn = 0;
1120 static void blkif_close(void)
1124 /* Move from CLOSED to DISCONNECTED state. */
1125 static void blkif_disconnect(void)
1127 blkif_sring_t *sring;
1129 if ( blk_ring.sring != NULL )
1130 free_page((unsigned long)blk_ring.sring);
1132 sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
1133 SHARED_RING_INIT(BLKIF_RING, sring);
1134 FRONT_RING_INIT(BLKIF_RING, &blk_ring, sring);
1135 blkif_state = BLKIF_STATE_DISCONNECTED;
1136 blkif_send_interface_connect();
1139 static void blkif_reset(void)
1141 blkif_free();
1142 blkif_disconnect();
1145 static void blkif_recover(void)
1147 int i;
1148 blkif_request_t *req;
1150 /* Hmm, requests might be re-ordered when we re-issue them.
1151 * This will need to be fixed once we have barriers */
1153 /* Stage 1 : Find active and move to safety. */
1154 for ( i = 0; i < RING_SIZE(BLKIF_RING, &blk_ring); i++ )
1156 if ( rec_ring[i].id >= PAGE_OFFSET )
1158 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring,
1159 blk_ring.req_prod_pvt);
1160 translate_req_to_mfn(req, &rec_ring[i]);
1161 blk_ring.req_prod_pvt++;
1165 /* Stage 2 : Set up shadow list. */
1166 for ( i = 0; i < blk_ring.req_prod_pvt; i++ )
1168 req = RING_GET_REQUEST(BLKIF_RING, &blk_ring, i);
1169 rec_ring[i].id = req->id;
1170 req->id = i;
1171 translate_req_to_pfn(&rec_ring[i], req);
1174 /* Stage 3 : Set up free list. */
1175 for ( ; i < RING_SIZE(BLKIF_RING, &blk_ring); i++ )
1176 rec_ring[i].id = i+1;
1177 rec_ring_free = blk_ring.req_prod_pvt;
1178 rec_ring[RING_SIZE(BLKIF_RING, &blk_ring)-1].id = 0x0fffffff;
1180 /* blk_ring->req_prod will be set when we flush_requests().*/
1181 wmb();
1183 /* Switch off recovery mode, using a memory barrier to ensure that
1184 * it's seen before we flush requests - we don't want to miss any
1185 * interrupts. */
1186 recovery = 0;
1187 wmb();
1189 /* Kicks things back into life. */
1190 flush_requests();
1192 /* Now safe to left other peope use interface. */
1193 blkif_state = BLKIF_STATE_CONNECTED;
1196 static void blkif_connect(blkif_fe_interface_status_t *status)
1198 int err = 0;
1200 blkif_evtchn = status->evtchn;
1201 blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
1203 err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
1204 if ( err )
1206 printk(KERN_ALERT "xen_blk: request_irq failed (err=%d)\n", err);
1207 return;
1210 if ( recovery )
1212 blkif_recover();
1214 else
1216 /* Transition to connected in case we need to do
1217 * a partition probe on a whole disk. */
1218 blkif_state = BLKIF_STATE_CONNECTED;
1220 /* Probe for discs attached to the interface. */
1221 xlvbd_init();
1224 /* Kick pending requests. */
1225 spin_lock_irq(&blkif_io_lock);
1226 kick_pending_request_queues();
1227 spin_unlock_irq(&blkif_io_lock);
1230 static void unexpected(blkif_fe_interface_status_t *status)
1232 DPRINTK(" Unexpected blkif status %u in state %u\n",
1233 status->status, blkif_state);
1236 static void blkif_status(blkif_fe_interface_status_t *status)
1238 if ( status->handle != blkif_handle )
1240 WPRINTK(" Invalid blkif: handle=%u\n", status->handle);
1241 unexpected(status);
1242 return;
1245 switch ( status->status )
1247 case BLKIF_INTERFACE_STATUS_CLOSED:
1248 switch ( blkif_state )
1250 case BLKIF_STATE_CLOSED:
1251 unexpected(status);
1252 break;
1253 case BLKIF_STATE_DISCONNECTED:
1254 case BLKIF_STATE_CONNECTED:
1255 unexpected(status);
1256 blkif_close();
1257 break;
1259 break;
1261 case BLKIF_INTERFACE_STATUS_DISCONNECTED:
1262 switch ( blkif_state )
1264 case BLKIF_STATE_CLOSED:
1265 blkif_disconnect();
1266 break;
1267 case BLKIF_STATE_DISCONNECTED:
1268 case BLKIF_STATE_CONNECTED:
1269 /* unexpected(status); */ /* occurs during suspend/resume */
1270 blkif_reset();
1271 break;
1273 break;
1275 case BLKIF_INTERFACE_STATUS_CONNECTED:
1276 switch ( blkif_state )
1278 case BLKIF_STATE_CLOSED:
1279 unexpected(status);
1280 blkif_disconnect();
1281 blkif_connect(status);
1282 break;
1283 case BLKIF_STATE_DISCONNECTED:
1284 blkif_connect(status);
1285 break;
1286 case BLKIF_STATE_CONNECTED:
1287 unexpected(status);
1288 blkif_connect(status);
1289 break;
1291 break;
1293 case BLKIF_INTERFACE_STATUS_CHANGED:
1294 switch ( blkif_state )
1296 case BLKIF_STATE_CLOSED:
1297 case BLKIF_STATE_DISCONNECTED:
1298 unexpected(status);
1299 break;
1300 case BLKIF_STATE_CONNECTED:
1301 vbd_update();
1302 break;
1304 break;
1306 default:
1307 WPRINTK(" Invalid blkif status: %d\n", status->status);
1308 break;
1313 static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
1315 switch ( msg->subtype )
1317 case CMSG_BLKIF_FE_INTERFACE_STATUS:
1318 if ( msg->length != sizeof(blkif_fe_interface_status_t) )
1319 goto parse_error;
1320 blkif_status((blkif_fe_interface_status_t *)
1321 &msg->msg[0]);
1322 break;
1323 default:
1324 goto parse_error;
1327 ctrl_if_send_response(msg);
1328 return;
1330 parse_error:
1331 msg->length = 0;
1332 ctrl_if_send_response(msg);
1335 int wait_for_blkif(void)
1337 int err = 0;
1338 int i;
1339 send_driver_status(1);
1341 /*
1342 * We should read 'nr_interfaces' from response message and wait
1343 * for notifications before proceeding. For now we assume that we
1344 * will be notified of exactly one interface.
1345 */
1346 for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ )
1348 set_current_state(TASK_INTERRUPTIBLE);
1349 schedule_timeout(1);
1352 if ( blkif_state != BLKIF_STATE_CONNECTED )
1354 printk(KERN_INFO "xen_blk: Timeout connecting to device!\n");
1355 err = -ENOSYS;
1357 return err;
1360 int __init xlblk_init(void)
1362 int i;
1364 if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
1365 (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
1366 return 0;
1368 printk(KERN_INFO "xen_blk: Initialising virtual block device driver\n");
1370 rec_ring_free = 0;
1371 for ( i = 0; i < RING_SIZE(BLKIF_RING, &blk_ring); i++ )
1372 rec_ring[i].id = i+1;
1373 rec_ring[RING_SIZE(BLKIF_RING, &blk_ring)-1].id = 0x0fffffff;
1375 (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
1376 CALLBACK_IN_BLOCKING_CONTEXT);
1378 wait_for_blkif();
1380 return 0;
1383 void blkdev_suspend(void)
1387 void blkdev_resume(void)
1389 send_driver_status(1);
1392 /* XXXXX THIS IS A TEMPORARY FUNCTION UNTIL WE GET GRANT TABLES */
1394 void blkif_completion(blkif_request_t *req)
1396 int i;
1398 switch ( req->operation )
1400 case BLKIF_OP_READ:
1401 for ( i = 0; i < req->nr_segments; i++ )
1403 unsigned long pfn = req->frame_and_sects[i] >> PAGE_SHIFT;
1404 unsigned long mfn = phys_to_machine_mapping[pfn];
1405 xen_machphys_update(mfn, pfn);
1407 break;