debuggers.hg

view linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c @ 4649:ebeac8efe955

bitkeeper revision 1.1350 (42676ee4BkgqwvPiIyB44k55uY8cSA)

Fix blkdev suspend/resume.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Apr 21 09:14:12 2005 +0000 (2005-04-21)
parents 7fc6eac6da3a
children 3291b52e8ca6 efc62ecb53c6 d16ae85cb89e
line source
1 /******************************************************************************
2 * blkfront.c
3 *
4 * XenLinux virtual block-device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 *
12 * This file may be distributed separately from the Linux kernel, or
13 * incorporated into other software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
34 #if 1
35 #define ASSERT(_p) \
36 if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
37 __LINE__, __FILE__); *(int*)0=0; }
38 #else
39 #define ASSERT(_p)
40 #endif
42 #include <linux/version.h>
44 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
45 #include "block.h"
46 #else
47 #include "common.h"
48 #include <linux/blk.h>
49 #include <linux/tqueue.h>
50 #endif
52 #include <linux/cdrom.h>
53 #include <linux/sched.h>
54 #include <linux/interrupt.h>
55 #include <scsi/scsi.h>
56 #include <asm-xen/ctrl_if.h>
57 #include <asm-xen/evtchn.h>
58 #ifdef CONFIG_XEN_BLKDEV_GRANT
59 #include <asm-xen/xen-public/grant_table.h>
60 #include <asm-xen/gnttab.h>
61 #endif
63 typedef unsigned char byte; /* from linux/ide.h */
65 /* Control whether runtime update of vbds is enabled. */
66 #define ENABLE_VBD_UPDATE 1
68 #if ENABLE_VBD_UPDATE
69 static void vbd_update(void);
70 #else
71 static void vbd_update(void){};
72 #endif
74 #define BLKIF_STATE_CLOSED 0
75 #define BLKIF_STATE_DISCONNECTED 1
76 #define BLKIF_STATE_CONNECTED 2
78 #define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_blk: " fmt, ##args)
80 static int blkif_handle = 0;
81 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
82 static unsigned int blkif_evtchn = 0;
83 static unsigned int blkif_irq = 0;
85 static int blkif_control_rsp_valid;
86 static blkif_response_t blkif_control_rsp;
88 static blkif_front_ring_t blk_ring;
90 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
92 #ifdef CONFIG_XEN_BLKDEV_GRANT
93 static domid_t rdomid = 0;
94 static grant_ref_t gref_head, gref_terminal;
95 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
96 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE)
97 #define GRANTREF_INVALID (1<<15)
98 #endif
100 static struct blk_shadow {
101 blkif_request_t req;
102 unsigned long request;
103 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
104 } blk_shadow[BLK_RING_SIZE];
105 unsigned long blk_shadow_free;
107 static int recovery = 0; /* Recovery in progress: protected by blkif_io_lock */
109 static void kick_pending_request_queues(void);
111 int __init xlblk_init(void);
113 static void blkif_completion(struct blk_shadow *s);
115 static inline int GET_ID_FROM_FREELIST(void)
116 {
117 unsigned long free = blk_shadow_free;
118 BUG_ON(free > BLK_RING_SIZE);
119 blk_shadow_free = blk_shadow[free].req.id;
120 blk_shadow[free].req.id = 0x0fffffee; /* debug */
121 return free;
122 }
124 static inline void ADD_ID_TO_FREELIST(unsigned long id)
125 {
126 blk_shadow[id].req.id = blk_shadow_free;
127 blk_shadow[id].request = 0;
128 blk_shadow_free = id;
129 }
132 /************************ COMMON CODE (inlined) ************************/
134 /* Kernel-specific definitions used in the common code */
135 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
136 #define DISABLE_SCATTERGATHER()
137 #else
138 static int sg_operation = -1;
139 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
140 #endif
142 static inline void pickle_request(struct blk_shadow *s, blkif_request_t *r)
143 {
144 #ifndef CONFIG_XEN_BLKDEV_GRANT
145 int i;
146 #endif
148 s->req = *r;
150 #ifndef CONFIG_XEN_BLKDEV_GRANT
151 for ( i = 0; i < r->nr_segments; i++ )
152 s->req.frame_and_sects[i] = machine_to_phys(r->frame_and_sects[i]);
153 #endif
154 }
156 static inline void unpickle_request(blkif_request_t *r, struct blk_shadow *s)
157 {
158 #ifndef CONFIG_XEN_BLKDEV_GRANT
159 int i;
160 #endif
162 *r = s->req;
164 #ifndef CONFIG_XEN_BLKDEV_GRANT
165 for ( i = 0; i < s->req.nr_segments; i++ )
166 r->frame_and_sects[i] = phys_to_machine(s->req.frame_and_sects[i]);
167 #endif
168 }
171 static inline void flush_requests(void)
172 {
173 DISABLE_SCATTERGATHER();
174 RING_PUSH_REQUESTS(&blk_ring);
175 notify_via_evtchn(blkif_evtchn);
176 }
179 /************************** KERNEL VERSION 2.6 **************************/
181 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
183 module_init(xlblk_init);
185 #if ENABLE_VBD_UPDATE
186 static void update_vbds_task(void *unused)
187 {
188 xlvbd_update_vbds();
189 }
191 static void vbd_update(void)
192 {
193 static DECLARE_WORK(update_tq, update_vbds_task, NULL);
194 schedule_work(&update_tq);
195 }
196 #endif /* ENABLE_VBD_UPDATE */
198 static void kick_pending_request_queues(void)
199 {
200 if ( (xlbd_blk_queue != NULL) &&
201 test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
202 {
203 blk_start_queue(xlbd_blk_queue);
204 /* XXXcl call to request_fn should not be needed but
205 * we get stuck without... needs investigating
206 */
207 xlbd_blk_queue->request_fn(xlbd_blk_queue);
208 }
209 }
212 int blkif_open(struct inode *inode, struct file *filep)
213 {
214 struct gendisk *gd = inode->i_bdev->bd_disk;
215 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
217 /* Update of usage count is protected by per-device semaphore. */
218 di->mi->usage++;
220 return 0;
221 }
224 int blkif_release(struct inode *inode, struct file *filep)
225 {
226 struct gendisk *gd = inode->i_bdev->bd_disk;
227 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
229 /*
230 * When usage drops to zero it may allow more VBD updates to occur.
231 * Update of usage count is protected by a per-device semaphore.
232 */
233 if ( --di->mi->usage == 0 )
234 vbd_update();
236 return 0;
237 }
240 int blkif_ioctl(struct inode *inode, struct file *filep,
241 unsigned command, unsigned long argument)
242 {
243 int i;
245 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
246 command, (long)argument, inode->i_rdev);
248 switch ( command )
249 {
250 case HDIO_GETGEO:
251 /* return ENOSYS to use defaults */
252 return -ENOSYS;
254 case CDROMMULTISESSION:
255 DPRINTK("FIXME: support multisession CDs later\n");
256 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
257 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
258 return 0;
260 default:
261 printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
262 command);
263 return -ENOSYS;
264 }
266 return 0;
267 }
270 /*
271 * blkif_queue_request
272 *
273 * request block io
274 *
275 * id: for guest use only.
276 * operation: BLKIF_OP_{READ,WRITE,PROBE}
277 * buffer: buffer to read/write into. this should be a
278 * virtual address in the guest os.
279 */
280 static int blkif_queue_request(struct request *req)
281 {
282 struct xlbd_disk_info *di =
283 (struct xlbd_disk_info *)req->rq_disk->private_data;
284 unsigned long buffer_ma;
285 blkif_request_t *ring_req;
286 struct bio *bio;
287 struct bio_vec *bvec;
288 int idx;
289 unsigned long id;
290 unsigned int fsect, lsect;
291 #ifdef CONFIG_XEN_BLKDEV_GRANT
292 int ref;
293 #endif
295 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
296 return 1;
298 /* Fill out a communications ring structure. */
299 ring_req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
300 id = GET_ID_FROM_FREELIST();
301 blk_shadow[id].request = (unsigned long)req;
303 ring_req->id = id;
304 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
305 BLKIF_OP_READ;
306 ring_req->sector_number = (blkif_sector_t)req->sector;
307 ring_req->device = di->xd_device;
309 ring_req->nr_segments = 0;
310 rq_for_each_bio(bio, req)
311 {
312 bio_for_each_segment(bvec, bio, idx)
313 {
314 if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
315 BUG();
316 buffer_ma = page_to_phys(bvec->bv_page);
317 fsect = bvec->bv_offset >> 9;
318 lsect = fsect + (bvec->bv_len >> 9) - 1;
319 #ifdef CONFIG_XEN_BLKDEV_GRANT
320 /* install a grant reference. */
321 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
322 ASSERT( ref != -ENOSPC );
324 gnttab_grant_foreign_access_ref(
325 ref,
326 rdomid,
327 buffer_ma >> PAGE_SHIFT,
328 rq_data_dir(req) );
330 blk_shadow[id].frame[ring_req->nr_segments] =
331 buffer_ma >> PAGE_SHIFT;
333 ring_req->frame_and_sects[ring_req->nr_segments++] =
334 (((u32) ref) << 16) | (fsect << 3) | lsect;
336 #else
337 ring_req->frame_and_sects[ring_req->nr_segments++] =
338 buffer_ma | (fsect << 3) | lsect;
339 #endif
340 }
341 }
343 blk_ring.req_prod_pvt++;
345 /* Keep a private copy so we can reissue requests when recovering. */
346 pickle_request(&blk_shadow[id], ring_req);
348 return 0;
349 }
352 /*
353 * do_blkif_request
354 * read a block; request is in a request queue
355 */
356 void do_blkif_request(request_queue_t *rq)
357 {
358 struct request *req;
359 int queued;
361 DPRINTK("Entered do_blkif_request\n");
363 queued = 0;
365 while ( (req = elv_next_request(rq)) != NULL )
366 {
367 if ( !blk_fs_request(req) )
368 {
369 end_request(req, 0);
370 continue;
371 }
373 if ( RING_FULL(&blk_ring) )
374 {
375 blk_stop_queue(rq);
376 break;
377 }
379 DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
380 req, req->cmd, req->sector, req->current_nr_sectors,
381 req->nr_sectors, req->buffer,
382 rq_data_dir(req) ? "write" : "read");
384 blkdev_dequeue_request(req);
385 if ( blkif_queue_request(req) )
386 {
387 blk_stop_queue(rq);
388 break;
389 }
391 queued++;
392 }
394 if ( queued != 0 )
395 flush_requests();
396 }
399 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
400 {
401 struct request *req;
402 blkif_response_t *bret;
403 RING_IDX i, rp;
404 unsigned long flags;
406 spin_lock_irqsave(&blkif_io_lock, flags);
408 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ||
409 unlikely(recovery) )
410 {
411 spin_unlock_irqrestore(&blkif_io_lock, flags);
412 return IRQ_HANDLED;
413 }
415 rp = blk_ring.sring->rsp_prod;
416 rmb(); /* Ensure we see queued responses up to 'rp'. */
418 for ( i = blk_ring.rsp_cons; i != rp; i++ )
419 {
420 unsigned long id;
422 bret = RING_GET_RESPONSE(&blk_ring, i);
423 id = bret->id;
424 req = (struct request *)blk_shadow[id].request;
426 blkif_completion(&blk_shadow[id]);
428 ADD_ID_TO_FREELIST(id);
430 switch ( bret->operation )
431 {
432 case BLKIF_OP_READ:
433 case BLKIF_OP_WRITE:
434 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
435 DPRINTK("Bad return from blkdev data request: %x\n",
436 bret->status);
438 if ( unlikely(end_that_request_first
439 (req,
440 (bret->status == BLKIF_RSP_OKAY),
441 req->hard_nr_sectors)) )
442 BUG();
443 end_that_request_last(req);
445 break;
446 case BLKIF_OP_PROBE:
447 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
448 blkif_control_rsp_valid = 1;
449 break;
450 default:
451 BUG();
452 }
453 }
455 blk_ring.rsp_cons = i;
457 kick_pending_request_queues();
459 spin_unlock_irqrestore(&blkif_io_lock, flags);
461 return IRQ_HANDLED;
462 }
464 #else
465 /************************** KERNEL VERSION 2.4 **************************/
467 static kdev_t sg_dev;
468 static unsigned long sg_next_sect;
470 /*
471 * Request queues with outstanding work, but ring is currently full.
472 * We need no special lock here, as we always access this with the
473 * blkif_io_lock held. We only need a small maximum list.
474 */
475 #define MAX_PENDING 8
476 static request_queue_t *pending_queues[MAX_PENDING];
477 static int nr_pending;
480 #define blkif_io_lock io_request_lock
482 /*============================================================================*/
483 #if ENABLE_VBD_UPDATE
485 /*
486 * blkif_update_int/update-vbds_task - handle VBD update events.
487 * Schedule a task for keventd to run, which will update the VBDs and perform
488 * the corresponding updates to our view of VBD state.
489 */
490 static void update_vbds_task(void *unused)
491 {
492 xlvbd_update_vbds();
493 }
495 static void vbd_update(void)
496 {
497 static struct tq_struct update_tq;
498 update_tq.routine = update_vbds_task;
499 schedule_task(&update_tq);
500 }
502 #endif /* ENABLE_VBD_UPDATE */
503 /*============================================================================*/
505 static void kick_pending_request_queues(void)
506 {
507 /* We kick pending request queues if the ring is reasonably empty. */
508 if ( (nr_pending != 0) &&
509 (RING_PENDING_REQUESTS(&blk_ring) < (BLK_RING_SIZE >> 1)) )
510 {
511 /* Attempt to drain the queue, but bail if the ring becomes full. */
512 while ( (nr_pending != 0) && !RING_FULL(&blk_ring) )
513 do_blkif_request(pending_queues[--nr_pending]);
514 }
515 }
517 int blkif_open(struct inode *inode, struct file *filep)
518 {
519 short xldev = inode->i_rdev;
520 struct gendisk *gd = get_gendisk(xldev);
521 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
522 short minor = MINOR(xldev);
524 if ( gd->part[minor].nr_sects == 0 )
525 {
526 /*
527 * Device either doesn't exist, or has zero capacity; we use a few
528 * cheesy heuristics to return the relevant error code
529 */
530 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
531 ((minor & (gd->max_p - 1)) != 0) )
532 {
533 /*
534 * We have a real device, but no such partition, or we just have a
535 * partition number so guess this is the problem.
536 */
537 return -ENXIO; /* no such device or address */
538 }
539 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
540 {
541 /* This is a removable device => assume that media is missing. */
542 return -ENOMEDIUM; /* media not present (this is a guess) */
543 }
544 else
545 {
546 /* Just go for the general 'no such device' error. */
547 return -ENODEV; /* no such device */
548 }
549 }
551 /* Update of usage count is protected by per-device semaphore. */
552 disk->usage++;
554 return 0;
555 }
558 int blkif_release(struct inode *inode, struct file *filep)
559 {
560 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
562 /*
563 * When usage drops to zero it may allow more VBD updates to occur.
564 * Update of usage count is protected by a per-device semaphore.
565 */
566 if ( --disk->usage == 0 ) {
567 vbd_update();
568 }
570 return 0;
571 }
574 int blkif_ioctl(struct inode *inode, struct file *filep,
575 unsigned command, unsigned long argument)
576 {
577 kdev_t dev = inode->i_rdev;
578 struct hd_geometry *geo = (struct hd_geometry *)argument;
579 struct gendisk *gd;
580 struct hd_struct *part;
581 int i;
582 unsigned short cylinders;
583 byte heads, sectors;
585 /* NB. No need to check permissions. That is done for us. */
587 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
588 command, (long) argument, dev);
590 gd = get_gendisk(dev);
591 part = &gd->part[MINOR(dev)];
593 switch ( command )
594 {
595 case BLKGETSIZE:
596 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
597 return put_user(part->nr_sects, (unsigned long *) argument);
599 case BLKGETSIZE64:
600 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
601 (u64)part->nr_sects * 512);
602 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
604 case BLKRRPART: /* re-read partition table */
605 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
606 return blkif_revalidate(dev);
608 case BLKSSZGET:
609 return hardsect_size[MAJOR(dev)][MINOR(dev)];
611 case BLKBSZGET: /* get block size */
612 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
613 break;
615 case BLKBSZSET: /* set block size */
616 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
617 break;
619 case BLKRASET: /* set read-ahead */
620 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
621 break;
623 case BLKRAGET: /* get read-ahead */
624 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
625 break;
627 case HDIO_GETGEO:
628 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
629 if (!argument) return -EINVAL;
631 /* We don't have real geometry info, but let's at least return
632 values consistent with the size of the device */
634 heads = 0xff;
635 sectors = 0x3f;
636 cylinders = part->nr_sects / (heads * sectors);
638 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
639 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
640 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
641 if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;
643 return 0;
645 case HDIO_GETGEO_BIG:
646 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
647 if (!argument) return -EINVAL;
649 /* We don't have real geometry info, but let's at least return
650 values consistent with the size of the device */
652 heads = 0xff;
653 sectors = 0x3f;
654 cylinders = part->nr_sects / (heads * sectors);
656 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
657 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
658 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
659 if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;
661 return 0;
663 case CDROMMULTISESSION:
664 DPRINTK("FIXME: support multisession CDs later\n");
665 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
666 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
667 return 0;
669 case SCSI_IOCTL_GET_BUS_NUMBER:
670 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
671 return -ENOSYS;
673 default:
674 printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
675 return -ENOSYS;
676 }
678 return 0;
679 }
683 /* check media change: should probably do something here in some cases :-) */
684 int blkif_check(kdev_t dev)
685 {
686 DPRINTK("blkif_check\n");
687 return 0;
688 }
690 int blkif_revalidate(kdev_t dev)
691 {
692 struct block_device *bd;
693 struct gendisk *gd;
694 xl_disk_t *disk;
695 unsigned long capacity;
696 int i, rc = 0;
698 if ( (bd = bdget(dev)) == NULL )
699 return -EINVAL;
701 /*
702 * Update of partition info, and check of usage count, is protected
703 * by the per-block-device semaphore.
704 */
705 down(&bd->bd_sem);
707 if ( ((gd = get_gendisk(dev)) == NULL) ||
708 ((disk = xldev_to_xldisk(dev)) == NULL) ||
709 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
710 {
711 rc = -EINVAL;
712 goto out;
713 }
715 if ( disk->usage > 1 )
716 {
717 rc = -EBUSY;
718 goto out;
719 }
721 /* Only reread partition table if VBDs aren't mapped to partitions. */
722 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
723 {
724 for ( i = gd->max_p - 1; i >= 0; i-- )
725 {
726 invalidate_device(dev+i, 1);
727 gd->part[MINOR(dev+i)].start_sect = 0;
728 gd->part[MINOR(dev+i)].nr_sects = 0;
729 gd->sizes[MINOR(dev+i)] = 0;
730 }
732 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
733 }
735 out:
736 up(&bd->bd_sem);
737 bdput(bd);
738 return rc;
739 }
742 /*
743 * blkif_queue_request
744 *
745 * request block io
746 *
747 * id: for guest use only.
748 * operation: BLKIF_OP_{READ,WRITE,PROBE}
749 * buffer: buffer to read/write into. this should be a
750 * virtual address in the guest os.
751 */
752 static int blkif_queue_request(unsigned long id,
753 int operation,
754 char * buffer,
755 unsigned long sector_number,
756 unsigned short nr_sectors,
757 kdev_t device)
758 {
759 unsigned long buffer_ma = virt_to_bus(buffer);
760 unsigned long xid;
761 struct gendisk *gd;
762 blkif_request_t *req;
763 struct buffer_head *bh;
764 unsigned int fsect, lsect;
765 #ifdef CONFIG_XEN_BLKDEV_GRANT
766 int ref;
767 #endif
769 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
770 lsect = fsect + nr_sectors - 1;
772 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
773 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
774 BUG();
775 if ( lsect > 7 )
776 BUG();
778 buffer_ma &= PAGE_MASK;
780 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
781 return 1;
783 switch ( operation )
784 {
786 case BLKIF_OP_READ:
787 case BLKIF_OP_WRITE:
788 gd = get_gendisk(device);
790 /*
791 * Update the sector_number we'll pass down as appropriate; note that
792 * we could sanity check that resulting sector will be in this
793 * partition, but this will happen in driver backend anyhow.
794 */
795 sector_number += gd->part[MINOR(device)].start_sect;
797 /*
798 * If this unit doesn't consist of virtual partitions then we clear
799 * the partn bits from the device number.
800 */
801 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
802 GENHD_FL_VIRT_PARTNS) )
803 device &= ~(gd->max_p - 1);
805 if ( (sg_operation == operation) &&
806 (sg_dev == device) &&
807 (sg_next_sect == sector_number) )
808 {
809 req = RING_GET_REQUEST(&blk_ring,
810 blk_ring.req_prod_pvt - 1);
811 bh = (struct buffer_head *)id;
813 bh->b_reqnext = (struct buffer_head *)blk_shadow[req->id].request;
814 blk_shadow[req->id].request = (unsigned long)id;
816 #ifdef CONFIG_XEN_BLKDEV_GRANT
817 /* install a grant reference. */
818 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
819 ASSERT( ref != -ENOSPC );
821 gnttab_grant_foreign_access_ref(
822 ref,
823 rdomid,
824 buffer_ma >> PAGE_SHIFT,
825 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
827 blk_shadow[id].frame[req->nr_segments] =
828 buffer_ma >> PAGE_SHIFT;
830 req->frame_and_sects[req->nr_segments] =
831 (((u32) ref ) << 16) | (fsect << 3) | lsect;
832 #else
833 req->frame_and_sects[req->nr_segments] =
834 buffer_ma | (fsect << 3) | lsect;
835 #endif
836 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
837 sg_next_sect += nr_sectors;
838 else
839 DISABLE_SCATTERGATHER();
841 /* Update the copy of the request in the recovery ring. */
842 pickle_request(&blk_shadow[req->id], req );
844 return 0;
845 }
846 else if ( RING_FULL(&blk_ring) )
847 {
848 return 1;
849 }
850 else
851 {
852 sg_operation = operation;
853 sg_dev = device;
854 sg_next_sect = sector_number + nr_sectors;
855 }
856 break;
858 default:
859 panic("unknown op %d\n", operation);
860 }
862 /* Fill out a communications ring structure. */
863 req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
865 xid = GET_ID_FROM_FREELIST();
866 blk_shadow[xid].request = (unsigned long)id;
868 req->id = xid;
869 req->operation = operation;
870 req->sector_number = (blkif_sector_t)sector_number;
871 req->device = device;
872 req->nr_segments = 1;
873 #ifdef CONFIG_XEN_BLKDEV_GRANT
874 /* install a grant reference. */
875 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
876 ASSERT( ref != -ENOSPC );
878 gnttab_grant_foreign_access_ref(
879 ref,
880 rdomid,
881 buffer_ma >> PAGE_SHIFT,
882 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
884 blk_shadow[xid].frame[0] = buffer_ma >> PAGE_SHIFT;
886 req->frame_and_sects[0] = (((u32) ref)<<16) | (fsect<<3) | lsect;
887 #else
888 req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
889 #endif
891 /* Keep a private copy so we can reissue requests when recovering. */
892 pickle_request(&blk_shadow[xid], req);
894 blk_ring.req_prod_pvt++;
896 return 0;
897 }
900 /*
901 * do_blkif_request
902 * read a block; request is in a request queue
903 */
904 void do_blkif_request(request_queue_t *rq)
905 {
906 struct request *req;
907 struct buffer_head *bh, *next_bh;
908 int rw, nsect, full, queued = 0;
910 DPRINTK("Entered do_blkif_request\n");
912 while ( !rq->plugged && !list_empty(&rq->queue_head))
913 {
914 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
915 goto out;
917 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
918 req, req->cmd, req->sector,
919 req->current_nr_sectors, req->nr_sectors, req->bh);
921 rw = req->cmd;
922 if ( rw == READA )
923 rw = READ;
924 if ( unlikely((rw != READ) && (rw != WRITE)) )
925 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
927 req->errors = 0;
929 bh = req->bh;
930 while ( bh != NULL )
931 {
932 next_bh = bh->b_reqnext;
933 bh->b_reqnext = NULL;
935 full = blkif_queue_request(
936 (unsigned long)bh,
937 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
938 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
940 if ( full )
941 {
942 bh->b_reqnext = next_bh;
943 pending_queues[nr_pending++] = rq;
944 if ( unlikely(nr_pending >= MAX_PENDING) )
945 BUG();
946 goto out;
947 }
949 queued++;
951 /* Dequeue the buffer head from the request. */
952 nsect = bh->b_size >> 9;
953 bh = req->bh = next_bh;
955 if ( bh != NULL )
956 {
957 /* There's another buffer head to do. Update the request. */
958 req->hard_sector += nsect;
959 req->hard_nr_sectors -= nsect;
960 req->sector = req->hard_sector;
961 req->nr_sectors = req->hard_nr_sectors;
962 req->current_nr_sectors = bh->b_size >> 9;
963 req->buffer = bh->b_data;
964 }
965 else
966 {
967 /* That was the last buffer head. Finalise the request. */
968 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
969 BUG();
970 blkdev_dequeue_request(req);
971 end_that_request_last(req);
972 }
973 }
974 }
976 out:
977 if ( queued != 0 )
978 flush_requests();
979 }
982 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
983 {
984 RING_IDX i, rp;
985 unsigned long flags;
986 struct buffer_head *bh, *next_bh;
988 spin_lock_irqsave(&io_request_lock, flags);
990 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
991 {
992 spin_unlock_irqrestore(&io_request_lock, flags);
993 return;
994 }
996 rp = blk_ring.sring->rsp_prod;
997 rmb(); /* Ensure we see queued responses up to 'rp'. */
999 for ( i = blk_ring.rsp_cons; i != rp; i++ )
1001 unsigned long id;
1002 blkif_response_t *bret;
1004 bret = RING_GET_RESPONSE(&blk_ring, i);
1005 id = bret->id;
1006 bh = (struct buffer_head *)blk_shadow[id].request;
1008 blkif_completion(&blk_shadow[id]);
1010 ADD_ID_TO_FREELIST(id);
1012 switch ( bret->operation )
1014 case BLKIF_OP_READ:
1015 case BLKIF_OP_WRITE:
1016 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
1017 DPRINTK("Bad return from blkdev data request: %lx\n",
1018 bret->status);
1019 for ( ; bh != NULL; bh = next_bh )
1021 next_bh = bh->b_reqnext;
1022 bh->b_reqnext = NULL;
1023 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
1026 break;
1027 case BLKIF_OP_PROBE:
1028 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
1029 blkif_control_rsp_valid = 1;
1030 break;
1031 default:
1032 BUG();
1036 blk_ring.rsp_cons = i;
1038 kick_pending_request_queues();
1040 spin_unlock_irqrestore(&io_request_lock, flags);
1043 #endif
1045 /***************************** COMMON CODE *******************************/
1047 #ifdef CONFIG_XEN_BLKDEV_GRANT
1048 void blkif_control_probe_send(blkif_request_t *req, blkif_response_t *rsp,
1049 unsigned long address)
1051 int ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
1052 ASSERT( ref != -ENOSPC );
1054 gnttab_grant_foreign_access_ref( ref, rdomid, address >> PAGE_SHIFT, 0 );
1056 req->frame_and_sects[0] = (((u32) ref) << 16) | 7;
1058 blkif_control_send(req, rsp);
1060 #endif
1062 void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
1064 unsigned long flags, id;
1065 blkif_request_t *req_d;
1067 retry:
1068 while ( RING_FULL(&blk_ring) )
1070 set_current_state(TASK_INTERRUPTIBLE);
1071 schedule_timeout(1);
1074 spin_lock_irqsave(&blkif_io_lock, flags);
1075 if ( RING_FULL(&blk_ring) )
1077 spin_unlock_irqrestore(&blkif_io_lock, flags);
1078 goto retry;
1081 DISABLE_SCATTERGATHER();
1082 req_d = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
1083 *req_d = *req;
1085 id = GET_ID_FROM_FREELIST();
1086 req_d->id = id;
1087 blk_shadow[id].request = (unsigned long)req;
1089 pickle_request(&blk_shadow[id], req);
1091 blk_ring.req_prod_pvt++;
1092 flush_requests();
1094 spin_unlock_irqrestore(&blkif_io_lock, flags);
1096 while ( !blkif_control_rsp_valid )
1098 set_current_state(TASK_INTERRUPTIBLE);
1099 schedule_timeout(1);
1102 memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
1103 blkif_control_rsp_valid = 0;
1107 /* Send a driver status notification to the domain controller. */
1108 static void send_driver_status(int ok)
1110 ctrl_msg_t cmsg = {
1111 .type = CMSG_BLKIF_FE,
1112 .subtype = CMSG_BLKIF_FE_DRIVER_STATUS,
1113 .length = sizeof(blkif_fe_driver_status_t),
1114 };
1115 blkif_fe_driver_status_t *msg = (void*)cmsg.msg;
1117 msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN);
1119 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1122 /* Tell the controller to bring up the interface. */
1123 static void blkif_send_interface_connect(void)
1125 ctrl_msg_t cmsg = {
1126 .type = CMSG_BLKIF_FE,
1127 .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
1128 .length = sizeof(blkif_fe_interface_connect_t),
1129 };
1130 blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
1132 msg->handle = 0;
1133 msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT);
1135 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1138 static void blkif_free(void)
1140 /* Prevent new requests being issued until we fix things up. */
1141 spin_lock_irq(&blkif_io_lock);
1142 recovery = 1;
1143 blkif_state = BLKIF_STATE_DISCONNECTED;
1144 spin_unlock_irq(&blkif_io_lock);
1146 /* Free resources associated with old device channel. */
1147 if ( blk_ring.sring != NULL )
1149 free_page((unsigned long)blk_ring.sring);
1150 blk_ring.sring = NULL;
1152 free_irq(blkif_irq, NULL);
1153 blkif_irq = 0;
1155 unbind_evtchn_from_irq(blkif_evtchn);
1156 blkif_evtchn = 0;
1159 static void blkif_close(void)
1163 /* Move from CLOSED to DISCONNECTED state. */
1164 static void blkif_disconnect(void)
1166 blkif_sring_t *sring;
1168 if ( blk_ring.sring != NULL )
1169 free_page((unsigned long)blk_ring.sring);
1171 sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
1172 SHARED_RING_INIT(sring);
1173 FRONT_RING_INIT(&blk_ring, sring, PAGE_SIZE);
1174 blkif_state = BLKIF_STATE_DISCONNECTED;
1175 blkif_send_interface_connect();
1178 static void blkif_reset(void)
1180 blkif_free();
1181 blkif_disconnect();
1184 static void blkif_recover(void)
1186 int i;
1187 blkif_request_t *req;
1188 struct blk_shadow *copy;
1189 #ifdef CONFIG_XEN_BLKDEV_GRANT
1190 int j;
1191 #endif
1193 /* Stage 1: Make a safe copy of the shadow state. */
1194 copy = (struct blk_shadow *)kmalloc(sizeof(blk_shadow), GFP_KERNEL);
1195 BUG_ON(copy == NULL);
1196 memcpy(copy, blk_shadow, sizeof(blk_shadow));
1198 /* Stage 2: Set up free list. */
1199 memset(&blk_shadow, 0, sizeof(blk_shadow));
1200 for ( i = 0; i < BLK_RING_SIZE; i++ )
1201 blk_shadow[i].req.id = i+1;
1202 blk_shadow_free = blk_ring.req_prod_pvt;
1203 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1205 /* Stage 3: Find pending requests and requeue them. */
1206 for ( i = 0; i < BLK_RING_SIZE; i++ )
1208 /* Not in use? */
1209 if ( copy[i].request == 0 )
1210 continue;
1212 /* Grab a request slot and unpickle shadow state into it. */
1213 req = RING_GET_REQUEST(
1214 &blk_ring, blk_ring.req_prod_pvt);
1215 unpickle_request(req, &copy[i]);
1217 /* We get a new request id, and must reset the shadow state. */
1218 req->id = GET_ID_FROM_FREELIST();
1219 memcpy(&blk_shadow[req->id], &copy[i], sizeof(copy[i]));
1221 #ifdef CONFIG_XEN_BLKDEV_GRANT
1222 /* Rewrite any grant references invalidated by suspend/resume. */
1223 for ( j = 0; j < req->nr_segments; j++ )
1225 if ( req->frame_and_sects[j] & GRANTREF_INVALID )
1226 gnttab_grant_foreign_access_ref(
1227 blkif_gref_from_fas(req->frame_and_sects[j]),
1228 rdomid,
1229 blk_shadow[req->id].frame[j],
1230 rq_data_dir((struct request *)
1231 blk_shadow[req->id].request));
1232 req->frame_and_sects[j] &= ~GRANTREF_INVALID;
1234 blk_shadow[req->id].req = *req;
1235 #endif
1237 blk_ring.req_prod_pvt++;
1240 kfree(copy);
1242 recovery = 0;
1244 /* blk_ring->req_prod will be set when we flush_requests().*/
1245 wmb();
1247 /* Kicks things back into life. */
1248 flush_requests();
1250 /* Now safe to left other people use the interface. */
1251 blkif_state = BLKIF_STATE_CONNECTED;
1254 static void blkif_connect(blkif_fe_interface_status_t *status)
1256 int err = 0;
1258 blkif_evtchn = status->evtchn;
1259 blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
1260 #ifdef CONFIG_XEN_BLKDEV_GRANT
1261 rdomid = status->domid;
1262 #endif
1264 err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
1265 if ( err )
1267 printk(KERN_ALERT "xen_blk: request_irq failed (err=%d)\n", err);
1268 return;
1271 if ( recovery )
1273 blkif_recover();
1275 else
1277 /* Transition to connected in case we need to do
1278 * a partition probe on a whole disk. */
1279 blkif_state = BLKIF_STATE_CONNECTED;
1281 /* Probe for discs attached to the interface. */
1282 xlvbd_init();
1285 /* Kick pending requests. */
1286 spin_lock_irq(&blkif_io_lock);
1287 kick_pending_request_queues();
1288 spin_unlock_irq(&blkif_io_lock);
1291 static void unexpected(blkif_fe_interface_status_t *status)
1293 DPRINTK(" Unexpected blkif status %u in state %u\n",
1294 status->status, blkif_state);
1297 static void blkif_status(blkif_fe_interface_status_t *status)
1299 if ( status->handle != blkif_handle )
1301 WPRINTK(" Invalid blkif: handle=%u\n", status->handle);
1302 unexpected(status);
1303 return;
1306 switch ( status->status )
1308 case BLKIF_INTERFACE_STATUS_CLOSED:
1309 switch ( blkif_state )
1311 case BLKIF_STATE_CLOSED:
1312 unexpected(status);
1313 break;
1314 case BLKIF_STATE_DISCONNECTED:
1315 case BLKIF_STATE_CONNECTED:
1316 unexpected(status);
1317 blkif_close();
1318 break;
1320 break;
1322 case BLKIF_INTERFACE_STATUS_DISCONNECTED:
1323 switch ( blkif_state )
1325 case BLKIF_STATE_CLOSED:
1326 blkif_disconnect();
1327 break;
1328 case BLKIF_STATE_DISCONNECTED:
1329 case BLKIF_STATE_CONNECTED:
1330 /* unexpected(status); */ /* occurs during suspend/resume */
1331 blkif_reset();
1332 break;
1334 break;
1336 case BLKIF_INTERFACE_STATUS_CONNECTED:
1337 switch ( blkif_state )
1339 case BLKIF_STATE_CLOSED:
1340 unexpected(status);
1341 blkif_disconnect();
1342 blkif_connect(status);
1343 break;
1344 case BLKIF_STATE_DISCONNECTED:
1345 blkif_connect(status);
1346 break;
1347 case BLKIF_STATE_CONNECTED:
1348 unexpected(status);
1349 blkif_connect(status);
1350 break;
1352 break;
1354 case BLKIF_INTERFACE_STATUS_CHANGED:
1355 switch ( blkif_state )
1357 case BLKIF_STATE_CLOSED:
1358 case BLKIF_STATE_DISCONNECTED:
1359 unexpected(status);
1360 break;
1361 case BLKIF_STATE_CONNECTED:
1362 vbd_update();
1363 break;
1365 break;
1367 default:
1368 WPRINTK(" Invalid blkif status: %d\n", status->status);
1369 break;
1374 static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
1376 switch ( msg->subtype )
1378 case CMSG_BLKIF_FE_INTERFACE_STATUS:
1379 blkif_status((blkif_fe_interface_status_t *)
1380 &msg->msg[0]);
1381 break;
1382 default:
1383 msg->length = 0;
1384 break;
1387 ctrl_if_send_response(msg);
1390 int wait_for_blkif(void)
1392 int err = 0;
1393 int i;
1394 send_driver_status(1);
1396 /*
1397 * We should read 'nr_interfaces' from response message and wait
1398 * for notifications before proceeding. For now we assume that we
1399 * will be notified of exactly one interface.
1400 */
1401 for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ )
1403 set_current_state(TASK_INTERRUPTIBLE);
1404 schedule_timeout(1);
1407 if ( blkif_state != BLKIF_STATE_CONNECTED )
1409 printk(KERN_INFO "xen_blk: Timeout connecting to device!\n");
1410 err = -ENOSYS;
1412 return err;
1415 int __init xlblk_init(void)
1417 int i;
1419 #ifdef CONFIG_XEN_BLKDEV_GRANT
1420 if ( 0 > gnttab_alloc_grant_references( MAXIMUM_OUTSTANDING_BLOCK_REQS,
1421 &gref_head, &gref_terminal ))
1422 return 1;
1423 printk(KERN_ALERT "Blkif frontend is using grant tables.\n");
1424 #endif
1426 if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
1427 (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
1428 return 0;
1430 printk(KERN_INFO "xen_blk: Initialising virtual block device driver\n");
1432 blk_shadow_free = 0;
1433 memset(blk_shadow, 0, sizeof(blk_shadow));
1434 for ( i = 0; i < BLK_RING_SIZE; i++ )
1435 blk_shadow[i].req.id = i+1;
1436 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1438 (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
1439 CALLBACK_IN_BLOCKING_CONTEXT);
1441 wait_for_blkif();
1443 return 0;
1446 void blkdev_suspend(void)
1450 void blkdev_resume(void)
1452 #ifdef CONFIG_XEN_BLKDEV_GRANT
1453 int i, j;
1454 for ( i = 0; i < BLK_RING_SIZE; i++ )
1455 for ( j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++ )
1456 blk_shadow[i].req.frame_and_sects[j] |= GRANTREF_INVALID;
1457 #endif
1458 send_driver_status(1);
1461 static void blkif_completion(struct blk_shadow *s)
1463 int i;
1464 #ifdef CONFIG_XEN_BLKDEV_GRANT
1465 for ( i = 0; i < s->req.nr_segments; i++ )
1466 gnttab_release_grant_reference(
1467 &gref_head, blkif_gref_from_fas(s->req.frame_and_sects[i]));
1468 #else
1469 /* This is a hack to get the dirty logging bits set */
1470 if ( s->req.operation == BLKIF_OP_READ )
1472 for ( i = 0; i < s->req.nr_segments; i++ )
1474 unsigned long pfn = s->req.frame_and_sects[i] >> PAGE_SHIFT;
1475 unsigned long mfn = phys_to_machine_mapping[pfn];
1476 xen_machphys_update(mfn, pfn);
1479 #endif