debuggers.hg

view linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c @ 6649:f59e0163540e

Updates to blktap driver and user code.

Mostly this makes the tap code work again with all of the changes that
have happened to the block drivers recently. We now use a shared page
per VBD (to the driver), and handle control information through the
store. The taplib interfaces have changed to be based around per-vbd
data structures that you can attach arbitrary handlers for.

There is also initial code for a user-level blockback driver, which
aims to get around the use of loopbacks for file-based vbds. Still
plenty of work to do here -- this is a working incremental checkin and
I'm away from this for the next four weeks.

Signed-off-by: Andrew Warfield <andrew.warfield@cl.cam.ac.uk>
author akw27@arcadians.cl.cam.ac.uk
date Sun Sep 04 21:19:44 2005 +0000 (2005-09-04)
parents dd668f7527cb
children 79658ef58925 1f460d0fd6c6
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/blktap/blktap.c
3 *
4 * This is a modified version of the block backend driver that remaps requests
5 * to a user-space memory region. It is intended to be used to write
6 * application-level servers that provide block interfaces to client VMs.
7 *
8 */
10 #include <linux/kernel.h>
11 #include <linux/spinlock.h>
12 #include <asm-xen/balloon.h>
13 #include <linux/kernel.h>
14 #include <linux/fs.h>
15 #include <linux/mm.h>
16 #include <linux/miscdevice.h>
17 #include <linux/errno.h>
18 #include <linux/major.h>
19 #include <linux/gfp.h>
20 #include <linux/poll.h>
21 #include <asm/tlbflush.h>
22 #include "common.h"
24 /* Only one process may open /dev/xen/blktap at any time. */
25 static unsigned long blktap_dev_inuse;
26 unsigned long blktap_ring_ok; /* make this ring->state */
28 /* Rings up to user space. */
29 static blkif_front_ring_t blktap_ufe_ring;
31 /* for poll: */
32 static wait_queue_head_t blktap_wait;
34 /* current switching mode */
35 static unsigned long blktap_mode;
37 /* local prototypes */
38 static int blktap_read_ufe_ring(void);
41 /* /dev/xen/blktap resides at device number major=10, minor=200 */
42 #define BLKTAP_MINOR 202
44 /* blktap IOCTLs: */
45 #define BLKTAP_IOCTL_KICK_FE 1
46 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
47 #define BLKTAP_IOCTL_SETMODE 3
48 #define BLKTAP_IOCTL_PRINT_IDXS 100
50 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
51 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
52 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
53 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
54 #define BLKTAP_MODE_COPY_FE 0x00000004 /* unimp. */
55 #define BLKTAP_MODE_COPY_BE 0x00000008 /* unimp. */
56 #define BLKTAP_MODE_COPY_FE_PAGES 0x00000010 /* unimp. */
57 #define BLKTAP_MODE_COPY_BE_PAGES 0x00000020 /* unimp. */
59 #define BLKTAP_MODE_INTERPOSE \
60 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
62 #define BLKTAP_MODE_COPY_BOTH \
63 (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
65 #define BLKTAP_MODE_COPY_BOTH_PAGES \
66 (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
68 static inline int BLKTAP_MODE_VALID(unsigned long arg)
69 {
70 return (
71 ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
72 ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
73 ( arg == BLKTAP_MODE_INTERPOSE ) );
74 /*
75 return (
76 ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
77 ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
78 ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
79 ( arg == BLKTAP_MODE_INTERPOSE ) ||
80 ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
81 ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
82 ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
83 );
84 */
85 }
88 /******************************************************************
89 * MMAP REGION
90 */
92 /*
93 * We use a big chunk of address space to map in-flight requests into,
94 * and export this region up to user-space. See the comments in blkback
95 * about this -- the two must be kept in sync if the tap is used as a
96 * passthrough.
97 */
99 #define MAX_PENDING_REQS 64
100 #define BATCH_PER_DOMAIN 16
102 /* immediately before the mmap area, we have a bunch of pages reserved
103 * for shared memory rings.
104 */
105 #define RING_PAGES 1 /* Front */
107 /* Where things are inside the device mapping. */
108 struct vm_area_struct *blktap_vma = NULL;
109 unsigned long mmap_vstart; /* Kernel pages for mapping in data. */
110 unsigned long rings_vstart; /* start of mmaped vma */
111 unsigned long user_vstart; /* start of user mappings */
113 #define MMAP_PAGES \
114 (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
115 #define MMAP_VADDR(_start, _req,_seg) \
116 (_start + \
117 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
118 ((_seg) * PAGE_SIZE))
122 /*
123 * Each outstanding request that we've passed to the lower device layers has a
124 * 'pending_req' allocated to it. Each buffer_head that completes decrements
125 * the pendcnt towards zero. When it hits zero, the specified domain has a
126 * response queued for it, with the saved 'id' passed back.
127 */
128 typedef struct {
129 blkif_t *blkif;
130 unsigned long id;
131 int nr_pages;
132 atomic_t pendcnt;
133 unsigned short operation;
134 int status;
135 } pending_req_t;
137 /*
138 * We can't allocate pending_req's in order, since they may complete out of
139 * order. We therefore maintain an allocation ring. This ring also indicates
140 * when enough work has been passed down -- at that point the allocation ring
141 * will be empty.
142 */
143 static pending_req_t pending_reqs[MAX_PENDING_REQS];
144 static unsigned char pending_ring[MAX_PENDING_REQS];
145 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
146 /* NB. We use a different index type to differentiate from shared blk rings. */
147 typedef unsigned int PEND_RING_IDX;
148 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
149 static PEND_RING_IDX pending_prod, pending_cons;
150 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
152 /* Requests passing through the tap to the backend hijack the id field
153 * in the request message. In it we put the AR index _AND_ the fe domid.
154 * the domid is used by the backend to map the pages properly.
155 */
157 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
158 {
159 return ( (fe_dom << 16) | MASK_PEND_IDX(idx) );
160 }
162 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
163 {
164 return (PEND_RING_IDX)( id & 0x0000ffff );
165 }
167 extern inline domid_t ID_TO_DOM(unsigned long id)
168 {
169 return (domid_t)(id >> 16);
170 }
174 /******************************************************************
175 * GRANT HANDLES
176 */
178 /* When using grant tables to map a frame for device access then the
179 * handle returned must be used to unmap the frame. This is needed to
180 * drop the ref count on the frame.
181 */
182 struct grant_handle_pair
183 {
184 u16 kernel;
185 u16 user;
186 };
187 static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
188 #define pending_handle(_idx, _i) \
189 (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
190 #define BLKTAP_INVALID_HANDLE(_g) \
191 (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
192 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
193 (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
194 } while(0)
197 /******************************************************************
198 * BLKTAP VM OPS
199 */
201 static struct page *blktap_nopage(struct vm_area_struct *vma,
202 unsigned long address,
203 int *type)
204 {
205 /*
206 * if the page has not been mapped in by the driver then generate
207 * a SIGBUS to the domain.
208 */
210 force_sig(SIGBUS, current);
212 return 0;
213 }
215 struct vm_operations_struct blktap_vm_ops = {
216 nopage: blktap_nopage,
217 };
219 /******************************************************************
220 * BLKTAP FILE OPS
221 */
223 static int blktap_open(struct inode *inode, struct file *filp)
224 {
225 blkif_sring_t *sring;
227 if ( test_and_set_bit(0, &blktap_dev_inuse) )
228 return -EBUSY;
230 /* Allocate the fe ring. */
231 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
232 if (sring == NULL)
233 goto fail_nomem;
235 SetPageReserved(virt_to_page(sring));
237 SHARED_RING_INIT(sring);
238 FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
240 return 0;
242 fail_nomem:
243 return -ENOMEM;
244 }
246 static int blktap_release(struct inode *inode, struct file *filp)
247 {
248 blktap_dev_inuse = 0;
249 blktap_ring_ok = 0;
251 /* Free the ring page. */
252 ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
253 free_page((unsigned long) blktap_ufe_ring.sring);
255 /* Clear any active mappings and free foreign map table */
256 if (blktap_vma != NULL) {
257 zap_page_range(blktap_vma, blktap_vma->vm_start,
258 blktap_vma->vm_end - blktap_vma->vm_start, NULL);
259 blktap_vma = NULL;
260 }
262 return 0;
263 }
266 /* Note on mmap:
267 * We need to map pages to user space in a way that will allow the block
268 * subsystem set up direct IO to them. This couldn't be done before, because
269 * there isn't really a sane way to translate a user virtual address down to a
270 * physical address when the page belongs to another domain.
271 *
272 * My first approach was to map the page in to kernel memory, add an entry
273 * for it in the physical frame list (using alloc_lomem_region as in blkback)
274 * and then attempt to map that page up to user space. This is disallowed
275 * by xen though, which realizes that we don't really own the machine frame
276 * underlying the physical page.
277 *
278 * The new approach is to provide explicit support for this in xen linux.
279 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
280 * mapped from other vms. vma->vm_private_data is set up as a mapping
281 * from pages to actual page structs. There is a new clause in get_user_pages
282 * that does the right thing for this sort of mapping.
283 */
284 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
285 {
286 int size;
287 struct page **map;
288 int i;
290 DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
291 vma->vm_start, vma->vm_end);
293 vma->vm_flags |= VM_RESERVED;
294 vma->vm_ops = &blktap_vm_ops;
296 size = vma->vm_end - vma->vm_start;
297 if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
298 printk(KERN_INFO
299 "blktap: you _must_ map exactly %d pages!\n",
300 MMAP_PAGES + RING_PAGES);
301 return -EAGAIN;
302 }
304 size >>= PAGE_SHIFT;
305 DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
307 rings_vstart = vma->vm_start;
308 user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
310 /* Map the ring pages to the start of the region and reserve it. */
312 /* not sure if I really need to do this... */
313 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
315 if (remap_pfn_range(vma, vma->vm_start,
316 __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT,
317 PAGE_SIZE, vma->vm_page_prot))
318 {
319 WPRINTK("Mapping user ring failed!\n");
320 goto fail;
321 }
323 /* Mark this VM as containing foreign pages, and set up mappings. */
324 map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
325 * sizeof(struct page_struct*),
326 GFP_KERNEL);
327 if (map == NULL)
328 {
329 WPRINTK("Couldn't alloc VM_FOREIGH map.\n");
330 goto fail;
331 }
333 for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
334 map[i] = NULL;
336 vma->vm_private_data = map;
337 vma->vm_flags |= VM_FOREIGN;
339 blktap_vma = vma;
340 blktap_ring_ok = 1;
342 return 0;
343 fail:
344 /* Clear any active mappings. */
345 zap_page_range(vma, vma->vm_start,
346 vma->vm_end - vma->vm_start, NULL);
348 return -ENOMEM;
349 }
351 static int blktap_ioctl(struct inode *inode, struct file *filp,
352 unsigned int cmd, unsigned long arg)
353 {
354 switch(cmd) {
355 case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
356 return blktap_read_ufe_ring();
358 case BLKTAP_IOCTL_SETMODE:
359 if (BLKTAP_MODE_VALID(arg)) {
360 blktap_mode = arg;
361 /* XXX: may need to flush rings here. */
362 printk(KERN_INFO "blktap: set mode to %lx\n", arg);
363 return 0;
364 }
365 case BLKTAP_IOCTL_PRINT_IDXS:
366 {
367 //print_fe_ring_idxs();
368 WPRINTK("User Rings: \n-----------\n");
369 WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
370 "| req_prod: %2d, rsp_prod: %2d\n",
371 blktap_ufe_ring.rsp_cons,
372 blktap_ufe_ring.req_prod_pvt,
373 blktap_ufe_ring.sring->req_prod,
374 blktap_ufe_ring.sring->rsp_prod);
376 }
377 }
378 return -ENOIOCTLCMD;
379 }
381 static unsigned int blktap_poll(struct file *file, poll_table *wait)
382 {
383 poll_wait(file, &blktap_wait, wait);
384 if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) )
385 {
386 flush_tlb_all();
388 RING_PUSH_REQUESTS(&blktap_ufe_ring);
389 return POLLIN | POLLRDNORM;
390 }
392 return 0;
393 }
395 void blktap_kick_user(void)
396 {
397 /* blktap_ring->req_prod = blktap_req_prod; */
398 wake_up_interruptible(&blktap_wait);
399 }
401 static struct file_operations blktap_fops = {
402 owner: THIS_MODULE,
403 poll: blktap_poll,
404 ioctl: blktap_ioctl,
405 open: blktap_open,
406 release: blktap_release,
407 mmap: blktap_mmap,
408 };
412 static int do_block_io_op(blkif_t *blkif, int max_to_do);
413 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
414 static void make_response(blkif_t *blkif, unsigned long id,
415 unsigned short op, int st);
418 static void fast_flush_area(int idx, int nr_pages)
419 {
420 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
421 unsigned int i, op = 0;
422 struct grant_handle_pair *handle;
423 unsigned long ptep;
425 for (i=0; i<nr_pages; i++)
426 {
427 handle = &pending_handle(idx, i);
428 if (!BLKTAP_INVALID_HANDLE(handle))
429 {
431 unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
432 unmap[op].dev_bus_addr = 0;
433 unmap[op].handle = handle->kernel;
434 op++;
436 if (create_lookup_pte_addr(blktap_vma->vm_mm,
437 MMAP_VADDR(user_vstart, idx, i),
438 &ptep) !=0) {
439 DPRINTK("Couldn't get a pte addr!\n");
440 return;
441 }
442 unmap[op].host_addr = ptep;
443 unmap[op].dev_bus_addr = 0;
444 unmap[op].handle = handle->user;
445 op++;
447 BLKTAP_INVALIDATE_HANDLE(handle);
448 }
449 }
450 if ( unlikely(HYPERVISOR_grant_table_op(
451 GNTTABOP_unmap_grant_ref, unmap, op)))
452 BUG();
454 if (blktap_vma != NULL)
455 zap_page_range(blktap_vma,
456 MMAP_VADDR(user_vstart, idx, 0),
457 nr_pages << PAGE_SHIFT, NULL);
458 }
460 /******************************************************************
461 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
462 */
464 static struct list_head blkio_schedule_list;
465 static spinlock_t blkio_schedule_list_lock;
467 static int __on_blkdev_list(blkif_t *blkif)
468 {
469 return blkif->blkdev_list.next != NULL;
470 }
472 static void remove_from_blkdev_list(blkif_t *blkif)
473 {
474 unsigned long flags;
475 if ( !__on_blkdev_list(blkif) ) return;
476 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
477 if ( __on_blkdev_list(blkif) )
478 {
479 list_del(&blkif->blkdev_list);
480 blkif->blkdev_list.next = NULL;
481 blkif_put(blkif);
482 }
483 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
484 }
486 static void add_to_blkdev_list_tail(blkif_t *blkif)
487 {
488 unsigned long flags;
489 if ( __on_blkdev_list(blkif) ) return;
490 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
491 if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
492 {
493 list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
494 blkif_get(blkif);
495 }
496 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
497 }
500 /******************************************************************
501 * SCHEDULER FUNCTIONS
502 */
504 static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
506 static int blkio_schedule(void *arg)
507 {
508 DECLARE_WAITQUEUE(wq, current);
510 blkif_t *blkif;
511 struct list_head *ent;
513 daemonize("xenblkd");
515 for ( ; ; )
516 {
517 /* Wait for work to do. */
518 add_wait_queue(&blkio_schedule_wait, &wq);
519 set_current_state(TASK_INTERRUPTIBLE);
520 if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
521 list_empty(&blkio_schedule_list) )
522 schedule();
523 __set_current_state(TASK_RUNNING);
524 remove_wait_queue(&blkio_schedule_wait, &wq);
526 /* Queue up a batch of requests. */
527 while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
528 !list_empty(&blkio_schedule_list) )
529 {
530 ent = blkio_schedule_list.next;
531 blkif = list_entry(ent, blkif_t, blkdev_list);
532 blkif_get(blkif);
533 remove_from_blkdev_list(blkif);
534 if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
535 add_to_blkdev_list_tail(blkif);
536 blkif_put(blkif);
537 }
538 }
539 }
541 static void maybe_trigger_blkio_schedule(void)
542 {
543 /*
544 * Needed so that two processes, who together make the following predicate
545 * true, don't both read stale values and evaluate the predicate
546 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
547 */
548 smp_mb();
550 if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
551 !list_empty(&blkio_schedule_list) )
552 wake_up(&blkio_schedule_wait);
553 }
557 /******************************************************************
558 * COMPLETION CALLBACK -- Called as bh->b_end_io()
559 */
562 static int blktap_read_ufe_ring(void)
563 {
564 /* This is called to read responses from the UFE ring. */
566 RING_IDX i, j, rp;
567 blkif_response_t *resp;
568 blkif_t *blkif;
569 int pending_idx;
570 pending_req_t *pending_req;
571 unsigned long flags;
573 /* if we are forwarding from UFERring to FERing */
574 if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
576 /* for each outstanding message on the UFEring */
577 rp = blktap_ufe_ring.sring->rsp_prod;
578 rmb();
580 for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ )
581 {
582 resp = RING_GET_RESPONSE(&blktap_ufe_ring, i);
583 pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id));
584 pending_req = &pending_reqs[pending_idx];
586 blkif = pending_req->blkif;
587 for (j = 0; j < pending_req->nr_pages; j++) {
588 unsigned long vaddr;
589 struct page **map = blktap_vma->vm_private_data;
590 int offset;
592 vaddr = MMAP_VADDR(user_vstart, pending_idx, j);
593 offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
595 //ClearPageReserved(virt_to_page(vaddr));
596 ClearPageReserved((struct page *)map[offset]);
597 map[offset] = NULL;
598 }
600 fast_flush_area(pending_idx, pending_req->nr_pages);
601 make_response(blkif, pending_req->id, resp->operation,
602 resp->status);
603 blkif_put(pending_req->blkif);
604 spin_lock_irqsave(&pend_prod_lock, flags);
605 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
606 spin_unlock_irqrestore(&pend_prod_lock, flags);
607 }
608 blktap_ufe_ring.rsp_cons = i;
609 maybe_trigger_blkio_schedule();
610 }
611 return 0;
612 }
615 /******************************************************************************
616 * NOTIFICATION FROM GUEST OS.
617 */
619 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
620 {
621 blkif_t *blkif = dev_id;
622 add_to_blkdev_list_tail(blkif);
623 maybe_trigger_blkio_schedule();
624 return IRQ_HANDLED;
625 }
629 /******************************************************************
630 * DOWNWARD CALLS -- These interface with the block-device layer proper.
631 */
633 static int do_block_io_op(blkif_t *blkif, int max_to_do)
634 {
635 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
636 blkif_request_t *req;
637 RING_IDX i, rp;
638 int more_to_do = 0;
640 rp = blk_ring->sring->req_prod;
641 rmb(); /* Ensure we see queued requests up to 'rp'. */
643 for ( i = blk_ring->req_cons;
644 (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
645 i++ )
646 {
647 if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
648 {
649 more_to_do = 1;
650 break;
651 }
653 req = RING_GET_REQUEST(blk_ring, i);
654 switch ( req->operation )
655 {
656 case BLKIF_OP_READ:
657 case BLKIF_OP_WRITE:
658 dispatch_rw_block_io(blkif, req);
659 break;
661 default:
662 DPRINTK("error: unknown block io operation [%d]\n",
663 req->operation);
664 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
665 break;
666 }
667 }
669 blk_ring->req_cons = i;
670 blktap_kick_user();
672 return more_to_do;
673 }
675 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
676 {
677 blkif_request_t *target;
678 int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
679 pending_req_t *pending_req;
680 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
681 int op, ret;
682 unsigned int nseg;
684 /* Check that number of segments is sane. */
685 nseg = req->nr_segments;
686 if ( unlikely(nseg == 0) ||
687 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
688 {
689 DPRINTK("Bad number of segments in request (%d)\n", nseg);
690 goto bad_descriptor;
691 }
693 /* Make sure userspace is ready. */
694 if (!blktap_ring_ok) {
695 DPRINTK("blktap: ring not ready for requests!\n");
696 goto bad_descriptor;
697 }
700 if ( RING_FULL(&blktap_ufe_ring) ) {
701 WPRINTK("blktap: fe_ring is full, can't add (very broken!).\n");
702 goto bad_descriptor;
703 }
705 flush_cache_all(); /* a noop on intel... */
707 /* Map the foreign pages directly in to the application */
708 op = 0;
709 for (i=0; i<req->nr_segments; i++) {
711 unsigned long uvaddr;
712 unsigned long kvaddr;
713 unsigned long ptep;
715 uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
716 kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
718 /* Map the remote page to kernel. */
719 map[op].host_addr = kvaddr;
720 map[op].dom = blkif->domid;
721 map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]);
722 map[op].flags = GNTMAP_host_map;
723 /* This needs a bit more thought in terms of interposition:
724 * If we want to be able to modify pages during write using
725 * grant table mappings, the guest will either need to allow
726 * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */
727 if (req->operation == BLKIF_OP_WRITE)
728 map[op].flags |= GNTMAP_readonly;
729 op++;
731 /* Now map it to user. */
732 ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
733 if (ret)
734 {
735 DPRINTK("Couldn't get a pte addr!\n");
736 fast_flush_area(pending_idx, req->nr_segments);
737 goto bad_descriptor;
738 }
740 map[op].host_addr = ptep;
741 map[op].dom = blkif->domid;
742 map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]);
743 map[op].flags = GNTMAP_host_map | GNTMAP_application_map
744 | GNTMAP_contains_pte;
745 /* Above interposition comment applies here as well. */
746 if (req->operation == BLKIF_OP_WRITE)
747 map[op].flags |= GNTMAP_readonly;
748 op++;
749 }
751 if ( unlikely(HYPERVISOR_grant_table_op(
752 GNTTABOP_map_grant_ref, map, op)))
753 BUG();
755 op = 0;
756 for (i=0; i<(req->nr_segments*2); i+=2) {
757 unsigned long uvaddr;
758 unsigned long kvaddr;
759 unsigned long offset;
760 int cancel = 0;
762 uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2);
763 kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2);
765 if ( unlikely(map[i].handle < 0) )
766 {
767 DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle);
768 ret = map[i].handle;
769 cancel = 1;
770 }
772 if ( unlikely(map[i+1].handle < 0) )
773 {
774 DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle);
775 ret = map[i+1].handle;
776 cancel = 1;
777 }
779 if (cancel)
780 {
781 fast_flush_area(pending_idx, req->nr_segments);
782 goto bad_descriptor;
783 }
785 /* Set the necessary mappings in p2m and in the VM_FOREIGN
786 * vm_area_struct to allow user vaddr -> struct page lookups
787 * to work. This is needed for direct IO to foreign pages. */
788 phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] =
789 FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
791 offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
792 ((struct page **)blktap_vma->vm_private_data)[offset] =
793 pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
795 /* Save handles for unmapping later. */
796 pending_handle(pending_idx, i/2).kernel = map[i].handle;
797 pending_handle(pending_idx, i/2).user = map[i+1].handle;
798 }
800 /* Mark mapped pages as reserved: */
801 for ( i = 0; i < req->nr_segments; i++ )
802 {
803 unsigned long kvaddr;
805 kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
806 SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
807 }
809 pending_req = &pending_reqs[pending_idx];
810 pending_req->blkif = blkif;
811 pending_req->id = req->id;
812 pending_req->operation = req->operation;
813 pending_req->status = BLKIF_RSP_OKAY;
814 pending_req->nr_pages = nseg;
815 req->id = MAKE_ID(blkif->domid, pending_idx);
816 //atomic_set(&pending_req->pendcnt, nbio);
817 pending_cons++;
818 blkif_get(blkif);
820 /* Finally, write the request message to the user ring. */
821 target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
822 memcpy(target, req, sizeof(*req));
823 blktap_ufe_ring.req_prod_pvt++;
824 return;
826 bad_descriptor:
827 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
828 }
832 /******************************************************************
833 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
834 */
837 static void make_response(blkif_t *blkif, unsigned long id,
838 unsigned short op, int st)
839 {
840 blkif_response_t *resp;
841 unsigned long flags;
842 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
844 /* Place on the response ring for the relevant domain. */
845 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
846 resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
847 resp->id = id;
848 resp->operation = op;
849 resp->status = st;
850 wmb(); /* Ensure other side can see the response fields. */
851 blk_ring->rsp_prod_pvt++;
852 RING_PUSH_RESPONSES(blk_ring);
853 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
855 /* Kick the relevant domain. */
856 notify_via_evtchn(blkif->evtchn);
857 }
859 static struct miscdevice blktap_miscdev = {
860 .minor = BLKTAP_MINOR,
861 .name = "blktap",
862 .fops = &blktap_fops,
863 .devfs_name = "misc/blktap",
864 };
866 void blkif_deschedule(blkif_t *blkif)
867 {
868 remove_from_blkdev_list(blkif);
869 }
871 static int __init blkif_init(void)
872 {
873 int i, j, err;
874 struct page *page;
875 /*
876 if ( !(xen_start_info.flags & SIF_INITDOMAIN) &&
877 !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
878 return 0;
879 */
880 blkif_interface_init();
882 page = balloon_alloc_empty_page_range(MMAP_PAGES);
883 BUG_ON(page == NULL);
884 mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
886 pending_cons = 0;
887 pending_prod = MAX_PENDING_REQS;
888 memset(pending_reqs, 0, sizeof(pending_reqs));
889 for ( i = 0; i < MAX_PENDING_REQS; i++ )
890 pending_ring[i] = i;
892 spin_lock_init(&blkio_schedule_list_lock);
893 INIT_LIST_HEAD(&blkio_schedule_list);
895 if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
896 BUG();
898 blkif_xenbus_init();
900 for (i=0; i<MAX_PENDING_REQS ; i++)
901 for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
902 BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
904 err = misc_register(&blktap_miscdev);
905 if ( err != 0 )
906 {
907 printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
908 return err;
909 }
911 init_waitqueue_head(&blktap_wait);
913 return 0;
914 }
916 __initcall(blkif_init);