debuggers.hg

view xen/drivers/block/xen_block.c @ 665:a74ec9013abb

bitkeeper revision 1.349 (3f132695Orgv2nzuhXag1iNmNOy53Q)

Merge labyrinth.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into labyrinth.cl.cam.ac.uk:/auto/anfs/scratch/labyrinth/iap10/xeno-clone/xeno.bk
author iap10@labyrinth.cl.cam.ac.uk
date Mon Jul 14 21:54:29 2003 +0000 (2003-07-14)
parents 01725801761a 384fbe1ed716
children d2aad5afa33c
line source
1 /*
2 * xen_block.c
3 *
4 * process incoming block io requests from guestos's.
5 */
7 #include <xeno/config.h>
8 #include <xeno/types.h>
9 #include <xeno/lib.h>
10 #include <xeno/sched.h>
11 #include <xeno/blkdev.h>
12 #include <xeno/event.h>
13 #include <hypervisor-ifs/block.h>
14 #include <hypervisor-ifs/hypervisor-if.h>
15 #include <asm-i386/io.h>
16 #include <xeno/spinlock.h>
17 #include <xeno/keyhandler.h>
18 #include <xeno/interrupt.h>
19 #include <xeno/segment.h>
20 #include <xeno/slab.h>
21 #include <xeno/physdisk.h>
23 #if 0
24 #define DPRINTK(_f, _a...) printk( _f , ## _a )
25 #else
26 #define DPRINTK(_f, _a...) ((void)0)
27 #endif
29 /*
30 * These are rather arbitrary. They are fairly large because adjacent
31 * requests pulled from a communication ring are quite likely to end
32 * up being part of the same scatter/gather request at the disc.
33 *
34 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
35 * This will increase the chances of being able to write whole tracks.
36 * '64' should be enough to keep us competitive with Linux.
37 */
38 #define MAX_PENDING_REQS 64
39 #define BATCH_PER_DOMAIN 16
41 /*
42 * Each outstanding request which we've passed to the lower device layers
43 * has a 'pending_req' allocated to it. Each buffer_head that completes
44 * decrements the pendcnt towards zero. When it hits zero, the specified
45 * domain has a response queued for it, with the saved 'id' passed back.
46 *
47 * We can't allocate pending_req's in order, since they may complete out
48 * of order. We therefore maintain an allocation ring. This ring also
49 * indicates when enough work has been passed down -- at that point the
50 * allocation ring will be empty.
51 */
52 static pending_req_t pending_reqs[MAX_PENDING_REQS];
53 static unsigned char pending_ring[MAX_PENDING_REQS];
54 static unsigned int pending_prod, pending_cons;
55 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
56 #define PENDREQ_IDX_INC(_i) ((_i) = ((_i)+1) & (MAX_PENDING_REQS-1))
58 static kmem_cache_t *buffer_head_cachep;
59 static atomic_t nr_pending;
61 #define NR_IDE_DEVS 20
62 #define NR_SCSI_DEVS 16
64 static kdev_t ide_devs[NR_IDE_DEVS] = {
65 MKDEV(IDE0_MAJOR, 0), MKDEV(IDE0_MAJOR, 64), /* hda, hdb */
66 MKDEV(IDE1_MAJOR, 0), MKDEV(IDE1_MAJOR, 64), /* hdc, hdd */
67 MKDEV(IDE2_MAJOR, 0), MKDEV(IDE2_MAJOR, 64), /* hde, hdf */
68 MKDEV(IDE3_MAJOR, 0), MKDEV(IDE3_MAJOR, 64), /* hdg, hdh */
69 MKDEV(IDE4_MAJOR, 0), MKDEV(IDE4_MAJOR, 64), /* hdi, hdj */
70 MKDEV(IDE5_MAJOR, 0), MKDEV(IDE5_MAJOR, 64), /* hdk, hdl */
71 MKDEV(IDE6_MAJOR, 0), MKDEV(IDE6_MAJOR, 64), /* hdm, hdn */
72 MKDEV(IDE7_MAJOR, 0), MKDEV(IDE7_MAJOR, 64), /* hdo, hdp */
73 MKDEV(IDE8_MAJOR, 0), MKDEV(IDE8_MAJOR, 64), /* hdq, hdr */
74 MKDEV(IDE9_MAJOR, 0), MKDEV(IDE9_MAJOR, 64) /* hds, hdt */
75 };
77 static kdev_t scsi_devs[NR_SCSI_DEVS] = {
78 MKDEV(SCSI_DISK0_MAJOR, 0), MKDEV(SCSI_DISK0_MAJOR, 16), /* sda, sdb */
79 MKDEV(SCSI_DISK0_MAJOR, 32), MKDEV(SCSI_DISK0_MAJOR, 48), /* sdc, sdd */
80 MKDEV(SCSI_DISK0_MAJOR, 64), MKDEV(SCSI_DISK0_MAJOR, 80), /* sde, sdf */
81 MKDEV(SCSI_DISK0_MAJOR, 96), MKDEV(SCSI_DISK0_MAJOR, 112), /* sdg, sdh */
82 MKDEV(SCSI_DISK0_MAJOR, 128), MKDEV(SCSI_DISK0_MAJOR, 144), /* sdi, sdj */
83 MKDEV(SCSI_DISK0_MAJOR, 160), MKDEV(SCSI_DISK0_MAJOR, 176), /* sdk, sdl */
84 MKDEV(SCSI_DISK0_MAJOR, 192), MKDEV(SCSI_DISK0_MAJOR, 208), /* sdm, sdn */
85 MKDEV(SCSI_DISK0_MAJOR, 224), MKDEV(SCSI_DISK0_MAJOR, 240), /* sdo, sdp */
86 };
88 static int __buffer_is_valid(struct task_struct *p,
89 unsigned long buffer,
90 unsigned short size,
91 int writeable_buffer);
92 static void __lock_buffer(unsigned long buffer,
93 unsigned short size,
94 int writeable_buffer);
95 static void unlock_buffer(struct task_struct *p,
96 unsigned long buffer,
97 unsigned short size,
98 int writeable_buffer);
100 static void io_schedule(unsigned long unused);
101 static int do_block_io_op_domain(struct task_struct *p, int max_to_do);
102 static void dispatch_rw_block_io(struct task_struct *p, int index);
103 static void dispatch_probe_blk(struct task_struct *p, int index);
104 static void dispatch_probe_seg(struct task_struct *p, int index);
105 static void dispatch_probe_seg_all(struct task_struct *p, int index);
106 static void dispatch_debug_block_io(struct task_struct *p, int index);
107 static void dispatch_create_segment(struct task_struct *p, int index);
108 static void dispatch_delete_segment(struct task_struct *p, int index);
109 static void dispatch_grant_physdev(struct task_struct *p, int index);
110 static void dispatch_probe_physdev(struct task_struct *p, int index);
111 static void make_response(struct task_struct *p, unsigned long id,
112 unsigned short op, unsigned long st);
115 /******************************************************************
116 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
117 */
119 static struct list_head io_schedule_list;
120 static spinlock_t io_schedule_list_lock;
122 static int __on_blkdev_list(struct task_struct *p)
123 {
124 return p->blkdev_list.next != NULL;
125 }
127 static void remove_from_blkdev_list(struct task_struct *p)
128 {
129 unsigned long flags;
130 if ( !__on_blkdev_list(p) ) return;
131 spin_lock_irqsave(&io_schedule_list_lock, flags);
132 if ( __on_blkdev_list(p) )
133 {
134 list_del(&p->blkdev_list);
135 p->blkdev_list.next = NULL;
136 put_task_struct(p);
137 }
138 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
139 }
141 static void add_to_blkdev_list_tail(struct task_struct *p)
142 {
143 unsigned long flags;
144 if ( __on_blkdev_list(p) ) return;
145 spin_lock_irqsave(&io_schedule_list_lock, flags);
146 if ( !__on_blkdev_list(p) )
147 {
148 list_add_tail(&p->blkdev_list, &io_schedule_list);
149 get_task_struct(p);
150 }
151 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
152 }
155 /******************************************************************
156 * SCHEDULER FUNCTIONS
157 */
159 static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
161 static void io_schedule(unsigned long unused)
162 {
163 struct task_struct *p;
164 struct list_head *ent;
166 /* Queue up a batch of requests. */
167 while ( (atomic_read(&nr_pending) < MAX_PENDING_REQS) &&
168 !list_empty(&io_schedule_list) )
169 {
170 ent = io_schedule_list.next;
171 p = list_entry(ent, struct task_struct, blkdev_list);
172 get_task_struct(p);
173 remove_from_blkdev_list(p);
174 if ( do_block_io_op_domain(p, BATCH_PER_DOMAIN) )
175 add_to_blkdev_list_tail(p);
176 put_task_struct(p);
177 }
179 /* Push the batch through to disc. */
180 run_task_queue(&tq_disk);
181 }
183 static void maybe_trigger_io_schedule(void)
184 {
185 /*
186 * Needed so that two processes, who together make the following predicate
187 * true, don't both read stale values and evaluate the predicate
188 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
189 */
190 smp_mb();
192 if ( (atomic_read(&nr_pending) < (MAX_PENDING_REQS/2)) &&
193 !list_empty(&io_schedule_list) )
194 {
195 tasklet_schedule(&io_schedule_tasklet);
196 }
197 }
201 /******************************************************************
202 * COMPLETION CALLBACK -- Called as bh->b_end_io()
203 */
205 static void end_block_io_op(struct buffer_head *bh, int uptodate)
206 {
207 unsigned long flags;
208 pending_req_t *pending_req = bh->pending_req;
210 /* An error fails the entire request. */
211 if ( !uptodate )
212 {
213 DPRINTK("Buffer not up-to-date at end of operation\n");
214 pending_req->status = 2;
215 }
217 unlock_buffer(pending_req->domain,
218 virt_to_phys(bh->b_data),
219 bh->b_size,
220 (pending_req->operation==READ));
222 if ( atomic_dec_and_test(&pending_req->pendcnt) )
223 {
224 make_response(pending_req->domain, pending_req->id,
225 pending_req->operation, pending_req->status);
226 put_task_struct(pending_req->domain);
227 spin_lock_irqsave(&pend_prod_lock, flags);
228 pending_ring[pending_prod] = pending_req - pending_reqs;
229 PENDREQ_IDX_INC(pending_prod);
230 spin_unlock_irqrestore(&pend_prod_lock, flags);
231 atomic_dec(&nr_pending);
232 maybe_trigger_io_schedule();
233 }
235 kmem_cache_free(buffer_head_cachep, bh);
236 }
240 /******************************************************************
241 * GUEST-OS SYSCALL -- Indicates there are requests outstanding.
242 */
244 long do_block_io_op(void)
245 {
246 add_to_blkdev_list_tail(current);
247 maybe_trigger_io_schedule();
248 return 0L;
249 }
253 /******************************************************************
254 * DOWNWARD CALLS -- These interface with the block-device layer proper.
255 */
257 static int __buffer_is_valid(struct task_struct *p,
258 unsigned long buffer,
259 unsigned short size,
260 int writeable_buffer)
261 {
262 unsigned long pfn;
263 struct pfn_info *page;
264 int rc = 0;
266 /* A request may span multiple page frames. Each must be checked. */
267 for ( pfn = buffer >> PAGE_SHIFT;
268 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
269 pfn++ )
270 {
271 /* Each frame must be within bounds of machine memory. */
272 if ( pfn >= max_page )
273 {
274 DPRINTK("pfn out of range: %08lx\n", pfn);
275 goto out;
276 }
278 page = frame_table + pfn;
280 /* Each frame must belong to the requesting domain. */
281 if ( (page->flags & PG_domain_mask) != p->domain )
282 {
283 DPRINTK("bad domain: expected %d, got %ld\n",
284 p->domain, page->flags & PG_domain_mask);
285 goto out;
286 }
288 /* If reading into the frame, the frame must be writeable. */
289 if ( writeable_buffer &&
290 ((page->flags & PG_type_mask) != PGT_writeable_page) &&
291 (page->type_count != 0) )
292 {
293 DPRINTK("non-writeable page passed for block read\n");
294 goto out;
295 }
296 }
298 rc = 1;
299 out:
300 return rc;
301 }
303 static void __lock_buffer(unsigned long buffer,
304 unsigned short size,
305 int writeable_buffer)
306 {
307 unsigned long pfn;
308 struct pfn_info *page;
310 for ( pfn = buffer >> PAGE_SHIFT;
311 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
312 pfn++ )
313 {
314 page = frame_table + pfn;
315 if ( writeable_buffer )
316 {
317 if ( page->type_count == 0 )
318 {
319 page->flags &= ~(PG_type_mask | PG_need_flush);
320 /* NB. This ref alone won't cause a TLB flush. */
321 page->flags |= PGT_writeable_page;
322 }
323 get_page_type(page);
324 }
325 get_page_tot(page);
326 }
327 }
329 static void unlock_buffer(struct task_struct *p,
330 unsigned long buffer,
331 unsigned short size,
332 int writeable_buffer)
333 {
334 unsigned long pfn, flags;
335 struct pfn_info *page;
337 spin_lock_irqsave(&p->page_lock, flags);
338 for ( pfn = buffer >> PAGE_SHIFT;
339 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
340 pfn++ )
341 {
342 page = frame_table + pfn;
343 if ( writeable_buffer &&
344 (put_page_type(page) == 0) &&
345 (page->flags & PG_need_flush) )
346 {
347 __flush_tlb();
348 page->flags &= ~PG_need_flush;
349 }
350 put_page_tot(page);
351 }
352 spin_unlock_irqrestore(&p->page_lock, flags);
353 }
355 static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
356 {
357 blk_ring_t *blk_ring = p->blk_ring_base;
358 int i, more_to_do = 0;
360 /*
361 * Take items off the comms ring, taking care not to catch up
362 * with the response-producer index.
363 */
364 for ( i = p->blk_req_cons;
365 (i != blk_ring->req_prod) &&
366 (((p->blk_resp_prod-i) & (BLK_RING_SIZE-1)) != 1);
367 i = BLK_RING_INC(i) )
368 {
369 if ( (max_to_do-- == 0) ||
370 (atomic_read(&nr_pending) == MAX_PENDING_REQS) )
371 {
372 more_to_do = 1;
373 break;
374 }
376 switch ( blk_ring->ring[i].req.operation )
377 {
378 case XEN_BLOCK_READ:
379 case XEN_BLOCK_WRITE:
380 dispatch_rw_block_io(p, i);
381 break;
383 case XEN_BLOCK_PROBE_BLK:
384 dispatch_probe_blk(p, i);
385 break;
387 case XEN_BLOCK_PROBE_SEG:
388 dispatch_probe_seg(p, i);
389 break;
391 case XEN_BLOCK_PROBE_SEG_ALL:
392 dispatch_probe_seg_all(p, i);
393 break;
395 case XEN_BLOCK_DEBUG:
396 dispatch_debug_block_io(p, i);
397 break;
399 case XEN_BLOCK_SEG_CREATE:
400 dispatch_create_segment(p, i);
401 break;
403 case XEN_BLOCK_SEG_DELETE:
404 dispatch_delete_segment(p, i);
405 break;
407 case XEN_BLOCK_PHYSDEV_GRANT:
408 dispatch_grant_physdev(p, i);
409 break;
411 case XEN_BLOCK_PHYSDEV_PROBE:
412 dispatch_probe_physdev(p, i);
413 break;
415 default:
416 DPRINTK("error: unknown block io operation [%d]\n",
417 blk_ring->ring[i].req.operation);
418 make_response(p, blk_ring->ring[i].req.id,
419 blk_ring->ring[i].req.operation, 1);
420 break;
421 }
422 }
424 p->blk_req_cons = i;
425 return more_to_do;
426 }
428 static void dispatch_debug_block_io(struct task_struct *p, int index)
429 {
430 DPRINTK("dispatch_debug_block_io: unimplemented\n");
431 }
433 static void dispatch_probe_physdev(struct task_struct *p, int index)
434 {
435 blk_ring_t *blk_ring = p->blk_ring_base;
436 unsigned long flags, buffer;
437 physdisk_probebuf_t *buf;
438 int result;
440 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
442 spin_lock_irqsave(&p->page_lock, flags);
443 if ( !__buffer_is_valid(p, buffer, sizeof(*buf), 1) )
444 {
445 spin_unlock_irqrestore(&p->page_lock, flags);
446 result = 1;
447 goto out;
448 }
449 __lock_buffer(buffer, sizeof(*buf), 1);
450 spin_unlock_irqrestore(&p->page_lock, flags);
452 buf = phys_to_virt(buffer);
453 result = xen_physdisk_probe(p, buf);
455 unlock_buffer(p, buffer, sizeof(*buf), 1);
457 out:
458 make_response(p, blk_ring->ring[index].req.id,
459 XEN_BLOCK_PHYSDEV_PROBE, result);
460 }
462 static void dispatch_grant_physdev(struct task_struct *p, int index)
463 {
464 blk_ring_t *blk_ring = p->blk_ring_base;
465 unsigned long flags, buffer;
466 xp_disk_t *xpd;
467 int result;
469 if ( p->domain != 0 )
470 {
471 DPRINTK("dispatch_grant_physdev called by dom%d\n", p->domain);
472 result = 1;
473 goto out;
474 }
476 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
478 spin_lock_irqsave(&p->page_lock, flags);
479 if ( !__buffer_is_valid(p, buffer, sizeof(xv_disk_t), 1) )
480 {
481 DPRINTK("Bad buffer in dispatch_grant_physdev\n");
482 spin_unlock_irqrestore(&p->page_lock, flags);
483 result = 1;
484 goto out;
485 }
486 __lock_buffer(buffer, sizeof(xv_disk_t), 1);
487 spin_unlock_irqrestore(&p->page_lock, flags);
489 xpd = phys_to_virt(buffer);
490 result = xen_physdisk_grant(xpd);
492 unlock_buffer(p, buffer, sizeof(xp_disk_t), 1);
494 out:
495 make_response(p, blk_ring->ring[index].req.id,
496 XEN_BLOCK_PHYSDEV_GRANT, result);
497 }
499 static void dispatch_create_segment(struct task_struct *p, int index)
500 {
501 blk_ring_t *blk_ring = p->blk_ring_base;
502 unsigned long flags, buffer;
503 xv_disk_t *xvd;
504 int result;
506 if ( p->domain != 0 )
507 {
508 DPRINTK("dispatch_create_segment called by dom%d\n", p->domain);
509 result = 1;
510 goto out;
511 }
513 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
515 spin_lock_irqsave(&p->page_lock, flags);
516 if ( !__buffer_is_valid(p, buffer, sizeof(xv_disk_t), 1) )
517 {
518 DPRINTK("Bad buffer in dispatch_create_segment\n");
519 spin_unlock_irqrestore(&p->page_lock, flags);
520 result = 1;
521 goto out;
522 }
523 __lock_buffer(buffer, sizeof(xv_disk_t), 1);
524 spin_unlock_irqrestore(&p->page_lock, flags);
526 xvd = phys_to_virt(buffer);
527 result = xen_segment_create(xvd);
529 unlock_buffer(p, buffer, sizeof(xv_disk_t), 1);
531 out:
532 make_response(p, blk_ring->ring[index].req.id,
533 XEN_BLOCK_SEG_CREATE, result);
534 }
536 static void dispatch_delete_segment(struct task_struct *p, int index)
537 {
538 DPRINTK("dispatch_delete_segment: unimplemented\n");
539 }
541 static void dispatch_probe_blk(struct task_struct *p, int index)
542 {
543 extern void ide_probe_devices(xen_disk_info_t *xdi);
544 extern void scsi_probe_devices(xen_disk_info_t *xdi);
546 blk_ring_t *blk_ring = p->blk_ring_base;
547 xen_disk_info_t *xdi;
548 unsigned long flags, buffer;
549 int rc = 0;
551 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
553 spin_lock_irqsave(&p->page_lock, flags);
554 if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) )
555 {
556 DPRINTK("Bad buffer in dispatch_probe_blk\n");
557 spin_unlock_irqrestore(&p->page_lock, flags);
558 rc = 1;
559 goto out;
560 }
561 __lock_buffer(buffer, sizeof(xen_disk_info_t), 1);
562 spin_unlock_irqrestore(&p->page_lock, flags);
564 xdi = phys_to_virt(buffer);
565 ide_probe_devices(xdi);
566 scsi_probe_devices(xdi);
568 unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1);
570 out:
571 make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_BLK, rc);
572 }
574 static void dispatch_probe_seg(struct task_struct *p,
575 int index)
576 {
577 extern void xen_segment_probe(struct task_struct *, xen_disk_info_t *);
579 blk_ring_t *blk_ring = p->blk_ring_base;
580 xen_disk_info_t *xdi;
581 unsigned long flags, buffer;
582 int rc = 0;
584 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
586 spin_lock_irqsave(&p->page_lock, flags);
587 if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) )
588 {
589 DPRINTK("Bad buffer in dispatch_probe_seg\n");
590 spin_unlock_irqrestore(&p->page_lock, flags);
591 rc = 1;
592 goto out;
593 }
594 __lock_buffer(buffer, sizeof(xen_disk_info_t), 1);
595 spin_unlock_irqrestore(&p->page_lock, flags);
597 xdi = phys_to_virt(buffer);
598 xen_segment_probe(p, xdi);
600 unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1);
602 out:
603 make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG, rc);
604 }
606 static void dispatch_probe_seg_all(struct task_struct *p, int index)
607 {
608 extern void xen_segment_probe_all(xen_segment_info_t *);
610 blk_ring_t *blk_ring = p->blk_ring_base;
611 xen_segment_info_t *xsi;
612 unsigned long flags, buffer;
613 int rc = 0;
615 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
617 spin_lock_irqsave(&p->page_lock, flags);
618 if ( !__buffer_is_valid(p, buffer, sizeof(xen_segment_info_t), 1) )
619 {
620 DPRINTK("Bad buffer in dispatch_probe_seg_all\n");
621 spin_unlock_irqrestore(&p->page_lock, flags);
622 rc = 1;
623 goto out;
624 }
625 __lock_buffer(buffer, sizeof(xen_segment_info_t), 1);
626 spin_unlock_irqrestore(&p->page_lock, flags);
628 xsi = phys_to_virt(buffer);
629 xen_segment_probe_all(xsi);
631 unlock_buffer(p, buffer, sizeof(xen_segment_info_t), 1);
633 out:
634 make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG_ALL, rc);
635 }
637 static void dispatch_rw_block_io(struct task_struct *p, int index)
638 {
639 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
640 blk_ring_t *blk_ring = p->blk_ring_base;
641 blk_ring_req_entry_t *req = &blk_ring->ring[index].req;
642 struct buffer_head *bh;
643 int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
644 unsigned short nr_sects;
645 unsigned long buffer, flags;
646 int i, tot_sects;
647 pending_req_t *pending_req;
649 /* We map virtual scatter/gather segments to physical segments. */
650 int new_segs, nr_psegs = 0;
651 phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
653 spin_lock_irqsave(&p->page_lock, flags);
655 /* Check that number of segments is sane. */
656 if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
657 {
658 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
659 goto bad_descriptor;
660 }
662 /*
663 * Check each address/size pair is sane, and convert into a
664 * physical device and block offset. Note that if the offset and size
665 * crosses a virtual extent boundary, we may end up with more
666 * physical scatter/gather segments than virtual segments.
667 */
668 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
669 {
670 buffer = req->buffer_and_sects[i] & ~0x1FF;
671 nr_sects = req->buffer_and_sects[i] & 0x1FF;
673 if ( nr_sects == 0 )
674 {
675 DPRINTK("zero-sized data request\n");
676 goto bad_descriptor;
677 }
679 if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
680 {
681 DPRINTK("invalid buffer\n");
682 goto bad_descriptor;
683 }
685 /* Get the physical device and block index. */
686 if ( (req->device & XENDEV_TYPE_MASK) == XENDEV_VIRTUAL )
687 {
688 new_segs = xen_segment_map_request(
689 &phys_seg[nr_psegs], p, operation,
690 req->device,
691 req->sector_number + tot_sects,
692 buffer, nr_sects);
693 if ( new_segs <= 0 )
694 {
695 DPRINTK("bogus xen_segment_map_request\n");
696 goto bad_descriptor;
697 }
698 }
699 else
700 {
701 phys_seg[nr_psegs].dev = req->device;
702 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
703 phys_seg[nr_psegs].buffer = buffer;
704 phys_seg[nr_psegs].nr_sects = nr_sects;
705 if (p->domain != 0 &&
706 !xen_physdisk_access_okay(&phys_seg[nr_psegs], p, operation)) {
707 DPRINTK("access denied: dev=%04x off=%ld nr=%ld\n",
708 req->device, req->sector_number + tot_sects, nr_sects);
709 goto bad_descriptor;
710 }
711 phys_seg[nr_psegs].dev = xendev_to_physdev(req->device);
712 if ( phys_seg[nr_psegs].dev == 0 )
713 {
714 DPRINTK("bad device: %04x\n", req_device);
715 goto bad_descriptor;
716 }
717 new_segs = 1;
718 }
720 nr_psegs += new_segs;
721 if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
722 }
724 /* Lock pages associated with each buffer head. */
725 for ( i = 0; i < nr_psegs; i++ )
726 __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9,
727 (operation==READ));
728 spin_unlock_irqrestore(&p->page_lock, flags);
730 atomic_inc(&nr_pending);
731 pending_req = pending_reqs + pending_ring[pending_cons];
732 PENDREQ_IDX_INC(pending_cons);
733 pending_req->domain = p;
734 pending_req->id = req->id;
735 pending_req->operation = operation;
736 pending_req->status = 0;
737 atomic_set(&pending_req->pendcnt, nr_psegs);
739 get_task_struct(p);
741 /* Now we pass each segment down to the real blkdev layer. */
742 for ( i = 0; i < nr_psegs; i++ )
743 {
744 bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
745 if ( bh == NULL ) panic("bh is null\n");
746 memset (bh, 0, sizeof (struct buffer_head));
748 bh->b_size = phys_seg[i].nr_sects << 9;
749 bh->b_dev = phys_seg[i].dev;
750 bh->b_rsector = phys_seg[i].sector_number;
751 bh->b_data = phys_to_virt(phys_seg[i].buffer);
752 bh->b_end_io = end_block_io_op;
753 bh->pending_req = pending_req;
755 if ( operation == WRITE )
756 {
757 bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) |
758 (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write);
759 }
760 else
761 {
762 bh->b_state = (1 << BH_Mapped) | (1 << BH_Read);
763 }
765 /* Dispatch a single request. We'll flush it to disc later. */
766 ll_rw_block(operation, 1, &bh);
767 }
769 return;
771 bad_descriptor:
772 spin_unlock_irqrestore(&p->page_lock, flags);
773 make_response(p, req->id, req->operation, 1);
774 }
778 /******************************************************************
779 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
780 */
782 kdev_t xendev_to_physdev(unsigned short xendev)
783 {
784 switch ( (xendev & XENDEV_TYPE_MASK) )
785 {
786 case XENDEV_IDE:
787 xendev &= XENDEV_IDX_MASK;
788 if ( xendev >= NR_IDE_DEVS )
789 {
790 DPRINTK("IDE device number out of range %d\n", xendev);
791 goto fail;
792 }
793 return ide_devs[xendev];
795 case XENDEV_SCSI:
796 xendev &= XENDEV_IDX_MASK;
797 if ( xendev >= NR_SCSI_DEVS )
798 {
799 DPRINTK("SCSI device number out of range %d\n", xendev);
800 goto fail;
801 }
802 return scsi_devs[xendev];
804 case XENDEV_VIRTUAL:
805 default:
806 DPRINTK("xendev_to_physdev: unknown device %d\n", xendev);
807 }
809 fail:
810 return (kdev_t)0;
811 }
813 static void make_response(struct task_struct *p, unsigned long id,
814 unsigned short op, unsigned long st)
815 {
816 unsigned long cpu_mask, flags;
817 int position;
818 blk_ring_t *blk_ring;
820 /* Place on the response ring for the relevant domain. */
821 spin_lock_irqsave(&p->blk_ring_lock, flags);
822 blk_ring = p->blk_ring_base;
823 position = p->blk_resp_prod;
824 blk_ring->ring[position].resp.id = id;
825 blk_ring->ring[position].resp.operation = op;
826 blk_ring->ring[position].resp.status = st;
827 p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
828 spin_unlock_irqrestore(&p->blk_ring_lock, flags);
830 /* Kick the relevant domain. */
831 cpu_mask = mark_guest_event(p, _EVENT_BLK_RESP);
832 guest_event_notify(cpu_mask);
833 }
835 static void dump_blockq(u_char key, void *dev_id, struct pt_regs *regs)
836 {
837 struct task_struct *p;
838 blk_ring_t *blk_ring ;
840 printk("Dumping block queue stats: nr_pending = %d (prod=%d,cons=%d)\n",
841 atomic_read(&nr_pending), pending_prod, pending_cons);
843 p = current->next_task;
844 do
845 {
846 if ( !is_idle_task(p) )
847 {
848 printk("Domain: %d\n", p->domain);
849 blk_ring = p->blk_ring_base;
851 printk(" req_prod:%d, req_cons:%d resp_prod:%d/%d on_list=%d\n",
852 blk_ring->req_prod, p->blk_req_cons,
853 blk_ring->resp_prod, p->blk_resp_prod,
854 __on_blkdev_list(p));
855 }
856 p = p->next_task;
857 } while (p != current);
858 }
860 /* Start-of-day initialisation for a new domain. */
861 void init_blkdev_info(struct task_struct *p)
862 {
863 if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
864 p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
865 clear_page(p->blk_ring_base);
866 SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain);
867 p->blkdev_list.next = NULL;
869 memset(p->segment_list, 0, sizeof(p->segment_list));
871 /* Get any previously created segments. */
872 xen_refresh_segment_list(p);
873 }
875 /* End-of-day teardown for a domain. */
876 void destroy_blkdev_info(struct task_struct *p)
877 {
878 ASSERT(!__on_blkdev_list(p));
879 UNSHARE_PFN(virt_to_page(p->blk_ring_base));
880 free_page((unsigned long)p->blk_ring_base);
881 }
883 void unlink_blkdev_info(struct task_struct *p)
884 {
885 unsigned long flags;
887 spin_lock_irqsave(&io_schedule_list_lock, flags);
888 if ( __on_blkdev_list(p) )
889 {
890 list_del(&p->blkdev_list);
891 p->blkdev_list.next = (void *)0xdeadbeef; /* prevent reinsertion */
892 put_task_struct(p);
893 }
894 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
895 }
897 void initialize_block_io ()
898 {
899 int i;
901 atomic_set(&nr_pending, 0);
902 pending_prod = pending_cons = 0;
903 memset(pending_reqs, 0, sizeof(pending_reqs));
904 for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i;
906 spin_lock_init(&io_schedule_list_lock);
907 INIT_LIST_HEAD(&io_schedule_list);
909 buffer_head_cachep = kmem_cache_create(
910 "buffer_head_cache", sizeof(struct buffer_head),
911 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
913 xen_segment_initialize();
915 add_key_handler('b', dump_blockq, "dump xen ide blkdev statistics");
916 }