debuggers.hg

view tools/blktap2/drivers/block-cache.c @ 22848:6341fe0f4e5a

Added tag 4.1.0-rc2 for changeset 9dca60d88c63
author Keir Fraser <keir@xen.org>
date Tue Jan 25 14:06:55 2011 +0000 (2011-01-25)
parents 1c627434605e
children
line source
1 /*
2 * Copyright (c) 2008, XenSource Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of XenSource Inc. nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <sys/mman.h>
34 #include "tapdisk.h"
35 #include "tapdisk-utils.h"
36 #include "tapdisk-driver.h"
37 #include "tapdisk-server.h"
38 #include "tapdisk-interface.h"
40 #ifdef DEBUG
41 #define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
42 #else
43 #define DBG(_f, _a...) ((void)0)
44 #endif
46 #define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
48 #define RADIX_TREE_PAGE_SHIFT 12 /* 4K pages */
49 #define RADIX_TREE_PAGE_SIZE (1 << RADIX_TREE_PAGE_SHIFT)
51 #define RADIX_TREE_NODE_SHIFT 9 /* 512B nodes */
52 #define RADIX_TREE_NODE_SIZE (1 << RADIX_TREE_NODE_SHIFT)
53 #define RADIX_TREE_NODE_MASK (RADIX_TREE_NODE_SIZE - 1)
55 #define BLOCK_CACHE_NODES_PER_PAGE (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT))
57 #define BLOCK_CACHE_MAX_SIZE (10 << 20) /* 100MB cache */
58 #define BLOCK_CACHE_REQUESTS (TAPDISK_DATA_REQUESTS << 3)
59 #define BLOCK_CACHE_PAGE_IDLETIME 60
61 typedef struct radix_tree radix_tree_t;
62 typedef struct radix_tree_node radix_tree_node_t;
63 typedef struct radix_tree_link radix_tree_link_t;
64 typedef struct radix_tree_leaf radix_tree_leaf_t;
65 typedef struct radix_tree_page radix_tree_page_t;
67 typedef struct block_cache block_cache_t;
68 typedef struct block_cache_request block_cache_request_t;
69 typedef struct block_cache_stats block_cache_stats_t;
71 struct radix_tree_page {
72 char *buf;
73 size_t size;
74 uint64_t sec;
75 radix_tree_link_t *owners[BLOCK_CACHE_NODES_PER_PAGE];
76 };
78 struct radix_tree_leaf {
79 radix_tree_page_t *page;
80 char *buf;
81 };
83 struct radix_tree_link {
84 uint32_t time;
85 union {
86 radix_tree_node_t *next;
87 radix_tree_leaf_t leaf;
88 } u;
89 };
91 struct radix_tree_node {
92 int height;
93 radix_tree_link_t links[RADIX_TREE_NODE_SIZE];
94 };
96 struct radix_tree {
97 int height;
98 uint64_t size;
99 uint32_t nodes;
100 radix_tree_node_t *root;
102 block_cache_t *cache;
103 };
105 struct block_cache_request {
106 int err;
107 char *buf;
108 uint64_t secs;
109 td_request_t treq;
110 block_cache_t *cache;
111 };
113 struct block_cache_stats {
114 uint64_t reads;
115 uint64_t hits;
116 uint64_t misses;
117 uint64_t prunes;
118 };
120 struct block_cache {
121 int ptype;
122 char *name;
124 uint64_t sectors;
126 block_cache_request_t requests[BLOCK_CACHE_REQUESTS];
127 block_cache_request_t *request_free_list[BLOCK_CACHE_REQUESTS];
128 int requests_free;
130 event_id_t timeout_id;
132 radix_tree_t tree;
134 block_cache_stats_t stats;
135 };
137 static inline uint64_t
138 radix_tree_calculate_size(int height)
139 {
140 return (uint64_t)RADIX_TREE_NODE_SIZE <<
141 (height * RADIX_TREE_NODE_SHIFT);
142 }
144 static inline int
145 radix_tree_calculate_height(uint64_t sectors)
146 {
147 int height;
148 uint64_t tree_size;
150 height = 1; /* always allocate root node */
151 tree_size = radix_tree_calculate_size(height);
152 while (sectors > tree_size)
153 tree_size = radix_tree_calculate_size(++height);
155 return height;
156 }
158 static inline int
159 radix_tree_index(radix_tree_node_t *node, uint64_t sector)
160 {
161 return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
162 RADIX_TREE_NODE_MASK);
163 }
165 static inline int
166 radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node)
167 {
168 return (node->height == 0);
169 }
171 static inline int
172 radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
173 {
174 return (node->height == tree->height);
175 }
177 static inline uint64_t
178 radix_tree_size(radix_tree_t *tree)
179 {
180 return tree->size + tree->nodes * sizeof(radix_tree_node_t);
181 }
183 static inline void
184 radix_tree_clear_link(radix_tree_link_t *link)
185 {
186 if (link)
187 memset(link, 0, sizeof(radix_tree_link_t));
188 }
190 static inline radix_tree_node_t *
191 radix_tree_allocate_node(radix_tree_t *tree, int height)
192 {
193 radix_tree_node_t *node;
195 node = calloc(1, sizeof(radix_tree_node_t));
196 if (!node)
197 return NULL;
199 node->height = height;
200 tree->nodes++;
202 return node;
203 }
205 static inline radix_tree_node_t *
206 radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
207 {
208 return radix_tree_allocate_node(tree, parent->height - 1);
209 }
211 void
212 radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
213 {
214 if (!node)
215 return;
217 free(node);
218 tree->nodes--;
219 }
221 static inline radix_tree_page_t *
222 radix_tree_allocate_page(radix_tree_t *tree,
223 char *buf, uint64_t sec, size_t size)
224 {
225 radix_tree_page_t *page;
227 page = calloc(1, sizeof(radix_tree_page_t));
228 if (!page)
229 return NULL;
231 page->buf = buf;
232 page->sec = sec;
233 page->size = size;
234 tree->size += size;
236 return page;
237 }
239 static inline void
240 radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
241 {
242 int i;
244 for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
245 DBG("%s: ejecting sector 0x%llx\n",
246 tree->cache->name, page->sec + i);
248 tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
249 tree->size -= page->size;
250 free(page->buf);
251 free(page);
252 }
254 /*
255 * remove a leaf and the shared radix_tree_page_t containing its buffer.
256 * leaves are deleted, nodes are not; gc will reap the nodes later.
257 */
258 static void
259 radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
260 {
261 int i;
263 if (!page)
264 return;
266 for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
267 radix_tree_clear_link(page->owners[i]);
269 radix_tree_free_page(tree, page);
270 }
272 static void
273 radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link,
274 radix_tree_page_t *page, off_t off)
275 {
276 int i;
278 if (off + RADIX_TREE_NODE_SIZE > page->size)
279 return;
281 for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
282 if (page->owners[i])
283 continue;
285 page->owners[i] = link;
286 link->u.leaf.page = page;
287 link->u.leaf.buf = page->buf + off;
289 break;
290 }
291 }
293 static char *
294 radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
295 {
296 int idx;
297 struct timeval now;
298 radix_tree_link_t *link;
299 radix_tree_node_t *node;
301 node = tree->root;
302 gettimeofday(&now, NULL);
304 do {
305 idx = radix_tree_index(node, sector);
306 link = node->links + idx;
307 link->time = now.tv_sec;
309 if (radix_tree_node_contains_leaves(tree, node))
310 return link->u.leaf.buf;
312 if (!link->u.next)
313 return NULL;
315 node = link->u.next;
316 } while (1);
317 }
319 static char *
320 radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
321 radix_tree_page_t *page, off_t off)
322 {
323 int idx;
324 struct timeval now;
325 radix_tree_link_t *link;
326 radix_tree_node_t *node;
328 node = tree->root;
329 gettimeofday(&now, NULL);
331 do {
332 idx = radix_tree_index(node, sector);
333 link = node->links + idx;
334 link->time = now.tv_sec;
336 if (radix_tree_node_contains_leaves(tree, node)) {
337 radix_tree_remove_page(tree, link->u.leaf.page);
338 radix_tree_insert_leaf(tree, link, page, off);
339 return link->u.leaf.buf;
340 }
342 if (!link->u.next) {
343 link->u.next = radix_tree_allocate_child_node(tree,
344 node);
345 if (!link->u.next)
346 return NULL;
347 }
349 node = link->u.next;
350 } while (1);
351 }
353 static int
354 radix_tree_add_leaves(radix_tree_t *tree, char *buf,
355 uint64_t sector, uint64_t sectors)
356 {
357 int i;
358 radix_tree_page_t *page;
360 page = radix_tree_allocate_page(tree, buf, sector,
361 sectors << RADIX_TREE_NODE_SHIFT);
362 if (!page)
363 return -ENOMEM;
365 for (i = 0; i < sectors; i++)
366 if (!radix_tree_add_leaf(tree, sector + i,
367 page, (i << RADIX_TREE_NODE_SHIFT)))
368 goto fail;
370 return 0;
372 fail:
373 page->buf = NULL;
374 radix_tree_remove_page(tree, page);
375 return -ENOMEM;
376 }
378 static void
379 radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
380 {
381 int i;
382 radix_tree_link_t *link;
384 if (!node)
385 return;
387 for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
388 link = node->links + i;
390 if (radix_tree_node_contains_leaves(tree, node))
391 radix_tree_remove_page(tree, link->u.leaf.page);
392 else
393 radix_tree_delete_branch(tree, link->u.next);
395 radix_tree_clear_link(link);
396 }
398 radix_tree_free_node(tree, node);
399 }
401 static inline void
402 radix_tree_destroy(radix_tree_t *tree)
403 {
404 radix_tree_delete_branch(tree, tree->root);
405 tree->root = NULL;
406 }
408 /*
409 * returns 1 if @node is empty after pruning, 0 otherwise
410 */
411 static int
412 radix_tree_prune_branch(radix_tree_t *tree,
413 radix_tree_node_t *node, uint32_t now)
414 {
415 int i, empty;
416 radix_tree_link_t *link;
418 empty = 1;
419 if (!node)
420 return empty;
422 for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
423 link = node->links + i;
425 if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
426 if (radix_tree_node_contains_leaves(tree, node)) {
427 empty = 0;
428 continue;
429 }
431 if (radix_tree_prune_branch(tree, link->u.next, now))
432 radix_tree_clear_link(link);
433 else
434 empty = 0;
436 continue;
437 }
439 if (radix_tree_node_contains_leaves(tree, node))
440 radix_tree_remove_page(tree, link->u.leaf.page);
441 else
442 radix_tree_delete_branch(tree, link->u.next);
444 radix_tree_clear_link(link);
445 }
447 if (empty && !radix_tree_node_is_root(tree, node))
448 radix_tree_free_node(tree, node);
450 return empty;
451 }
453 /*
454 * walk tree and free any node that has been idle for too long
455 */
456 static void
457 radix_tree_prune(radix_tree_t *tree)
458 {
459 struct timeval now;
461 if (!tree->root)
462 return;
464 DPRINTF("tree %s has %"PRIu64" bytes\n",
465 tree->cache->name, tree->size);
467 gettimeofday(&now, NULL);
468 radix_tree_prune_branch(tree, tree->root, now.tv_sec);
470 DPRINTF("tree %s now has %"PRIu64" bytes\n",
471 tree->cache->name, tree->size);
472 }
474 static inline int
475 radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
476 {
477 tree->height = radix_tree_calculate_height(sectors);
478 tree->root = radix_tree_allocate_node(tree, tree->height);
479 if (!tree->root)
480 return -ENOMEM;
482 return 0;
483 }
485 static inline void
486 radix_tree_free(radix_tree_t *tree)
487 {
488 radix_tree_destroy(tree);
489 }
491 static void
492 block_cache_prune_event(event_id_t id, char mode, void *private)
493 {
494 radix_tree_t *tree;
495 block_cache_t *cache;
497 cache = (block_cache_t *)private;
498 tree = &cache->tree;
500 radix_tree_prune(tree);
501 }
503 static inline block_cache_request_t *
504 block_cache_get_request(block_cache_t *cache)
505 {
506 if (!cache->requests_free)
507 return NULL;
509 return cache->request_free_list[--cache->requests_free];
510 }
512 static inline void
513 block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
514 {
515 memset(breq, 0, sizeof(block_cache_request_t));
516 cache->request_free_list[cache->requests_free++] = breq;
517 }
519 static int
520 block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
521 {
522 int i, err;
523 radix_tree_t *tree;
524 block_cache_t *cache;
526 if (!td_flag_test(flags, TD_OPEN_RDONLY))
527 return -EINVAL;
529 if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
530 return -EINVAL;
532 cache = (block_cache_t *)driver->data;
533 err = tapdisk_namedup(&cache->name, (char *)name);
534 if (err)
535 return -ENOMEM;
537 cache->sectors = driver->info.size;
539 tree = &cache->tree;
540 err = radix_tree_initialize(tree, cache->sectors);
541 if (err)
542 goto fail;
544 tree->cache = cache;
545 cache->requests_free = BLOCK_CACHE_REQUESTS;
546 for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
547 cache->request_free_list[i] = cache->requests + i;
549 cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
550 -1, /* dummy fd */
551 BLOCK_CACHE_PAGE_IDLETIME << 1,
552 block_cache_prune_event,
553 cache);
554 if (cache->timeout_id < 0)
555 goto fail;
557 DPRINTF("opening cache for %s, sectors: %"PRIu64", "
558 "tree: %p, height: %d\n",
559 cache->name, cache->sectors, tree, tree->height);
561 if (mlockall(MCL_CURRENT | MCL_FUTURE))
562 DPRINTF("mlockall failed: %d\n", -errno);
564 return 0;
566 fail:
567 free(cache->name);
568 radix_tree_free(&cache->tree);
569 return err;
570 }
572 static int
573 block_cache_close(td_driver_t *driver)
574 {
575 radix_tree_t *tree;
576 block_cache_t *cache;
578 cache = (block_cache_t *)driver->data;
579 tree = &cache->tree;
581 DPRINTF("closing cache for %s\n", cache->name);
583 tapdisk_server_unregister_event(cache->timeout_id);
584 radix_tree_free(tree);
585 free(cache->name);
587 return 0;
588 }
590 static inline uint64_t
591 block_cache_hash(block_cache_t *cache, char *buf)
592 {
593 int i, n;
594 uint64_t cksm, *data;
596 return 0;
598 cksm = 0;
599 data = (uint64_t *)buf;
600 n = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
602 for (i = 0; i < n; i++)
603 cksm += data[i];
605 return ~cksm;
606 }
608 static void
609 block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
610 {
611 int i;
612 off_t off;
614 cache->stats.hits += treq.secs;
616 for (i = 0; i < treq.secs; i++) {
617 DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
618 cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
620 off = i << RADIX_TREE_NODE_SHIFT;
621 memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
622 }
624 td_complete_request(treq, 0);
625 }
627 static void
628 block_cache_populate_cache(td_request_t clone, int err)
629 {
630 int i;
631 radix_tree_t *tree;
632 block_cache_t *cache;
633 block_cache_request_t *breq;
635 breq = (block_cache_request_t *)clone.cb_data;
636 cache = breq->cache;
637 tree = &cache->tree;
638 breq->secs -= clone.secs;
639 breq->err = (breq->err ? breq->err : err);
641 if (breq->secs)
642 return;
644 if (breq->err) {
645 free(breq->buf);
646 goto out;
647 }
649 for (i = 0; i < breq->treq.secs; i++) {
650 off_t off = i << RADIX_TREE_NODE_SHIFT;
651 DBG("%s: populating sec 0x%08llx\n",
652 cache->name, breq->treq.sec + i);
653 memcpy(breq->treq.buf + off,
654 breq->buf + off, RADIX_TREE_NODE_SIZE);
655 }
657 if (radix_tree_add_leaves(tree, breq->buf,
658 breq->treq.sec, breq->treq.secs))
659 free(breq->buf);
661 out:
662 td_complete_request(breq->treq, breq->err);
663 block_cache_put_request(cache, breq);
664 }
666 static void
667 block_cache_miss(block_cache_t *cache, td_request_t treq)
668 {
669 char *buf;
670 size_t size;
671 td_request_t clone;
672 radix_tree_t *tree;
673 block_cache_request_t *breq;
675 DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
677 clone = treq;
678 tree = &cache->tree;
679 size = treq.secs << RADIX_TREE_NODE_SHIFT;
681 cache->stats.misses += treq.secs;
683 if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
684 goto out;
686 breq = block_cache_get_request(cache);
687 if (!breq)
688 goto out;
690 if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) {
691 block_cache_put_request(cache, breq);
692 goto out;
693 }
695 breq->treq = treq;
696 breq->secs = treq.secs;
697 breq->err = 0;
698 breq->buf = buf;
699 breq->cache = cache;
701 clone.buf = buf;
702 clone.cb = block_cache_populate_cache;
703 clone.cb_data = breq;
705 out:
706 td_forward_request(clone);
707 }
709 static void
710 block_cache_queue_read(td_driver_t *driver, td_request_t treq)
711 {
712 int i;
713 radix_tree_t *tree;
714 block_cache_t *cache;
715 char *iov[BLOCK_CACHE_NODES_PER_PAGE];
717 cache = (block_cache_t *)driver->data;
718 tree = &cache->tree;
720 cache->stats.reads += treq.secs;
722 if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
723 return td_forward_request(treq);
725 for (i = 0; i < treq.secs; i++) {
726 iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
727 if (!iov[i])
728 return block_cache_miss(cache, treq);
729 }
731 return block_cache_hit(cache, treq, iov);
732 }
734 static void
735 block_cache_queue_write(td_driver_t *driver, td_request_t treq)
736 {
737 td_complete_request(treq, -EPERM);
738 }
740 static int
741 block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
742 {
743 return -EINVAL;
744 }
746 static int
747 block_cache_validate_parent(td_driver_t *driver,
748 td_driver_t *pdriver, td_flag_t flags)
749 {
750 block_cache_t *cache;
752 if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
753 return -EINVAL;
755 cache = (block_cache_t *)driver->data;
756 if (strcmp(driver->name, pdriver->name))
757 return -EINVAL;
759 return 0;
760 }
762 static void
763 block_cache_debug(td_driver_t *driver)
764 {
765 block_cache_t *cache;
766 block_cache_stats_t *stats;
768 cache = (block_cache_t *)driver->data;
769 stats = &cache->stats;
771 WARN("BLOCK CACHE %s\n", cache->name);
772 WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: %"PRIu64"\n",
773 stats->reads, stats->hits, stats->misses, stats->prunes);
774 }
776 struct tap_disk tapdisk_block_cache = {
777 .disk_type = "tapdisk_block_cache",
778 .flags = 0,
779 .private_data_size = sizeof(block_cache_t),
780 .td_open = block_cache_open,
781 .td_close = block_cache_close,
782 .td_queue_read = block_cache_queue_read,
783 .td_queue_write = block_cache_queue_write,
784 .td_get_parent_id = block_cache_get_parent_id,
785 .td_validate_parent = block_cache_validate_parent,
786 .td_debug = block_cache_debug,
787 };