debuggers.hg

view xen/common/tmem.c @ 22571:01f3b3509023

tmem: two wrongs (or three lefts and a wrong) make a right

These two bugs apparently complement each other enough that
they escaped problems in my testing, but eventually gum
up the works and are obviously horribly wrong.

Found while developing tmem for native Linux.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir@xen.org>
date Wed Dec 15 10:27:18 2010 +0000 (2010-12-15)
parents 0353037c6b95
children
line source
1 /******************************************************************************
2 * tmem.c
3 *
4 * Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
9 /* TODO list: 090129 (updated 100318)
10 - any better reclamation policy?
11 - use different tlsf pools for each client (maybe each pool)
12 - test shared access more completely (ocfs2)
13 - add feedback-driven compression (not for persistent pools though!)
14 - add data-structure total bytes overhead stats
15 */
17 #ifdef __XEN__
18 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
19 #endif
21 #include <xen/tmem.h>
22 #include <xen/rbtree.h>
23 #include <xen/radix-tree.h>
24 #include <xen/list.h>
26 #define EXPORT /* indicates code other modules are dependent upon */
27 #define FORWARD
29 #define TMEM_SPEC_VERSION 1
31 /************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
33 #define CLI_ID_NULL TMH_CLI_ID_NULL
34 #define cli_id_str tmh_cli_id_str
35 #define client_str tmh_client_str
37 /************ DEBUG and STATISTICS (+ some compression testing) *******/
39 #ifndef NDEBUG
40 #define SENTINELS
41 #define NOINLINE noinline
42 #else
43 #define NOINLINE
44 #endif
46 #ifdef SENTINELS
47 #define DECL_SENTINEL unsigned long sentinel;
48 #define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
49 #define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
50 #define ASSERT_SENTINEL(_x,_y) \
51 ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
52 #ifdef __i386__
53 #define POOL_SENTINEL 0x87658765
54 #define OBJ_SENTINEL 0x12345678
55 #define OBJNODE_SENTINEL 0xfedcba09
56 #define PGD_SENTINEL 0x43214321
57 #else
58 #define POOL_SENTINEL 0x8765876587658765
59 #define OBJ_SENTINEL 0x1234567812345678
60 #define OBJNODE_SENTINEL 0xfedcba0987654321
61 #define PGD_SENTINEL 0x4321432143214321
62 #endif
63 #else
64 #define DECL_SENTINEL
65 #define SET_SENTINEL(_x,_y) do { } while (0)
66 #define ASSERT_SENTINEL(_x,_y) do { } while (0)
67 #define INVERT_SENTINEL(_x,_y) do { } while (0)
68 #endif
70 /* global statistics (none need to be locked) */
71 static unsigned long total_tmem_ops = 0;
72 static unsigned long errored_tmem_ops = 0;
73 static unsigned long total_flush_pool = 0;
74 static unsigned long alloc_failed = 0, alloc_page_failed = 0;
75 static unsigned long evicted_pgs = 0, evict_attempts = 0;
76 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
77 static unsigned long max_evicts_per_relinq = 0;
78 static unsigned long low_on_memory = 0;
79 static unsigned long deduped_puts = 0;
80 static unsigned long tot_good_eph_puts = 0;
81 static int global_obj_count_max = 0;
82 static int global_pgp_count_max = 0;
83 static int global_pcd_count_max = 0;
84 static int global_page_count_max = 0;
85 static int global_rtree_node_count_max = 0;
86 static long global_eph_count_max = 0;
87 static unsigned long failed_copies;
88 static unsigned long pcd_tot_tze_size = 0;
89 static unsigned long pcd_tot_csize = 0;
91 DECL_CYC_COUNTER(succ_get);
92 DECL_CYC_COUNTER(succ_put);
93 DECL_CYC_COUNTER(non_succ_get);
94 DECL_CYC_COUNTER(non_succ_put);
95 DECL_CYC_COUNTER(flush);
96 DECL_CYC_COUNTER(flush_obj);
97 #ifdef COMPARE_COPY_PAGE_SSE2
98 EXTERN_CYC_COUNTER(pg_copy1);
99 EXTERN_CYC_COUNTER(pg_copy2);
100 EXTERN_CYC_COUNTER(pg_copy3);
101 EXTERN_CYC_COUNTER(pg_copy4);
102 #else
103 EXTERN_CYC_COUNTER(pg_copy);
104 #endif
105 DECL_CYC_COUNTER(compress);
106 DECL_CYC_COUNTER(decompress);
108 /************ CORE DATA STRUCTURES ************************************/
110 #define MAX_POOLS_PER_DOMAIN 16
111 #define MAX_GLOBAL_SHARED_POOLS 16
113 struct tm_pool;
114 struct tmem_page_descriptor;
115 struct tmem_page_content_descriptor;
116 struct client {
117 struct list_head client_list;
118 struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
119 tmh_client_t *tmh;
120 struct list_head ephemeral_page_list;
121 long eph_count, eph_count_max;
122 cli_id_t cli_id;
123 uint32_t weight;
124 uint32_t cap;
125 bool_t compress;
126 bool_t frozen;
127 bool_t shared_auth_required;
128 /* for save/restore/migration */
129 bool_t live_migrating;
130 bool_t was_frozen;
131 struct list_head persistent_invalidated_list;
132 struct tmem_page_descriptor *cur_pgp;
133 /* statistics collection */
134 unsigned long compress_poor, compress_nomem;
135 unsigned long compressed_pages;
136 uint64_t compressed_sum_size;
137 uint64_t total_cycles;
138 unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets;
139 /* shared pool authentication */
140 uint64_t shared_auth_uuid[MAX_GLOBAL_SHARED_POOLS][2];
141 };
142 typedef struct client client_t;
144 struct share_list {
145 struct list_head share_list;
146 client_t *client;
147 };
148 typedef struct share_list sharelist_t;
150 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
151 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
153 struct tm_pool {
154 bool_t shared;
155 bool_t persistent;
156 bool_t is_dying;
157 int pageshift; /* 0 == 2**12 */
158 struct list_head pool_list;
159 client_t *client;
160 uint64_t uuid[2]; /* 0 for private, non-zero for shared */
161 uint32_t pool_id;
162 rwlock_t pool_rwlock;
163 struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
164 struct list_head share_list; /* valid if shared */
165 int shared_count; /* valid if shared */
166 /* for save/restore/migration */
167 struct list_head persistent_page_list;
168 struct tmem_page_descriptor *cur_pgp;
169 /* statistics collection */
170 atomic_t pgp_count;
171 int pgp_count_max;
172 long obj_count; /* atomicity depends on pool_rwlock held for write */
173 long obj_count_max;
174 unsigned long objnode_count, objnode_count_max;
175 uint64_t sum_life_cycles;
176 uint64_t sum_evicted_cycles;
177 unsigned long puts, good_puts, no_mem_puts;
178 unsigned long dup_puts_flushed, dup_puts_replaced;
179 unsigned long gets, found_gets;
180 unsigned long flushs, flushs_found;
181 unsigned long flush_objs, flush_objs_found;
182 DECL_SENTINEL
183 };
184 typedef struct tm_pool pool_t;
186 #define is_persistent(_p) (_p->persistent)
187 #define is_ephemeral(_p) (!(_p->persistent))
188 #define is_shared(_p) (_p->shared)
189 #define is_private(_p) (!(_p->shared))
191 struct oid {
192 uint64_t oid[3];
193 };
194 typedef struct oid OID;
196 struct tmem_object_root {
197 DECL_SENTINEL
198 OID oid;
199 struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
200 unsigned long objnode_count; /* atomicity depends on obj_spinlock */
201 long pgp_count; /* atomicity depends on obj_spinlock */
202 struct radix_tree_root tree_root; /* tree of pages within object */
203 pool_t *pool;
204 cli_id_t last_client;
205 spinlock_t obj_spinlock;
206 bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
207 };
208 typedef struct tmem_object_root obj_t;
210 typedef struct radix_tree_node rtn_t;
211 struct tmem_object_node {
212 obj_t *obj;
213 DECL_SENTINEL
214 rtn_t rtn;
215 };
216 typedef struct tmem_object_node objnode_t;
218 struct tmem_page_descriptor {
219 union {
220 struct list_head global_eph_pages;
221 struct list_head client_inv_pages;
222 };
223 union {
224 struct {
225 union {
226 struct list_head client_eph_pages;
227 struct list_head pool_pers_pages;
228 };
229 obj_t *obj;
230 } us;
231 OID inv_oid; /* used for invalid list only */
232 };
233 pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
234 else compressed data (cdata) */
235 uint32_t index;
236 /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
237 uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */
238 bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */
239 struct list_head pcd_siblings;
240 union {
241 pfp_t *pfp; /* page frame pointer */
242 char *cdata; /* compressed data */
243 struct tmem_page_content_descriptor *pcd; /* page dedup */
244 };
245 union {
246 uint64_t timestamp;
247 uint32_t pool_id; /* used for invalid list only */
248 };
249 DECL_SENTINEL
250 };
251 typedef struct tmem_page_descriptor pgp_t;
253 #define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
255 struct tmem_page_content_descriptor {
256 union {
257 pfp_t *pfp; /* page frame pointer */
258 char *cdata; /* if compression_enabled */
259 char *tze; /* if !compression_enabled, trailing zeroes eliminated */
260 };
261 struct list_head pgp_list;
262 struct rb_node pcd_rb_tree_node;
263 uint32_t pgp_ref_count;
264 pagesize_t size; /* if compression_enabled -> 0<size<PAGE_SIZE (*cdata)
265 * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
266 * else PAGE_SIZE -> *pfp */
267 };
268 typedef struct tmem_page_content_descriptor pcd_t;
269 struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
270 rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
272 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
274 static LIST_HEAD(global_client_list);
275 static LIST_HEAD(global_pool_list);
277 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
278 static bool_t global_shared_auth = 0;
279 static atomic_t client_weight_total = ATOMIC_INIT(0);
280 static int tmem_initialized = 0;
282 /************ CONCURRENCY ***********************************************/
284 EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
285 EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
286 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
287 static DEFINE_SPINLOCK(pers_lists_spinlock);
289 #define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
290 #define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
291 #define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
292 #define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
293 #define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
294 #define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
295 #define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
296 #define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
298 #define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
299 #define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
301 /* global counters (should use long_atomic_t access) */
302 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
303 static atomic_t global_obj_count = ATOMIC_INIT(0);
304 static atomic_t global_pgp_count = ATOMIC_INIT(0);
305 static atomic_t global_pcd_count = ATOMIC_INIT(0);
306 static atomic_t global_page_count = ATOMIC_INIT(0);
307 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
309 #define atomic_inc_and_max(_c) do { \
310 atomic_inc(&_c); \
311 if ( _atomic_read(_c) > _c##_max ) \
312 _c##_max = _atomic_read(_c); \
313 } while (0)
315 #define atomic_dec_and_assert(_c) do { \
316 atomic_dec(&_c); \
317 ASSERT(_atomic_read(_c) >= 0); \
318 } while (0)
321 /************ MEMORY ALLOCATION INTERFACE *****************************/
323 #define tmem_malloc(_type,_pool) \
324 _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
326 #define tmem_malloc_bytes(_size,_pool) \
327 _tmem_malloc(_size, 1, _pool)
329 static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
330 {
331 void *v;
333 if ( (pool != NULL) && is_persistent(pool) )
334 v = tmh_alloc_subpage_thispool(pool,size,align);
335 else
336 v = tmh_alloc_subpage(pool, size, align);
337 if ( v == NULL )
338 alloc_failed++;
339 return v;
340 }
342 static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
343 {
344 if ( pool == NULL || !is_persistent(pool) )
345 tmh_free_subpage(p,size);
346 else
347 tmh_free_subpage_thispool(pool,p,size);
348 }
350 static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
351 {
352 pfp_t *pfp = NULL;
354 if ( pool != NULL && is_persistent(pool) )
355 pfp = tmh_alloc_page_thispool(pool);
356 else
357 pfp = tmh_alloc_page(pool,0);
358 if ( pfp == NULL )
359 alloc_page_failed++;
360 else
361 atomic_inc_and_max(global_page_count);
362 return pfp;
363 }
365 static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
366 {
367 ASSERT(pfp);
368 if ( pool == NULL || !is_persistent(pool) )
369 tmh_free_page(pfp);
370 else
371 tmh_free_page_thispool(pool,pfp);
372 atomic_dec_and_assert(global_page_count);
373 }
375 /************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/
377 #define NOT_SHAREABLE ((uint16_t)-1UL)
379 static NOINLINE int pcd_copy_to_client(tmem_cli_mfn_t cmfn, pgp_t *pgp)
380 {
381 uint8_t firstbyte = pgp->firstbyte;
382 pcd_t *pcd;
383 int ret;
385 ASSERT(tmh_dedup_enabled());
386 tmem_read_lock(&pcd_tree_rwlocks[firstbyte]);
387 pcd = pgp->pcd;
388 if ( pgp->size < PAGE_SIZE && pgp->size != 0 &&
389 pcd->size < PAGE_SIZE && pcd->size != 0 )
390 ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL);
391 else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE )
392 ret = tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size);
393 else
394 ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL);
395 tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]);
396 return ret;
397 }
399 /* ensure pgp no longer points to pcd, nor vice-versa */
400 /* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
401 static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t have_pcd_rwlock)
402 {
403 pcd_t *pcd = pgp->pcd;
404 pfp_t *pfp = pgp->pcd->pfp;
405 uint16_t firstbyte = pgp->firstbyte;
406 char *pcd_tze = pgp->pcd->tze;
407 pagesize_t pcd_size = pcd->size;
408 pagesize_t pgp_size = pgp->size;
409 char *pcd_cdata = pgp->pcd->cdata;
410 pagesize_t pcd_csize = pgp->pcd->size;
412 ASSERT(tmh_dedup_enabled());
413 ASSERT(firstbyte != NOT_SHAREABLE);
414 ASSERT(firstbyte < 256);
416 if ( have_pcd_rwlock )
417 ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]);
418 else
419 tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
420 list_del_init(&pgp->pcd_siblings);
421 pgp->pcd = NULL;
422 pgp->firstbyte = NOT_SHAREABLE;
423 pgp->size = -1;
424 if ( --pcd->pgp_ref_count )
425 {
426 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
427 return;
428 }
430 /* no more references to this pcd, recycle it and the physical page */
431 ASSERT(list_empty(&pcd->pgp_list));
432 pcd->pfp = NULL;
433 /* remove pcd from rbtree */
434 rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
435 /* reinit the struct for safety for now */
436 RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
437 /* now free up the pcd memory */
438 tmem_free(pcd,sizeof(pcd_t),NULL);
439 atomic_dec_and_assert(global_pcd_count);
440 if ( pgp_size != 0 && pcd_size < PAGE_SIZE )
441 {
442 /* compressed data */
443 tmem_free(pcd_cdata,pcd_csize,pool);
444 pcd_tot_csize -= pcd_csize;
445 }
446 else if ( pcd_size != PAGE_SIZE )
447 {
448 /* trailing zero data */
449 pcd_tot_tze_size -= pcd_size;
450 if ( pcd_size )
451 tmem_free(pcd_tze,pcd_size,pool);
452 } else {
453 /* real physical page */
454 if ( tmh_tze_enabled() )
455 pcd_tot_tze_size -= PAGE_SIZE;
456 if ( tmh_compression_enabled() )
457 pcd_tot_csize -= PAGE_SIZE;
458 tmem_page_free(pool,pfp);
459 }
460 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
461 }
464 static NOINLINE int pcd_associate(pgp_t *pgp, char *cdata, pagesize_t csize)
465 {
466 struct rb_node **new, *parent = NULL;
467 struct rb_root *root;
468 pcd_t *pcd;
469 int cmp;
470 pagesize_t pfp_size = 0;
471 uint8_t firstbyte = (cdata == NULL) ? tmh_get_first_byte(pgp->pfp) : *cdata;
472 int ret = 0;
474 if ( !tmh_dedup_enabled() )
475 return 0;
476 ASSERT(pgp->us.obj != NULL);
477 ASSERT(pgp->us.obj->pool != NULL);
478 ASSERT(!pgp->us.obj->pool->persistent);
479 if ( cdata == NULL )
480 {
481 ASSERT(pgp->pfp != NULL);
482 pfp_size = PAGE_SIZE;
483 if ( tmh_tze_enabled() )
484 {
485 pfp_size = tmh_tze_pfp_scan(pgp->pfp);
486 if ( pfp_size > PCD_TZE_MAX_SIZE )
487 pfp_size = PAGE_SIZE;
488 }
489 ASSERT(pfp_size <= PAGE_SIZE);
490 ASSERT(!(pfp_size & (sizeof(uint64_t)-1)));
491 }
492 tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
494 /* look for page match */
495 root = &pcd_tree_roots[firstbyte];
496 new = &(root->rb_node);
497 while ( *new )
498 {
499 pcd = container_of(*new, pcd_t, pcd_rb_tree_node);
500 parent = *new;
501 /* compare new entry and rbtree entry, set cmp accordingly */
502 if ( cdata != NULL )
503 {
504 if ( pcd->size < PAGE_SIZE )
505 /* both new entry and rbtree entry are compressed */
506 cmp = tmh_pcd_cmp(cdata,csize,pcd->cdata,pcd->size);
507 else
508 /* new entry is compressed, rbtree entry is not */
509 cmp = -1;
510 } else if ( pcd->size < PAGE_SIZE )
511 /* rbtree entry is compressed, rbtree entry is not */
512 cmp = 1;
513 else if ( tmh_tze_enabled() ) {
514 if ( pcd->size < PAGE_SIZE )
515 /* both new entry and rbtree entry are trailing zero */
516 cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size);
517 else
518 /* new entry is trailing zero, rbtree entry is not */
519 cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE);
520 } else {
521 /* both new entry and rbtree entry are full physical pages */
522 ASSERT(pgp->pfp != NULL);
523 ASSERT(pcd->pfp != NULL);
524 cmp = tmh_page_cmp(pgp->pfp,pcd->pfp);
525 }
527 /* walk tree or match depending on cmp */
528 if ( cmp < 0 )
529 new = &((*new)->rb_left);
530 else if ( cmp > 0 )
531 new = &((*new)->rb_right);
532 else
533 {
534 /* match! if not compressed, free the no-longer-needed page */
535 /* but if compressed, data is assumed static so don't free! */
536 if ( cdata == NULL )
537 tmem_page_free(pgp->us.obj->pool,pgp->pfp);
538 deduped_puts++;
539 goto match;
540 }
541 }
543 /* exited while loop with no match, so alloc a pcd and put it in the tree */
544 if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL )
545 {
546 ret = -ENOMEM;
547 goto unlock;
548 } else if ( cdata != NULL ) {
549 if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->us.obj->pool)) == NULL )
550 {
551 tmem_free(pcd,sizeof(pcd_t),NULL);
552 ret = -ENOMEM;
553 goto unlock;
554 }
555 }
556 atomic_inc_and_max(global_pcd_count);
557 RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */
558 INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */
559 pcd->pgp_ref_count = 0;
560 if ( cdata != NULL )
561 {
562 memcpy(pcd->cdata,cdata,csize);
563 pcd->size = csize;
564 pcd_tot_csize += csize;
565 } else if ( pfp_size == 0 ) {
566 ASSERT(tmh_tze_enabled());
567 pcd->size = 0;
568 pcd->tze = NULL;
569 } else if ( pfp_size < PAGE_SIZE &&
570 ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->us.obj->pool)) != NULL) ) {
571 tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size);
572 pcd->size = pfp_size;
573 pcd_tot_tze_size += pfp_size;
574 tmem_page_free(pgp->us.obj->pool,pgp->pfp);
575 } else {
576 pcd->pfp = pgp->pfp;
577 pcd->size = PAGE_SIZE;
578 if ( tmh_tze_enabled() )
579 pcd_tot_tze_size += PAGE_SIZE;
580 if ( tmh_compression_enabled() )
581 pcd_tot_csize += PAGE_SIZE;
582 }
583 rb_link_node(&pcd->pcd_rb_tree_node, parent, new);
584 rb_insert_color(&pcd->pcd_rb_tree_node, root);
586 match:
587 pcd->pgp_ref_count++;
588 list_add(&pgp->pcd_siblings,&pcd->pgp_list);
589 pgp->firstbyte = firstbyte;
590 pgp->eviction_attempted = 0;
591 pgp->pcd = pcd;
593 unlock:
594 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
595 return ret;
596 }
598 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
600 /* allocate a pgp_t and associate it with an object */
601 static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
602 {
603 pgp_t *pgp;
604 pool_t *pool;
606 ASSERT(obj != NULL);
607 ASSERT(obj->pool != NULL);
608 pool = obj->pool;
609 if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
610 return NULL;
611 pgp->us.obj = obj;
612 INIT_LIST_HEAD(&pgp->global_eph_pages);
613 INIT_LIST_HEAD(&pgp->us.client_eph_pages);
614 pgp->pfp = NULL;
615 if ( tmh_dedup_enabled() )
616 {
617 pgp->firstbyte = NOT_SHAREABLE;
618 pgp->eviction_attempted = 0;
619 INIT_LIST_HEAD(&pgp->pcd_siblings);
620 }
621 pgp->size = -1;
622 pgp->index = -1;
623 pgp->timestamp = get_cycles();
624 SET_SENTINEL(pgp,PGD);
625 atomic_inc_and_max(global_pgp_count);
626 atomic_inc_and_max(pool->pgp_count);
627 return pgp;
628 }
630 static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
631 {
632 ASSERT(obj != NULL);
633 ASSERT_SPINLOCK(&obj->obj_spinlock);
634 ASSERT_SENTINEL(obj,OBJ);
635 ASSERT(obj->pool != NULL);
636 ASSERT_SENTINEL(obj->pool,POOL);
637 return radix_tree_lookup(&obj->tree_root, index);
638 }
640 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
641 {
642 pagesize_t pgp_size = pgp->size;
644 if ( pgp->pfp == NULL )
645 return;
646 if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
647 pcd_disassociate(pgp,pool,0); /* pgp->size lost */
648 else if ( pgp_size )
649 tmem_free(pgp->cdata,pgp_size,pool);
650 else
651 tmem_page_free(pgp->us.obj->pool,pgp->pfp);
652 if ( pool != NULL && pgp_size )
653 {
654 pool->client->compressed_pages--;
655 pool->client->compressed_sum_size -= pgp_size;
656 }
657 pgp->pfp = NULL;
658 pgp->size = -1;
659 }
661 static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
662 {
663 pool_t *pool = NULL;
665 ASSERT_SENTINEL(pgp,PGD);
666 ASSERT(pgp->us.obj != NULL);
667 ASSERT_SENTINEL(pgp->us.obj,OBJ);
668 ASSERT_SENTINEL(pgp->us.obj->pool,POOL);
669 ASSERT(pgp->us.obj->pool->client != NULL);
670 if ( from_delete )
671 ASSERT(pgp_lookup_in_obj(pgp->us.obj,pgp->index) == NULL);
672 ASSERT(pgp->us.obj->pool != NULL);
673 pool = pgp->us.obj->pool;
674 if ( is_ephemeral(pool) )
675 {
676 ASSERT(list_empty(&pgp->global_eph_pages));
677 ASSERT(list_empty(&pgp->us.client_eph_pages));
678 }
679 pgp_free_data(pgp, pool);
680 atomic_dec_and_assert(global_pgp_count);
681 atomic_dec_and_assert(pool->pgp_count);
682 pgp->size = -1;
683 if ( is_persistent(pool) && pool->client->live_migrating )
684 {
685 pgp->inv_oid = pgp->us.obj->oid;
686 pgp->pool_id = pool->pool_id;
687 return;
688 }
689 INVERT_SENTINEL(pgp,PGD);
690 pgp->us.obj = NULL;
691 pgp->index = -1;
692 tmem_free(pgp,sizeof(pgp_t),pool);
693 }
695 static NOINLINE void pgp_free_from_inv_list(client_t *client, pgp_t *pgp)
696 {
697 pool_t *pool = client->pools[pgp->pool_id];
699 ASSERT_SENTINEL(pool,POOL);
700 ASSERT_SENTINEL(pgp,PGD);
701 INVERT_SENTINEL(pgp,PGD);
702 pgp->us.obj = NULL;
703 pgp->index = -1;
704 tmem_free(pgp,sizeof(pgp_t),pool);
705 }
707 /* remove the page from appropriate lists but not from parent object */
708 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
709 {
710 client_t *client;
712 ASSERT(pgp != NULL);
713 ASSERT(pgp->us.obj != NULL);
714 ASSERT(pgp->us.obj->pool != NULL);
715 client = pgp->us.obj->pool->client;
716 ASSERT(client != NULL);
717 if ( is_ephemeral(pgp->us.obj->pool) )
718 {
719 if ( !no_eph_lock )
720 tmem_spin_lock(&eph_lists_spinlock);
721 if ( !list_empty(&pgp->us.client_eph_pages) )
722 client->eph_count--;
723 ASSERT(client->eph_count >= 0);
724 list_del_init(&pgp->us.client_eph_pages);
725 if ( !list_empty(&pgp->global_eph_pages) )
726 global_eph_count--;
727 ASSERT(global_eph_count >= 0);
728 list_del_init(&pgp->global_eph_pages);
729 if ( !no_eph_lock )
730 tmem_spin_unlock(&eph_lists_spinlock);
731 } else {
732 if ( client->live_migrating )
733 {
734 tmem_spin_lock(&pers_lists_spinlock);
735 list_add_tail(&pgp->client_inv_pages,
736 &client->persistent_invalidated_list);
737 if ( pgp != pgp->us.obj->pool->cur_pgp )
738 list_del_init(&pgp->us.pool_pers_pages);
739 tmem_spin_unlock(&pers_lists_spinlock);
740 } else {
741 tmem_spin_lock(&pers_lists_spinlock);
742 list_del_init(&pgp->us.pool_pers_pages);
743 tmem_spin_unlock(&pers_lists_spinlock);
744 }
745 }
746 }
748 /* remove page from lists (but not from parent object) and free it */
749 static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
750 {
751 uint64_t life;
753 ASSERT(pgp != NULL);
754 ASSERT(pgp->us.obj != NULL);
755 ASSERT(pgp->us.obj->pool != NULL);
756 life = get_cycles() - pgp->timestamp;
757 pgp->us.obj->pool->sum_life_cycles += life;
758 pgp_delist(pgp, no_eph_lock);
759 pgp_free(pgp,1);
760 }
762 /* called only indirectly by radix_tree_destroy */
763 static NOINLINE void pgp_destroy(void *v)
764 {
765 pgp_t *pgp = (pgp_t *)v;
767 ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
768 pgp_delist(pgp,0);
769 ASSERT(pgp->us.obj != NULL);
770 pgp->us.obj->pgp_count--;
771 ASSERT(pgp->us.obj->pgp_count >= 0);
772 pgp_free(pgp,0);
773 }
775 FORWARD static rtn_t *rtn_alloc(void *arg);
776 FORWARD static void rtn_free(rtn_t *rtn);
778 static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
779 {
780 int ret;
782 ASSERT_SPINLOCK(&obj->obj_spinlock);
783 ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
784 if ( !ret )
785 obj->pgp_count++;
786 return ret;
787 }
789 static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
790 {
791 pgp_t *pgp;
793 ASSERT(obj != NULL);
794 ASSERT_SPINLOCK(&obj->obj_spinlock);
795 ASSERT_SENTINEL(obj,OBJ);
796 ASSERT(obj->pool != NULL);
797 ASSERT_SENTINEL(obj->pool,POOL);
798 pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
799 if ( pgp != NULL )
800 obj->pgp_count--;
801 ASSERT(obj->pgp_count >= 0);
803 return pgp;
804 }
806 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
808 /* called only indirectly from radix_tree_insert */
809 static NOINLINE rtn_t *rtn_alloc(void *arg)
810 {
811 objnode_t *objnode;
812 obj_t *obj = (obj_t *)arg;
814 ASSERT_SENTINEL(obj,OBJ);
815 ASSERT(obj->pool != NULL);
816 ASSERT_SENTINEL(obj->pool,POOL);
817 objnode = tmem_malloc(objnode_t,obj->pool);
818 if (objnode == NULL)
819 return NULL;
820 objnode->obj = obj;
821 SET_SENTINEL(objnode,OBJNODE);
822 memset(&objnode->rtn, 0, sizeof(rtn_t));
823 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
824 obj->pool->objnode_count_max = obj->pool->objnode_count;
825 atomic_inc_and_max(global_rtree_node_count);
826 obj->objnode_count++;
827 return &objnode->rtn;
828 }
830 /* called only indirectly from radix_tree_delete/destroy */
831 static void rtn_free(rtn_t *rtn)
832 {
833 pool_t *pool;
834 objnode_t *objnode;
835 int i;
837 ASSERT(rtn != NULL);
838 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
839 ASSERT(rtn->slots[i] == NULL);
840 objnode = container_of(rtn,objnode_t,rtn);
841 ASSERT_SENTINEL(objnode,OBJNODE);
842 INVERT_SENTINEL(objnode,OBJNODE);
843 ASSERT(objnode->obj != NULL);
844 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
845 ASSERT_SENTINEL(objnode->obj,OBJ);
846 pool = objnode->obj->pool;
847 ASSERT(pool != NULL);
848 ASSERT_SENTINEL(pool,POOL);
849 pool->objnode_count--;
850 objnode->obj->objnode_count--;
851 objnode->obj = NULL;
852 tmem_free(objnode,sizeof(objnode_t),pool);
853 atomic_dec_and_assert(global_rtree_node_count);
854 }
856 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
858 int oid_compare(OID *left, OID *right)
859 {
860 if ( left->oid[2] == right->oid[2] )
861 {
862 if ( left->oid[1] == right->oid[1] )
863 {
864 if ( left->oid[0] == right->oid[0] )
865 return 0;
866 else if ( left->oid[0] < right->oid[0] )
867 return -1;
868 else
869 return 1;
870 }
871 else if ( left->oid[1] < right->oid[1] )
872 return -1;
873 else
874 return 1;
875 }
876 else if ( left->oid[2] < right->oid[2] )
877 return -1;
878 else
879 return 1;
880 }
882 void oid_set_invalid(OID *oidp)
883 {
884 oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
885 }
887 unsigned oid_hash(OID *oidp)
888 {
889 return (tmh_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
890 BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK);
891 }
893 /* searches for object==oid in pool, returns locked object if found */
894 static NOINLINE obj_t * obj_find(pool_t *pool, OID *oidp)
895 {
896 struct rb_node *node;
897 obj_t *obj;
899 restart_find:
900 tmem_read_lock(&pool->pool_rwlock);
901 node = pool->obj_rb_root[oid_hash(oidp)].rb_node;
902 while ( node )
903 {
904 obj = container_of(node, obj_t, rb_tree_node);
905 switch ( oid_compare(&obj->oid, oidp) )
906 {
907 case 0: /* equal */
908 if ( tmh_lock_all )
909 obj->no_evict = 1;
910 else
911 {
912 if ( !tmem_spin_trylock(&obj->obj_spinlock) )
913 {
914 tmem_read_unlock(&pool->pool_rwlock);
915 goto restart_find;
916 }
917 tmem_read_unlock(&pool->pool_rwlock);
918 }
919 return obj;
920 case -1:
921 node = node->rb_left;
922 break;
923 case 1:
924 node = node->rb_right;
925 }
926 }
927 tmem_read_unlock(&pool->pool_rwlock);
928 return NULL;
929 }
931 /* free an object that has no more pgps in it */
932 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
933 {
934 pool_t *pool;
935 OID old_oid;
937 ASSERT_SPINLOCK(&obj->obj_spinlock);
938 ASSERT(obj != NULL);
939 ASSERT_SENTINEL(obj,OBJ);
940 ASSERT(obj->pgp_count == 0);
941 pool = obj->pool;
942 ASSERT(pool != NULL);
943 ASSERT(pool->client != NULL);
944 ASSERT_WRITELOCK(&pool->pool_rwlock);
945 if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
946 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
947 ASSERT((long)obj->objnode_count == 0);
948 ASSERT(obj->tree_root.rnode == NULL);
949 pool->obj_count--;
950 ASSERT(pool->obj_count >= 0);
951 INVERT_SENTINEL(obj,OBJ);
952 obj->pool = NULL;
953 old_oid = obj->oid;
954 oid_set_invalid(&obj->oid);
955 obj->last_client = CLI_ID_NULL;
956 atomic_dec_and_assert(global_obj_count);
957 /* use no_rebalance only if all objects are being destroyed anyway */
958 if ( !no_rebalance )
959 rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[oid_hash(&old_oid)]);
960 tmem_free(obj,sizeof(obj_t),pool);
961 }
963 static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
964 {
965 struct rb_node **new, *parent = NULL;
966 obj_t *this;
968 new = &(root->rb_node);
969 while ( *new )
970 {
971 this = container_of(*new, obj_t, rb_tree_node);
972 parent = *new;
973 switch ( oid_compare(&this->oid, &obj->oid) )
974 {
975 case 0:
976 return 0;
977 case -1:
978 new = &((*new)->rb_left);
979 break;
980 case 1:
981 new = &((*new)->rb_right);
982 break;
983 }
984 }
985 rb_link_node(&obj->rb_tree_node, parent, new);
986 rb_insert_color(&obj->rb_tree_node, root);
987 return 1;
988 }
990 /*
991 * allocate, initialize, and insert an tmem_object_root
992 * (should be called only if find failed)
993 */
994 static NOINLINE obj_t * obj_new(pool_t *pool, OID *oidp)
995 {
996 obj_t *obj;
998 ASSERT(pool != NULL);
999 ASSERT_WRITELOCK(&pool->pool_rwlock);
1000 if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
1001 return NULL;
1002 pool->obj_count++;
1003 if (pool->obj_count > pool->obj_count_max)
1004 pool->obj_count_max = pool->obj_count;
1005 atomic_inc_and_max(global_obj_count);
1006 INIT_RADIX_TREE(&obj->tree_root,0);
1007 spin_lock_init(&obj->obj_spinlock);
1008 obj->pool = pool;
1009 obj->oid = *oidp;
1010 obj->objnode_count = 0;
1011 obj->pgp_count = 0;
1012 obj->last_client = CLI_ID_NULL;
1013 SET_SENTINEL(obj,OBJ);
1014 tmem_spin_lock(&obj->obj_spinlock);
1015 obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj);
1016 obj->no_evict = 1;
1017 ASSERT_SPINLOCK(&obj->obj_spinlock);
1018 return obj;
1021 /* free an object after destroying any pgps in it */
1022 static NOINLINE void obj_destroy(obj_t *obj, int no_rebalance)
1024 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
1025 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
1026 obj_free(obj,no_rebalance);
1029 /* destroys all objs in a pool, or only if obj->last_client matches cli_id */
1030 static void pool_destroy_objs(pool_t *pool, bool_t selective, cli_id_t cli_id)
1032 struct rb_node *node;
1033 obj_t *obj;
1034 int i;
1036 tmem_write_lock(&pool->pool_rwlock);
1037 pool->is_dying = 1;
1038 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
1040 node = rb_first(&pool->obj_rb_root[i]);
1041 while ( node != NULL )
1043 obj = container_of(node, obj_t, rb_tree_node);
1044 tmem_spin_lock(&obj->obj_spinlock);
1045 node = rb_next(node);
1046 ASSERT(obj->no_evict == 0);
1047 if ( !selective )
1048 /* FIXME: should be obj,1 but walking/erasing rbtree is racy */
1049 obj_destroy(obj,0);
1050 else if ( obj->last_client == cli_id )
1051 obj_destroy(obj,0);
1052 else
1053 tmem_spin_unlock(&obj->obj_spinlock);
1056 tmem_write_unlock(&pool->pool_rwlock);
1060 /************ POOL MANIPULATION ROUTINES ******************************/
1062 static pool_t * pool_alloc(void)
1064 pool_t *pool;
1065 int i;
1067 if ( (pool = tmh_alloc_infra(sizeof(pool_t),__alignof__(pool_t))) == NULL )
1068 return NULL;
1069 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
1070 pool->obj_rb_root[i] = RB_ROOT;
1071 INIT_LIST_HEAD(&pool->pool_list);
1072 INIT_LIST_HEAD(&pool->persistent_page_list);
1073 pool->cur_pgp = NULL;
1074 rwlock_init(&pool->pool_rwlock);
1075 pool->pgp_count_max = pool->obj_count_max = 0;
1076 pool->objnode_count = pool->objnode_count_max = 0;
1077 atomic_set(&pool->pgp_count,0);
1078 pool->obj_count = 0; pool->shared_count = 0;
1079 pool->pageshift = PAGE_SHIFT - 12;
1080 pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
1081 pool->dup_puts_replaced = pool->no_mem_puts = 0;
1082 pool->found_gets = pool->gets = 0;
1083 pool->flushs_found = pool->flushs = 0;
1084 pool->flush_objs_found = pool->flush_objs = 0;
1085 pool->is_dying = 0;
1086 SET_SENTINEL(pool,POOL);
1087 return pool;
1090 static NOINLINE void pool_free(pool_t *pool)
1092 ASSERT_SENTINEL(pool,POOL);
1093 INVERT_SENTINEL(pool,POOL);
1094 pool->client = NULL;
1095 list_del(&pool->pool_list);
1096 tmh_free_infra(pool);
1099 /* register new_client as a user of this shared pool and return new
1100 total number of registered users */
1101 static int shared_pool_join(pool_t *pool, client_t *new_client)
1103 sharelist_t *sl;
1105 ASSERT(is_shared(pool));
1106 if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
1107 return -1;
1108 sl->client = new_client;
1109 list_add_tail(&sl->share_list, &pool->share_list);
1110 if ( new_client->cli_id != pool->client->cli_id )
1111 printk("adding new %s %d to shared pool owned by %s %d\n",
1112 client_str, new_client->cli_id, client_str, pool->client->cli_id);
1113 return ++pool->shared_count;
1116 /* reassign "ownership" of the pool to another client that shares this pool */
1117 static NOINLINE void shared_pool_reassign(pool_t *pool)
1119 sharelist_t *sl;
1120 int poolid;
1121 client_t *old_client = pool->client, *new_client;
1123 ASSERT(is_shared(pool));
1124 if ( list_empty(&pool->share_list) )
1126 ASSERT(pool->shared_count == 0);
1127 return;
1129 old_client->pools[pool->pool_id] = NULL;
1130 sl = list_entry(pool->share_list.next, sharelist_t, share_list);
1131 ASSERT(sl->client != old_client);
1132 pool->client = new_client = sl->client;
1133 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
1134 if (new_client->pools[poolid] == pool)
1135 break;
1136 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
1137 new_client->eph_count += _atomic_read(pool->pgp_count);
1138 old_client->eph_count -= _atomic_read(pool->pgp_count);
1139 list_splice_init(&old_client->ephemeral_page_list,
1140 &new_client->ephemeral_page_list);
1141 printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
1142 cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
1143 pool->pool_id = poolid;
1146 /* destroy all objects with last_client same as passed cli_id,
1147 remove pool's cli_id from list of sharers of this pool */
1148 static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
1150 sharelist_t *sl;
1151 int s_poolid;
1153 ASSERT(is_shared(pool));
1154 ASSERT(pool->client != NULL);
1156 ASSERT_WRITELOCK(&tmem_rwlock);
1157 pool_destroy_objs(pool,1,cli_id);
1158 list_for_each_entry(sl,&pool->share_list, share_list)
1160 if (sl->client->cli_id != cli_id)
1161 continue;
1162 list_del(&sl->share_list);
1163 tmem_free(sl,sizeof(sharelist_t),pool);
1164 --pool->shared_count;
1165 if (pool->client->cli_id == cli_id)
1166 shared_pool_reassign(pool);
1167 if (pool->shared_count)
1168 return pool->shared_count;
1169 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
1170 if ( (global_shared_pools[s_poolid]) == pool )
1172 global_shared_pools[s_poolid] = NULL;
1173 break;
1175 return 0;
1177 printk("tmem: no match unsharing pool, %s=%d\n",
1178 cli_id_str,pool->client->cli_id);
1179 return -1;
1182 /* flush all data (owned by cli_id) from a pool and, optionally, free it */
1183 static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
1185 ASSERT(pool != NULL);
1186 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
1188 printk("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
1189 cli_id_str, cli_id, pool->pool_id, cli_id_str,pool->client->cli_id);
1190 return;
1192 printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
1193 is_persistent(pool) ? "persistent" : "ephemeral" ,
1194 is_shared(pool) ? "shared" : "private");
1195 printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
1196 if ( pool->client->live_migrating )
1198 printk("can't %s pool while %s is live-migrating\n",
1199 destroy?"destroy":"flush", client_str);
1200 return;
1202 pool_destroy_objs(pool,0,CLI_ID_NULL);
1203 if ( destroy )
1205 pool->client->pools[pool->pool_id] = NULL;
1206 pool_free(pool);
1210 /************ CLIENT MANIPULATION OPERATIONS **************************/
1212 static client_t *client_create(cli_id_t cli_id)
1214 client_t *client = tmh_alloc_infra(sizeof(client_t),__alignof__(client_t));
1215 int i;
1217 printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
1218 if ( client == NULL )
1220 printk("failed... out of memory\n");
1221 goto fail;
1223 memset(client,0,sizeof(client_t));
1224 if ( (client->tmh = tmh_client_init(cli_id)) == NULL )
1226 printk("failed... can't allocate host-dependent part of client\n");
1227 goto fail;
1229 if ( !tmh_set_client_from_id(client, client->tmh, cli_id) )
1231 printk("failed... can't set client\n");
1232 goto fail;
1234 client->cli_id = cli_id;
1235 #ifdef __i386__
1236 client->compress = 0;
1237 #else
1238 client->compress = tmh_compression_enabled();
1239 #endif
1240 client->shared_auth_required = tmh_shared_auth();
1241 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1242 client->shared_auth_uuid[i][0] =
1243 client->shared_auth_uuid[i][1] = -1L;
1244 client->frozen = 0; client->live_migrating = 0;
1245 client->weight = 0; client->cap = 0;
1246 list_add_tail(&client->client_list, &global_client_list);
1247 INIT_LIST_HEAD(&client->ephemeral_page_list);
1248 INIT_LIST_HEAD(&client->persistent_invalidated_list);
1249 client->cur_pgp = NULL;
1250 client->eph_count = client->eph_count_max = 0;
1251 client->total_cycles = 0; client->succ_pers_puts = 0;
1252 client->succ_eph_gets = 0; client->succ_pers_gets = 0;
1253 printk("ok\n");
1254 return client;
1256 fail:
1257 tmh_free_infra(client);
1258 return NULL;
1261 static void client_free(client_t *client)
1263 list_del(&client->client_list);
1264 tmh_client_destroy(client->tmh);
1265 tmh_free_infra(client);
1268 /* flush all data from a client and, optionally, free it */
1269 static void client_flush(client_t *client, bool_t destroy)
1271 int i;
1272 pool_t *pool;
1274 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
1276 if ( (pool = client->pools[i]) == NULL )
1277 continue;
1278 pool_flush(pool,client->cli_id,destroy);
1279 if ( destroy )
1280 client->pools[i] = NULL;
1282 if ( destroy )
1283 client_free(client);
1286 static bool_t client_over_quota(client_t *client)
1288 int total = _atomic_read(client_weight_total);
1290 ASSERT(client != NULL);
1291 if ( (total == 0) || (client->weight == 0) ||
1292 (client->eph_count == 0) )
1293 return 0;
1294 return ( ((global_eph_count*100L) / client->eph_count ) >
1295 ((total*100L) / client->weight) );
1298 static void client_freeze(client_t *client, int freeze)
1300 client->frozen = freeze;
1303 /************ MEMORY REVOCATION ROUTINES *******************************/
1305 static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
1307 obj_t *obj = pgp->us.obj;
1308 pool_t *pool = obj->pool;
1309 client_t *client = pool->client;
1310 uint16_t firstbyte = pgp->firstbyte;
1312 if ( pool->is_dying )
1313 return 0;
1314 if ( tmh_lock_all && !obj->no_evict )
1315 return 1;
1316 if ( tmem_spin_trylock(&obj->obj_spinlock) )
1318 if ( tmh_dedup_enabled() )
1320 firstbyte = pgp->firstbyte;
1321 if ( firstbyte == NOT_SHAREABLE )
1322 goto obj_unlock;
1323 ASSERT(firstbyte < 256);
1324 if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) )
1325 goto obj_unlock;
1326 if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted )
1328 pgp->eviction_attempted++;
1329 list_del(&pgp->global_eph_pages);
1330 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1331 list_del(&pgp->us.client_eph_pages);
1332 list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
1333 goto pcd_unlock;
1336 if ( obj->pgp_count > 1 )
1337 return 1;
1338 if ( tmem_write_trylock(&pool->pool_rwlock) )
1340 *hold_pool_rwlock = 1;
1341 return 1;
1343 pcd_unlock:
1344 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
1345 obj_unlock:
1346 tmem_spin_unlock(&obj->obj_spinlock);
1348 return 0;
1351 static int tmem_evict(void)
1353 client_t *client = tmh_client_from_current();
1354 pgp_t *pgp = NULL, *pgp2, *pgp_del;
1355 obj_t *obj;
1356 pool_t *pool;
1357 int ret = 0;
1358 bool_t hold_pool_rwlock = 0;
1360 evict_attempts++;
1361 tmem_spin_lock(&eph_lists_spinlock);
1362 if ( (client != NULL) && client_over_quota(client) &&
1363 !list_empty(&client->ephemeral_page_list) )
1365 list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,us.client_eph_pages)
1366 if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
1367 goto found;
1368 } else if ( list_empty(&global_ephemeral_page_list) ) {
1369 goto out;
1370 } else {
1371 list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages)
1372 if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
1373 goto found;
1376 ret = 0;
1377 goto out;
1379 found:
1380 ASSERT(pgp != NULL);
1381 ASSERT_SENTINEL(pgp,PGD);
1382 obj = pgp->us.obj;
1383 ASSERT(obj != NULL);
1384 ASSERT(obj->no_evict == 0);
1385 ASSERT(obj->pool != NULL);
1386 ASSERT_SENTINEL(obj,OBJ);
1387 pool = obj->pool;
1389 ASSERT_SPINLOCK(&obj->obj_spinlock);
1390 pgp_del = pgp_delete_from_obj(obj, pgp->index);
1391 ASSERT(pgp_del == pgp);
1392 if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
1394 ASSERT(pgp->pcd->pgp_ref_count == 1 || pgp->eviction_attempted);
1395 pcd_disassociate(pgp,pool,1);
1397 pgp_delete(pgp,1);
1398 if ( obj->pgp_count == 0 )
1400 ASSERT_WRITELOCK(&pool->pool_rwlock);
1401 obj_free(obj,0);
1403 else
1404 tmem_spin_unlock(&obj->obj_spinlock);
1405 if ( hold_pool_rwlock )
1406 tmem_write_unlock(&pool->pool_rwlock);
1407 evicted_pgs++;
1408 ret = 1;
1410 out:
1411 tmem_spin_unlock(&eph_lists_spinlock);
1412 return ret;
1415 static unsigned long tmem_relinquish_npages(unsigned long n)
1417 unsigned long avail_pages = 0;
1419 while ( (avail_pages = tmh_avail_pages()) < n )
1421 if ( !tmem_evict() )
1422 break;
1424 if ( avail_pages )
1425 tmh_release_avail_pages_to_host();
1426 return avail_pages;
1429 /* Under certain conditions (e.g. if each client is putting pages for exactly
1430 * one object), once locks are held, freeing up memory may
1431 * result in livelocks and very long "put" times, so we try to ensure there
1432 * is a minimum amount of memory (1MB) available BEFORE any data structure
1433 * locks are held */
1434 static inline void tmem_ensure_avail_pages(void)
1436 int failed_evict = 10;
1438 while ( !tmh_free_mb() )
1440 if ( tmem_evict() )
1441 continue;
1442 else if ( failed_evict-- <= 0 )
1443 break;
1447 /************ TMEM CORE OPERATIONS ************************************/
1449 static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1450 void *cva)
1452 void *dst, *p;
1453 size_t size;
1454 int ret = 0;
1455 DECL_LOCAL_CYC_COUNTER(compress);
1457 ASSERT(pgp != NULL);
1458 ASSERT(pgp->us.obj != NULL);
1459 ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
1460 ASSERT(pgp->us.obj->pool != NULL);
1461 ASSERT(pgp->us.obj->pool->client != NULL);
1462 #ifdef __i386__
1463 return -ENOMEM;
1464 #endif
1466 if ( pgp->pfp != NULL )
1467 pgp_free_data(pgp, pgp->us.obj->pool);
1468 START_CYC_COUNTER(compress);
1469 ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
1470 if ( (ret == -EFAULT) || (ret == 0) )
1471 goto out;
1472 else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) {
1473 ret = 0;
1474 goto out;
1475 } else if ( tmh_dedup_enabled() && !is_persistent(pgp->us.obj->pool) ) {
1476 if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM )
1477 goto out;
1478 } else if ( (p = tmem_malloc_bytes(size,pgp->us.obj->pool)) == NULL ) {
1479 ret = -ENOMEM;
1480 goto out;
1481 } else {
1482 memcpy(p,dst,size);
1483 pgp->cdata = p;
1485 pgp->size = size;
1486 pgp->us.obj->pool->client->compressed_pages++;
1487 pgp->us.obj->pool->client->compressed_sum_size += size;
1488 ret = 1;
1490 out:
1491 END_CYC_COUNTER(compress);
1492 return ret;
1495 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1496 pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva)
1498 pool_t *pool;
1499 obj_t *obj;
1500 client_t *client;
1501 pgp_t *pgpfound = NULL;
1502 int ret;
1504 ASSERT(pgp != NULL);
1505 ASSERT(pgp->pfp != NULL);
1506 ASSERT(pgp->size != -1);
1507 obj = pgp->us.obj;
1508 ASSERT_SPINLOCK(&obj->obj_spinlock);
1509 ASSERT(obj != NULL);
1510 pool = obj->pool;
1511 ASSERT(pool != NULL);
1512 client = pool->client;
1513 if ( client->live_migrating )
1514 goto failed_dup; /* no dups allowed when migrating */
1515 /* can we successfully manipulate pgp to change out the data? */
1516 if ( len != 0 && client->compress && pgp->size != 0 )
1518 ret = do_tmem_put_compress(pgp,cmfn,cva);
1519 if ( ret == 1 )
1520 goto done;
1521 else if ( ret == 0 )
1522 goto copy_uncompressed;
1523 else if ( ret == -ENOMEM )
1524 goto failed_dup;
1525 else if ( ret == -EFAULT )
1526 goto bad_copy;
1529 copy_uncompressed:
1530 if ( pgp->pfp )
1531 pgp_free_data(pgp, pool);
1532 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1533 goto failed_dup;
1534 pgp->size = 0;
1535 /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
1536 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
1537 if ( ret == -EFAULT )
1538 goto bad_copy;
1539 if ( tmh_dedup_enabled() && !is_persistent(pool) )
1541 if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
1542 goto failed_dup;
1545 done:
1546 /* successfully replaced data, clean up and return success */
1547 if ( is_shared(pool) )
1548 obj->last_client = client->cli_id;
1549 obj->no_evict = 0;
1550 tmem_spin_unlock(&obj->obj_spinlock);
1551 pool->dup_puts_replaced++;
1552 pool->good_puts++;
1553 if ( is_persistent(pool) )
1554 client->succ_pers_puts++;
1555 return 1;
1557 bad_copy:
1558 /* this should only happen if the client passed a bad mfn */
1559 failed_copies++;
1560 ret = -EFAULT;
1561 goto cleanup;
1563 failed_dup:
1564 /* couldn't change out the data, flush the old data and return
1565 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
1566 ret = -ENOSPC;
1567 cleanup:
1568 pgpfound = pgp_delete_from_obj(obj, pgp->index);
1569 ASSERT(pgpfound == pgp);
1570 pgp_delete(pgpfound,0);
1571 if ( obj->pgp_count == 0 )
1573 tmem_write_lock(&pool->pool_rwlock);
1574 obj_free(obj,0);
1575 tmem_write_unlock(&pool->pool_rwlock);
1576 } else {
1577 obj->no_evict = 0;
1578 tmem_spin_unlock(&obj->obj_spinlock);
1580 pool->dup_puts_flushed++;
1581 return ret;
1585 static NOINLINE int do_tmem_put(pool_t *pool,
1586 OID *oidp, uint32_t index,
1587 tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
1588 pagesize_t pfn_offset, pagesize_t len, void *cva)
1590 obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
1591 pgp_t *pgp = NULL, *pgpdel = NULL;
1592 client_t *client = pool->client;
1593 int ret = client->frozen ? -EFROZEN : -ENOMEM;
1595 ASSERT(pool != NULL);
1596 pool->puts++;
1597 /* does page already exist (dup)? if so, handle specially */
1598 if ( (obj = objfound = obj_find(pool,oidp)) != NULL )
1600 ASSERT_SPINLOCK(&objfound->obj_spinlock);
1601 if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
1602 return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len,cva);
1605 /* no puts allowed into a frozen pool (except dup puts) */
1606 if ( client->frozen )
1607 goto free;
1609 if ( (objfound == NULL) )
1611 tmem_write_lock(&pool->pool_rwlock);
1612 if ( (obj = objnew = obj_new(pool,oidp)) == NULL )
1614 tmem_write_unlock(&pool->pool_rwlock);
1615 return -ENOMEM;
1617 ASSERT_SPINLOCK(&objnew->obj_spinlock);
1618 tmem_write_unlock(&pool->pool_rwlock);
1621 ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
1622 ASSERT_SPINLOCK(&obj->obj_spinlock);
1623 if ( (pgp = pgp_alloc(obj)) == NULL )
1624 goto free;
1626 ret = pgp_add_to_obj(obj, index, pgp);
1627 if ( ret == -ENOMEM )
1628 /* warning, may result in partially built radix tree ("stump") */
1629 goto free;
1630 ASSERT(ret != -EEXIST);
1631 pgp->index = index;
1632 pgp->size = 0;
1634 if ( len != 0 && client->compress )
1636 ASSERT(pgp->pfp == NULL);
1637 ret = do_tmem_put_compress(pgp,cmfn,cva);
1638 if ( ret == 1 )
1639 goto insert_page;
1640 if ( ret == -ENOMEM )
1642 client->compress_nomem++;
1643 goto delete_and_free;
1645 if ( ret == 0 )
1647 client->compress_poor++;
1648 goto copy_uncompressed;
1650 if ( ret == -EFAULT )
1651 goto bad_copy;
1654 copy_uncompressed:
1655 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1657 ret = -ENOMEM;
1658 goto delete_and_free;
1660 /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
1661 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
1662 if ( ret == -EFAULT )
1663 goto bad_copy;
1664 if ( tmh_dedup_enabled() && !is_persistent(pool) )
1666 if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
1667 goto delete_and_free;
1670 insert_page:
1671 if ( is_ephemeral(pool) )
1673 tmem_spin_lock(&eph_lists_spinlock);
1674 list_add_tail(&pgp->global_eph_pages,
1675 &global_ephemeral_page_list);
1676 if (++global_eph_count > global_eph_count_max)
1677 global_eph_count_max = global_eph_count;
1678 list_add_tail(&pgp->us.client_eph_pages,
1679 &client->ephemeral_page_list);
1680 if (++client->eph_count > client->eph_count_max)
1681 client->eph_count_max = client->eph_count;
1682 tmem_spin_unlock(&eph_lists_spinlock);
1683 } else { /* is_persistent */
1684 tmem_spin_lock(&pers_lists_spinlock);
1685 list_add_tail(&pgp->us.pool_pers_pages,
1686 &pool->persistent_page_list);
1687 tmem_spin_unlock(&pers_lists_spinlock);
1689 ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
1690 if ( is_shared(pool) )
1691 obj->last_client = client->cli_id;
1692 obj->no_evict = 0;
1693 tmem_spin_unlock(&obj->obj_spinlock);
1694 pool->good_puts++;
1695 if ( is_persistent(pool) )
1696 client->succ_pers_puts++;
1697 else
1698 tot_good_eph_puts++;
1699 return 1;
1701 bad_copy:
1702 /* this should only happen if the client passed a bad mfn */
1703 ret = -EFAULT;
1704 failed_copies++;
1706 delete_and_free:
1707 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1708 pgpdel = pgp_delete_from_obj(obj, pgp->index);
1709 ASSERT(pgp == pgpdel);
1711 free:
1712 if ( pgp )
1713 pgp_delete(pgp,0);
1714 if ( objfound )
1716 objfound->no_evict = 0;
1717 tmem_spin_unlock(&objfound->obj_spinlock);
1719 if ( objnew )
1721 tmem_write_lock(&pool->pool_rwlock);
1722 obj_free(objnew,0);
1723 tmem_write_unlock(&pool->pool_rwlock);
1725 pool->no_mem_puts++;
1726 return ret;
1729 static NOINLINE int do_tmem_get(pool_t *pool, OID *oidp, uint32_t index,
1730 tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
1731 pagesize_t pfn_offset, pagesize_t len, void *cva)
1733 obj_t *obj;
1734 pgp_t *pgp;
1735 client_t *client = pool->client;
1736 DECL_LOCAL_CYC_COUNTER(decompress);
1738 if ( !_atomic_read(pool->pgp_count) )
1739 return -EEMPTY;
1741 pool->gets++;
1742 obj = obj_find(pool,oidp);
1743 if ( obj == NULL )
1744 return 0;
1746 ASSERT_SPINLOCK(&obj->obj_spinlock);
1747 if (is_shared(pool) || is_persistent(pool) )
1748 pgp = pgp_lookup_in_obj(obj, index);
1749 else
1750 pgp = pgp_delete_from_obj(obj, index);
1751 if ( pgp == NULL )
1753 obj->no_evict = 0;
1754 tmem_spin_unlock(&obj->obj_spinlock);
1755 return 0;
1757 ASSERT(pgp->size != -1);
1758 if ( tmh_dedup_enabled() && !is_persistent(pool) &&
1759 pgp->firstbyte != NOT_SHAREABLE )
1761 if ( pcd_copy_to_client(cmfn, pgp) == -EFAULT )
1762 goto bad_copy;
1763 } else if ( pgp->size != 0 ) {
1764 START_CYC_COUNTER(decompress);
1765 if ( tmh_decompress_to_client(cmfn, pgp->cdata,
1766 pgp->size, cva) == -EFAULT )
1767 goto bad_copy;
1768 END_CYC_COUNTER(decompress);
1769 } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
1770 pfn_offset, len, cva) == -EFAULT)
1771 goto bad_copy;
1772 if ( is_ephemeral(pool) )
1774 if ( is_private(pool) )
1776 pgp_delete(pgp,0);
1777 if ( obj->pgp_count == 0 )
1779 tmem_write_lock(&pool->pool_rwlock);
1780 obj_free(obj,0);
1781 obj = NULL;
1782 tmem_write_unlock(&pool->pool_rwlock);
1784 } else {
1785 tmem_spin_lock(&eph_lists_spinlock);
1786 list_del(&pgp->global_eph_pages);
1787 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1788 list_del(&pgp->us.client_eph_pages);
1789 list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
1790 tmem_spin_unlock(&eph_lists_spinlock);
1791 ASSERT(obj != NULL);
1792 obj->last_client = tmh_get_cli_id_from_current();
1795 if ( obj != NULL )
1797 obj->no_evict = 0;
1798 tmem_spin_unlock(&obj->obj_spinlock);
1800 pool->found_gets++;
1801 if ( is_ephemeral(pool) )
1802 client->succ_eph_gets++;
1803 else
1804 client->succ_pers_gets++;
1805 return 1;
1807 bad_copy:
1808 /* this should only happen if the client passed a bad mfn */
1809 failed_copies++;
1810 return -EFAULT;
1814 static NOINLINE int do_tmem_flush_page(pool_t *pool, OID *oidp, uint32_t index)
1816 obj_t *obj;
1817 pgp_t *pgp;
1819 pool->flushs++;
1820 obj = obj_find(pool,oidp);
1821 if ( obj == NULL )
1822 goto out;
1823 pgp = pgp_delete_from_obj(obj, index);
1824 if ( pgp == NULL )
1826 obj->no_evict = 0;
1827 tmem_spin_unlock(&obj->obj_spinlock);
1828 goto out;
1830 pgp_delete(pgp,0);
1831 if ( obj->pgp_count == 0 )
1833 tmem_write_lock(&pool->pool_rwlock);
1834 obj_free(obj,0);
1835 tmem_write_unlock(&pool->pool_rwlock);
1836 } else {
1837 obj->no_evict = 0;
1838 tmem_spin_unlock(&obj->obj_spinlock);
1840 pool->flushs_found++;
1842 out:
1843 if ( pool->client->frozen )
1844 return -EFROZEN;
1845 else
1846 return 1;
1849 static NOINLINE int do_tmem_flush_object(pool_t *pool, OID *oidp)
1851 obj_t *obj;
1853 pool->flush_objs++;
1854 obj = obj_find(pool,oidp);
1855 if ( obj == NULL )
1856 goto out;
1857 tmem_write_lock(&pool->pool_rwlock);
1858 obj_destroy(obj,0);
1859 pool->flush_objs_found++;
1860 tmem_write_unlock(&pool->pool_rwlock);
1862 out:
1863 if ( pool->client->frozen )
1864 return -EFROZEN;
1865 else
1866 return 1;
1869 static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
1871 client_t *client = tmh_client_from_current();
1872 pool_t *pool;
1874 if ( client->pools == NULL )
1875 return 0;
1876 if ( (pool = client->pools[pool_id]) == NULL )
1877 return 0;
1878 client->pools[pool_id] = NULL;
1879 pool_flush(pool,client->cli_id,1);
1880 return 1;
1883 static NOINLINE int do_tmem_new_pool(cli_id_t this_cli_id,
1884 uint32_t d_poolid, uint32_t flags,
1885 uint64_t uuid_lo, uint64_t uuid_hi)
1887 client_t *client;
1888 cli_id_t cli_id;
1889 int persistent = flags & TMEM_POOL_PERSIST;
1890 int shared = flags & TMEM_POOL_SHARED;
1891 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1892 & TMEM_POOL_PAGESIZE_MASK;
1893 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1894 & TMEM_POOL_VERSION_MASK;
1895 pool_t *pool, *shpool;
1896 int s_poolid, first_unused_s_poolid;
1897 int i;
1899 if ( this_cli_id == CLI_ID_NULL )
1900 cli_id = tmh_get_cli_id_from_current();
1901 else
1902 cli_id = this_cli_id;
1903 printk("tmem: allocating %s-%s tmem pool for %s=%d...",
1904 persistent ? "persistent" : "ephemeral" ,
1905 shared ? "shared" : "private", cli_id_str, cli_id);
1906 if ( specversion != TMEM_SPEC_VERSION )
1908 printk("failed... unsupported spec version\n");
1909 return -EPERM;
1911 if ( pagebits != (PAGE_SHIFT - 12) )
1913 printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
1914 return -EPERM;
1916 if ( flags & TMEM_POOL_PRECOMPRESSED )
1918 printk("failed... precompression flag set but unsupported\n");
1919 return -EPERM;
1921 if ( flags & TMEM_POOL_RESERVED_BITS )
1923 printk("failed... reserved bits must be zero\n");
1924 return -EPERM;
1926 if ( (pool = pool_alloc()) == NULL )
1928 printk("failed... out of memory\n");
1929 return -ENOMEM;
1931 if ( this_cli_id != CLI_ID_NULL )
1933 if ( (client = tmh_client_from_cli_id(this_cli_id)) == NULL
1934 || d_poolid >= MAX_POOLS_PER_DOMAIN
1935 || client->pools[d_poolid] != NULL )
1936 goto fail;
1938 else
1940 client = tmh_client_from_current();
1941 ASSERT(client != NULL);
1942 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1943 if ( client->pools[d_poolid] == NULL )
1944 break;
1945 if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
1947 printk("failed... no more pool slots available for this %s\n",
1948 client_str);
1949 goto fail;
1952 if ( shared )
1954 if ( uuid_lo == -1L && uuid_hi == -1L )
1955 shared = 0;
1956 if ( client->shared_auth_required && !global_shared_auth )
1958 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1959 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1960 (client->shared_auth_uuid[i][1] == uuid_hi) )
1961 break;
1962 if ( i == MAX_GLOBAL_SHARED_POOLS )
1963 shared = 0;
1966 pool->shared = shared;
1967 pool->client = client;
1968 if ( shared )
1970 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1971 for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
1973 if ( (shpool = global_shared_pools[s_poolid]) != NULL )
1975 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1977 printk("(matches shared pool uuid=%"PRIx64".%"PRIx64") ",
1978 uuid_hi, uuid_lo);
1979 printk("pool_id=%d\n",d_poolid);
1980 client->pools[d_poolid] = global_shared_pools[s_poolid];
1981 shared_pool_join(global_shared_pools[s_poolid], client);
1982 pool_free(pool);
1983 return d_poolid;
1986 else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1987 first_unused_s_poolid = s_poolid;
1989 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1991 printk("tmem: failed... no global shared pool slots available\n");
1992 goto fail;
1994 else
1996 INIT_LIST_HEAD(&pool->share_list);
1997 pool->shared_count = 0;
1998 global_shared_pools[first_unused_s_poolid] = pool;
1999 (void)shared_pool_join(pool,client);
2002 client->pools[d_poolid] = pool;
2003 list_add_tail(&pool->pool_list, &global_pool_list);
2004 pool->pool_id = d_poolid;
2005 pool->persistent = persistent;
2006 pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
2007 printk("pool_id=%d\n",d_poolid);
2008 return d_poolid;
2010 fail:
2011 pool_free(pool);
2012 return -EPERM;
2015 /************ TMEM CONTROL OPERATIONS ************************************/
2017 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
2018 static int tmemc_freeze_pools(cli_id_t cli_id, int arg)
2020 client_t *client;
2021 bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
2022 bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
2023 char *s;
2025 s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
2026 if ( cli_id == CLI_ID_NULL )
2028 list_for_each_entry(client,&global_client_list,client_list)
2029 client_freeze(client,freeze);
2030 printk("tmem: all pools %s for all %ss\n",s,client_str);
2032 else
2034 if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
2035 return -1;
2036 client_freeze(client,freeze);
2037 printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
2039 return 0;
2042 static int tmemc_flush_mem(cli_id_t cli_id, uint32_t kb)
2044 uint32_t npages, flushed_pages, flushed_kb;
2046 if ( cli_id != CLI_ID_NULL )
2048 printk("tmem: %s-specific flush not supported yet, use --all\n",
2049 client_str);
2050 return -1;
2052 /* convert kb to pages, rounding up if necessary */
2053 npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
2054 flushed_pages = tmem_relinquish_npages(npages);
2055 flushed_kb = flushed_pages << (PAGE_SHIFT-10);
2056 return flushed_kb;
2059 /*
2060 * These tmemc_list* routines output lots of stats in a format that is
2061 * intended to be program-parseable, not human-readable. Further, by
2062 * tying each group of stats to a line format indicator (e.g. G= for
2063 * global stats) and each individual stat to a two-letter specifier
2064 * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
2065 * global ephemeral pool), it should allow the stats reported to be
2066 * forward and backwards compatible as tmem evolves.
2067 */
2068 #define BSIZE 1024
2070 static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
2071 uint32_t len, bool_t use_long)
2073 char info[BSIZE];
2074 int i, n = 0, sum = 0;
2075 pool_t *p;
2076 bool_t s;
2078 n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d,"
2079 "Tc:%"PRIu64",Ge:%ld,Pp:%ld,Gp:%ld%c",
2080 c->cli_id, c->weight, c->cap, c->compress, c->frozen,
2081 c->total_cycles, c->succ_eph_gets, c->succ_pers_puts, c->succ_pers_gets,
2082 use_long ? ',' : '\n');
2083 if (use_long)
2084 n += scnprintf(info+n,BSIZE-n,
2085 "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n",
2086 c->eph_count, c->eph_count_max,
2087 c->compressed_pages, c->compressed_sum_size,
2088 c->compress_poor, c->compress_nomem);
2089 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2090 sum += n;
2091 for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
2093 if ( (p = c->pools[i]) == NULL )
2094 continue;
2095 s = is_shared(p);
2096 n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,"
2097 "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c",
2098 c->cli_id, p->pool_id,
2099 is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
2100 (uint64_t)(s ? p->uuid[0] : 0),
2101 (uint64_t)(s ? p->uuid[1] : 0LL),
2102 use_long ? ',' : '\n');
2103 if (use_long)
2104 n += scnprintf(info+n,BSIZE-n,
2105 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
2106 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
2107 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
2108 _atomic_read(p->pgp_count), p->pgp_count_max,
2109 p->obj_count, p->obj_count_max,
2110 p->objnode_count, p->objnode_count_max,
2111 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
2112 p->no_mem_puts,
2113 p->found_gets, p->gets,
2114 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
2115 if ( sum + n >= len )
2116 return sum;
2117 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2118 sum += n;
2120 return sum;
2123 static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
2124 bool_t use_long)
2126 char info[BSIZE];
2127 int i, n = 0, sum = 0;
2128 pool_t *p;
2129 sharelist_t *sl;
2131 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
2133 if ( (p = global_shared_pools[i]) == NULL )
2134 continue;
2135 n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64,
2136 i, is_persistent(p) ? 'P' : 'E',
2137 is_shared(p) ? 'S' : 'P',
2138 p->uuid[0], p->uuid[1]);
2139 list_for_each_entry(sl,&p->share_list, share_list)
2140 n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
2141 n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
2142 if (use_long)
2143 n += scnprintf(info+n,BSIZE-n,
2144 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
2145 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
2146 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
2147 _atomic_read(p->pgp_count), p->pgp_count_max,
2148 p->obj_count, p->obj_count_max,
2149 p->objnode_count, p->objnode_count_max,
2150 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
2151 p->no_mem_puts,
2152 p->found_gets, p->gets,
2153 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
2154 if ( sum + n >= len )
2155 return sum;
2156 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2157 sum += n;
2159 return sum;
2162 #ifdef TMEM_PERF
2163 static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
2164 bool_t use_long)
2166 char info[BSIZE];
2167 int n = 0, sum = 0;
2169 n = scnprintf(info+n,BSIZE-n,"T=");
2170 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
2171 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
2172 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
2173 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
2174 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
2175 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
2176 #ifdef COMPARE_COPY_PAGE_SSE2
2177 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
2178 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
2179 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
2180 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
2181 #else
2182 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
2183 #endif
2184 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
2185 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
2186 n--; /* overwrite trailing comma */
2187 n += scnprintf(info+n,BSIZE-n,"\n");
2188 if ( sum + n >= len )
2189 return sum;
2190 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2191 sum += n;
2192 return sum;
2194 #else
2195 #define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
2196 #endif
2198 static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
2199 bool_t use_long)
2201 char info[BSIZE];
2202 int n = 0, sum = off;
2204 n += scnprintf(info,BSIZE,"G="
2205 "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
2206 "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
2207 total_tmem_ops, errored_tmem_ops, failed_copies,
2208 alloc_failed, alloc_page_failed, tmh_avail_pages(),
2209 low_on_memory, evicted_pgs,
2210 evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
2211 total_flush_pool, use_long ? ',' : '\n');
2212 if (use_long)
2213 n += scnprintf(info+n,BSIZE-n,
2214 "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d,"
2215 "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Ep:%lu,Gd:%lu,Zt:%lu,Gz:%lu\n",
2216 global_eph_count, global_eph_count_max,
2217 _atomic_read(global_obj_count), global_obj_count_max,
2218 _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
2219 _atomic_read(global_pgp_count), global_pgp_count_max,
2220 _atomic_read(global_page_count), global_page_count_max,
2221 _atomic_read(global_pcd_count), global_pcd_count_max,
2222 tot_good_eph_puts,deduped_puts,pcd_tot_tze_size,pcd_tot_csize);
2223 if ( sum + n >= len )
2224 return sum;
2225 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2226 sum += n;
2227 return sum;
2230 static int tmemc_list(cli_id_t cli_id, tmem_cli_va_t buf, uint32_t len,
2231 bool_t use_long)
2233 client_t *client;
2234 int off = 0;
2236 if ( cli_id == CLI_ID_NULL ) {
2237 off = tmemc_list_global(buf,0,len,use_long);
2238 off += tmemc_list_shared(buf,off,len-off,use_long);
2239 list_for_each_entry(client,&global_client_list,client_list)
2240 off += tmemc_list_client(client, buf, off, len-off, use_long);
2241 off += tmemc_list_global_perf(buf,off,len-off,use_long);
2243 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
2244 return -1;
2245 else
2246 off = tmemc_list_client(client, buf, 0, len, use_long);
2248 return 0;
2251 static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
2253 cli_id_t cli_id = client->cli_id;
2254 uint32_t old_weight;
2256 switch (subop)
2258 case TMEMC_SET_WEIGHT:
2259 old_weight = client->weight;
2260 client->weight = arg1;
2261 printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
2262 atomic_sub(old_weight,&client_weight_total);
2263 atomic_add(client->weight,&client_weight_total);
2264 break;
2265 case TMEMC_SET_CAP:
2266 client->cap = arg1;
2267 printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
2268 break;
2269 case TMEMC_SET_COMPRESS:
2270 #ifdef __i386__
2271 return -1;
2272 #endif
2273 if ( tmh_dedup_enabled() )
2275 printk("tmem: compression %s for all %ss, cannot be changed "
2276 "when tmem_dedup is enabled\n",
2277 tmh_compression_enabled() ? "enabled" : "disabled",client_str);
2278 return -1;
2280 client->compress = arg1 ? 1 : 0;
2281 printk("tmem: compression %s for %s=%d\n",
2282 arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
2283 break;
2284 default:
2285 printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
2286 return -1;
2288 return 0;
2291 static int tmemc_set_var(cli_id_t cli_id, uint32_t subop, uint32_t arg1)
2293 client_t *client;
2295 if ( cli_id == CLI_ID_NULL )
2296 list_for_each_entry(client,&global_client_list,client_list)
2297 tmemc_set_var_one(client, subop, arg1);
2298 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
2299 return -1;
2300 else
2301 tmemc_set_var_one(client, subop, arg1);
2302 return 0;
2305 static NOINLINE int tmemc_shared_pool_auth(cli_id_t cli_id, uint64_t uuid_lo,
2306 uint64_t uuid_hi, bool_t auth)
2308 client_t *client;
2309 int i, free = -1;
2311 if ( cli_id == CLI_ID_NULL )
2313 global_shared_auth = auth;
2314 return 1;
2316 client = tmh_client_from_cli_id(cli_id);
2317 if ( client == NULL )
2318 return -EINVAL;
2319 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
2321 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
2322 (client->shared_auth_uuid[i][1] == uuid_hi) )
2324 if ( auth == 0 )
2325 client->shared_auth_uuid[i][0] =
2326 client->shared_auth_uuid[i][1] = -1L;
2327 return 1;
2329 if ( (auth == 1) && (client->shared_auth_uuid[i][0] == -1L) &&
2330 (client->shared_auth_uuid[i][1] == -1L) && (free == -1) )
2331 free = i;
2333 if ( auth == 0 )
2334 return 0;
2335 if ( auth == 1 && free == -1 )
2336 return -ENOMEM;
2337 client->shared_auth_uuid[free][0] = uuid_lo;
2338 client->shared_auth_uuid[free][1] = uuid_hi;
2339 return 1;
2342 static NOINLINE int tmemc_save_subop(int cli_id, uint32_t pool_id,
2343 uint32_t subop, tmem_cli_va_t buf, uint32_t arg1)
2345 client_t *client = tmh_client_from_cli_id(cli_id);
2346 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2347 ? NULL : client->pools[pool_id];
2348 uint32_t p;
2349 uint64_t *uuid;
2350 pgp_t *pgp, *pgp2;
2351 int rc = -1;
2353 switch(subop)
2355 case TMEMC_SAVE_BEGIN:
2356 if ( client == NULL )
2357 return 0;
2358 for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
2359 if ( client->pools[p] != NULL )
2360 break;
2361 if ( p == MAX_POOLS_PER_DOMAIN )
2363 rc = 0;
2364 break;
2366 client->was_frozen = client->frozen;
2367 client->frozen = 1;
2368 if ( arg1 != 0 )
2369 client->live_migrating = 1;
2370 rc = 1;
2371 break;
2372 case TMEMC_RESTORE_BEGIN:
2373 if ( client == NULL && (client = client_create(cli_id)) != NULL )
2374 return 1;
2375 break;
2376 case TMEMC_SAVE_GET_VERSION:
2377 rc = TMEM_SPEC_VERSION;
2378 break;
2379 case TMEMC_SAVE_GET_MAXPOOLS:
2380 rc = MAX_POOLS_PER_DOMAIN;
2381 break;
2382 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
2383 rc = client->weight == -1 ? -2 : client->weight;
2384 break;
2385 case TMEMC_SAVE_GET_CLIENT_CAP:
2386 rc = client->cap == -1 ? -2 : client->cap;
2387 break;
2388 case TMEMC_SAVE_GET_CLIENT_FLAGS:
2389 rc = (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) |
2390 (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 );
2391 break;
2392 case TMEMC_SAVE_GET_POOL_FLAGS:
2393 if ( pool == NULL )
2394 break;
2395 rc = (pool->persistent ? TMEM_POOL_PERSIST : 0) |
2396 (pool->shared ? TMEM_POOL_SHARED : 0) |
2397 (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT);
2398 break;
2399 case TMEMC_SAVE_GET_POOL_NPAGES:
2400 if ( pool == NULL )
2401 break;
2402 rc = _atomic_read(pool->pgp_count);
2403 break;
2404 case TMEMC_SAVE_GET_POOL_UUID:
2405 if ( pool == NULL )
2406 break;
2407 uuid = (uint64_t *)buf.p;
2408 *uuid++ = pool->uuid[0];
2409 *uuid = pool->uuid[1];
2410 rc = 0;
2411 case TMEMC_SAVE_END:
2412 client->live_migrating = 0;
2413 if ( !list_empty(&client->persistent_invalidated_list) )
2414 list_for_each_entry_safe(pgp,pgp2,
2415 &client->persistent_invalidated_list, client_inv_pages)
2416 pgp_free_from_inv_list(client,pgp);
2417 client->frozen = client->was_frozen;
2418 rc = 0;
2420 return rc;
2423 static NOINLINE int tmemc_save_get_next_page(int cli_id, int pool_id,
2424 tmem_cli_va_t buf, uint32_t bufsize)
2426 client_t *client = tmh_client_from_cli_id(cli_id);
2427 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2428 ? NULL : client->pools[pool_id];
2429 pgp_t *pgp;
2430 OID oid;
2431 int ret = 0;
2432 struct tmem_handle *h;
2433 unsigned int pagesize = 1 << (pool->pageshift+12);
2435 if ( pool == NULL || is_ephemeral(pool) )
2436 return -1;
2437 if ( bufsize < pagesize + sizeof(struct tmem_handle) )
2438 return -ENOMEM;
2440 tmem_spin_lock(&pers_lists_spinlock);
2441 if ( list_empty(&pool->persistent_page_list) )
2443 ret = -1;
2444 goto out;
2446 /* note: pool->cur_pgp is the pgp last returned by get_next_page */
2447 if ( pool->cur_pgp == NULL )
2449 /* process the first one */
2450 pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
2451 pgp_t,us.pool_pers_pages);
2452 } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages,
2453 &pool->persistent_page_list) )
2455 /* already processed the last one in the list */
2456 ret = -1;
2457 goto out;
2459 pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next,
2460 pgp_t,us.pool_pers_pages);
2461 pool->cur_pgp = pgp;
2462 oid = pgp->us.obj->oid;
2463 h = (struct tmem_handle *)buf.p;
2464 *(OID *)&h->oid[0] = oid;
2465 h->index = pgp->index;
2466 buf.p = (void *)(h+1);
2467 ret = do_tmem_get(pool, &oid, h->index,0,0,0,pagesize,buf.p);
2469 out:
2470 tmem_spin_unlock(&pers_lists_spinlock);
2471 return ret;
2474 static NOINLINE int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_t buf,
2475 uint32_t bufsize)
2477 client_t *client = tmh_client_from_cli_id(cli_id);
2478 pgp_t *pgp;
2479 struct tmem_handle *h;
2480 int ret = 0;
2482 if ( client == NULL )
2483 return 0;
2484 if ( bufsize < sizeof(struct tmem_handle) )
2485 return 0;
2486 tmem_spin_lock(&pers_lists_spinlock);
2487 if ( list_empty(&client->persistent_invalidated_list) )
2488 goto out;
2489 if ( client->cur_pgp == NULL )
2491 pgp = list_entry((&client->persistent_invalidated_list)->next,
2492 pgp_t,client_inv_pages);
2493 client->cur_pgp = pgp;
2494 } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
2495 &client->persistent_invalidated_list) )
2497 client->cur_pgp = NULL;
2498 ret = 0;
2499 goto out;
2500 } else {
2501 pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
2502 pgp_t,client_inv_pages);
2503 client->cur_pgp = pgp;
2505 h = (struct tmem_handle *)buf.p;
2506 h->pool_id = pgp->pool_id;
2507 *(OID *)&h->oid = pgp->inv_oid;
2508 h->index = pgp->index;
2509 ret = 1;
2510 out:
2511 tmem_spin_unlock(&pers_lists_spinlock);
2512 return ret;
2515 static int tmemc_restore_put_page(int cli_id, int pool_id, OID *oidp,
2516 uint32_t index, tmem_cli_va_t buf, uint32_t bufsize)
2518 client_t *client = tmh_client_from_cli_id(cli_id);
2519 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2520 ? NULL : client->pools[pool_id];
2522 if ( pool == NULL )
2523 return -1;
2524 return do_tmem_put(pool,oidp,index,0,0,0,bufsize,buf.p);
2527 static int tmemc_restore_flush_page(int cli_id, int pool_id, OID *oidp,
2528 uint32_t index)
2530 client_t *client = tmh_client_from_cli_id(cli_id);
2531 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2532 ? NULL : client->pools[pool_id];
2534 if ( pool == NULL )
2535 return -1;
2536 return do_tmem_flush_page(pool,oidp,index);
2539 static NOINLINE int do_tmem_control(struct tmem_op *op)
2541 int ret;
2542 uint32_t pool_id = op->pool_id;
2543 uint32_t subop = op->u.ctrl.subop;
2544 OID *oidp = (OID *)(&op->u.ctrl.oid[0]);
2546 if (!tmh_current_is_privileged())
2548 /* don't fail... mystery: sometimes dom0 fails here */
2549 /* return -EPERM; */
2551 switch(subop)
2553 case TMEMC_THAW:
2554 case TMEMC_FREEZE:
2555 case TMEMC_DESTROY:
2556 ret = tmemc_freeze_pools(op->u.ctrl.cli_id,subop);
2557 break;
2558 case TMEMC_FLUSH:
2559 ret = tmemc_flush_mem(op->u.ctrl.cli_id,op->u.ctrl.arg1);
2560 break;
2561 case TMEMC_LIST:
2562 ret = tmemc_list(op->u.ctrl.cli_id,op->u.ctrl.buf,
2563 op->u.ctrl.arg1,op->u.ctrl.arg2);
2564 break;
2565 case TMEMC_SET_WEIGHT:
2566 case TMEMC_SET_CAP:
2567 case TMEMC_SET_COMPRESS:
2568 ret = tmemc_set_var(op->u.ctrl.cli_id,subop,op->u.ctrl.arg1);
2569 break;
2570 case TMEMC_QUERY_FREEABLE_MB:
2571 ret = tmh_freeable_pages() >> (20 - PAGE_SHIFT);
2572 break;
2573 case TMEMC_SAVE_BEGIN:
2574 case TMEMC_RESTORE_BEGIN:
2575 case TMEMC_SAVE_GET_VERSION:
2576 case TMEMC_SAVE_GET_MAXPOOLS:
2577 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
2578 case TMEMC_SAVE_GET_CLIENT_CAP:
2579 case TMEMC_SAVE_GET_CLIENT_FLAGS:
2580 case TMEMC_SAVE_GET_POOL_FLAGS:
2581 case TMEMC_SAVE_GET_POOL_NPAGES:
2582 case TMEMC_SAVE_GET_POOL_UUID:
2583 case TMEMC_SAVE_END:
2584 ret = tmemc_save_subop(op->u.ctrl.cli_id,pool_id,subop,
2585 op->u.ctrl.buf,op->u.ctrl.arg1);
2586 break;
2587 case TMEMC_SAVE_GET_NEXT_PAGE:
2588 ret = tmemc_save_get_next_page(op->u.ctrl.cli_id, pool_id,
2589 op->u.ctrl.buf, op->u.ctrl.arg1);
2590 break;
2591 case TMEMC_SAVE_GET_NEXT_INV:
2592 ret = tmemc_save_get_next_inv(op->u.ctrl.cli_id, op->u.ctrl.buf,
2593 op->u.ctrl.arg1);
2594 break;
2595 case TMEMC_RESTORE_PUT_PAGE:
2596 ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id,
2597 oidp, op->u.ctrl.arg2,
2598 op->u.ctrl.buf, op->u.ctrl.arg1);
2599 break;
2600 case TMEMC_RESTORE_FLUSH_PAGE:
2601 ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id,
2602 oidp, op->u.ctrl.arg2);
2603 break;
2604 default:
2605 ret = -1;
2607 return ret;
2610 /************ EXPORTed FUNCTIONS **************************************/
2612 EXPORT long do_tmem_op(tmem_cli_op_t uops)
2614 struct tmem_op op;
2615 client_t *client = tmh_client_from_current();
2616 pool_t *pool = NULL;
2617 OID *oidp;
2618 int rc = 0;
2619 bool_t succ_get = 0, succ_put = 0;
2620 bool_t non_succ_get = 0, non_succ_put = 0;
2621 bool_t flush = 0, flush_obj = 0;
2622 bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
2623 DECL_LOCAL_CYC_COUNTER(succ_get);
2624 DECL_LOCAL_CYC_COUNTER(succ_put);
2625 DECL_LOCAL_CYC_COUNTER(non_succ_get);
2626 DECL_LOCAL_CYC_COUNTER(non_succ_put);
2627 DECL_LOCAL_CYC_COUNTER(flush);
2628 DECL_LOCAL_CYC_COUNTER(flush_obj);
2630 if ( !tmem_initialized )
2631 return -ENODEV;
2633 total_tmem_ops++;
2635 if ( tmh_lock_all )
2637 if ( tmh_lock_all > 1 )
2638 spin_lock_irq(&tmem_spinlock);
2639 else
2640 spin_lock(&tmem_spinlock);
2643 START_CYC_COUNTER(succ_get);
2644 DUP_START_CYC_COUNTER(succ_put,succ_get);
2645 DUP_START_CYC_COUNTER(non_succ_get,succ_get);
2646 DUP_START_CYC_COUNTER(non_succ_put,succ_get);
2647 DUP_START_CYC_COUNTER(flush,succ_get);
2648 DUP_START_CYC_COUNTER(flush_obj,succ_get);
2650 if ( client != NULL && tmh_client_is_dying(client) )
2652 rc = -ENODEV;
2653 goto out;
2656 if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
2658 printk("tmem: can't get tmem struct from %s\n",client_str);
2659 rc = -EFAULT;
2660 goto out;
2663 if ( op.cmd == TMEM_CONTROL )
2665 tmem_write_lock(&tmem_rwlock);
2666 tmem_write_lock_set = 1;
2667 rc = do_tmem_control(&op);
2668 goto out;
2669 } else if ( op.cmd == TMEM_AUTH ) {
2670 tmem_write_lock(&tmem_rwlock);
2671 tmem_write_lock_set = 1;
2672 rc = tmemc_shared_pool_auth(op.u.creat.arg1,op.u.creat.uuid[0],
2673 op.u.creat.uuid[1],op.u.creat.flags);
2674 goto out;
2675 } else if ( op.cmd == TMEM_RESTORE_NEW ) {
2676 tmem_write_lock(&tmem_rwlock);
2677 tmem_write_lock_set = 1;
2678 rc = do_tmem_new_pool(op.u.creat.arg1, op.pool_id, op.u.creat.flags,
2679 op.u.creat.uuid[0], op.u.creat.uuid[1]);
2680 goto out;
2683 /* create per-client tmem structure dynamically on first use by client */
2684 if ( client == NULL )
2686 tmem_write_lock(&tmem_rwlock);
2687 tmem_write_lock_set = 1;
2688 if ( (client = client_create(tmh_get_cli_id_from_current())) == NULL )
2690 printk("tmem: can't create tmem structure for %s\n",client_str);
2691 rc = -ENOMEM;
2692 goto out;
2696 if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
2698 if ( !tmem_write_lock_set )
2700 tmem_write_lock(&tmem_rwlock);
2701 tmem_write_lock_set = 1;
2704 else
2706 if ( !tmem_write_lock_set )
2708 tmem_read_lock(&tmem_rwlock);
2709 tmem_read_lock_set = 1;
2711 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
2712 ((pool = client->pools[op.pool_id]) == NULL) )
2714 rc = -ENODEV;
2715 printk("tmem: operation requested on uncreated pool\n");
2716 goto out;
2718 ASSERT_SENTINEL(pool,POOL);
2721 oidp = (OID *)&op.u.gen.oid[0];
2722 switch ( op.cmd )
2724 case TMEM_NEW_POOL:
2725 rc = do_tmem_new_pool(CLI_ID_NULL, 0, op.u.creat.flags,
2726 op.u.creat.uuid[0], op.u.creat.uuid[1]);
2727 break;
2728 case TMEM_NEW_PAGE:
2729 tmem_ensure_avail_pages();
2730 rc = do_tmem_put(pool, oidp,
2731 op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, NULL);
2732 break;
2733 case TMEM_PUT_PAGE:
2734 tmem_ensure_avail_pages();
2735 rc = do_tmem_put(pool, oidp,
2736 op.u.gen.index, op.u.gen.cmfn, 0, 0, PAGE_SIZE, NULL);
2737 if (rc == 1) succ_put = 1;
2738 else non_succ_put = 1;
2739 break;
2740 case TMEM_GET_PAGE:
2741 rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
2742 0, 0, PAGE_SIZE, 0);
2743 if (rc == 1) succ_get = 1;
2744 else non_succ_get = 1;
2745 break;
2746 case TMEM_FLUSH_PAGE:
2747 flush = 1;
2748 rc = do_tmem_flush_page(pool, oidp, op.u.gen.index);
2749 break;
2750 case TMEM_FLUSH_OBJECT:
2751 rc = do_tmem_flush_object(pool, oidp);
2752 flush_obj = 1;
2753 break;
2754 case TMEM_DESTROY_POOL:
2755 flush = 1;
2756 rc = do_tmem_destroy_pool(op.pool_id);
2757 break;
2758 case TMEM_READ:
2759 rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
2760 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
2761 op.u.gen.len,0);
2762 break;
2763 case TMEM_WRITE:
2764 rc = do_tmem_put(pool, oidp,
2765 op.u.gen.index, op.u.gen.cmfn,
2766 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
2767 op.u.gen.len, NULL);
2768 break;
2769 case TMEM_XCHG:
2770 /* need to hold global lock to ensure xchg is atomic */
2771 printk("tmem_xchg op not implemented yet\n");
2772 rc = 0;
2773 break;
2774 default:
2775 printk("tmem: op %d not implemented\n", op.cmd);
2776 rc = 0;
2777 break;
2780 out:
2781 if ( rc < 0 )
2782 errored_tmem_ops++;
2783 if ( succ_get )
2784 END_CYC_COUNTER_CLI(succ_get,client);
2785 else if ( succ_put )
2786 END_CYC_COUNTER_CLI(succ_put,client);
2787 else if ( non_succ_get )
2788 END_CYC_COUNTER_CLI(non_succ_get,client);
2789 else if ( non_succ_put )
2790 END_CYC_COUNTER_CLI(non_succ_put,client);
2791 else if ( flush )
2792 END_CYC_COUNTER_CLI(flush,client);
2793 else if ( flush_obj )
2794 END_CYC_COUNTER_CLI(flush_obj,client);
2796 if ( tmh_lock_all )
2798 if ( tmh_lock_all > 1 )
2799 spin_unlock_irq(&tmem_spinlock);
2800 else
2801 spin_unlock(&tmem_spinlock);
2802 } else {
2803 if ( tmem_write_lock_set )
2804 write_unlock(&tmem_rwlock);
2805 else if ( tmem_read_lock_set )
2806 read_unlock(&tmem_rwlock);
2807 else
2808 ASSERT(0);
2811 return rc;
2814 /* this should be called when the host is destroying a client */
2815 EXPORT void tmem_destroy(void *v)
2817 client_t *client = (client_t *)v;
2819 if ( client == NULL )
2820 return;
2822 if ( !tmh_client_is_dying(client) )
2824 printk("tmem: tmem_destroy can only destroy dying client\n");
2825 return;
2828 if ( tmh_lock_all )
2829 spin_lock(&tmem_spinlock);
2830 else
2831 write_lock(&tmem_rwlock);
2833 printk("tmem: flushing tmem pools for %s=%d\n",
2834 cli_id_str, client->cli_id);
2835 client_flush(client, 1);
2837 if ( tmh_lock_all )
2838 spin_unlock(&tmem_spinlock);
2839 else
2840 write_unlock(&tmem_rwlock);
2843 /* freezing all pools guarantees that no additional memory will be consumed */
2844 EXPORT void tmem_freeze_all(unsigned char key)
2846 static int freeze = 0;
2848 if ( tmh_lock_all )
2849 spin_lock(&tmem_spinlock);
2850 else
2851 write_lock(&tmem_rwlock);
2853 freeze = !freeze;
2854 tmemc_freeze_pools(CLI_ID_NULL,freeze);
2856 if ( tmh_lock_all )
2857 spin_unlock(&tmem_spinlock);
2858 else
2859 write_unlock(&tmem_rwlock);
2862 #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
2864 EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2866 pfp_t *pfp;
2867 unsigned long evicts_per_relinq = 0;
2868 int max_evictions = 10;
2870 if (!tmh_enabled() || !tmh_freeable_pages())
2871 return NULL;
2872 #ifdef __i386__
2873 return NULL;
2874 #endif
2876 relinq_attempts++;
2877 if ( order > 0 )
2879 #ifndef NDEBUG
2880 printk("tmem_relinquish_page: failing order=%d\n", order);
2881 #endif
2882 return NULL;
2885 if ( tmh_called_from_tmem(memflags) )
2887 if ( tmh_lock_all )
2888 spin_lock(&tmem_spinlock);
2889 else
2890 read_lock(&tmem_rwlock);
2893 while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
2895 if ( (max_evictions-- <= 0) || !tmem_evict())
2896 break;
2897 evicts_per_relinq++;
2899 if ( evicts_per_relinq > max_evicts_per_relinq )
2900 max_evicts_per_relinq = evicts_per_relinq;
2901 tmh_scrub_page(pfp, memflags);
2902 if ( pfp != NULL )
2903 relinq_pgs++;
2905 if ( tmh_called_from_tmem(memflags) )
2907 if ( tmh_lock_all )
2908 spin_unlock(&tmem_spinlock);
2909 else
2910 read_unlock(&tmem_rwlock);
2913 return pfp;
2916 EXPORT unsigned long tmem_freeable_pages(void)
2918 return tmh_freeable_pages();
2921 /* called at hypervisor startup */
2922 static int __init init_tmem(void)
2924 int i;
2925 if ( !tmh_enabled() )
2926 return 0;
2928 radix_tree_init();
2929 if ( tmh_dedup_enabled() )
2930 for (i = 0; i < 256; i++ )
2932 pcd_tree_roots[i] = RB_ROOT;
2933 rwlock_init(&pcd_tree_rwlocks[i]);
2936 if ( tmh_init() )
2938 printk("tmem: initialized comp=%d dedup=%d tze=%d global-lock=%d\n",
2939 tmh_compression_enabled(), tmh_dedup_enabled(), tmh_tze_enabled(),
2940 tmh_lock_all);
2941 if ( tmh_dedup_enabled()&&tmh_compression_enabled()&&tmh_tze_enabled() )
2943 tmh_tze_disable();
2944 printk("tmem: tze and compression not compatible, disabling tze\n");
2946 tmem_initialized = 1;
2948 else
2949 printk("tmem: initialization FAILED\n");
2951 return 0;
2953 __initcall(init_tmem);
2955 /*
2956 * Local variables:
2957 * mode: C
2958 * c-set-style: "BSD"
2959 * c-basic-offset: 4
2960 * tab-width: 4
2961 * indent-tabs-mode: nil
2962 * End:
2963 */