debuggers.hg

view xen/common/tmem.c @ 22906:700ac6445812

Now add KDB to the non-kdb tree
author Mukesh Rathor
date Thu Feb 03 15:42:41 2011 -0800 (2011-02-03)
parents 01f3b3509023
children
line source
1 /******************************************************************************
2 * tmem.c
3 *
4 * Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
9 /* TODO list: 090129 (updated 100318)
10 - any better reclamation policy?
11 - use different tlsf pools for each client (maybe each pool)
12 - test shared access more completely (ocfs2)
13 - add feedback-driven compression (not for persistent pools though!)
14 - add data-structure total bytes overhead stats
15 */
17 #ifdef __XEN__
18 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
19 #endif
21 #include <xen/tmem.h>
22 #include <xen/rbtree.h>
23 #include <xen/radix-tree.h>
24 #include <xen/list.h>
26 #define EXPORT /* indicates code other modules are dependent upon */
27 #define FORWARD
29 #define TMEM_SPEC_VERSION 1
31 /************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
33 #define CLI_ID_NULL TMH_CLI_ID_NULL
34 #define cli_id_str tmh_cli_id_str
35 #define client_str tmh_client_str
37 /************ DEBUG and STATISTICS (+ some compression testing) *******/
39 #ifndef NDEBUG
40 #define SENTINELS
41 #define NOINLINE noinline
42 #else
43 #define NOINLINE
44 #endif
46 #ifdef SENTINELS
47 #define DECL_SENTINEL unsigned long sentinel;
48 #define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
49 #define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
50 #define ASSERT_SENTINEL(_x,_y) \
51 ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
52 #ifdef __i386__
53 #define POOL_SENTINEL 0x87658765
54 #define OBJ_SENTINEL 0x12345678
55 #define OBJNODE_SENTINEL 0xfedcba09
56 #define PGD_SENTINEL 0x43214321
57 #else
58 #define POOL_SENTINEL 0x8765876587658765
59 #define OBJ_SENTINEL 0x1234567812345678
60 #define OBJNODE_SENTINEL 0xfedcba0987654321
61 #define PGD_SENTINEL 0x4321432143214321
62 #endif
63 #else
64 #define DECL_SENTINEL
65 #define SET_SENTINEL(_x,_y) do { } while (0)
66 #define ASSERT_SENTINEL(_x,_y) do { } while (0)
67 #define INVERT_SENTINEL(_x,_y) do { } while (0)
68 #endif
70 /* global statistics (none need to be locked) */
71 static unsigned long total_tmem_ops = 0;
72 static unsigned long errored_tmem_ops = 0;
73 static unsigned long total_flush_pool = 0;
74 static unsigned long alloc_failed = 0, alloc_page_failed = 0;
75 static unsigned long evicted_pgs = 0, evict_attempts = 0;
76 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
77 static unsigned long max_evicts_per_relinq = 0;
78 static unsigned long low_on_memory = 0;
79 static unsigned long deduped_puts = 0;
80 static unsigned long tot_good_eph_puts = 0;
81 static int global_obj_count_max = 0;
82 static int global_pgp_count_max = 0;
83 static int global_pcd_count_max = 0;
84 static int global_page_count_max = 0;
85 static int global_rtree_node_count_max = 0;
86 static long global_eph_count_max = 0;
87 static unsigned long failed_copies;
88 static unsigned long pcd_tot_tze_size = 0;
89 static unsigned long pcd_tot_csize = 0;
91 DECL_CYC_COUNTER(succ_get);
92 DECL_CYC_COUNTER(succ_put);
93 DECL_CYC_COUNTER(non_succ_get);
94 DECL_CYC_COUNTER(non_succ_put);
95 DECL_CYC_COUNTER(flush);
96 DECL_CYC_COUNTER(flush_obj);
97 #ifdef COMPARE_COPY_PAGE_SSE2
98 EXTERN_CYC_COUNTER(pg_copy1);
99 EXTERN_CYC_COUNTER(pg_copy2);
100 EXTERN_CYC_COUNTER(pg_copy3);
101 EXTERN_CYC_COUNTER(pg_copy4);
102 #else
103 EXTERN_CYC_COUNTER(pg_copy);
104 #endif
105 DECL_CYC_COUNTER(compress);
106 DECL_CYC_COUNTER(decompress);
108 /************ CORE DATA STRUCTURES ************************************/
110 #define MAX_POOLS_PER_DOMAIN 16
111 #define MAX_GLOBAL_SHARED_POOLS 16
113 struct tm_pool;
114 struct tmem_page_descriptor;
115 struct tmem_page_content_descriptor;
116 struct client {
117 struct list_head client_list;
118 struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
119 tmh_client_t *tmh;
120 struct list_head ephemeral_page_list;
121 long eph_count, eph_count_max;
122 cli_id_t cli_id;
123 uint32_t weight;
124 uint32_t cap;
125 bool_t compress;
126 bool_t frozen;
127 bool_t shared_auth_required;
128 /* for save/restore/migration */
129 bool_t live_migrating;
130 bool_t was_frozen;
131 struct list_head persistent_invalidated_list;
132 struct tmem_page_descriptor *cur_pgp;
133 /* statistics collection */
134 unsigned long compress_poor, compress_nomem;
135 unsigned long compressed_pages;
136 uint64_t compressed_sum_size;
137 uint64_t total_cycles;
138 unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets;
139 /* shared pool authentication */
140 uint64_t shared_auth_uuid[MAX_GLOBAL_SHARED_POOLS][2];
141 };
142 typedef struct client client_t;
144 struct share_list {
145 struct list_head share_list;
146 client_t *client;
147 };
148 typedef struct share_list sharelist_t;
150 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
151 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
153 struct tm_pool {
154 bool_t shared;
155 bool_t persistent;
156 bool_t is_dying;
157 int pageshift; /* 0 == 2**12 */
158 struct list_head pool_list;
159 client_t *client;
160 uint64_t uuid[2]; /* 0 for private, non-zero for shared */
161 uint32_t pool_id;
162 rwlock_t pool_rwlock;
163 struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
164 struct list_head share_list; /* valid if shared */
165 int shared_count; /* valid if shared */
166 /* for save/restore/migration */
167 struct list_head persistent_page_list;
168 struct tmem_page_descriptor *cur_pgp;
169 /* statistics collection */
170 atomic_t pgp_count;
171 int pgp_count_max;
172 long obj_count; /* atomicity depends on pool_rwlock held for write */
173 long obj_count_max;
174 unsigned long objnode_count, objnode_count_max;
175 uint64_t sum_life_cycles;
176 uint64_t sum_evicted_cycles;
177 unsigned long puts, good_puts, no_mem_puts;
178 unsigned long dup_puts_flushed, dup_puts_replaced;
179 unsigned long gets, found_gets;
180 unsigned long flushs, flushs_found;
181 unsigned long flush_objs, flush_objs_found;
182 DECL_SENTINEL
183 };
184 typedef struct tm_pool pool_t;
186 #define is_persistent(_p) (_p->persistent)
187 #define is_ephemeral(_p) (!(_p->persistent))
188 #define is_shared(_p) (_p->shared)
189 #define is_private(_p) (!(_p->shared))
191 struct oid {
192 uint64_t oid[3];
193 };
194 typedef struct oid OID;
196 struct tmem_object_root {
197 DECL_SENTINEL
198 OID oid;
199 struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
200 unsigned long objnode_count; /* atomicity depends on obj_spinlock */
201 long pgp_count; /* atomicity depends on obj_spinlock */
202 struct radix_tree_root tree_root; /* tree of pages within object */
203 pool_t *pool;
204 cli_id_t last_client;
205 spinlock_t obj_spinlock;
206 bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
207 };
208 typedef struct tmem_object_root obj_t;
210 typedef struct radix_tree_node rtn_t;
211 struct tmem_object_node {
212 obj_t *obj;
213 DECL_SENTINEL
214 rtn_t rtn;
215 };
216 typedef struct tmem_object_node objnode_t;
218 struct tmem_page_descriptor {
219 union {
220 struct list_head global_eph_pages;
221 struct list_head client_inv_pages;
222 };
223 union {
224 struct {
225 union {
226 struct list_head client_eph_pages;
227 struct list_head pool_pers_pages;
228 };
229 obj_t *obj;
230 } us;
231 OID inv_oid; /* used for invalid list only */
232 };
233 pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
234 else compressed data (cdata) */
235 uint32_t index;
236 /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
237 uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */
238 bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */
239 struct list_head pcd_siblings;
240 union {
241 pfp_t *pfp; /* page frame pointer */
242 char *cdata; /* compressed data */
243 struct tmem_page_content_descriptor *pcd; /* page dedup */
244 };
245 union {
246 uint64_t timestamp;
247 uint32_t pool_id; /* used for invalid list only */
248 };
249 DECL_SENTINEL
250 };
251 typedef struct tmem_page_descriptor pgp_t;
253 #define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
255 struct tmem_page_content_descriptor {
256 union {
257 pfp_t *pfp; /* page frame pointer */
258 char *cdata; /* if compression_enabled */
259 char *tze; /* if !compression_enabled, trailing zeroes eliminated */
260 };
261 struct list_head pgp_list;
262 struct rb_node pcd_rb_tree_node;
263 uint32_t pgp_ref_count;
264 pagesize_t size; /* if compression_enabled -> 0<size<PAGE_SIZE (*cdata)
265 * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
266 * else PAGE_SIZE -> *pfp */
267 };
268 typedef struct tmem_page_content_descriptor pcd_t;
269 struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
270 rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
272 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
274 static LIST_HEAD(global_client_list);
275 static LIST_HEAD(global_pool_list);
277 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
278 static bool_t global_shared_auth = 0;
279 static atomic_t client_weight_total = ATOMIC_INIT(0);
280 static int tmem_initialized = 0;
282 /************ CONCURRENCY ***********************************************/
284 EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
285 EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
286 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
287 static DEFINE_SPINLOCK(pers_lists_spinlock);
289 #define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
290 #define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
291 #define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
292 #define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
293 #define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
294 #define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
295 #define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
296 #define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
298 #define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
299 #define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
301 /* global counters (should use long_atomic_t access) */
302 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
303 static atomic_t global_obj_count = ATOMIC_INIT(0);
304 static atomic_t global_pgp_count = ATOMIC_INIT(0);
305 static atomic_t global_pcd_count = ATOMIC_INIT(0);
306 static atomic_t global_page_count = ATOMIC_INIT(0);
307 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
309 #define atomic_inc_and_max(_c) do { \
310 atomic_inc(&_c); \
311 if ( _atomic_read(_c) > _c##_max ) \
312 _c##_max = _atomic_read(_c); \
313 } while (0)
315 #define atomic_dec_and_assert(_c) do { \
316 atomic_dec(&_c); \
317 ASSERT(_atomic_read(_c) >= 0); \
318 } while (0)
321 /************ MEMORY ALLOCATION INTERFACE *****************************/
323 #define tmem_malloc(_type,_pool) \
324 _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
326 #define tmem_malloc_bytes(_size,_pool) \
327 _tmem_malloc(_size, 1, _pool)
329 static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
330 {
331 void *v;
333 if ( (pool != NULL) && is_persistent(pool) )
334 v = tmh_alloc_subpage_thispool(pool,size,align);
335 else
336 v = tmh_alloc_subpage(pool, size, align);
337 if ( v == NULL )
338 alloc_failed++;
339 return v;
340 }
342 static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
343 {
344 if ( pool == NULL || !is_persistent(pool) )
345 tmh_free_subpage(p,size);
346 else
347 tmh_free_subpage_thispool(pool,p,size);
348 }
350 static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
351 {
352 pfp_t *pfp = NULL;
354 if ( pool != NULL && is_persistent(pool) )
355 pfp = tmh_alloc_page_thispool(pool);
356 else
357 pfp = tmh_alloc_page(pool,0);
358 if ( pfp == NULL )
359 alloc_page_failed++;
360 else
361 atomic_inc_and_max(global_page_count);
362 return pfp;
363 }
365 static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
366 {
367 ASSERT(pfp);
368 if ( pool == NULL || !is_persistent(pool) )
369 tmh_free_page(pfp);
370 else
371 tmh_free_page_thispool(pool,pfp);
372 atomic_dec_and_assert(global_page_count);
373 }
375 /************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/
377 #define NOT_SHAREABLE ((uint16_t)-1UL)
379 static NOINLINE int pcd_copy_to_client(tmem_cli_mfn_t cmfn, pgp_t *pgp)
380 {
381 uint8_t firstbyte = pgp->firstbyte;
382 pcd_t *pcd;
383 int ret;
385 ASSERT(tmh_dedup_enabled());
386 tmem_read_lock(&pcd_tree_rwlocks[firstbyte]);
387 pcd = pgp->pcd;
388 if ( pgp->size < PAGE_SIZE && pgp->size != 0 &&
389 pcd->size < PAGE_SIZE && pcd->size != 0 )
390 ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL);
391 else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE )
392 ret = tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size);
393 else
394 ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL);
395 tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]);
396 return ret;
397 }
399 /* ensure pgp no longer points to pcd, nor vice-versa */
400 /* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
401 static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t have_pcd_rwlock)
402 {
403 pcd_t *pcd = pgp->pcd;
404 pfp_t *pfp = pgp->pcd->pfp;
405 uint16_t firstbyte = pgp->firstbyte;
406 char *pcd_tze = pgp->pcd->tze;
407 pagesize_t pcd_size = pcd->size;
408 pagesize_t pgp_size = pgp->size;
409 char *pcd_cdata = pgp->pcd->cdata;
410 pagesize_t pcd_csize = pgp->pcd->size;
412 ASSERT(tmh_dedup_enabled());
413 ASSERT(firstbyte != NOT_SHAREABLE);
414 ASSERT(firstbyte < 256);
416 if ( have_pcd_rwlock )
417 ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]);
418 else
419 tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
420 list_del_init(&pgp->pcd_siblings);
421 pgp->pcd = NULL;
422 pgp->firstbyte = NOT_SHAREABLE;
423 pgp->size = -1;
424 if ( --pcd->pgp_ref_count )
425 {
426 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
427 return;
428 }
430 /* no more references to this pcd, recycle it and the physical page */
431 ASSERT(list_empty(&pcd->pgp_list));
432 pcd->pfp = NULL;
433 /* remove pcd from rbtree */
434 rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
435 /* reinit the struct for safety for now */
436 RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
437 /* now free up the pcd memory */
438 tmem_free(pcd,sizeof(pcd_t),NULL);
439 atomic_dec_and_assert(global_pcd_count);
440 if ( pgp_size != 0 && pcd_size < PAGE_SIZE )
441 {
442 /* compressed data */
443 tmem_free(pcd_cdata,pcd_csize,pool);
444 pcd_tot_csize -= pcd_csize;
445 }
446 else if ( pcd_size != PAGE_SIZE )
447 {
448 /* trailing zero data */
449 pcd_tot_tze_size -= pcd_size;
450 if ( pcd_size )
451 tmem_free(pcd_tze,pcd_size,pool);
452 } else {
453 /* real physical page */
454 if ( tmh_tze_enabled() )
455 pcd_tot_tze_size -= PAGE_SIZE;
456 if ( tmh_compression_enabled() )
457 pcd_tot_csize -= PAGE_SIZE;
458 tmem_page_free(pool,pfp);
459 }
460 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
461 }
464 static NOINLINE int pcd_associate(pgp_t *pgp, char *cdata, pagesize_t csize)
465 {
466 struct rb_node **new, *parent = NULL;
467 struct rb_root *root;
468 pcd_t *pcd;
469 int cmp;
470 pagesize_t pfp_size = 0;
471 uint8_t firstbyte = (cdata == NULL) ? tmh_get_first_byte(pgp->pfp) : *cdata;
472 int ret = 0;
474 if ( !tmh_dedup_enabled() )
475 return 0;
476 ASSERT(pgp->us.obj != NULL);
477 ASSERT(pgp->us.obj->pool != NULL);
478 ASSERT(!pgp->us.obj->pool->persistent);
479 if ( cdata == NULL )
480 {
481 ASSERT(pgp->pfp != NULL);
482 pfp_size = PAGE_SIZE;
483 if ( tmh_tze_enabled() )
484 {
485 pfp_size = tmh_tze_pfp_scan(pgp->pfp);
486 if ( pfp_size > PCD_TZE_MAX_SIZE )
487 pfp_size = PAGE_SIZE;
488 }
489 ASSERT(pfp_size <= PAGE_SIZE);
490 ASSERT(!(pfp_size & (sizeof(uint64_t)-1)));
491 }
492 tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
494 /* look for page match */
495 root = &pcd_tree_roots[firstbyte];
496 new = &(root->rb_node);
497 while ( *new )
498 {
499 pcd = container_of(*new, pcd_t, pcd_rb_tree_node);
500 parent = *new;
501 /* compare new entry and rbtree entry, set cmp accordingly */
502 if ( cdata != NULL )
503 {
504 if ( pcd->size < PAGE_SIZE )
505 /* both new entry and rbtree entry are compressed */
506 cmp = tmh_pcd_cmp(cdata,csize,pcd->cdata,pcd->size);
507 else
508 /* new entry is compressed, rbtree entry is not */
509 cmp = -1;
510 } else if ( pcd->size < PAGE_SIZE )
511 /* rbtree entry is compressed, rbtree entry is not */
512 cmp = 1;
513 else if ( tmh_tze_enabled() ) {
514 if ( pcd->size < PAGE_SIZE )
515 /* both new entry and rbtree entry are trailing zero */
516 cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size);
517 else
518 /* new entry is trailing zero, rbtree entry is not */
519 cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE);
520 } else {
521 /* both new entry and rbtree entry are full physical pages */
522 ASSERT(pgp->pfp != NULL);
523 ASSERT(pcd->pfp != NULL);
524 cmp = tmh_page_cmp(pgp->pfp,pcd->pfp);
525 }
527 /* walk tree or match depending on cmp */
528 if ( cmp < 0 )
529 new = &((*new)->rb_left);
530 else if ( cmp > 0 )
531 new = &((*new)->rb_right);
532 else
533 {
534 /* match! if not compressed, free the no-longer-needed page */
535 /* but if compressed, data is assumed static so don't free! */
536 if ( cdata == NULL )
537 tmem_page_free(pgp->us.obj->pool,pgp->pfp);
538 deduped_puts++;
539 goto match;
540 }
541 }
543 /* exited while loop with no match, so alloc a pcd and put it in the tree */
544 if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL )
545 {
546 ret = -ENOMEM;
547 goto unlock;
548 } else if ( cdata != NULL ) {
549 if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->us.obj->pool)) == NULL )
550 {
551 tmem_free(pcd,sizeof(pcd_t),NULL);
552 ret = -ENOMEM;
553 goto unlock;
554 }
555 }
556 atomic_inc_and_max(global_pcd_count);
557 RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */
558 INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */
559 pcd->pgp_ref_count = 0;
560 if ( cdata != NULL )
561 {
562 memcpy(pcd->cdata,cdata,csize);
563 pcd->size = csize;
564 pcd_tot_csize += csize;
565 } else if ( pfp_size == 0 ) {
566 ASSERT(tmh_tze_enabled());
567 pcd->size = 0;
568 pcd->tze = NULL;
569 } else if ( pfp_size < PAGE_SIZE &&
570 ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->us.obj->pool)) != NULL) ) {
571 tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size);
572 pcd->size = pfp_size;
573 pcd_tot_tze_size += pfp_size;
574 tmem_page_free(pgp->us.obj->pool,pgp->pfp);
575 } else {
576 pcd->pfp = pgp->pfp;
577 pcd->size = PAGE_SIZE;
578 if ( tmh_tze_enabled() )
579 pcd_tot_tze_size += PAGE_SIZE;
580 if ( tmh_compression_enabled() )
581 pcd_tot_csize += PAGE_SIZE;
582 }
583 rb_link_node(&pcd->pcd_rb_tree_node, parent, new);
584 rb_insert_color(&pcd->pcd_rb_tree_node, root);
586 match:
587 pcd->pgp_ref_count++;
588 list_add(&pgp->pcd_siblings,&pcd->pgp_list);
589 pgp->firstbyte = firstbyte;
590 pgp->eviction_attempted = 0;
591 pgp->pcd = pcd;
593 unlock:
594 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
595 return ret;
596 }
598 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
600 /* allocate a pgp_t and associate it with an object */
601 static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
602 {
603 pgp_t *pgp;
604 pool_t *pool;
606 ASSERT(obj != NULL);
607 ASSERT(obj->pool != NULL);
608 pool = obj->pool;
609 if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
610 return NULL;
611 pgp->us.obj = obj;
612 INIT_LIST_HEAD(&pgp->global_eph_pages);
613 INIT_LIST_HEAD(&pgp->us.client_eph_pages);
614 pgp->pfp = NULL;
615 if ( tmh_dedup_enabled() )
616 {
617 pgp->firstbyte = NOT_SHAREABLE;
618 pgp->eviction_attempted = 0;
619 INIT_LIST_HEAD(&pgp->pcd_siblings);
620 }
621 pgp->size = -1;
622 pgp->index = -1;
623 pgp->timestamp = get_cycles();
624 SET_SENTINEL(pgp,PGD);
625 atomic_inc_and_max(global_pgp_count);
626 atomic_inc_and_max(pool->pgp_count);
627 return pgp;
628 }
630 static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
631 {
632 ASSERT(obj != NULL);
633 ASSERT_SPINLOCK(&obj->obj_spinlock);
634 ASSERT_SENTINEL(obj,OBJ);
635 ASSERT(obj->pool != NULL);
636 ASSERT_SENTINEL(obj->pool,POOL);
637 return radix_tree_lookup(&obj->tree_root, index);
638 }
640 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
641 {
642 pagesize_t pgp_size = pgp->size;
644 if ( pgp->pfp == NULL )
645 return;
646 if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
647 pcd_disassociate(pgp,pool,0); /* pgp->size lost */
648 else if ( pgp_size )
649 tmem_free(pgp->cdata,pgp_size,pool);
650 else
651 tmem_page_free(pgp->us.obj->pool,pgp->pfp);
652 if ( pool != NULL && pgp_size )
653 {
654 pool->client->compressed_pages--;
655 pool->client->compressed_sum_size -= pgp_size;
656 }
657 pgp->pfp = NULL;
658 pgp->size = -1;
659 }
661 static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
662 {
663 pool_t *pool = NULL;
665 ASSERT_SENTINEL(pgp,PGD);
666 ASSERT(pgp->us.obj != NULL);
667 ASSERT_SENTINEL(pgp->us.obj,OBJ);
668 ASSERT_SENTINEL(pgp->us.obj->pool,POOL);
669 ASSERT(pgp->us.obj->pool->client != NULL);
670 if ( from_delete )
671 ASSERT(pgp_lookup_in_obj(pgp->us.obj,pgp->index) == NULL);
672 ASSERT(pgp->us.obj->pool != NULL);
673 pool = pgp->us.obj->pool;
674 if ( is_ephemeral(pool) )
675 {
676 ASSERT(list_empty(&pgp->global_eph_pages));
677 ASSERT(list_empty(&pgp->us.client_eph_pages));
678 }
679 pgp_free_data(pgp, pool);
680 atomic_dec_and_assert(global_pgp_count);
681 atomic_dec_and_assert(pool->pgp_count);
682 pgp->size = -1;
683 if ( is_persistent(pool) && pool->client->live_migrating )
684 {
685 pgp->inv_oid = pgp->us.obj->oid;
686 pgp->pool_id = pool->pool_id;
687 return;
688 }
689 INVERT_SENTINEL(pgp,PGD);
690 pgp->us.obj = NULL;
691 pgp->index = -1;
692 tmem_free(pgp,sizeof(pgp_t),pool);
693 }
695 static NOINLINE void pgp_free_from_inv_list(client_t *client, pgp_t *pgp)
696 {
697 pool_t *pool = client->pools[pgp->pool_id];
699 ASSERT_SENTINEL(pool,POOL);
700 ASSERT_SENTINEL(pgp,PGD);
701 INVERT_SENTINEL(pgp,PGD);
702 pgp->us.obj = NULL;
703 pgp->index = -1;
704 tmem_free(pgp,sizeof(pgp_t),pool);
705 }
707 /* remove the page from appropriate lists but not from parent object */
708 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
709 {
710 client_t *client;
712 ASSERT(pgp != NULL);
713 ASSERT(pgp->us.obj != NULL);
714 ASSERT(pgp->us.obj->pool != NULL);
715 client = pgp->us.obj->pool->client;
716 ASSERT(client != NULL);
717 if ( is_ephemeral(pgp->us.obj->pool) )
718 {
719 if ( !no_eph_lock )
720 tmem_spin_lock(&eph_lists_spinlock);
721 if ( !list_empty(&pgp->us.client_eph_pages) )
722 client->eph_count--;
723 ASSERT(client->eph_count >= 0);
724 list_del_init(&pgp->us.client_eph_pages);
725 if ( !list_empty(&pgp->global_eph_pages) )
726 global_eph_count--;
727 ASSERT(global_eph_count >= 0);
728 list_del_init(&pgp->global_eph_pages);
729 if ( !no_eph_lock )
730 tmem_spin_unlock(&eph_lists_spinlock);
731 } else {
732 if ( client->live_migrating )
733 {
734 tmem_spin_lock(&pers_lists_spinlock);
735 list_add_tail(&pgp->client_inv_pages,
736 &client->persistent_invalidated_list);
737 if ( pgp != pgp->us.obj->pool->cur_pgp )
738 list_del_init(&pgp->us.pool_pers_pages);
739 tmem_spin_unlock(&pers_lists_spinlock);
740 } else {
741 tmem_spin_lock(&pers_lists_spinlock);
742 list_del_init(&pgp->us.pool_pers_pages);
743 tmem_spin_unlock(&pers_lists_spinlock);
744 }
745 }
746 }
748 /* remove page from lists (but not from parent object) and free it */
749 static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
750 {
751 uint64_t life;
753 ASSERT(pgp != NULL);
754 ASSERT(pgp->us.obj != NULL);
755 ASSERT(pgp->us.obj->pool != NULL);
756 life = get_cycles() - pgp->timestamp;
757 pgp->us.obj->pool->sum_life_cycles += life;
758 pgp_delist(pgp, no_eph_lock);
759 pgp_free(pgp,1);
760 }
762 /* called only indirectly by radix_tree_destroy */
763 static NOINLINE void pgp_destroy(void *v)
764 {
765 pgp_t *pgp = (pgp_t *)v;
767 ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
768 pgp_delist(pgp,0);
769 ASSERT(pgp->us.obj != NULL);
770 pgp->us.obj->pgp_count--;
771 ASSERT(pgp->us.obj->pgp_count >= 0);
772 pgp_free(pgp,0);
773 }
775 FORWARD static rtn_t *rtn_alloc(void *arg);
776 FORWARD static void rtn_free(rtn_t *rtn);
778 static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
779 {
780 int ret;
782 ASSERT_SPINLOCK(&obj->obj_spinlock);
783 ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
784 if ( !ret )
785 obj->pgp_count++;
786 return ret;
787 }
789 static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
790 {
791 pgp_t *pgp;
793 ASSERT(obj != NULL);
794 ASSERT_SPINLOCK(&obj->obj_spinlock);
795 ASSERT_SENTINEL(obj,OBJ);
796 ASSERT(obj->pool != NULL);
797 ASSERT_SENTINEL(obj->pool,POOL);
798 pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
799 if ( pgp != NULL )
800 obj->pgp_count--;
801 ASSERT(obj->pgp_count >= 0);
803 return pgp;
804 }
806 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
808 /* called only indirectly from radix_tree_insert */
809 static NOINLINE rtn_t *rtn_alloc(void *arg)
810 {
811 objnode_t *objnode;
812 obj_t *obj = (obj_t *)arg;
814 ASSERT_SENTINEL(obj,OBJ);
815 ASSERT(obj->pool != NULL);
816 ASSERT_SENTINEL(obj->pool,POOL);
817 objnode = tmem_malloc(objnode_t,obj->pool);
818 if (objnode == NULL)
819 return NULL;
820 objnode->obj = obj;
821 SET_SENTINEL(objnode,OBJNODE);
822 memset(&objnode->rtn, 0, sizeof(rtn_t));
823 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
824 obj->pool->objnode_count_max = obj->pool->objnode_count;
825 atomic_inc_and_max(global_rtree_node_count);
826 obj->objnode_count++;
827 return &objnode->rtn;
828 }
830 /* called only indirectly from radix_tree_delete/destroy */
831 static void rtn_free(rtn_t *rtn)
832 {
833 pool_t *pool;
834 objnode_t *objnode;
835 int i;
837 ASSERT(rtn != NULL);
838 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
839 ASSERT(rtn->slots[i] == NULL);
840 objnode = container_of(rtn,objnode_t,rtn);
841 ASSERT_SENTINEL(objnode,OBJNODE);
842 INVERT_SENTINEL(objnode,OBJNODE);
843 ASSERT(objnode->obj != NULL);
844 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
845 ASSERT_SENTINEL(objnode->obj,OBJ);
846 pool = objnode->obj->pool;
847 ASSERT(pool != NULL);
848 ASSERT_SENTINEL(pool,POOL);
849 pool->objnode_count--;
850 objnode->obj->objnode_count--;
851 objnode->obj = NULL;
852 tmem_free(objnode,sizeof(objnode_t),pool);
853 atomic_dec_and_assert(global_rtree_node_count);
854 }
856 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
858 int oid_compare(OID *left, OID *right)
859 {
860 if ( left->oid[2] == right->oid[2] )
861 {
862 if ( left->oid[1] == right->oid[1] )
863 {
864 if ( left->oid[0] == right->oid[0] )
865 return 0;
866 else if ( left->oid[0] < right->oid[0] )
867 return -1;
868 else
869 return 1;
870 }
871 else if ( left->oid[1] < right->oid[1] )
872 return -1;
873 else
874 return 1;
875 }
876 else if ( left->oid[2] < right->oid[2] )
877 return -1;
878 else
879 return 1;
880 }
882 void oid_set_invalid(OID *oidp)
883 {
884 oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
885 }
887 unsigned oid_hash(OID *oidp)
888 {
889 return (tmh_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
890 BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK);
891 }
893 /* searches for object==oid in pool, returns locked object if found */
894 static NOINLINE obj_t * obj_find(pool_t *pool, OID *oidp)
895 {
896 struct rb_node *node;
897 obj_t *obj;
899 restart_find:
900 tmem_read_lock(&pool->pool_rwlock);
901 node = pool->obj_rb_root[oid_hash(oidp)].rb_node;
902 while ( node )
903 {
904 obj = container_of(node, obj_t, rb_tree_node);
905 switch ( oid_compare(&obj->oid, oidp) )
906 {
907 case 0: /* equal */
908 if ( tmh_lock_all )
909 obj->no_evict = 1;
910 else
911 {
912 if ( !tmem_spin_trylock(&obj->obj_spinlock) )
913 {
914 tmem_read_unlock(&pool->pool_rwlock);
915 goto restart_find;
916 }
917 tmem_read_unlock(&pool->pool_rwlock);
918 }
919 return obj;
920 case -1:
921 node = node->rb_left;
922 break;
923 case 1:
924 node = node->rb_right;
925 }
926 }
927 tmem_read_unlock(&pool->pool_rwlock);
928 return NULL;
929 }
931 /* free an object that has no more pgps in it */
932 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
933 {
934 pool_t *pool;
935 OID old_oid;
937 ASSERT_SPINLOCK(&obj->obj_spinlock);
938 ASSERT(obj != NULL);
939 ASSERT_SENTINEL(obj,OBJ);
940 ASSERT(obj->pgp_count == 0);
941 pool = obj->pool;
942 ASSERT(pool != NULL);
943 ASSERT(pool->client != NULL);
944 ASSERT_WRITELOCK(&pool->pool_rwlock);
945 if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
946 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
947 ASSERT((long)obj->objnode_count == 0);
948 ASSERT(obj->tree_root.rnode == NULL);
949 pool->obj_count--;
950 ASSERT(pool->obj_count >= 0);
951 INVERT_SENTINEL(obj,OBJ);
952 obj->pool = NULL;
953 old_oid = obj->oid;
954 oid_set_invalid(&obj->oid);
955 obj->last_client = CLI_ID_NULL;
956 atomic_dec_and_assert(global_obj_count);
957 /* use no_rebalance only if all objects are being destroyed anyway */
958 if ( !no_rebalance )
959 rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[oid_hash(&old_oid)]);
960 tmem_free(obj,sizeof(obj_t),pool);
961 }
963 static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
964 {
965 struct rb_node **new, *parent = NULL;
966 obj_t *this;
968 new = &(root->rb_node);
969 while ( *new )
970 {
971 this = container_of(*new, obj_t, rb_tree_node);
972 parent = *new;
973 switch ( oid_compare(&this->oid, &obj->oid) )
974 {
975 case 0:
976 return 0;
977 case -1:
978 new = &((*new)->rb_left);
979 break;
980 case 1:
981 new = &((*new)->rb_right);
982 break;
983 }
984 }
985 rb_link_node(&obj->rb_tree_node, parent, new);
986 rb_insert_color(&obj->rb_tree_node, root);
987 return 1;
988 }
990 /*
991 * allocate, initialize, and insert an tmem_object_root
992 * (should be called only if find failed)
993 */
994 static NOINLINE obj_t * obj_new(pool_t *pool, OID *oidp)
995 {
996 obj_t *obj;
998 ASSERT(pool != NULL);
999 ASSERT_WRITELOCK(&pool->pool_rwlock);
1000 if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
1001 return NULL;
1002 pool->obj_count++;
1003 if (pool->obj_count > pool->obj_count_max)
1004 pool->obj_count_max = pool->obj_count;
1005 atomic_inc_and_max(global_obj_count);
1006 INIT_RADIX_TREE(&obj->tree_root,0);
1007 spin_lock_init(&obj->obj_spinlock);
1008 obj->pool = pool;
1009 obj->oid = *oidp;
1010 obj->objnode_count = 0;
1011 obj->pgp_count = 0;
1012 obj->last_client = CLI_ID_NULL;
1013 SET_SENTINEL(obj,OBJ);
1014 tmem_spin_lock(&obj->obj_spinlock);
1015 obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj);
1016 obj->no_evict = 1;
1017 ASSERT_SPINLOCK(&obj->obj_spinlock);
1018 return obj;
1021 /* free an object after destroying any pgps in it */
1022 static NOINLINE void obj_destroy(obj_t *obj, int no_rebalance)
1024 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
1025 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
1026 obj_free(obj,no_rebalance);
1029 /* destroys all objs in a pool, or only if obj->last_client matches cli_id */
1030 static void pool_destroy_objs(pool_t *pool, bool_t selective, cli_id_t cli_id)
1032 struct rb_node *node;
1033 obj_t *obj;
1034 int i;
1036 tmem_write_lock(&pool->pool_rwlock);
1037 pool->is_dying = 1;
1038 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
1040 node = rb_first(&pool->obj_rb_root[i]);
1041 while ( node != NULL )
1043 obj = container_of(node, obj_t, rb_tree_node);
1044 tmem_spin_lock(&obj->obj_spinlock);
1045 node = rb_next(node);
1046 ASSERT(obj->no_evict == 0);
1047 if ( !selective )
1048 /* FIXME: should be obj,1 but walking/erasing rbtree is racy */
1049 obj_destroy(obj,0);
1050 else if ( obj->last_client == cli_id )
1051 obj_destroy(obj,0);
1052 else
1053 tmem_spin_unlock(&obj->obj_spinlock);
1056 tmem_write_unlock(&pool->pool_rwlock);
1060 /************ POOL MANIPULATION ROUTINES ******************************/
1062 static pool_t * pool_alloc(void)
1064 pool_t *pool;
1065 int i;
1067 if ( (pool = tmh_alloc_infra(sizeof(pool_t),__alignof__(pool_t))) == NULL )
1068 return NULL;
1069 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
1070 pool->obj_rb_root[i] = RB_ROOT;
1071 INIT_LIST_HEAD(&pool->pool_list);
1072 INIT_LIST_HEAD(&pool->persistent_page_list);
1073 pool->cur_pgp = NULL;
1074 rwlock_init(&pool->pool_rwlock);
1075 pool->pgp_count_max = pool->obj_count_max = 0;
1076 pool->objnode_count = pool->objnode_count_max = 0;
1077 atomic_set(&pool->pgp_count,0);
1078 pool->obj_count = 0; pool->shared_count = 0;
1079 pool->pageshift = PAGE_SHIFT - 12;
1080 pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
1081 pool->dup_puts_replaced = pool->no_mem_puts = 0;
1082 pool->found_gets = pool->gets = 0;
1083 pool->flushs_found = pool->flushs = 0;
1084 pool->flush_objs_found = pool->flush_objs = 0;
1085 pool->is_dying = 0;
1086 SET_SENTINEL(pool,POOL);
1087 return pool;
1090 static NOINLINE void pool_free(pool_t *pool)
1092 ASSERT_SENTINEL(pool,POOL);
1093 INVERT_SENTINEL(pool,POOL);
1094 pool->client = NULL;
1095 list_del(&pool->pool_list);
1096 tmh_free_infra(pool);
1099 /* register new_client as a user of this shared pool and return new
1100 total number of registered users */
1101 static int shared_pool_join(pool_t *pool, client_t *new_client)
1103 sharelist_t *sl;
1105 ASSERT(is_shared(pool));
1106 if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
1107 return -1;
1108 sl->client = new_client;
1109 list_add_tail(&sl->share_list, &pool->share_list);
1110 if ( new_client->cli_id != pool->client->cli_id )
1111 printk("adding new %s %d to shared pool owned by %s %d\n",
1112 client_str, new_client->cli_id, client_str, pool->client->cli_id);
1113 return ++pool->shared_count;
1116 /* reassign "ownership" of the pool to another client that shares this pool */
1117 static NOINLINE void shared_pool_reassign(pool_t *pool)
1119 sharelist_t *sl;
1120 int poolid;
1121 client_t *old_client = pool->client, *new_client;
1123 ASSERT(is_shared(pool));
1124 if ( list_empty(&pool->share_list) )
1126 ASSERT(pool->shared_count == 0);
1127 return;
1129 old_client->pools[pool->pool_id] = NULL;
1130 sl = list_entry(pool->share_list.next, sharelist_t, share_list);
1131 ASSERT(sl->client != old_client);
1132 pool->client = new_client = sl->client;
1133 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
1134 if (new_client->pools[poolid] == pool)
1135 break;
1136 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
1137 new_client->eph_count += _atomic_read(pool->pgp_count);
1138 old_client->eph_count -= _atomic_read(pool->pgp_count);
1139 list_splice_init(&old_client->ephemeral_page_list,
1140 &new_client->ephemeral_page_list);
1141 printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
1142 cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
1143 pool->pool_id = poolid;
1146 /* destroy all objects with last_client same as passed cli_id,
1147 remove pool's cli_id from list of sharers of this pool */
1148 static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
1150 sharelist_t *sl;
1151 int s_poolid;
1153 ASSERT(is_shared(pool));
1154 ASSERT(pool->client != NULL);
1156 ASSERT_WRITELOCK(&tmem_rwlock);
1157 pool_destroy_objs(pool,1,cli_id);
1158 list_for_each_entry(sl,&pool->share_list, share_list)
1160 if (sl->client->cli_id != cli_id)
1161 continue;
1162 list_del(&sl->share_list);
1163 tmem_free(sl,sizeof(sharelist_t),pool);
1164 --pool->shared_count;
1165 if (pool->client->cli_id == cli_id)
1166 shared_pool_reassign(pool);
1167 if (pool->shared_count)
1168 return pool->shared_count;
1169 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
1170 if ( (global_shared_pools[s_poolid]) == pool )
1172 global_shared_pools[s_poolid] = NULL;
1173 break;
1175 return 0;
1177 printk("tmem: no match unsharing pool, %s=%d\n",
1178 cli_id_str,pool->client->cli_id);
1179 return -1;
1182 /* flush all data (owned by cli_id) from a pool and, optionally, free it */
1183 static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
1185 ASSERT(pool != NULL);
1186 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
1188 printk("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
1189 cli_id_str, cli_id, pool->pool_id, cli_id_str,pool->client->cli_id);
1190 return;
1192 printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
1193 is_persistent(pool) ? "persistent" : "ephemeral" ,
1194 is_shared(pool) ? "shared" : "private");
1195 printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
1196 if ( pool->client->live_migrating )
1198 printk("can't %s pool while %s is live-migrating\n",
1199 destroy?"destroy":"flush", client_str);
1200 return;
1202 pool_destroy_objs(pool,0,CLI_ID_NULL);
1203 if ( destroy )
1205 pool->client->pools[pool->pool_id] = NULL;
1206 pool_free(pool);
1210 /************ CLIENT MANIPULATION OPERATIONS **************************/
1212 static client_t *client_create(cli_id_t cli_id)
1214 client_t *client = tmh_alloc_infra(sizeof(client_t),__alignof__(client_t));
1215 int i;
1217 printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
1218 if ( client == NULL )
1220 printk("failed... out of memory\n");
1221 goto fail;
1223 memset(client,0,sizeof(client_t));
1224 if ( (client->tmh = tmh_client_init(cli_id)) == NULL )
1226 printk("failed... can't allocate host-dependent part of client\n");
1227 goto fail;
1229 if ( !tmh_set_client_from_id(client, client->tmh, cli_id) )
1231 printk("failed... can't set client\n");
1232 goto fail;
1234 client->cli_id = cli_id;
1235 #ifdef __i386__
1236 client->compress = 0;
1237 #else
1238 client->compress = tmh_compression_enabled();
1239 #endif
1240 client->shared_auth_required = tmh_shared_auth();
1241 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1242 client->shared_auth_uuid[i][0] =
1243 client->shared_auth_uuid[i][1] = -1L;
1244 client->frozen = 0; client->live_migrating = 0;
1245 client->weight = 0; client->cap = 0;
1246 list_add_tail(&client->client_list, &global_client_list);
1247 INIT_LIST_HEAD(&client->ephemeral_page_list);
1248 INIT_LIST_HEAD(&client->persistent_invalidated_list);
1249 client->cur_pgp = NULL;
1250 client->eph_count = client->eph_count_max = 0;
1251 client->total_cycles = 0; client->succ_pers_puts = 0;
1252 client->succ_eph_gets = 0; client->succ_pers_gets = 0;
1253 printk("ok\n");
1254 return client;
1256 fail:
1257 tmh_free_infra(client);
1258 return NULL;
1261 static void client_free(client_t *client)
1263 list_del(&client->client_list);
1264 tmh_client_destroy(client->tmh);
1265 tmh_free_infra(client);
1268 /* flush all data from a client and, optionally, free it */
1269 static void client_flush(client_t *client, bool_t destroy)
1271 int i;
1272 pool_t *pool;
1274 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
1276 if ( (pool = client->pools[i]) == NULL )
1277 continue;
1278 pool_flush(pool,client->cli_id,destroy);
1279 if ( destroy )
1280 client->pools[i] = NULL;
1282 if ( destroy )
1283 client_free(client);
1286 static bool_t client_over_quota(client_t *client)
1288 int total = _atomic_read(client_weight_total);
1290 ASSERT(client != NULL);
1291 if ( (total == 0) || (client->weight == 0) ||
1292 (client->eph_count == 0) )
1293 return 0;
1294 return ( ((global_eph_count*100L) / client->eph_count ) >
1295 ((total*100L) / client->weight) );
1298 static void client_freeze(client_t *client, int freeze)
1300 client->frozen = freeze;
1303 /************ MEMORY REVOCATION ROUTINES *******************************/
1305 static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
1307 obj_t *obj = pgp->us.obj;
1308 pool_t *pool = obj->pool;
1309 client_t *client = pool->client;
1310 uint16_t firstbyte = pgp->firstbyte;
1312 if ( pool->is_dying )
1313 return 0;
1314 if ( tmh_lock_all && !obj->no_evict )
1315 return 1;
1316 if ( tmem_spin_trylock(&obj->obj_spinlock) )
1318 if ( tmh_dedup_enabled() )
1320 firstbyte = pgp->firstbyte;
1321 if ( firstbyte == NOT_SHAREABLE )
1322 goto obj_unlock;
1323 ASSERT(firstbyte < 256);
1324 if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) )
1325 goto obj_unlock;
1326 if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted )
1328 pgp->eviction_attempted++;
1329 list_del(&pgp->global_eph_pages);
1330 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1331 list_del(&pgp->us.client_eph_pages);
1332 list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
1333 goto pcd_unlock;
1336 if ( obj->pgp_count > 1 )
1337 return 1;
1338 if ( tmem_write_trylock(&pool->pool_rwlock) )
1340 *hold_pool_rwlock = 1;
1341 return 1;
1343 pcd_unlock:
1344 tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
1345 obj_unlock:
1346 tmem_spin_unlock(&obj->obj_spinlock);
1348 return 0;
1351 static int tmem_evict(void)
1353 client_t *client = tmh_client_from_current();
1354 pgp_t *pgp = NULL, *pgp2, *pgp_del;
1355 obj_t *obj;
1356 pool_t *pool;
1357 int ret = 0;
1358 bool_t hold_pool_rwlock = 0;
1360 evict_attempts++;
1361 tmem_spin_lock(&eph_lists_spinlock);
1362 if ( (client != NULL) && client_over_quota(client) &&
1363 !list_empty(&client->ephemeral_page_list) )
1365 list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,us.client_eph_pages)
1366 if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
1367 goto found;
1368 } else if ( list_empty(&global_ephemeral_page_list) ) {
1369 goto out;
1370 } else {
1371 list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages)
1372 if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
1373 goto found;
1376 ret = 0;
1377 goto out;
1379 found:
1380 ASSERT(pgp != NULL);
1381 ASSERT_SENTINEL(pgp,PGD);
1382 obj = pgp->us.obj;
1383 ASSERT(obj != NULL);
1384 ASSERT(obj->no_evict == 0);
1385 ASSERT(obj->pool != NULL);
1386 ASSERT_SENTINEL(obj,OBJ);
1387 pool = obj->pool;
1389 ASSERT_SPINLOCK(&obj->obj_spinlock);
1390 pgp_del = pgp_delete_from_obj(obj, pgp->index);
1391 ASSERT(pgp_del == pgp);
1392 if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
1394 ASSERT(pgp->pcd->pgp_ref_count == 1 || pgp->eviction_attempted);
1395 pcd_disassociate(pgp,pool,1);
1397 pgp_delete(pgp,1);
1398 if ( obj->pgp_count == 0 )
1400 ASSERT_WRITELOCK(&pool->pool_rwlock);
1401 obj_free(obj,0);
1403 else
1404 tmem_spin_unlock(&obj->obj_spinlock);
1405 if ( hold_pool_rwlock )
1406 tmem_write_unlock(&pool->pool_rwlock);
1407 evicted_pgs++;
1408 ret = 1;
1410 out:
1411 tmem_spin_unlock(&eph_lists_spinlock);
1412 return ret;
1415 static unsigned long tmem_relinquish_npages(unsigned long n)
1417 unsigned long avail_pages = 0;
1419 while ( (avail_pages = tmh_avail_pages()) < n )
1421 if ( !tmem_evict() )
1422 break;
1424 if ( avail_pages )
1425 tmh_release_avail_pages_to_host();
1426 return avail_pages;
1429 /* Under certain conditions (e.g. if each client is putting pages for exactly
1430 * one object), once locks are held, freeing up memory may
1431 * result in livelocks and very long "put" times, so we try to ensure there
1432 * is a minimum amount of memory (1MB) available BEFORE any data structure
1433 * locks are held */
1434 static inline void tmem_ensure_avail_pages(void)
1436 int failed_evict = 10;
1438 while ( !tmh_free_mb() )
1440 if ( tmem_evict() )
1441 continue;
1442 else if ( failed_evict-- <= 0 )
1443 break;
1447 /************ TMEM CORE OPERATIONS ************************************/
1449 static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1450 void *cva)
1452 void *dst, *p;
1453 size_t size;
1454 int ret = 0;
1455 DECL_LOCAL_CYC_COUNTER(compress);
1457 ASSERT(pgp != NULL);
1458 ASSERT(pgp->us.obj != NULL);
1459 ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
1460 ASSERT(pgp->us.obj->pool != NULL);
1461 ASSERT(pgp->us.obj->pool->client != NULL);
1462 #ifdef __i386__
1463 return -ENOMEM;
1464 #endif
1466 if ( pgp->pfp != NULL )
1467 pgp_free_data(pgp, pgp->us.obj->pool);
1468 START_CYC_COUNTER(compress);
1469 ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
1470 if ( (ret == -EFAULT) || (ret == 0) )
1471 goto out;
1472 else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) {
1473 ret = 0;
1474 goto out;
1475 } else if ( tmh_dedup_enabled() && !is_persistent(pgp->us.obj->pool) ) {
1476 if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM )
1477 goto out;
1478 } else if ( (p = tmem_malloc_bytes(size,pgp->us.obj->pool)) == NULL ) {
1479 ret = -ENOMEM;
1480 goto out;
1481 } else {
1482 memcpy(p,dst,size);
1483 pgp->cdata = p;
1485 pgp->size = size;
1486 pgp->us.obj->pool->client->compressed_pages++;
1487 pgp->us.obj->pool->client->compressed_sum_size += size;
1488 ret = 1;
1490 out:
1491 END_CYC_COUNTER(compress);
1492 return ret;
1495 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1496 pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva)
1498 pool_t *pool;
1499 obj_t *obj;
1500 client_t *client;
1501 pgp_t *pgpfound = NULL;
1502 int ret;
1504 ASSERT(pgp != NULL);
1505 ASSERT(pgp->pfp != NULL);
1506 ASSERT(pgp->size != -1);
1507 obj = pgp->us.obj;
1508 ASSERT_SPINLOCK(&obj->obj_spinlock);
1509 ASSERT(obj != NULL);
1510 pool = obj->pool;
1511 ASSERT(pool != NULL);
1512 client = pool->client;
1513 if ( client->live_migrating )
1514 goto failed_dup; /* no dups allowed when migrating */
1515 /* can we successfully manipulate pgp to change out the data? */
1516 if ( len != 0 && client->compress && pgp->size != 0 )
1518 ret = do_tmem_put_compress(pgp,cmfn,cva);
1519 if ( ret == 1 )
1520 goto done;
1521 else if ( ret == 0 )
1522 goto copy_uncompressed;
1523 else if ( ret == -ENOMEM )
1524 goto failed_dup;
1525 else if ( ret == -EFAULT )
1526 goto bad_copy;
1529 copy_uncompressed:
1530 if ( pgp->pfp )
1531 pgp_free_data(pgp, pool);
1532 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1533 goto failed_dup;
1534 pgp->size = 0;
1535 /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
1536 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
1537 if ( ret == -EFAULT )
1538 goto bad_copy;
1539 if ( tmh_dedup_enabled() && !is_persistent(pool) )
1541 if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
1542 goto failed_dup;
1545 done:
1546 /* successfully replaced data, clean up and return success */
1547 if ( is_shared(pool) )
1548 obj->last_client = client->cli_id;
1549 obj->no_evict = 0;
1550 tmem_spin_unlock(&obj->obj_spinlock);
1551 pool->dup_puts_replaced++;
1552 pool->good_puts++;
1553 if ( is_persistent(pool) )
1554 client->succ_pers_puts++;
1555 return 1;
1557 bad_copy:
1558 /* this should only happen if the client passed a bad mfn */
1559 failed_copies++;
1560 ret = -EFAULT;
1561 goto cleanup;
1563 failed_dup:
1564 /* couldn't change out the data, flush the old data and return
1565 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
1566 ret = -ENOSPC;
1567 cleanup:
1568 pgpfound = pgp_delete_from_obj(obj, pgp->index);
1569 ASSERT(pgpfound == pgp);
1570 pgp_delete(pgpfound,0);
1571 if ( obj->pgp_count == 0 )
1573 tmem_write_lock(&pool->pool_rwlock);
1574 obj_free(obj,0);
1575 tmem_write_unlock(&pool->pool_rwlock);
1576 } else {
1577 obj->no_evict = 0;
1578 tmem_spin_unlock(&obj->obj_spinlock);
1580 pool->dup_puts_flushed++;
1581 return ret;
1585 static NOINLINE int do_tmem_put(pool_t *pool,
1586 OID *oidp, uint32_t index,
1587 tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
1588 pagesize_t pfn_offset, pagesize_t len, void *cva)
1590 obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
1591 pgp_t *pgp = NULL, *pgpdel = NULL;
1592 client_t *client = pool->client;
1593 int ret = client->frozen ? -EFROZEN : -ENOMEM;
1595 ASSERT(pool != NULL);
1596 pool->puts++;
1597 /* does page already exist (dup)? if so, handle specially */
1598 if ( (obj = objfound = obj_find(pool,oidp)) != NULL )
1600 ASSERT_SPINLOCK(&objfound->obj_spinlock);
1601 if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
1602 return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len,cva);
1605 /* no puts allowed into a frozen pool (except dup puts) */
1606 if ( client->frozen )
1607 goto free;
1609 if ( (objfound == NULL) )
1611 tmem_write_lock(&pool->pool_rwlock);
1612 if ( (obj = objnew = obj_new(pool,oidp)) == NULL )
1614 tmem_write_unlock(&pool->pool_rwlock);
1615 return -ENOMEM;
1617 ASSERT_SPINLOCK(&objnew->obj_spinlock);
1618 tmem_write_unlock(&pool->pool_rwlock);
1621 ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
1622 ASSERT_SPINLOCK(&obj->obj_spinlock);
1623 if ( (pgp = pgp_alloc(obj)) == NULL )
1624 goto free;
1626 ret = pgp_add_to_obj(obj, index, pgp);
1627 if ( ret == -ENOMEM )
1628 /* warning, may result in partially built radix tree ("stump") */
1629 goto free;
1630 ASSERT(ret != -EEXIST);
1631 pgp->index = index;
1632 pgp->size = 0;
1634 if ( len != 0 && client->compress )
1636 ASSERT(pgp->pfp == NULL);
1637 ret = do_tmem_put_compress(pgp,cmfn,cva);
1638 if ( ret == 1 )
1639 goto insert_page;
1640 if ( ret == -ENOMEM )
1642 client->compress_nomem++;
1643 goto delete_and_free;
1645 if ( ret == 0 )
1647 client->compress_poor++;
1648 goto copy_uncompressed;
1650 if ( ret == -EFAULT )
1651 goto bad_copy;
1654 copy_uncompressed:
1655 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1657 ret = -ENOMEM;
1658 goto delete_and_free;
1660 /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
1661 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
1662 if ( ret == -EFAULT )
1663 goto bad_copy;
1664 if ( tmh_dedup_enabled() && !is_persistent(pool) )
1666 if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
1667 goto delete_and_free;
1670 insert_page:
1671 if ( is_ephemeral(pool) )
1673 tmem_spin_lock(&eph_lists_spinlock);
1674 list_add_tail(&pgp->global_eph_pages,
1675 &global_ephemeral_page_list);
1676 if (++global_eph_count > global_eph_count_max)
1677 global_eph_count_max = global_eph_count;
1678 list_add_tail(&pgp->us.client_eph_pages,
1679 &client->ephemeral_page_list);
1680 if (++client->eph_count > client->eph_count_max)
1681 client->eph_count_max = client->eph_count;
1682 tmem_spin_unlock(&eph_lists_spinlock);
1683 } else { /* is_persistent */
1684 tmem_spin_lock(&pers_lists_spinlock);
1685 list_add_tail(&pgp->us.pool_pers_pages,
1686 &pool->persistent_page_list);
1687 tmem_spin_unlock(&pers_lists_spinlock);
1689 ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
1690 if ( is_shared(pool) )
1691 obj->last_client = client->cli_id;
1692 obj->no_evict = 0;
1693 tmem_spin_unlock(&obj->obj_spinlock);
1694 pool->good_puts++;
1695 if ( is_persistent(pool) )
1696 client->succ_pers_puts++;
1697 else
1698 tot_good_eph_puts++;
1699 return 1;
1701 bad_copy:
1702 /* this should only happen if the client passed a bad mfn */
1703 ret = -EFAULT;
1704 failed_copies++;
1706 delete_and_free:
1707 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1708 pgpdel = pgp_delete_from_obj(obj, pgp->index);
1709 ASSERT(pgp == pgpdel);
1711 free:
1712 if ( pgp )
1713 pgp_delete(pgp,0);
1714 if ( objfound )
1716 objfound->no_evict = 0;
1717 tmem_spin_unlock(&objfound->obj_spinlock);
1719 if ( objnew )
1721 tmem_write_lock(&pool->pool_rwlock);
1722 obj_free(objnew,0);
1723 tmem_write_unlock(&pool->pool_rwlock);
1725 pool->no_mem_puts++;
1726 return ret;
1729 static NOINLINE int do_tmem_get(pool_t *pool, OID *oidp, uint32_t index,
1730 tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
1731 pagesize_t pfn_offset, pagesize_t len, void *cva)
1733 obj_t *obj;
1734 pgp_t *pgp;
1735 client_t *client = pool->client;
1736 DECL_LOCAL_CYC_COUNTER(decompress);
1738 if ( !_atomic_read(pool->pgp_count) )
1739 return -EEMPTY;
1741 pool->gets++;
1742 obj = obj_find(pool,oidp);
1743 if ( obj == NULL )
1744 return 0;
1746 ASSERT_SPINLOCK(&obj->obj_spinlock);
1747 if (is_shared(pool) || is_persistent(pool) )
1748 pgp = pgp_lookup_in_obj(obj, index);
1749 else
1750 pgp = pgp_delete_from_obj(obj, index);
1751 if ( pgp == NULL )
1753 obj->no_evict = 0;
1754 tmem_spin_unlock(&obj->obj_spinlock);
1755 return 0;
1757 ASSERT(pgp->size != -1);
1758 if ( tmh_dedup_enabled() && !is_persistent(pool) &&
1759 pgp->firstbyte != NOT_SHAREABLE )
1761 if ( pcd_copy_to_client(cmfn, pgp) == -EFAULT )
1762 goto bad_copy;
1763 } else if ( pgp->size != 0 ) {
1764 START_CYC_COUNTER(decompress);
1765 if ( tmh_decompress_to_client(cmfn, pgp->cdata,
1766 pgp->size, cva) == -EFAULT )
1767 goto bad_copy;
1768 END_CYC_COUNTER(decompress);
1769 } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
1770 pfn_offset, len, cva) == -EFAULT)
1771 goto bad_copy;
1772 if ( is_ephemeral(pool) )
1774 if ( is_private(pool) )
1776 pgp_delete(pgp,0);
1777 if ( obj->pgp_count == 0 )
1779 tmem_write_lock(&pool->pool_rwlock);
1780 obj_free(obj,0);
1781 obj = NULL;
1782 tmem_write_unlock(&pool->pool_rwlock);
1784 } else {
1785 tmem_spin_lock(&eph_lists_spinlock);
1786 list_del(&pgp->global_eph_pages);
1787 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1788 list_del(&pgp->us.client_eph_pages);
1789 list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
1790 tmem_spin_unlock(&eph_lists_spinlock);
1791 ASSERT(obj != NULL);
1792 obj->last_client = tmh_get_cli_id_from_current();
1795 if ( obj != NULL )
1797 obj->no_evict = 0;
1798 tmem_spin_unlock(&obj->obj_spinlock);
1800 pool->found_gets++;
1801 if ( is_ephemeral(pool) )
1802 client->succ_eph_gets++;
1803 else
1804 client->succ_pers_gets++;
1805 return 1;
1807 bad_copy:
1808 /* this should only happen if the client passed a bad mfn */
1809 failed_copies++;
1810 return -EFAULT;
1814 static NOINLINE int do_tmem_flush_page(pool_t *pool, OID *oidp, uint32_t index)
1816 obj_t *obj;
1817 pgp_t *pgp;
1819 pool->flushs++;
1820 obj = obj_find(pool,oidp);
1821 if ( obj == NULL )
1822 goto out;
1823 pgp = pgp_delete_from_obj(obj, index);
1824 if ( pgp == NULL )
1826 obj->no_evict = 0;
1827 tmem_spin_unlock(&obj->obj_spinlock);
1828 goto out;
1830 pgp_delete(pgp,0);
1831 if ( obj->pgp_count == 0 )
1833 tmem_write_lock(&pool->pool_rwlock);
1834 obj_free(obj,0);
1835 tmem_write_unlock(&pool->pool_rwlock);
1836 } else {
1837 obj->no_evict = 0;
1838 tmem_spin_unlock(&obj->obj_spinlock);
1840 pool->flushs_found++;
1842 out:
1843 if ( pool->client->frozen )
1844 return -EFROZEN;
1845 else
1846 return 1;
1849 static NOINLINE int do_tmem_flush_object(pool_t *pool, OID *oidp)
1851 obj_t *obj;
1853 pool->flush_objs++;
1854 obj = obj_find(pool,oidp);
1855 if ( obj == NULL )
1856 goto out;
1857 tmem_write_lock(&pool->pool_rwlock);
1858 obj_destroy(obj,0);
1859 pool->flush_objs_found++;
1860 tmem_write_unlock(&pool->pool_rwlock);
1862 out:
1863 if ( pool->client->frozen )
1864 return -EFROZEN;
1865 else
1866 return 1;
1869 static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
1871 client_t *client = tmh_client_from_current();
1872 pool_t *pool;
1874 if ( client->pools == NULL )
1875 return 0;
1876 if ( (pool = client->pools[pool_id]) == NULL )
1877 return 0;
1878 client->pools[pool_id] = NULL;
1879 pool_flush(pool,client->cli_id,1);
1880 return 1;
1883 static NOINLINE int do_tmem_new_pool(cli_id_t this_cli_id,
1884 uint32_t d_poolid, uint32_t flags,
1885 uint64_t uuid_lo, uint64_t uuid_hi)
1887 client_t *client;
1888 cli_id_t cli_id;
1889 int persistent = flags & TMEM_POOL_PERSIST;
1890 int shared = flags & TMEM_POOL_SHARED;
1891 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1892 & TMEM_POOL_PAGESIZE_MASK;
1893 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1894 & TMEM_POOL_VERSION_MASK;
1895 pool_t *pool, *shpool;
1896 int s_poolid, first_unused_s_poolid;
1897 int i;
1899 if ( this_cli_id == CLI_ID_NULL )
1900 cli_id = tmh_get_cli_id_from_current();
1901 else
1902 cli_id = this_cli_id;
1903 printk("tmem: allocating %s-%s tmem pool for %s=%d...",
1904 persistent ? "persistent" : "ephemeral" ,
1905 shared ? "shared" : "private", cli_id_str, cli_id);
1906 if ( specversion != TMEM_SPEC_VERSION )
1908 printk("failed... unsupported spec version\n");
1909 return -EPERM;
1911 if ( pagebits != (PAGE_SHIFT - 12) )
1913 printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
1914 return -EPERM;
1916 if ( flags & TMEM_POOL_PRECOMPRESSED )
1918 printk("failed... precompression flag set but unsupported\n");
1919 return -EPERM;
1921 if ( flags & TMEM_POOL_RESERVED_BITS )
1923 printk("failed... reserved bits must be zero\n");
1924 return -EPERM;
1926 if ( (pool = pool_alloc()) == NULL )
1928 printk("failed... out of memory\n");
1929 return -ENOMEM;
1931 if ( this_cli_id != CLI_ID_NULL )
1933 if ( (client = tmh_client_from_cli_id(this_cli_id)) == NULL
1934 || d_poolid >= MAX_POOLS_PER_DOMAIN
1935 || client->pools[d_poolid] != NULL )
1936 goto fail;
1938 else
1940 client = tmh_client_from_current();
1941 ASSERT(client != NULL);
1942 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1943 if ( client->pools[d_poolid] == NULL )
1944 break;
1945 if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
1947 printk("failed... no more pool slots available for this %s\n",
1948 client_str);
1949 goto fail;
1952 if ( shared )
1954 if ( uuid_lo == -1L && uuid_hi == -1L )
1955 shared = 0;
1956 if ( client->shared_auth_required && !global_shared_auth )
1958 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1959 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1960 (client->shared_auth_uuid[i][1] == uuid_hi) )
1961 break;
1962 if ( i == MAX_GLOBAL_SHARED_POOLS )
1963 shared = 0;
1966 pool->shared = shared;
1967 pool->client = client;
1968 if ( shared )
1970 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1971 for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
1973 if ( (shpool = global_shared_pools[s_poolid]) != NULL )
1975 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1977 printk("(matches shared pool uuid=%"PRIx64".%"PRIx64") ",
1978 uuid_hi, uuid_lo);
1979 printk("pool_id=%d\n",d_poolid);
1980 client->pools[d_poolid] = global_shared_pools[s_poolid];
1981 shared_pool_join(global_shared_pools[s_poolid], client);
1982 pool_free(pool);
1983 return d_poolid;
1986 else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1987 first_unused_s_poolid = s_poolid;
1989 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1991 printk("tmem: failed... no global shared pool slots available\n");
1992 goto fail;
1994 else
1996 INIT_LIST_HEAD(&pool->share_list);
1997 pool->shared_count = 0;
1998 global_shared_pools[first_unused_s_poolid] = pool;
1999 (void)shared_pool_join(pool,client);
2002 client->pools[d_poolid] = pool;
2003 list_add_tail(&pool->pool_list, &global_pool_list);
2004 pool->pool_id = d_poolid;
2005 pool->persistent = persistent;
2006 pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
2007 printk("pool_id=%d\n",d_poolid);
2008 return d_poolid;
2010 fail:
2011 pool_free(pool);
2012 return -EPERM;
2015 /************ TMEM CONTROL OPERATIONS ************************************/
2017 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
2018 static int tmemc_freeze_pools(cli_id_t cli_id, int arg)
2020 client_t *client;
2021 bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
2022 bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
2023 char *s;
2025 s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
2026 if ( cli_id == CLI_ID_NULL )
2028 list_for_each_entry(client,&global_client_list,client_list)
2029 client_freeze(client,freeze);
2030 printk("tmem: all pools %s for all %ss\n",s,client_str);
2032 else
2034 if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
2035 return -1;
2036 client_freeze(client,freeze);
2037 printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
2039 return 0;
2042 static int tmemc_flush_mem(cli_id_t cli_id, uint32_t kb)
2044 uint32_t npages, flushed_pages, flushed_kb;
2046 if ( cli_id != CLI_ID_NULL )
2048 printk("tmem: %s-specific flush not supported yet, use --all\n",
2049 client_str);
2050 return -1;
2052 /* convert kb to pages, rounding up if necessary */
2053 npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
2054 flushed_pages = tmem_relinquish_npages(npages);
2055 flushed_kb = flushed_pages << (PAGE_SHIFT-10);
2056 return flushed_kb;
2059 /*
2060 * These tmemc_list* routines output lots of stats in a format that is
2061 * intended to be program-parseable, not human-readable. Further, by
2062 * tying each group of stats to a line format indicator (e.g. G= for
2063 * global stats) and each individual stat to a two-letter specifier
2064 * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
2065 * global ephemeral pool), it should allow the stats reported to be
2066 * forward and backwards compatible as tmem evolves.
2067 */
2068 #define BSIZE 1024
2070 static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
2071 uint32_t len, bool_t use_long)
2073 char info[BSIZE];
2074 int i, n = 0, sum = 0;
2075 pool_t *p;
2076 bool_t s;
2078 n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d,"
2079 "Tc:%"PRIu64",Ge:%ld,Pp:%ld,Gp:%ld%c",
2080 c->cli_id, c->weight, c->cap, c->compress, c->frozen,
2081 c->total_cycles, c->succ_eph_gets, c->succ_pers_puts, c->succ_pers_gets,
2082 use_long ? ',' : '\n');
2083 if (use_long)
2084 n += scnprintf(info+n,BSIZE-n,
2085 "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n",
2086 c->eph_count, c->eph_count_max,
2087 c->compressed_pages, c->compressed_sum_size,
2088 c->compress_poor, c->compress_nomem);
2089 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2090 sum += n;
2091 for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
2093 if ( (p = c->pools[i]) == NULL )
2094 continue;
2095 s = is_shared(p);
2096 n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,"
2097 "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c",
2098 c->cli_id, p->pool_id,
2099 is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
2100 (uint64_t)(s ? p->uuid[0] : 0),
2101 (uint64_t)(s ? p->uuid[1] : 0LL),
2102 use_long ? ',' : '\n');
2103 if (use_long)
2104 n += scnprintf(info+n,BSIZE-n,
2105 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
2106 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
2107 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
2108 _atomic_read(p->pgp_count), p->pgp_count_max,
2109 p->obj_count, p->obj_count_max,
2110 p->objnode_count, p->objnode_count_max,
2111 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
2112 p->no_mem_puts,
2113 p->found_gets, p->gets,
2114 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
2115 if ( sum + n >= len )
2116 return sum;
2117 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2118 sum += n;
2120 return sum;
2123 static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
2124 bool_t use_long)
2126 char info[BSIZE];
2127 int i, n = 0, sum = 0;
2128 pool_t *p;
2129 sharelist_t *sl;
2131 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
2133 if ( (p = global_shared_pools[i]) == NULL )
2134 continue;
2135 n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64,
2136 i, is_persistent(p) ? 'P' : 'E',
2137 is_shared(p) ? 'S' : 'P',
2138 p->uuid[0], p->uuid[1]);
2139 list_for_each_entry(sl,&p->share_list, share_list)
2140 n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
2141 n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
2142 if (use_long)
2143 n += scnprintf(info+n,BSIZE-n,
2144 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
2145 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
2146 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
2147 _atomic_read(p->pgp_count), p->pgp_count_max,
2148 p->obj_count, p->obj_count_max,
2149 p->objnode_count, p->objnode_count_max,
2150 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
2151 p->no_mem_puts,
2152 p->found_gets, p->gets,
2153 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
2154 if ( sum + n >= len )
2155 return sum;
2156 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2157 sum += n;
2159 return sum;
2162 #ifdef TMEM_PERF
2163 static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
2164 bool_t use_long)
2166 char info[BSIZE];
2167 int n = 0, sum = 0;
2169 n = scnprintf(info+n,BSIZE-n,"T=");
2170 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
2171 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
2172 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
2173 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
2174 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
2175 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
2176 #ifdef COMPARE_COPY_PAGE_SSE2
2177 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
2178 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
2179 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
2180 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
2181 #else
2182 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
2183 #endif
2184 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
2185 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
2186 n--; /* overwrite trailing comma */
2187 n += scnprintf(info+n,BSIZE-n,"\n");
2188 if ( sum + n >= len )
2189 return sum;
2190 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2191 sum += n;
2192 return sum;
2194 #else
2195 #define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
2196 #endif
2198 static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
2199 bool_t use_long)
2201 char info[BSIZE];
2202 int n = 0, sum = off;
2204 n += scnprintf(info,BSIZE,"G="
2205 "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
2206 "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
2207 total_tmem_ops, errored_tmem_ops, failed_copies,
2208 alloc_failed, alloc_page_failed, tmh_avail_pages(),
2209 low_on_memory, evicted_pgs,
2210 evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
2211 total_flush_pool, use_long ? ',' : '\n');
2212 if (use_long)
2213 n += scnprintf(info+n,BSIZE-n,
2214 "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d,"
2215 "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Ep:%lu,Gd:%lu,Zt:%lu,Gz:%lu\n",
2216 global_eph_count, global_eph_count_max,
2217 _atomic_read(global_obj_count), global_obj_count_max,
2218 _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
2219 _atomic_read(global_pgp_count), global_pgp_count_max,
2220 _atomic_read(global_page_count), global_page_count_max,
2221 _atomic_read(global_pcd_count), global_pcd_count_max,
2222 tot_good_eph_puts,deduped_puts,pcd_tot_tze_size,pcd_tot_csize);
2223 if ( sum + n >= len )
2224 return sum;
2225 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
2226 sum += n;
2227 return sum;
2230 static int tmemc_list(cli_id_t cli_id, tmem_cli_va_t buf, uint32_t len,
2231 bool_t use_long)
2233 client_t *client;
2234 int off = 0;
2236 if ( cli_id == CLI_ID_NULL ) {
2237 off = tmemc_list_global(buf,0,len,use_long);
2238 off += tmemc_list_shared(buf,off,len-off,use_long);
2239 list_for_each_entry(client,&global_client_list,client_list)
2240 off += tmemc_list_client(client, buf, off, len-off, use_long);
2241 off += tmemc_list_global_perf(buf,off,len-off,use_long);
2243 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
2244 return -1;
2245 else
2246 off = tmemc_list_client(client, buf, 0, len, use_long);
2248 return 0;
2251 static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
2253 cli_id_t cli_id = client->cli_id;
2254 uint32_t old_weight;
2256 switch (subop)
2258 case TMEMC_SET_WEIGHT:
2259 old_weight = client->weight;
2260 client->weight = arg1;
2261 printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
2262 atomic_sub(old_weight,&client_weight_total);
2263 atomic_add(client->weight,&client_weight_total);
2264 break;
2265 case TMEMC_SET_CAP:
2266 client->cap = arg1;
2267 printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
2268 break;
2269 case TMEMC_SET_COMPRESS:
2270 #ifdef __i386__
2271 return -1;
2272 #endif
2273 if ( tmh_dedup_enabled() )
2275 printk("tmem: compression %s for all %ss, cannot be changed "
2276 "when tmem_dedup is enabled\n",
2277 tmh_compression_enabled() ? "enabled" : "disabled",client_str);
2278 return -1;
2280 client->compress = arg1 ? 1 : 0;
2281 printk("tmem: compression %s for %s=%d\n",
2282 arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
2283 break;
2284 default:
2285 printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
2286 return -1;
2288 return 0;
2291 static int tmemc_set_var(cli_id_t cli_id, uint32_t subop, uint32_t arg1)
2293 client_t *client;
2295 if ( cli_id == CLI_ID_NULL )
2296 list_for_each_entry(client,&global_client_list,client_list)
2297 tmemc_set_var_one(client, subop, arg1);
2298 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
2299 return -1;
2300 else
2301 tmemc_set_var_one(client, subop, arg1);
2302 return 0;
2305 static NOINLINE int tmemc_shared_pool_auth(cli_id_t cli_id, uint64_t uuid_lo,
2306 uint64_t uuid_hi, bool_t auth)
2308 client_t *client;
2309 int i, free = -1;
2311 if ( cli_id == CLI_ID_NULL )
2313 global_shared_auth = auth;
2314 return 1;
2316 client = tmh_client_from_cli_id(cli_id);
2317 if ( client == NULL )
2318 return -EINVAL;
2319 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
2321 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
2322 (client->shared_auth_uuid[i][1] == uuid_hi) )
2324 if ( auth == 0 )
2325 client->shared_auth_uuid[i][0] =
2326 client->shared_auth_uuid[i][1] = -1L;
2327 return 1;
2329 if ( (auth == 1) && (client->shared_auth_uuid[i][0] == -1L) &&
2330 (client->shared_auth_uuid[i][1] == -1L) && (free == -1) )
2331 free = i;
2333 if ( auth == 0 )
2334 return 0;
2335 if ( auth == 1 && free == -1 )
2336 return -ENOMEM;
2337 client->shared_auth_uuid[free][0] = uuid_lo;
2338 client->shared_auth_uuid[free][1] = uuid_hi;
2339 return 1;
2342 static NOINLINE int tmemc_save_subop(int cli_id, uint32_t pool_id,
2343 uint32_t subop, tmem_cli_va_t buf, uint32_t arg1)
2345 client_t *client = tmh_client_from_cli_id(cli_id);
2346 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2347 ? NULL : client->pools[pool_id];
2348 uint32_t p;
2349 uint64_t *uuid;
2350 pgp_t *pgp, *pgp2;
2351 int rc = -1;
2353 switch(subop)
2355 case TMEMC_SAVE_BEGIN:
2356 if ( client == NULL )
2357 return 0;
2358 for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
2359 if ( client->pools[p] != NULL )
2360 break;
2361 if ( p == MAX_POOLS_PER_DOMAIN )
2363 rc = 0;
2364 break;
2366 client->was_frozen = client->frozen;
2367 client->frozen = 1;
2368 if ( arg1 != 0 )
2369 client->live_migrating = 1;
2370 rc = 1;
2371 break;
2372 case TMEMC_RESTORE_BEGIN:
2373 if ( client == NULL && (client = client_create(cli_id)) != NULL )
2374 return 1;
2375 break;
2376 case TMEMC_SAVE_GET_VERSION:
2377 rc = TMEM_SPEC_VERSION;
2378 break;
2379 case TMEMC_SAVE_GET_MAXPOOLS:
2380 rc = MAX_POOLS_PER_DOMAIN;
2381 break;
2382 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
2383 rc = client->weight == -1 ? -2 : client->weight;
2384 break;
2385 case TMEMC_SAVE_GET_CLIENT_CAP:
2386 rc = client->cap == -1 ? -2 : client->cap;
2387 break;
2388 case TMEMC_SAVE_GET_CLIENT_FLAGS:
2389 rc = (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) |
2390 (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 );
2391 break;
2392 case TMEMC_SAVE_GET_POOL_FLAGS:
2393 if ( pool == NULL )
2394 break;
2395 rc = (pool->persistent ? TMEM_POOL_PERSIST : 0) |
2396 (pool->shared ? TMEM_POOL_SHARED : 0) |
2397 (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT);
2398 break;
2399 case TMEMC_SAVE_GET_POOL_NPAGES:
2400 if ( pool == NULL )
2401 break;
2402 rc = _atomic_read(pool->pgp_count);
2403 break;
2404 case TMEMC_SAVE_GET_POOL_UUID:
2405 if ( pool == NULL )
2406 break;
2407 uuid = (uint64_t *)buf.p;
2408 *uuid++ = pool->uuid[0];
2409 *uuid = pool->uuid[1];
2410 rc = 0;
2411 case TMEMC_SAVE_END:
2412 client->live_migrating = 0;
2413 if ( !list_empty(&client->persistent_invalidated_list) )
2414 list_for_each_entry_safe(pgp,pgp2,
2415 &client->persistent_invalidated_list, client_inv_pages)
2416 pgp_free_from_inv_list(client,pgp);
2417 client->frozen = client->was_frozen;
2418 rc = 0;
2420 return rc;
2423 static NOINLINE int tmemc_save_get_next_page(int cli_id, int pool_id,
2424 tmem_cli_va_t buf, uint32_t bufsize)
2426 client_t *client = tmh_client_from_cli_id(cli_id);
2427 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2428 ? NULL : client->pools[pool_id];
2429 pgp_t *pgp;
2430 OID oid;
2431 int ret = 0;
2432 struct tmem_handle *h;
2433 unsigned int pagesize = 1 << (pool->pageshift+12);
2435 if ( pool == NULL || is_ephemeral(pool) )
2436 return -1;
2437 if ( bufsize < pagesize + sizeof(struct tmem_handle) )
2438 return -ENOMEM;
2440 tmem_spin_lock(&pers_lists_spinlock);
2441 if ( list_empty(&pool->persistent_page_list) )
2443 ret = -1;
2444 goto out;
2446 /* note: pool->cur_pgp is the pgp last returned by get_next_page */
2447 if ( pool->cur_pgp == NULL )
2449 /* process the first one */
2450 pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
2451 pgp_t,us.pool_pers_pages);
2452 } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages,
2453 &pool->persistent_page_list) )
2455 /* already processed the last one in the list */
2456 ret = -1;
2457 goto out;
2459 pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next,
2460 pgp_t,us.pool_pers_pages);
2461 pool->cur_pgp = pgp;
2462 oid = pgp->us.obj->oid;
2463 h = (struct tmem_handle *)buf.p;
2464 *(OID *)&h->oid[0] = oid;
2465 h->index = pgp->index;
2466 buf.p = (void *)(h+1);
2467 ret = do_tmem_get(pool, &oid, h->index,0,0,0,pagesize,buf.p);
2469 out:
2470 tmem_spin_unlock(&pers_lists_spinlock);
2471 return ret;
2474 static NOINLINE int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_t buf,
2475 uint32_t bufsize)
2477 client_t *client = tmh_client_from_cli_id(cli_id);
2478 pgp_t *pgp;
2479 struct tmem_handle *h;
2480 int ret = 0;
2482 if ( client == NULL )
2483 return 0;
2484 if ( bufsize < sizeof(struct tmem_handle) )
2485 return 0;
2486 tmem_spin_lock(&pers_lists_spinlock);
2487 if ( list_empty(&client->persistent_invalidated_list) )
2488 goto out;
2489 if ( client->cur_pgp == NULL )
2491 pgp = list_entry((&client->persistent_invalidated_list)->next,
2492 pgp_t,client_inv_pages);
2493 client->cur_pgp = pgp;
2494 } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
2495 &client->persistent_invalidated_list) )
2497 client->cur_pgp = NULL;
2498 ret = 0;
2499 goto out;
2500 } else {
2501 pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
2502 pgp_t,client_inv_pages);
2503 client->cur_pgp = pgp;
2505 h = (struct tmem_handle *)buf.p;
2506 h->pool_id = pgp->pool_id;
2507 *(OID *)&h->oid = pgp->inv_oid;
2508 h->index = pgp->index;
2509 ret = 1;
2510 out:
2511 tmem_spin_unlock(&pers_lists_spinlock);
2512 return ret;
2515 static int tmemc_restore_put_page(int cli_id, int pool_id, OID *oidp,
2516 uint32_t index, tmem_cli_va_t buf, uint32_t bufsize)
2518 client_t *client = tmh_client_from_cli_id(cli_id);
2519 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2520 ? NULL : client->pools[pool_id];
2522 if ( pool == NULL )
2523 return -1;
2524 return do_tmem_put(pool,oidp,index,0,0,0,bufsize,buf.p);
2527 static int tmemc_restore_flush_page(int cli_id, int pool_id, OID *oidp,
2528 uint32_t index)
2530 client_t *client = tmh_client_from_cli_id(cli_id);
2531 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2532 ? NULL : client->pools[pool_id];
2534 if ( pool == NULL )
2535 return -1;
2536 return do_tmem_flush_page(pool,oidp,index);
2539 static NOINLINE int do_tmem_control(struct tmem_op *op)
2541 int ret;
2542 uint32_t pool_id = op->pool_id;
2543 uint32_t subop = op->u.ctrl.subop;
2544 OID *oidp = (OID *)(&op->u.ctrl.oid[0]);
2546 if (!tmh_current_is_privileged())
2548 /* don't fail... mystery: sometimes dom0 fails here */
2549 /* return -EPERM; */
2551 switch(subop)
2553 case TMEMC_THAW:
2554 case TMEMC_FREEZE:
2555 case TMEMC_DESTROY:
2556 ret = tmemc_freeze_pools(op->u.ctrl.cli_id,subop);
2557 break;
2558 case TMEMC_FLUSH:
2559 ret = tmemc_flush_mem(op->u.ctrl.cli_id,op->u.ctrl.arg1);
2560 break;
2561 case TMEMC_LIST:
2562 ret = tmemc_list(op->u.ctrl.cli_id,op->u.ctrl.buf,
2563 op->u.ctrl.arg1,op->u.ctrl.arg2);
2564 break;
2565 case TMEMC_SET_WEIGHT:
2566 case TMEMC_SET_CAP:
2567 case TMEMC_SET_COMPRESS:
2568 ret = tmemc_set_var(op->u.ctrl.cli_id,subop,op->u.ctrl.arg1);
2569 break;
2570 case TMEMC_QUERY_FREEABLE_MB:
2571 ret = tmh_freeable_pages() >> (20 - PAGE_SHIFT);
2572 break;
2573 case TMEMC_SAVE_BEGIN:
2574 case TMEMC_RESTORE_BEGIN:
2575 case TMEMC_SAVE_GET_VERSION:
2576 case TMEMC_SAVE_GET_MAXPOOLS:
2577 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
2578 case TMEMC_SAVE_GET_CLIENT_CAP:
2579 case TMEMC_SAVE_GET_CLIENT_FLAGS:
2580 case TMEMC_SAVE_GET_POOL_FLAGS:
2581 case TMEMC_SAVE_GET_POOL_NPAGES:
2582 case TMEMC_SAVE_GET_POOL_UUID:
2583 case TMEMC_SAVE_END:
2584 ret = tmemc_save_subop(op->u.ctrl.cli_id,pool_id,subop,
2585 op->u.ctrl.buf,op->u.ctrl.arg1);
2586 break;
2587 case TMEMC_SAVE_GET_NEXT_PAGE:
2588 ret = tmemc_save_get_next_page(op->u.ctrl.cli_id, pool_id,
2589 op->u.ctrl.buf, op->u.ctrl.arg1);
2590 break;
2591 case TMEMC_SAVE_GET_NEXT_INV:
2592 ret = tmemc_save_get_next_inv(op->u.ctrl.cli_id, op->u.ctrl.buf,
2593 op->u.ctrl.arg1);
2594 break;
2595 case TMEMC_RESTORE_PUT_PAGE:
2596 ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id,
2597 oidp, op->u.ctrl.arg2,
2598 op->u.ctrl.buf, op->u.ctrl.arg1);
2599 break;
2600 case TMEMC_RESTORE_FLUSH_PAGE:
2601 ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id,
2602 oidp, op->u.ctrl.arg2);
2603 break;
2604 default:
2605 ret = -1;
2607 return ret;
2610 /************ EXPORTed FUNCTIONS **************************************/
2612 EXPORT long do_tmem_op(tmem_cli_op_t uops)
2614 struct tmem_op op;
2615 client_t *client = tmh_client_from_current();
2616 pool_t *pool = NULL;
2617 OID *oidp;
2618 int rc = 0;
2619 bool_t succ_get = 0, succ_put = 0;
2620 bool_t non_succ_get = 0, non_succ_put = 0;
2621 bool_t flush = 0, flush_obj = 0;
2622 bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
2623 DECL_LOCAL_CYC_COUNTER(succ_get);
2624 DECL_LOCAL_CYC_COUNTER(succ_put);
2625 DECL_LOCAL_CYC_COUNTER(non_succ_get);
2626 DECL_LOCAL_CYC_COUNTER(non_succ_put);
2627 DECL_LOCAL_CYC_COUNTER(flush);
2628 DECL_LOCAL_CYC_COUNTER(flush_obj);
2630 if ( !tmem_initialized )
2631 return -ENODEV;
2633 total_tmem_ops++;
2635 if ( tmh_lock_all )
2637 if ( tmh_lock_all > 1 )
2638 spin_lock_irq(&tmem_spinlock);
2639 else
2640 spin_lock(&tmem_spinlock);
2643 START_CYC_COUNTER(succ_get);
2644 DUP_START_CYC_COUNTER(succ_put,succ_get);
2645 DUP_START_CYC_COUNTER(non_succ_get,succ_get);
2646 DUP_START_CYC_COUNTER(non_succ_put,succ_get);
2647 DUP_START_CYC_COUNTER(flush,succ_get);
2648 DUP_START_CYC_COUNTER(flush_obj,succ_get);
2650 if ( client != NULL && tmh_client_is_dying(client) )
2652 rc = -ENODEV;
2653 goto out;
2656 if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
2658 printk("tmem: can't get tmem struct from %s\n",client_str);
2659 rc = -EFAULT;
2660 goto out;
2663 if ( op.cmd == TMEM_CONTROL )
2665 tmem_write_lock(&tmem_rwlock);
2666 tmem_write_lock_set = 1;
2667 rc = do_tmem_control(&op);
2668 goto out;
2669 } else if ( op.cmd == TMEM_AUTH ) {
2670 tmem_write_lock(&tmem_rwlock);
2671 tmem_write_lock_set = 1;
2672 rc = tmemc_shared_pool_auth(op.u.creat.arg1,op.u.creat.uuid[0],
2673 op.u.creat.uuid[1],op.u.creat.flags);
2674 goto out;
2675 } else if ( op.cmd == TMEM_RESTORE_NEW ) {
2676 tmem_write_lock(&tmem_rwlock);
2677 tmem_write_lock_set = 1;
2678 rc = do_tmem_new_pool(op.u.creat.arg1, op.pool_id, op.u.creat.flags,
2679 op.u.creat.uuid[0], op.u.creat.uuid[1]);
2680 goto out;
2683 /* create per-client tmem structure dynamically on first use by client */
2684 if ( client == NULL )
2686 tmem_write_lock(&tmem_rwlock);
2687 tmem_write_lock_set = 1;
2688 if ( (client = client_create(tmh_get_cli_id_from_current())) == NULL )
2690 printk("tmem: can't create tmem structure for %s\n",client_str);
2691 rc = -ENOMEM;
2692 goto out;
2696 if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
2698 if ( !tmem_write_lock_set )
2700 tmem_write_lock(&tmem_rwlock);
2701 tmem_write_lock_set = 1;
2704 else
2706 if ( !tmem_write_lock_set )
2708 tmem_read_lock(&tmem_rwlock);
2709 tmem_read_lock_set = 1;
2711 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
2712 ((pool = client->pools[op.pool_id]) == NULL) )
2714 rc = -ENODEV;
2715 printk("tmem: operation requested on uncreated pool\n");
2716 goto out;
2718 ASSERT_SENTINEL(pool,POOL);
2721 oidp = (OID *)&op.u.gen.oid[0];
2722 switch ( op.cmd )
2724 case TMEM_NEW_POOL:
2725 rc = do_tmem_new_pool(CLI_ID_NULL, 0, op.u.creat.flags,
2726 op.u.creat.uuid[0], op.u.creat.uuid[1]);
2727 break;
2728 case TMEM_NEW_PAGE:
2729 tmem_ensure_avail_pages();
2730 rc = do_tmem_put(pool, oidp,
2731 op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, NULL);
2732 break;
2733 case TMEM_PUT_PAGE:
2734 tmem_ensure_avail_pages();
2735 rc = do_tmem_put(pool, oidp,
2736 op.u.gen.index, op.u.gen.cmfn, 0, 0, PAGE_SIZE, NULL);
2737 if (rc == 1) succ_put = 1;
2738 else non_succ_put = 1;
2739 break;
2740 case TMEM_GET_PAGE:
2741 rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
2742 0, 0, PAGE_SIZE, 0);
2743 if (rc == 1) succ_get = 1;
2744 else non_succ_get = 1;
2745 break;
2746 case TMEM_FLUSH_PAGE:
2747 flush = 1;
2748 rc = do_tmem_flush_page(pool, oidp, op.u.gen.index);
2749 break;
2750 case TMEM_FLUSH_OBJECT:
2751 rc = do_tmem_flush_object(pool, oidp);
2752 flush_obj = 1;
2753 break;
2754 case TMEM_DESTROY_POOL:
2755 flush = 1;
2756 rc = do_tmem_destroy_pool(op.pool_id);
2757 break;
2758 case TMEM_READ:
2759 rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
2760 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
2761 op.u.gen.len,0);
2762 break;
2763 case TMEM_WRITE:
2764 rc = do_tmem_put(pool, oidp,
2765 op.u.gen.index, op.u.gen.cmfn,
2766 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
2767 op.u.gen.len, NULL);
2768 break;
2769 case TMEM_XCHG:
2770 /* need to hold global lock to ensure xchg is atomic */
2771 printk("tmem_xchg op not implemented yet\n");
2772 rc = 0;
2773 break;
2774 default:
2775 printk("tmem: op %d not implemented\n", op.cmd);
2776 rc = 0;
2777 break;
2780 out:
2781 if ( rc < 0 )
2782 errored_tmem_ops++;
2783 if ( succ_get )
2784 END_CYC_COUNTER_CLI(succ_get,client);
2785 else if ( succ_put )
2786 END_CYC_COUNTER_CLI(succ_put,client);
2787 else if ( non_succ_get )
2788 END_CYC_COUNTER_CLI(non_succ_get,client);
2789 else if ( non_succ_put )
2790 END_CYC_COUNTER_CLI(non_succ_put,client);
2791 else if ( flush )
2792 END_CYC_COUNTER_CLI(flush,client);
2793 else if ( flush_obj )
2794 END_CYC_COUNTER_CLI(flush_obj,client);
2796 if ( tmh_lock_all )
2798 if ( tmh_lock_all > 1 )
2799 spin_unlock_irq(&tmem_spinlock);
2800 else
2801 spin_unlock(&tmem_spinlock);
2802 } else {
2803 if ( tmem_write_lock_set )
2804 write_unlock(&tmem_rwlock);
2805 else if ( tmem_read_lock_set )
2806 read_unlock(&tmem_rwlock);
2807 else
2808 ASSERT(0);
2811 return rc;
2814 /* this should be called when the host is destroying a client */
2815 EXPORT void tmem_destroy(void *v)
2817 client_t *client = (client_t *)v;
2819 if ( client == NULL )
2820 return;
2822 if ( !tmh_client_is_dying(client) )
2824 printk("tmem: tmem_destroy can only destroy dying client\n");
2825 return;
2828 if ( tmh_lock_all )
2829 spin_lock(&tmem_spinlock);
2830 else
2831 write_lock(&tmem_rwlock);
2833 printk("tmem: flushing tmem pools for %s=%d\n",
2834 cli_id_str, client->cli_id);
2835 client_flush(client, 1);
2837 if ( tmh_lock_all )
2838 spin_unlock(&tmem_spinlock);
2839 else
2840 write_unlock(&tmem_rwlock);
2843 /* freezing all pools guarantees that no additional memory will be consumed */
2844 EXPORT void tmem_freeze_all(unsigned char key)
2846 static int freeze = 0;
2848 if ( tmh_lock_all )
2849 spin_lock(&tmem_spinlock);
2850 else
2851 write_lock(&tmem_rwlock);
2853 freeze = !freeze;
2854 tmemc_freeze_pools(CLI_ID_NULL,freeze);
2856 if ( tmh_lock_all )
2857 spin_unlock(&tmem_spinlock);
2858 else
2859 write_unlock(&tmem_rwlock);
2862 #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
2864 EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2866 pfp_t *pfp;
2867 unsigned long evicts_per_relinq = 0;
2868 int max_evictions = 10;
2870 if (!tmh_enabled() || !tmh_freeable_pages())
2871 return NULL;
2872 #ifdef __i386__
2873 return NULL;
2874 #endif
2876 relinq_attempts++;
2877 if ( order > 0 )
2879 #ifndef NDEBUG
2880 printk("tmem_relinquish_page: failing order=%d\n", order);
2881 #endif
2882 return NULL;
2885 if ( tmh_called_from_tmem(memflags) )
2887 if ( tmh_lock_all )
2888 spin_lock(&tmem_spinlock);
2889 else
2890 read_lock(&tmem_rwlock);
2893 while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
2895 if ( (max_evictions-- <= 0) || !tmem_evict())
2896 break;
2897 evicts_per_relinq++;
2899 if ( evicts_per_relinq > max_evicts_per_relinq )
2900 max_evicts_per_relinq = evicts_per_relinq;
2901 tmh_scrub_page(pfp, memflags);
2902 if ( pfp != NULL )
2903 relinq_pgs++;
2905 if ( tmh_called_from_tmem(memflags) )
2907 if ( tmh_lock_all )
2908 spin_unlock(&tmem_spinlock);
2909 else
2910 read_unlock(&tmem_rwlock);
2913 return pfp;
2916 EXPORT unsigned long tmem_freeable_pages(void)
2918 return tmh_freeable_pages();
2921 /* called at hypervisor startup */
2922 static int __init init_tmem(void)
2924 int i;
2925 if ( !tmh_enabled() )
2926 return 0;
2928 radix_tree_init();
2929 if ( tmh_dedup_enabled() )
2930 for (i = 0; i < 256; i++ )
2932 pcd_tree_roots[i] = RB_ROOT;
2933 rwlock_init(&pcd_tree_rwlocks[i]);
2936 if ( tmh_init() )
2938 printk("tmem: initialized comp=%d dedup=%d tze=%d global-lock=%d\n",
2939 tmh_compression_enabled(), tmh_dedup_enabled(), tmh_tze_enabled(),
2940 tmh_lock_all);
2941 if ( tmh_dedup_enabled()&&tmh_compression_enabled()&&tmh_tze_enabled() )
2943 tmh_tze_disable();
2944 printk("tmem: tze and compression not compatible, disabling tze\n");
2946 tmem_initialized = 1;
2948 else
2949 printk("tmem: initialization FAILED\n");
2951 return 0;
2953 __initcall(init_tmem);
2955 /*
2956 * Local variables:
2957 * mode: C
2958 * c-set-style: "BSD"
2959 * c-basic-offset: 4
2960 * tab-width: 4
2961 * indent-tabs-mode: nil
2962 * End:
2963 */