debuggers.hg

view xen/common/tmem.c @ 21005:87f1e5b7660b

tmem: Quieten noisy printk in non-debug build

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Feb 22 10:02:17 2010 +0000 (2010-02-22)
parents a3fa6d444b25
children bfaafdddf31a
line source
1 /******************************************************************************
2 * tmem.c
3 *
4 * Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
9 /* TODO list: 090129
10 - improve on reclamation policy
11 - use different tlsf pools for each client (maybe each pool)
12 - implement page accounting and minimal QoS limits
13 - test shared access more completely (need pv cluster fs)
14 - add feedback-driven compression (not for persistent pools though!)
15 - add data-structure total bytes overhead stats
16 */
18 #ifdef __XEN__
19 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
20 #endif
22 #include <xen/tmem.h>
23 #include <xen/rbtree.h>
24 #include <xen/radix-tree.h>
25 #include <xen/list.h>
27 #define EXPORT /* indicates code other modules are dependent upon */
28 #define FORWARD
30 #define TMEM_SPEC_VERSION 0
32 /************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
34 #define CLI_ID_NULL TMH_CLI_ID_NULL
35 #define cli_id_str tmh_cli_id_str
36 #define client_str tmh_client_str
38 /************ DEBUG and STATISTICS (+ some compression testing) *******/
40 #ifndef NDEBUG
41 #define SENTINELS
42 #define NOINLINE noinline
43 #else
44 #define NOINLINE
45 #endif
47 #ifdef SENTINELS
48 #define DECL_SENTINEL unsigned long sentinel;
49 #define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
50 #define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
51 #define ASSERT_SENTINEL(_x,_y) \
52 ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
53 #ifdef __i386__
54 #define POOL_SENTINEL 0x87658765
55 #define OBJ_SENTINEL 0x12345678
56 #define OBJNODE_SENTINEL 0xfedcba09
57 #define PGD_SENTINEL 0x43214321
58 #else
59 #define POOL_SENTINEL 0x8765876587658765
60 #define OBJ_SENTINEL 0x1234567812345678
61 #define OBJNODE_SENTINEL 0xfedcba0987654321
62 #define PGD_SENTINEL 0x4321432143214321
63 #endif
64 #else
65 #define DECL_SENTINEL
66 #define SET_SENTINEL(_x,_y) do { } while (0)
67 #define ASSERT_SENTINEL(_x,_y) do { } while (0)
68 #define INVERT_SENTINEL(_x,_y) do { } while (0)
69 #endif
71 /* global statistics (none need to be locked) */
72 static unsigned long total_tmem_ops = 0;
73 static unsigned long errored_tmem_ops = 0;
74 static unsigned long total_flush_pool = 0;
75 static unsigned long alloc_failed = 0, alloc_page_failed = 0;
76 static unsigned long evicted_pgs = 0, evict_attempts = 0;
77 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
78 static unsigned long max_evicts_per_relinq = 0;
79 static unsigned long low_on_memory = 0;
80 static int global_obj_count_max = 0;
81 static int global_pgp_count_max = 0;
82 static int global_page_count_max = 0;
83 static int global_rtree_node_count_max = 0;
84 static long global_eph_count_max = 0;
85 static unsigned long failed_copies;
87 DECL_CYC_COUNTER(succ_get);
88 DECL_CYC_COUNTER(succ_put);
89 DECL_CYC_COUNTER(non_succ_get);
90 DECL_CYC_COUNTER(non_succ_put);
91 DECL_CYC_COUNTER(flush);
92 DECL_CYC_COUNTER(flush_obj);
93 #ifdef COMPARE_COPY_PAGE_SSE2
94 EXTERN_CYC_COUNTER(pg_copy1);
95 EXTERN_CYC_COUNTER(pg_copy2);
96 EXTERN_CYC_COUNTER(pg_copy3);
97 EXTERN_CYC_COUNTER(pg_copy4);
98 #else
99 EXTERN_CYC_COUNTER(pg_copy);
100 #endif
101 DECL_CYC_COUNTER(compress);
102 DECL_CYC_COUNTER(decompress);
104 /************ CORE DATA STRUCTURES ************************************/
106 #define MAX_POOLS_PER_DOMAIN 16
107 #define MAX_GLOBAL_SHARED_POOLS 16
109 struct tm_pool;
110 struct tmem_page_descriptor;
111 struct client {
112 struct list_head client_list;
113 struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
114 tmh_client_t *tmh;
115 struct list_head ephemeral_page_list;
116 long eph_count, eph_count_max;
117 cli_id_t cli_id;
118 uint32_t weight;
119 uint32_t cap;
120 bool_t compress;
121 bool_t frozen;
122 bool_t shared_auth_required;
123 /* for save/restore/migration */
124 bool_t live_migrating;
125 bool_t was_frozen;
126 struct list_head persistent_invalidated_list;
127 struct tmem_page_descriptor *cur_pgp;
128 /* statistics collection */
129 unsigned long compress_poor, compress_nomem;
130 unsigned long compressed_pages;
131 uint64_t compressed_sum_size;
132 uint64_t total_cycles;
133 unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets;
134 /* shared pool authentication */
135 uint64_t shared_auth_uuid[MAX_GLOBAL_SHARED_POOLS][2];
136 };
137 typedef struct client client_t;
139 struct share_list {
140 struct list_head share_list;
141 client_t *client;
142 };
143 typedef struct share_list sharelist_t;
145 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
146 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
147 #define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
149 struct tm_pool {
150 bool_t shared;
151 bool_t persistent;
152 bool_t is_dying;
153 int pageshift; /* 0 == 2**12 */
154 struct list_head pool_list; /* FIXME do we need this anymore? */
155 client_t *client;
156 uint64_t uuid[2]; /* 0 for private, non-zero for shared */
157 uint32_t pool_id;
158 rwlock_t pool_rwlock;
159 struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
160 struct list_head share_list; /* valid if shared */
161 int shared_count; /* valid if shared */
162 /* for save/restore/migration */
163 struct list_head persistent_page_list;
164 struct tmem_page_descriptor *cur_pgp;
165 /* statistics collection */
166 atomic_t pgp_count;
167 int pgp_count_max;
168 long obj_count; /* atomicity depends on pool_rwlock held for write */
169 long obj_count_max;
170 unsigned long objnode_count, objnode_count_max;
171 uint64_t sum_life_cycles;
172 uint64_t sum_evicted_cycles;
173 unsigned long puts, good_puts, no_mem_puts;
174 unsigned long dup_puts_flushed, dup_puts_replaced;
175 unsigned long gets, found_gets;
176 unsigned long flushs, flushs_found;
177 unsigned long flush_objs, flush_objs_found;
178 DECL_SENTINEL
179 };
180 typedef struct tm_pool pool_t;
182 #define is_persistent(_p) (_p->persistent)
183 #define is_ephemeral(_p) (!(_p->persistent))
184 #define is_shared(_p) (_p->shared)
185 #define is_private(_p) (!(_p->shared))
187 struct tmem_object_root {
188 DECL_SENTINEL
189 uint64_t oid;
190 struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
191 unsigned long objnode_count; /* atomicity depends on obj_spinlock */
192 long pgp_count; /* atomicity depends on obj_spinlock */
193 struct radix_tree_root tree_root; /* tree of pages within object */
194 pool_t *pool;
195 cli_id_t last_client;
196 spinlock_t obj_spinlock;
197 bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
198 };
199 typedef struct tmem_object_root obj_t;
201 typedef struct radix_tree_node rtn_t;
202 struct tmem_object_node {
203 obj_t *obj;
204 DECL_SENTINEL
205 rtn_t rtn;
206 };
207 typedef struct tmem_object_node objnode_t;
209 struct tmem_page_descriptor {
210 union {
211 struct list_head global_eph_pages;
212 struct list_head client_inv_pages;
213 };
214 union {
215 struct list_head client_eph_pages;
216 struct list_head pool_pers_pages;
217 };
218 union {
219 obj_t *obj;
220 uint64_t inv_oid; /* used for invalid list only */
221 };
222 uint32_t index;
223 size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
224 else compressed data (cdata) */
225 union {
226 pfp_t *pfp; /* page frame pointer */
227 char *cdata; /* compressed data */
228 };
229 union {
230 uint64_t timestamp;
231 uint32_t pool_id; /* used for invalid list only */
232 };
233 DECL_SENTINEL
234 };
235 typedef struct tmem_page_descriptor pgp_t;
237 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
239 static LIST_HEAD(global_client_list);
240 static LIST_HEAD(global_pool_list);
242 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
243 static bool_t global_shared_auth = 0;
244 static atomic_t client_weight_total = ATOMIC_INIT(0);
245 static int tmem_initialized = 0;
247 /************ CONCURRENCY ***********************************************/
249 EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
250 EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
251 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
252 static DEFINE_SPINLOCK(pers_lists_spinlock);
254 #define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
255 #define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
256 #define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
257 #define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
258 #define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
259 #define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
260 #define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
261 #define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
263 #define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
264 #define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
266 /* global counters (should use long_atomic_t access) */
267 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
268 static atomic_t global_obj_count = ATOMIC_INIT(0);
269 static atomic_t global_pgp_count = ATOMIC_INIT(0);
270 static atomic_t global_page_count = ATOMIC_INIT(0);
271 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
273 #define atomic_inc_and_max(_c) do { \
274 atomic_inc(&_c); \
275 if ( _atomic_read(_c) > _c##_max ) \
276 _c##_max = _atomic_read(_c); \
277 } while (0)
279 #define atomic_dec_and_assert(_c) do { \
280 atomic_dec(&_c); \
281 ASSERT(_atomic_read(_c) >= 0); \
282 } while (0)
285 /************ MEMORY ALLOCATION INTERFACE *****************************/
287 #define tmem_malloc(_type,_pool) \
288 _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
290 #define tmem_malloc_bytes(_size,_pool) \
291 _tmem_malloc(_size, 1, _pool)
293 static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
294 {
295 void *v;
297 if ( (pool != NULL) && is_persistent(pool) )
298 v = tmh_alloc_subpage_thispool(pool,size,align);
299 else
300 v = tmh_alloc_subpage(pool, size, align);
301 if ( v == NULL )
302 alloc_failed++;
303 return v;
304 }
306 static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
307 {
308 if ( pool == NULL || !is_persistent(pool) )
309 tmh_free_subpage(p,size);
310 else
311 tmh_free_subpage_thispool(pool,p,size);
312 }
314 static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
315 {
316 pfp_t *pfp = NULL;
318 if ( pool != NULL && is_persistent(pool) )
319 pfp = tmh_alloc_page_thispool(pool);
320 else
321 pfp = tmh_alloc_page(pool,0);
322 if ( pfp == NULL )
323 alloc_page_failed++;
324 else
325 atomic_inc_and_max(global_page_count);
326 return pfp;
327 }
329 static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
330 {
331 ASSERT(pfp);
332 if ( pool == NULL || !is_persistent(pool) )
333 tmh_free_page(pfp);
334 else
335 tmh_free_page_thispool(pool,pfp);
336 atomic_dec_and_assert(global_page_count);
337 }
339 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
341 /* allocate a pgp_t and associate it with an object */
342 static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
343 {
344 pgp_t *pgp;
345 pool_t *pool;
347 ASSERT(obj != NULL);
348 ASSERT(obj->pool != NULL);
349 pool = obj->pool;
350 if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
351 return NULL;
352 pgp->obj = obj;
353 INIT_LIST_HEAD(&pgp->global_eph_pages);
354 INIT_LIST_HEAD(&pgp->client_eph_pages);
355 pgp->pfp = NULL;
356 pgp->size = -1;
357 pgp->index = -1;
358 pgp->timestamp = get_cycles();
359 SET_SENTINEL(pgp,PGD);
360 atomic_inc_and_max(global_pgp_count);
361 atomic_inc_and_max(pool->pgp_count);
362 return pgp;
363 }
365 static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
366 {
367 ASSERT(obj != NULL);
368 ASSERT_SPINLOCK(&obj->obj_spinlock);
369 ASSERT_SENTINEL(obj,OBJ);
370 ASSERT(obj->pool != NULL);
371 ASSERT_SENTINEL(obj->pool,POOL);
372 return radix_tree_lookup(&obj->tree_root, index);
373 }
375 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
376 {
377 if ( pgp->pfp == NULL )
378 return;
379 if ( !pgp->size )
380 tmem_page_free(pgp->obj->pool,pgp->pfp);
381 else
382 {
383 tmem_free(pgp->cdata,pgp->size,pool);
384 if ( pool != NULL )
385 {
386 pool->client->compressed_pages--;
387 pool->client->compressed_sum_size -= pgp->size;
388 }
389 }
390 pgp->pfp = NULL;
391 pgp->size = -1;
392 }
394 static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
395 {
396 pool_t *pool = NULL;
398 ASSERT_SENTINEL(pgp,PGD);
399 ASSERT(pgp->obj != NULL);
400 ASSERT_SENTINEL(pgp->obj,OBJ);
401 ASSERT_SENTINEL(pgp->obj->pool,POOL);
402 ASSERT(pgp->obj->pool->client != NULL);
403 if ( from_delete )
404 ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
405 ASSERT(pgp->obj->pool != NULL);
406 pool = pgp->obj->pool;
407 if ( is_ephemeral(pool) )
408 {
409 ASSERT(list_empty(&pgp->global_eph_pages));
410 ASSERT(list_empty(&pgp->client_eph_pages));
411 }
412 pgp_free_data(pgp, pool);
413 atomic_dec_and_assert(global_pgp_count);
414 atomic_dec_and_assert(pool->pgp_count);
415 pgp->size = -1;
416 if ( is_persistent(pool) && pool->client->live_migrating )
417 {
418 pgp->inv_oid = pgp->obj->oid;
419 pgp->pool_id = pool->pool_id;
420 return;
421 }
422 INVERT_SENTINEL(pgp,PGD);
423 pgp->obj = NULL;
424 pgp->index = -1;
425 tmem_free(pgp,sizeof(pgp_t),pool);
426 }
428 static NOINLINE void pgp_free_from_inv_list(client_t *client, pgp_t *pgp)
429 {
430 pool_t *pool = client->pools[pgp->pool_id];
432 ASSERT_SENTINEL(pool,POOL);
433 ASSERT_SENTINEL(pgp,PGD);
434 INVERT_SENTINEL(pgp,PGD);
435 pgp->obj = NULL;
436 pgp->index = -1;
437 tmem_free(pgp,sizeof(pgp_t),pool);
438 }
440 /* remove the page from appropriate lists but not from parent object */
441 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
442 {
443 client_t *client;
445 ASSERT(pgp != NULL);
446 ASSERT(pgp->obj != NULL);
447 ASSERT(pgp->obj->pool != NULL);
448 client = pgp->obj->pool->client;
449 ASSERT(client != NULL);
450 if ( is_ephemeral(pgp->obj->pool) )
451 {
452 if ( !no_eph_lock )
453 tmem_spin_lock(&eph_lists_spinlock);
454 if ( !list_empty(&pgp->client_eph_pages) )
455 client->eph_count--;
456 ASSERT(client->eph_count >= 0);
457 list_del_init(&pgp->client_eph_pages);
458 if ( !list_empty(&pgp->global_eph_pages) )
459 global_eph_count--;
460 ASSERT(global_eph_count >= 0);
461 list_del_init(&pgp->global_eph_pages);
462 if ( !no_eph_lock )
463 tmem_spin_unlock(&eph_lists_spinlock);
464 } else {
465 if ( client->live_migrating )
466 {
467 tmem_spin_lock(&pers_lists_spinlock);
468 list_add_tail(&pgp->client_inv_pages,
469 &client->persistent_invalidated_list);
470 if ( pgp != pgp->obj->pool->cur_pgp )
471 list_del_init(&pgp->pool_pers_pages);
472 tmem_spin_unlock(&pers_lists_spinlock);
473 } else {
474 tmem_spin_lock(&pers_lists_spinlock);
475 list_del_init(&pgp->pool_pers_pages);
476 tmem_spin_unlock(&pers_lists_spinlock);
477 }
478 }
479 }
481 /* remove page from lists (but not from parent object) and free it */
482 static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
483 {
484 uint64_t life;
486 ASSERT(pgp != NULL);
487 ASSERT(pgp->obj != NULL);
488 ASSERT(pgp->obj->pool != NULL);
489 life = get_cycles() - pgp->timestamp;
490 pgp->obj->pool->sum_life_cycles += life;
491 pgp_delist(pgp, no_eph_lock);
492 pgp_free(pgp,1);
493 }
495 /* called only indirectly by radix_tree_destroy */
496 static NOINLINE void pgp_destroy(void *v)
497 {
498 pgp_t *pgp = (pgp_t *)v;
500 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
501 pgp_delist(pgp,0);
502 ASSERT(pgp->obj != NULL);
503 pgp->obj->pgp_count--;
504 ASSERT(pgp->obj->pgp_count >= 0);
505 pgp_free(pgp,0);
506 }
508 FORWARD static rtn_t *rtn_alloc(void *arg);
509 FORWARD static void rtn_free(rtn_t *rtn);
511 static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
512 {
513 int ret;
515 ASSERT_SPINLOCK(&obj->obj_spinlock);
516 ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
517 if ( !ret )
518 obj->pgp_count++;
519 return ret;
520 }
522 static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
523 {
524 pgp_t *pgp;
526 ASSERT(obj != NULL);
527 ASSERT_SPINLOCK(&obj->obj_spinlock);
528 ASSERT_SENTINEL(obj,OBJ);
529 ASSERT(obj->pool != NULL);
530 ASSERT_SENTINEL(obj->pool,POOL);
531 pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
532 if ( pgp != NULL )
533 obj->pgp_count--;
534 ASSERT(obj->pgp_count >= 0);
536 return pgp;
537 }
539 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
541 /* called only indirectly from radix_tree_insert */
542 static NOINLINE rtn_t *rtn_alloc(void *arg)
543 {
544 objnode_t *objnode;
545 obj_t *obj = (obj_t *)arg;
547 ASSERT_SENTINEL(obj,OBJ);
548 ASSERT(obj->pool != NULL);
549 ASSERT_SENTINEL(obj->pool,POOL);
550 objnode = tmem_malloc(objnode_t,obj->pool);
551 if (objnode == NULL)
552 return NULL;
553 objnode->obj = obj;
554 SET_SENTINEL(objnode,OBJNODE);
555 memset(&objnode->rtn, 0, sizeof(rtn_t));
556 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
557 obj->pool->objnode_count_max = obj->pool->objnode_count;
558 atomic_inc_and_max(global_rtree_node_count);
559 obj->objnode_count++;
560 return &objnode->rtn;
561 }
563 /* called only indirectly from radix_tree_delete/destroy */
564 static void rtn_free(rtn_t *rtn)
565 {
566 pool_t *pool;
567 objnode_t *objnode;
568 int i;
570 ASSERT(rtn != NULL);
571 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
572 ASSERT(rtn->slots[i] == NULL);
573 objnode = container_of(rtn,objnode_t,rtn);
574 ASSERT_SENTINEL(objnode,OBJNODE);
575 INVERT_SENTINEL(objnode,OBJNODE);
576 ASSERT(objnode->obj != NULL);
577 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
578 ASSERT_SENTINEL(objnode->obj,OBJ);
579 pool = objnode->obj->pool;
580 ASSERT(pool != NULL);
581 ASSERT_SENTINEL(pool,POOL);
582 pool->objnode_count--;
583 objnode->obj->objnode_count--;
584 objnode->obj = NULL;
585 tmem_free(objnode,sizeof(objnode_t),pool);
586 atomic_dec_and_assert(global_rtree_node_count);
587 }
589 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
591 /* searches for object==oid in pool, returns locked object if found */
592 static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
593 {
594 struct rb_node *node;
595 obj_t *obj;
597 restart_find:
598 tmem_read_lock(&pool->pool_rwlock);
599 node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
600 while ( node )
601 {
602 obj = container_of(node, obj_t, rb_tree_node);
603 if ( obj->oid == oid )
604 {
605 if ( tmh_lock_all )
606 obj->no_evict = 1;
607 else
608 {
609 if ( !tmem_spin_trylock(&obj->obj_spinlock) )
610 {
611 tmem_read_unlock(&pool->pool_rwlock);
612 goto restart_find;
613 }
614 tmem_read_unlock(&pool->pool_rwlock);
615 }
616 return obj;
617 }
618 else if ( oid < obj->oid )
619 node = node->rb_left;
620 else
621 node = node->rb_right;
622 }
623 tmem_read_unlock(&pool->pool_rwlock);
624 return NULL;
625 }
627 /* free an object that has no more pgps in it */
628 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
629 {
630 pool_t *pool;
631 uint64_t old_oid;
633 ASSERT_SPINLOCK(&obj->obj_spinlock);
634 ASSERT(obj != NULL);
635 ASSERT_SENTINEL(obj,OBJ);
636 ASSERT(obj->pgp_count == 0);
637 pool = obj->pool;
638 ASSERT(pool != NULL);
639 ASSERT(pool->client != NULL);
640 ASSERT_WRITELOCK(&pool->pool_rwlock);
641 if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
642 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
643 ASSERT((long)obj->objnode_count == 0);
644 ASSERT(obj->tree_root.rnode == NULL);
645 pool->obj_count--;
646 ASSERT(pool->obj_count >= 0);
647 INVERT_SENTINEL(obj,OBJ);
648 obj->pool = NULL;
649 old_oid = obj->oid;
650 obj->oid = -1;
651 obj->last_client = CLI_ID_NULL;
652 atomic_dec_and_assert(global_obj_count);
653 /* use no_rebalance only if all objects are being destroyed anyway */
654 if ( !no_rebalance )
655 rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
656 tmem_free(obj,sizeof(obj_t),pool);
657 }
659 static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
660 {
661 struct rb_node **new, *parent = NULL;
662 obj_t *this;
664 new = &(root->rb_node);
665 while ( *new )
666 {
667 this = container_of(*new, obj_t, rb_tree_node);
668 parent = *new;
669 if ( obj->oid < this->oid )
670 new = &((*new)->rb_left);
671 else if ( obj->oid > this->oid )
672 new = &((*new)->rb_right);
673 else
674 return 0;
675 }
676 rb_link_node(&obj->rb_tree_node, parent, new);
677 rb_insert_color(&obj->rb_tree_node, root);
678 return 1;
679 }
681 /*
682 * allocate, initialize, and insert an tmem_object_root
683 * (should be called only if find failed)
684 */
685 static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
686 {
687 obj_t *obj;
689 ASSERT(pool != NULL);
690 ASSERT_WRITELOCK(&pool->pool_rwlock);
691 if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
692 return NULL;
693 pool->obj_count++;
694 if (pool->obj_count > pool->obj_count_max)
695 pool->obj_count_max = pool->obj_count;
696 atomic_inc_and_max(global_obj_count);
697 INIT_RADIX_TREE(&obj->tree_root,0);
698 spin_lock_init(&obj->obj_spinlock);
699 obj->pool = pool;
700 obj->oid = oid;
701 obj->objnode_count = 0;
702 obj->pgp_count = 0;
703 obj->last_client = CLI_ID_NULL;
704 SET_SENTINEL(obj,OBJ);
705 tmem_spin_lock(&obj->obj_spinlock);
706 obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
707 obj->no_evict = 1;
708 ASSERT_SPINLOCK(&obj->obj_spinlock);
709 return obj;
710 }
712 /* free an object after destroying any pgps in it */
713 static NOINLINE void obj_destroy(obj_t *obj, int no_rebalance)
714 {
715 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
716 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
717 obj_free(obj,no_rebalance);
718 }
720 /* destroys all objs in a pool, or only if obj->last_client matches cli_id */
721 static void pool_destroy_objs(pool_t *pool, bool_t selective, cli_id_t cli_id)
722 {
723 struct rb_node *node;
724 obj_t *obj;
725 int i;
727 tmem_write_lock(&pool->pool_rwlock);
728 pool->is_dying = 1;
729 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
730 {
731 node = rb_first(&pool->obj_rb_root[i]);
732 while ( node != NULL )
733 {
734 obj = container_of(node, obj_t, rb_tree_node);
735 tmem_spin_lock(&obj->obj_spinlock);
736 node = rb_next(node);
737 ASSERT(obj->no_evict == 0);
738 if ( !selective )
739 /* FIXME: should be obj,1 but walking/erasing rbtree is racy */
740 obj_destroy(obj,0);
741 else if ( obj->last_client == cli_id )
742 obj_destroy(obj,0);
743 else
744 tmem_spin_unlock(&obj->obj_spinlock);
745 }
746 }
747 tmem_write_unlock(&pool->pool_rwlock);
748 }
751 /************ POOL MANIPULATION ROUTINES ******************************/
753 static pool_t * pool_alloc(void)
754 {
755 pool_t *pool;
756 int i;
758 if ( (pool = tmh_alloc_infra(sizeof(pool_t),__alignof__(pool_t))) == NULL )
759 return NULL;
760 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
761 pool->obj_rb_root[i] = RB_ROOT;
762 INIT_LIST_HEAD(&pool->pool_list);
763 INIT_LIST_HEAD(&pool->persistent_page_list);
764 pool->cur_pgp = NULL;
765 rwlock_init(&pool->pool_rwlock);
766 pool->pgp_count_max = pool->obj_count_max = 0;
767 pool->objnode_count = pool->objnode_count_max = 0;
768 atomic_set(&pool->pgp_count,0);
769 pool->obj_count = 0; pool->shared_count = 0;
770 pool->pageshift = PAGE_SHIFT - 12;
771 pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
772 pool->dup_puts_replaced = pool->no_mem_puts = 0;
773 pool->found_gets = pool->gets = 0;
774 pool->flushs_found = pool->flushs = 0;
775 pool->flush_objs_found = pool->flush_objs = 0;
776 pool->is_dying = 0;
777 SET_SENTINEL(pool,POOL);
778 return pool;
779 }
781 static NOINLINE void pool_free(pool_t *pool)
782 {
783 ASSERT_SENTINEL(pool,POOL);
784 INVERT_SENTINEL(pool,POOL);
785 pool->client = NULL;
786 list_del(&pool->pool_list);
787 tmh_free_infra(pool);
788 }
790 /* register new_client as a user of this shared pool and return new
791 total number of registered users */
792 static int shared_pool_join(pool_t *pool, client_t *new_client)
793 {
794 sharelist_t *sl;
796 ASSERT(is_shared(pool));
797 if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
798 return -1;
799 sl->client = new_client;
800 list_add_tail(&sl->share_list, &pool->share_list);
801 if ( new_client->cli_id != pool->client->cli_id )
802 printk("adding new %s %d to shared pool owned by %s %d\n",
803 client_str, new_client->cli_id, client_str, pool->client->cli_id);
804 return ++pool->shared_count;
805 }
807 /* reassign "ownership" of the pool to another client that shares this pool */
808 static NOINLINE void shared_pool_reassign(pool_t *pool)
809 {
810 sharelist_t *sl;
811 int poolid;
812 client_t *old_client = pool->client, *new_client;
814 ASSERT(is_shared(pool));
815 if ( list_empty(&pool->share_list) )
816 {
817 ASSERT(pool->shared_count == 0);
818 return;
819 }
820 old_client->pools[pool->pool_id] = NULL;
821 sl = list_entry(pool->share_list.next, sharelist_t, share_list);
822 ASSERT(sl->client != old_client);
823 pool->client = new_client = sl->client;
824 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
825 if (new_client->pools[poolid] == pool)
826 break;
827 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
828 new_client->eph_count += _atomic_read(pool->pgp_count);
829 old_client->eph_count -= _atomic_read(pool->pgp_count);
830 list_splice_init(&old_client->ephemeral_page_list,
831 &new_client->ephemeral_page_list);
832 printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
833 cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
834 pool->pool_id = poolid;
835 }
837 /* destroy all objects with last_client same as passed cli_id,
838 remove pool's cli_id from list of sharers of this pool */
839 static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
840 {
841 sharelist_t *sl;
842 int s_poolid;
844 ASSERT(is_shared(pool));
845 ASSERT(pool->client != NULL);
847 ASSERT_WRITELOCK(&tmem_rwlock);
848 pool_destroy_objs(pool,1,cli_id);
849 list_for_each_entry(sl,&pool->share_list, share_list)
850 {
851 if (sl->client->cli_id != cli_id)
852 continue;
853 list_del(&sl->share_list);
854 tmem_free(sl,sizeof(sharelist_t),pool);
855 --pool->shared_count;
856 if (pool->client->cli_id == cli_id)
857 shared_pool_reassign(pool);
858 if (pool->shared_count)
859 return pool->shared_count;
860 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
861 if ( (global_shared_pools[s_poolid]) == pool )
862 {
863 global_shared_pools[s_poolid] = NULL;
864 break;
865 }
866 return 0;
867 }
868 printk("tmem: no match unsharing pool, %s=%d\n",
869 cli_id_str,pool->client->cli_id);
870 return -1;
871 }
873 /* flush all data (owned by cli_id) from a pool and, optionally, free it */
874 static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
875 {
876 ASSERT(pool != NULL);
877 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
878 {
879 printk("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
880 cli_id_str, cli_id, pool->pool_id, cli_id_str,pool->client->cli_id);
881 return;
882 }
883 printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
884 is_persistent(pool) ? "persistent" : "ephemeral" ,
885 is_shared(pool) ? "shared" : "private");
886 printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
887 if ( pool->client->live_migrating )
888 {
889 printk("can't %s pool while %s is live-migrating\n",
890 destroy?"destroy":"flush", client_str);
891 return;
892 }
893 pool_destroy_objs(pool,0,CLI_ID_NULL);
894 if ( destroy )
895 {
896 pool->client->pools[pool->pool_id] = NULL;
897 pool_free(pool);
898 }
899 }
901 /************ CLIENT MANIPULATION OPERATIONS **************************/
903 static client_t *client_create(cli_id_t cli_id)
904 {
905 client_t *client = tmh_alloc_infra(sizeof(client_t),__alignof__(client_t));
906 int i;
908 printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
909 if ( client == NULL )
910 {
911 printk("failed... out of memory\n");
912 return NULL;
913 }
914 memset(client,0,sizeof(client_t));
915 if ( (client->tmh = tmh_client_init(cli_id)) == NULL )
916 {
917 printk("failed... can't allocate host-dependent part of client\n");
918 if ( client )
919 tmh_free_infra(client);
920 return NULL;
921 }
922 tmh_set_client_from_id(client, client->tmh, cli_id);
923 client->cli_id = cli_id;
924 #ifdef __i386__
925 client->compress = 0;
926 #else
927 client->compress = tmh_compression_enabled();
928 #endif
929 client->shared_auth_required = tmh_shared_auth();
930 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
931 client->shared_auth_uuid[i][0] =
932 client->shared_auth_uuid[i][1] = -1L;
933 client->frozen = 0; client->live_migrating = 0;
934 client->weight = 0; client->cap = 0;
935 list_add_tail(&client->client_list, &global_client_list);
936 INIT_LIST_HEAD(&client->ephemeral_page_list);
937 INIT_LIST_HEAD(&client->persistent_invalidated_list);
938 client->cur_pgp = NULL;
939 client->eph_count = client->eph_count_max = 0;
940 client->total_cycles = 0; client->succ_pers_puts = 0;
941 client->succ_eph_gets = 0; client->succ_pers_gets = 0;
942 printk("ok\n");
943 return client;
944 }
946 static void client_free(client_t *client)
947 {
948 list_del(&client->client_list);
949 tmh_client_destroy(client->tmh);
950 tmh_free_infra(client);
951 }
953 /* flush all data from a client and, optionally, free it */
954 static void client_flush(client_t *client, bool_t destroy)
955 {
956 int i;
957 pool_t *pool;
959 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
960 {
961 if ( (pool = client->pools[i]) == NULL )
962 continue;
963 pool_flush(pool,client->cli_id,destroy);
964 if ( destroy )
965 client->pools[i] = NULL;
966 }
967 if ( destroy )
968 client_free(client);
969 }
971 static bool_t client_over_quota(client_t *client)
972 {
973 int total = _atomic_read(client_weight_total);
975 ASSERT(client != NULL);
976 if ( (total == 0) || (client->weight == 0) ||
977 (client->eph_count == 0) )
978 return 0;
979 return ( ((global_eph_count*100L) / client->eph_count ) >
980 ((total*100L) / client->weight) );
981 }
983 static void client_freeze(client_t *client, int freeze)
984 {
985 client->frozen = freeze;
986 }
988 /************ MEMORY REVOCATION ROUTINES *******************************/
990 static int tmem_evict(void)
991 {
992 client_t *client = tmh_client_from_current();
993 pgp_t *pgp = NULL, *pgp_del;
994 obj_t *obj;
995 pool_t *pool;
996 int ret = 0;
997 bool_t hold_pool_rwlock = 0;
999 evict_attempts++;
1000 tmem_spin_lock(&eph_lists_spinlock);
1001 if ( (client != NULL) && client_over_quota(client) &&
1002 !list_empty(&client->ephemeral_page_list) )
1004 list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
1006 obj = pgp->obj;
1007 pool = obj->pool;
1008 if ( pool->is_dying )
1009 continue;
1010 if ( tmh_lock_all && !obj->no_evict )
1011 goto found;
1012 if ( tmem_spin_trylock(&obj->obj_spinlock) )
1014 if ( obj->pgp_count > 1 )
1015 goto found;
1016 if ( tmem_write_trylock(&pool->pool_rwlock) )
1018 hold_pool_rwlock = 1;
1019 goto found;
1021 tmem_spin_unlock(&obj->obj_spinlock);
1024 } else if ( list_empty(&global_ephemeral_page_list) ) {
1025 goto out;
1026 } else {
1027 list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
1029 obj = pgp->obj;
1030 pool = obj->pool;
1031 if ( pool->is_dying )
1032 continue;
1033 if ( tmh_lock_all && !obj->no_evict )
1034 goto found;
1035 if ( tmem_spin_trylock(&obj->obj_spinlock) )
1037 if ( obj->pgp_count > 1 )
1038 goto found;
1039 if ( tmem_write_trylock(&pool->pool_rwlock) )
1041 hold_pool_rwlock = 1;
1042 goto found;
1044 tmem_spin_unlock(&obj->obj_spinlock);
1049 ret = 0;
1050 goto out;
1052 found:
1053 ASSERT(pgp != NULL);
1054 ASSERT_SENTINEL(pgp,PGD);
1055 obj = pgp->obj;
1056 ASSERT(obj != NULL);
1057 ASSERT(obj->no_evict == 0);
1058 ASSERT(obj->pool != NULL);
1059 ASSERT_SENTINEL(obj,OBJ);
1061 ASSERT_SPINLOCK(&obj->obj_spinlock);
1062 pgp_del = pgp_delete_from_obj(obj, pgp->index);
1063 ASSERT(pgp_del == pgp);
1064 pgp_delete(pgp,1);
1065 if ( obj->pgp_count == 0 )
1067 ASSERT_WRITELOCK(&pool->pool_rwlock);
1068 obj_free(obj,0);
1070 else
1071 tmem_spin_unlock(&obj->obj_spinlock);
1072 if ( hold_pool_rwlock )
1073 tmem_write_unlock(&pool->pool_rwlock);
1074 evicted_pgs++;
1075 ret = 1;
1077 out:
1078 tmem_spin_unlock(&eph_lists_spinlock);
1079 return ret;
1082 static unsigned long tmem_relinquish_npages(unsigned long n)
1084 unsigned long avail_pages = 0;
1086 while ( (avail_pages = tmh_avail_pages()) < n )
1088 if ( !tmem_evict() )
1089 break;
1091 if ( avail_pages )
1092 tmh_release_avail_pages_to_host();
1093 return avail_pages;
1096 /* Under certain conditions (e.g. if each client is putting pages for exactly
1097 * one object), once locks are held, freeing up memory may
1098 * result in livelocks and very long "put" times, so we try to ensure there
1099 * is a minimum amount of memory (1MB) available BEFORE any data structure
1100 * locks are held */
1101 static inline void tmem_ensure_avail_pages(void)
1103 int failed_evict = 10;
1105 while ( !tmh_free_mb() )
1107 if ( tmem_evict() )
1108 continue;
1109 else if ( failed_evict-- <= 0 )
1110 break;
1114 /************ TMEM CORE OPERATIONS ************************************/
1116 static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1117 void *cva)
1119 void *dst, *p;
1120 size_t size;
1121 int ret = 0;
1122 DECL_LOCAL_CYC_COUNTER(compress);
1124 ASSERT(pgp != NULL);
1125 ASSERT(pgp->obj != NULL);
1126 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
1127 ASSERT(pgp->obj->pool != NULL);
1128 ASSERT(pgp->obj->pool->client != NULL);
1129 #ifdef __i386__
1130 return -ENOMEM;
1131 #endif
1132 if ( pgp->pfp != NULL )
1133 pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */
1134 START_CYC_COUNTER(compress);
1135 ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
1136 if ( (ret == -EFAULT) || (ret == 0) )
1137 goto out;
1138 else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
1139 ret = 0;
1140 else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
1141 ret = -ENOMEM;
1142 else
1144 memcpy(p,dst,size);
1145 pgp->cdata = p;
1146 pgp->size = size;
1147 pgp->obj->pool->client->compressed_pages++;
1148 pgp->obj->pool->client->compressed_sum_size += size;
1149 ret = 1;
1152 out:
1153 END_CYC_COUNTER(compress);
1154 return ret;
1157 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1158 uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva)
1160 pool_t *pool;
1161 obj_t *obj;
1162 client_t *client;
1163 pgp_t *pgpfound = NULL;
1164 int ret;
1166 ASSERT(pgp != NULL);
1167 ASSERT(pgp->pfp != NULL);
1168 ASSERT(pgp->size != -1);
1169 obj = pgp->obj;
1170 ASSERT_SPINLOCK(&obj->obj_spinlock);
1171 ASSERT(obj != NULL);
1172 pool = obj->pool;
1173 ASSERT(pool != NULL);
1174 client = pool->client;
1175 if ( client->live_migrating )
1176 goto failed_dup; /* no dups allowed when migrating */
1177 /* can we successfully manipulate pgp to change out the data? */
1178 if ( len != 0 && client->compress && pgp->size != 0 )
1180 ret = do_tmem_put_compress(pgp,cmfn,cva);
1181 if ( ret == 1 )
1182 goto done;
1183 else if ( ret == 0 )
1184 goto copy_uncompressed;
1185 else if ( ret == -ENOMEM )
1186 goto failed_dup;
1187 else if ( ret == -EFAULT )
1188 goto bad_copy;
1191 copy_uncompressed:
1192 if ( pgp->pfp )
1193 pgp_free_data(pgp, pool);
1194 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1195 goto failed_dup;
1196 /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
1197 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
1198 if ( ret == -EFAULT )
1199 goto bad_copy;
1200 pgp->size = 0;
1202 done:
1203 /* successfully replaced data, clean up and return success */
1204 if ( is_shared(pool) )
1205 obj->last_client = client->cli_id;
1206 obj->no_evict = 0;
1207 tmem_spin_unlock(&obj->obj_spinlock);
1208 pool->dup_puts_replaced++;
1209 pool->good_puts++;
1210 if ( is_persistent(pool) )
1211 client->succ_pers_puts++;
1212 return 1;
1214 bad_copy:
1215 /* this should only happen if the client passed a bad mfn */
1216 failed_copies++;
1217 ASSERT(0);
1218 return -EFAULT;
1220 failed_dup:
1221 /* couldn't change out the data, flush the old data and return
1222 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
1223 pgpfound = pgp_delete_from_obj(obj, pgp->index);
1224 ASSERT(pgpfound == pgp);
1225 pgp_delete(pgpfound,0);
1226 if ( obj->pgp_count == 0 )
1228 tmem_write_lock(&pool->pool_rwlock);
1229 obj_free(obj,0);
1230 tmem_write_unlock(&pool->pool_rwlock);
1231 } else {
1232 obj->no_evict = 0;
1233 tmem_spin_unlock(&obj->obj_spinlock);
1235 pool->dup_puts_flushed++;
1236 return -ENOSPC;
1240 static NOINLINE int do_tmem_put(pool_t *pool,
1241 uint64_t oid, uint32_t index,
1242 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1243 uint32_t pfn_offset, uint32_t len, void *cva)
1245 obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
1246 pgp_t *pgp = NULL, *pgpdel = NULL;
1247 client_t *client = pool->client;
1248 int ret = client->frozen ? -EFROZEN : -ENOMEM;
1250 ASSERT(pool != NULL);
1251 pool->puts++;
1252 /* does page already exist (dup)? if so, handle specially */
1253 if ( (obj = objfound = obj_find(pool,oid)) != NULL )
1255 ASSERT_SPINLOCK(&objfound->obj_spinlock);
1256 if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
1257 return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len,cva);
1260 /* no puts allowed into a frozen pool (except dup puts) */
1261 if ( client->frozen )
1262 goto free;
1264 if ( (objfound == NULL) )
1266 tmem_write_lock(&pool->pool_rwlock);
1267 if ( (obj = objnew = obj_new(pool,oid)) == NULL )
1269 tmem_write_unlock(&pool->pool_rwlock);
1270 return -ENOMEM;
1272 ASSERT_SPINLOCK(&objnew->obj_spinlock);
1273 tmem_write_unlock(&pool->pool_rwlock);
1276 ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
1277 ASSERT_SPINLOCK(&obj->obj_spinlock);
1278 if ( (pgp = pgp_alloc(obj)) == NULL )
1279 goto free;
1281 ret = pgp_add_to_obj(obj, index, pgp);
1282 if ( ret == -ENOMEM )
1283 /* warning, may result in partially built radix tree ("stump") */
1284 goto free;
1285 ASSERT(ret != -EEXIST);
1286 pgp->index = index;
1288 if ( len != 0 && client->compress )
1290 ASSERT(pgp->pfp == NULL);
1291 ret = do_tmem_put_compress(pgp,cmfn,cva);
1292 if ( ret == 1 )
1293 goto insert_page;
1294 if ( ret == -ENOMEM )
1296 client->compress_nomem++;
1297 goto delete_and_free;
1299 if ( ret == 0 )
1301 client->compress_poor++;
1302 goto copy_uncompressed;
1304 if ( ret == -EFAULT )
1305 goto bad_copy;
1308 copy_uncompressed:
1309 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1311 ret == -ENOMEM;
1312 goto delete_and_free;
1314 /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
1315 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
1316 if ( ret == -EFAULT )
1317 goto bad_copy;
1318 pgp->size = 0;
1320 insert_page:
1321 if ( is_ephemeral(pool) )
1323 tmem_spin_lock(&eph_lists_spinlock);
1324 list_add_tail(&pgp->global_eph_pages,
1325 &global_ephemeral_page_list);
1326 if (++global_eph_count > global_eph_count_max)
1327 global_eph_count_max = global_eph_count;
1328 list_add_tail(&pgp->client_eph_pages,
1329 &client->ephemeral_page_list);
1330 if (++client->eph_count > client->eph_count_max)
1331 client->eph_count_max = client->eph_count;
1332 tmem_spin_unlock(&eph_lists_spinlock);
1333 } else { /* is_persistent */
1334 tmem_spin_lock(&pers_lists_spinlock);
1335 list_add_tail(&pgp->pool_pers_pages,
1336 &pool->persistent_page_list);
1337 tmem_spin_unlock(&pers_lists_spinlock);
1339 ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
1340 if ( is_shared(pool) )
1341 obj->last_client = client->cli_id;
1342 obj->no_evict = 0;
1343 tmem_spin_unlock(&obj->obj_spinlock);
1344 pool->good_puts++;
1345 if ( is_persistent(pool) )
1346 client->succ_pers_puts++;
1347 return 1;
1349 delete_and_free:
1350 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1351 pgpdel = pgp_delete_from_obj(obj, pgp->index);
1352 ASSERT(pgp == pgpdel);
1354 free:
1355 if ( pgp )
1356 pgp_delete(pgp,0);
1357 if ( objfound )
1359 objfound->no_evict = 0;
1360 tmem_spin_unlock(&objfound->obj_spinlock);
1362 if ( objnew )
1364 tmem_write_lock(&pool->pool_rwlock);
1365 obj_free(objnew,0);
1366 tmem_write_unlock(&pool->pool_rwlock);
1368 pool->no_mem_puts++;
1369 return ret;
1371 bad_copy:
1372 /* this should only happen if the client passed a bad mfn */
1373 failed_copies++;
1374 ASSERT(0);
1375 goto free;
1378 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
1379 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1380 uint32_t pfn_offset, uint32_t len, void *cva)
1382 obj_t *obj;
1383 pgp_t *pgp;
1384 client_t *client = pool->client;
1385 DECL_LOCAL_CYC_COUNTER(decompress);
1387 if ( !_atomic_read(pool->pgp_count) )
1388 return -EEMPTY;
1390 pool->gets++;
1391 obj = obj_find(pool,oid);
1392 if ( obj == NULL )
1393 return 0;
1395 ASSERT_SPINLOCK(&obj->obj_spinlock);
1396 if (is_shared(pool) || is_persistent(pool) )
1397 pgp = pgp_lookup_in_obj(obj, index);
1398 else
1399 pgp = pgp_delete_from_obj(obj, index);
1400 if ( pgp == NULL )
1402 obj->no_evict = 0;
1403 tmem_spin_unlock(&obj->obj_spinlock);
1404 return 0;
1406 ASSERT(pgp->size != -1);
1407 if ( pgp->size != 0 )
1409 START_CYC_COUNTER(decompress);
1410 if ( tmh_decompress_to_client(cmfn, pgp->cdata,
1411 pgp->size, cva) == -EFAULT )
1412 goto bad_copy;
1413 END_CYC_COUNTER(decompress);
1415 else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
1416 pfn_offset, len, cva) == -EFAULT)
1417 goto bad_copy;
1418 if ( is_ephemeral(pool) )
1420 if ( is_private(pool) )
1422 pgp_delete(pgp,0);
1423 if ( obj->pgp_count == 0 )
1425 tmem_write_lock(&pool->pool_rwlock);
1426 obj_free(obj,0);
1427 obj = NULL;
1428 tmem_write_unlock(&pool->pool_rwlock);
1430 } else {
1431 tmem_spin_lock(&eph_lists_spinlock);
1432 list_del(&pgp->global_eph_pages);
1433 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1434 list_del(&pgp->client_eph_pages);
1435 list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
1436 tmem_spin_unlock(&eph_lists_spinlock);
1437 ASSERT(obj != NULL);
1438 obj->last_client = tmh_get_cli_id_from_current();
1441 if ( obj != NULL )
1443 obj->no_evict = 0;
1444 tmem_spin_unlock(&obj->obj_spinlock);
1446 pool->found_gets++;
1447 if ( is_ephemeral(pool) )
1448 client->succ_eph_gets++;
1449 else
1450 client->succ_pers_gets++;
1451 return 1;
1453 bad_copy:
1454 /* this should only happen if the client passed a bad mfn */
1455 failed_copies++;
1456 ASSERT(0);
1457 return -EFAULT;
1461 static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
1463 obj_t *obj;
1464 pgp_t *pgp;
1466 pool->flushs++;
1467 obj = obj_find(pool,oid);
1468 if ( obj == NULL )
1469 goto out;
1470 pgp = pgp_delete_from_obj(obj, index);
1471 if ( pgp == NULL )
1473 obj->no_evict = 0;
1474 tmem_spin_unlock(&obj->obj_spinlock);
1475 goto out;
1477 pgp_delete(pgp,0);
1478 if ( obj->pgp_count == 0 )
1480 tmem_write_lock(&pool->pool_rwlock);
1481 obj_free(obj,0);
1482 tmem_write_unlock(&pool->pool_rwlock);
1483 } else {
1484 obj->no_evict = 0;
1485 tmem_spin_unlock(&obj->obj_spinlock);
1487 pool->flushs_found++;
1489 out:
1490 if ( pool->client->frozen )
1491 return -EFROZEN;
1492 else
1493 return 1;
1496 static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
1498 obj_t *obj;
1500 pool->flush_objs++;
1501 obj = obj_find(pool,oid);
1502 if ( obj == NULL )
1503 goto out;
1504 tmem_write_lock(&pool->pool_rwlock);
1505 obj_destroy(obj,0);
1506 pool->flush_objs_found++;
1507 tmem_write_unlock(&pool->pool_rwlock);
1509 out:
1510 if ( pool->client->frozen )
1511 return -EFROZEN;
1512 else
1513 return 1;
1516 static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
1518 client_t *client = tmh_client_from_current();
1519 pool_t *pool;
1521 if ( client->pools == NULL )
1522 return 0;
1523 if ( (pool = client->pools[pool_id]) == NULL )
1524 return 0;
1525 client->pools[pool_id] = NULL;
1526 pool_flush(pool,client->cli_id,1);
1527 return 1;
1530 static NOINLINE int do_tmem_new_pool(cli_id_t this_cli_id,
1531 uint32_t d_poolid, uint32_t flags,
1532 uint64_t uuid_lo, uint64_t uuid_hi)
1534 client_t *client;
1535 cli_id_t cli_id;
1536 int persistent = flags & TMEM_POOL_PERSIST;
1537 int shared = flags & TMEM_POOL_SHARED;
1538 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1539 & TMEM_POOL_PAGESIZE_MASK;
1540 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1541 & TMEM_POOL_VERSION_MASK;
1542 pool_t *pool, *shpool;
1543 int s_poolid, first_unused_s_poolid;
1544 int i;
1546 if ( this_cli_id == CLI_ID_NULL )
1547 cli_id = tmh_get_cli_id_from_current();
1548 else
1549 cli_id = this_cli_id;
1550 printk("tmem: allocating %s-%s tmem pool for %s=%d...",
1551 persistent ? "persistent" : "ephemeral" ,
1552 shared ? "shared" : "private", cli_id_str, cli_id);
1553 if ( specversion != TMEM_SPEC_VERSION )
1555 printk("failed... unsupported spec version\n");
1556 return -EPERM;
1558 if ( pagebits != (PAGE_SHIFT - 12) )
1560 printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
1561 return -EPERM;
1563 if ( (pool = pool_alloc()) == NULL )
1565 printk("failed... out of memory\n");
1566 return -ENOMEM;
1568 if ( this_cli_id != CLI_ID_NULL )
1570 if ( (client = tmh_client_from_cli_id(this_cli_id)) == NULL
1571 || d_poolid >= MAX_POOLS_PER_DOMAIN
1572 || client->pools[d_poolid] != NULL )
1573 goto fail;
1575 else
1577 client = tmh_client_from_current();
1578 ASSERT(client != NULL);
1579 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1580 if ( client->pools[d_poolid] == NULL )
1581 break;
1582 if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
1584 printk("failed... no more pool slots available for this %s\n",
1585 client_str);
1586 goto fail;
1589 if ( shared )
1591 if ( uuid_lo == -1L && uuid_hi == -1L )
1592 shared = 0;
1593 if ( client->shared_auth_required && !global_shared_auth )
1595 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1596 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1597 (client->shared_auth_uuid[i][1] == uuid_hi) )
1598 break;
1599 if ( i == MAX_GLOBAL_SHARED_POOLS )
1600 shared = 0;
1603 pool->shared = shared;
1604 pool->client = client;
1605 if ( shared )
1607 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1608 for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
1610 if ( (shpool = global_shared_pools[s_poolid]) != NULL )
1612 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1614 printk("(matches shared pool uuid=%"PRIx64".%"PRIx64") ",
1615 uuid_hi, uuid_lo);
1616 printk("pool_id=%d\n",d_poolid);
1617 client->pools[d_poolid] = global_shared_pools[s_poolid];
1618 shared_pool_join(global_shared_pools[s_poolid], client);
1619 pool_free(pool);
1620 if ( this_cli_id != CLI_ID_NULL )
1621 tmh_client_put(client->tmh);
1622 return d_poolid;
1625 else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1626 first_unused_s_poolid = s_poolid;
1628 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1630 printk("tmem: failed... no global shared pool slots available\n");
1631 goto fail;
1633 else
1635 INIT_LIST_HEAD(&pool->share_list);
1636 pool->shared_count = 0;
1637 global_shared_pools[first_unused_s_poolid] = pool;
1638 (void)shared_pool_join(pool,client);
1641 client->pools[d_poolid] = pool;
1642 if ( this_cli_id != CLI_ID_NULL )
1643 tmh_client_put(client->tmh);
1644 list_add_tail(&pool->pool_list, &global_pool_list);
1645 pool->pool_id = d_poolid;
1646 pool->persistent = persistent;
1647 pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
1648 printk("pool_id=%d\n",d_poolid);
1649 return d_poolid;
1651 fail:
1652 pool_free(pool);
1653 if ( this_cli_id != CLI_ID_NULL )
1654 tmh_client_put(client->tmh);
1655 return -EPERM;
1658 /************ TMEM CONTROL OPERATIONS ************************************/
1660 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
1661 static int tmemc_freeze_pools(cli_id_t cli_id, int arg)
1663 client_t *client;
1664 bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
1665 bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
1666 char *s;
1668 s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
1669 if ( cli_id == CLI_ID_NULL )
1671 list_for_each_entry(client,&global_client_list,client_list)
1672 client_freeze(client,freeze);
1673 printk("tmem: all pools %s for all %ss\n",s,client_str);
1675 else
1677 if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1678 return -1;
1679 client_freeze(client,freeze);
1680 tmh_client_put(client->tmh);
1681 printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
1683 return 0;
1686 static int tmemc_flush_mem(cli_id_t cli_id, uint32_t kb)
1688 uint32_t npages, flushed_pages, flushed_kb;
1690 if ( cli_id != CLI_ID_NULL )
1692 printk("tmem: %s-specific flush not supported yet, use --all\n",
1693 client_str);
1694 return -1;
1696 /* convert kb to pages, rounding up if necessary */
1697 npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
1698 flushed_pages = tmem_relinquish_npages(npages);
1699 flushed_kb = flushed_pages << (PAGE_SHIFT-10);
1700 return flushed_kb;
1703 /*
1704 * These tmemc_list* routines output lots of stats in a format that is
1705 * intended to be program-parseable, not human-readable. Further, by
1706 * tying each group of stats to a line format indicator (e.g. G= for
1707 * global stats) and each individual stat to a two-letter specifier
1708 * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
1709 * global ephemeral pool), it should allow the stats reported to be
1710 * forward and backwards compatible as tmem evolves.
1711 */
1712 #define BSIZE 1024
1714 static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
1715 uint32_t len, bool_t use_long)
1717 char info[BSIZE];
1718 int i, n = 0, sum = 0;
1719 pool_t *p;
1720 bool_t s;
1722 n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d,"
1723 "Tc:%"PRIu64",Ge:%ld,Pp:%ld,Gp:%ld%c",
1724 c->cli_id, c->weight, c->cap, c->compress, c->frozen,
1725 c->total_cycles, c->succ_eph_gets, c->succ_pers_puts, c->succ_pers_gets,
1726 use_long ? ',' : '\n');
1727 if (use_long)
1728 n += scnprintf(info+n,BSIZE-n,
1729 "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n",
1730 c->eph_count, c->eph_count_max,
1731 c->compressed_pages, c->compressed_sum_size,
1732 c->compress_poor, c->compress_nomem);
1733 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1734 sum += n;
1735 for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
1737 if ( (p = c->pools[i]) == NULL )
1738 continue;
1739 s = is_shared(p);
1740 n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,"
1741 "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c",
1742 c->cli_id, p->pool_id,
1743 is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
1744 (uint64_t)(s ? p->uuid[0] : 0),
1745 (uint64_t)(s ? p->uuid[1] : 0LL),
1746 use_long ? ',' : '\n');
1747 if (use_long)
1748 n += scnprintf(info+n,BSIZE-n,
1749 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1750 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1751 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1752 _atomic_read(p->pgp_count), p->pgp_count_max,
1753 p->obj_count, p->obj_count_max,
1754 p->objnode_count, p->objnode_count_max,
1755 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1756 p->no_mem_puts,
1757 p->found_gets, p->gets,
1758 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1759 if ( sum + n >= len )
1760 return sum;
1761 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1762 sum += n;
1764 return sum;
1767 static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
1768 bool_t use_long)
1770 char info[BSIZE];
1771 int i, n = 0, sum = 0;
1772 pool_t *p;
1773 sharelist_t *sl;
1775 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
1777 if ( (p = global_shared_pools[i]) == NULL )
1778 continue;
1779 n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64,
1780 i, is_persistent(p) ? 'P' : 'E',
1781 is_shared(p) ? 'S' : 'P',
1782 p->uuid[0], p->uuid[1]);
1783 list_for_each_entry(sl,&p->share_list, share_list)
1784 n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
1785 n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
1786 if (use_long)
1787 n += scnprintf(info+n,BSIZE-n,
1788 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1789 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1790 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1791 _atomic_read(p->pgp_count), p->pgp_count_max,
1792 p->obj_count, p->obj_count_max,
1793 p->objnode_count, p->objnode_count_max,
1794 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1795 p->no_mem_puts,
1796 p->found_gets, p->gets,
1797 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1798 if ( sum + n >= len )
1799 return sum;
1800 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1801 sum += n;
1803 return sum;
1806 #ifdef TMEM_PERF
1807 static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
1808 bool_t use_long)
1810 char info[BSIZE];
1811 int n = 0, sum = 0;
1813 n = scnprintf(info+n,BSIZE-n,"T=");
1814 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
1815 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
1816 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
1817 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
1818 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
1819 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
1820 #ifdef COMPARE_COPY_PAGE_SSE2
1821 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
1822 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
1823 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
1824 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
1825 #else
1826 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
1827 #endif
1828 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
1829 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
1830 n--; /* overwrite trailing comma */
1831 n += scnprintf(info+n,BSIZE-n,"\n");
1832 if ( sum + n >= len )
1833 return sum;
1834 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1835 sum += n;
1836 return sum;
1838 #else
1839 #define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
1840 #endif
1842 static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
1843 bool_t use_long)
1845 char info[BSIZE];
1846 int n = 0, sum = off;
1848 n += scnprintf(info,BSIZE,"G="
1849 "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
1850 "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
1851 total_tmem_ops, errored_tmem_ops, failed_copies,
1852 alloc_failed, alloc_page_failed, tmh_avail_pages(),
1853 low_on_memory, evicted_pgs,
1854 evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
1855 total_flush_pool, use_long ? ',' : '\n');
1856 if (use_long)
1857 n += scnprintf(info+n,BSIZE-n,
1858 "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
1859 global_eph_count, global_eph_count_max,
1860 _atomic_read(global_obj_count), global_obj_count_max,
1861 _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
1862 _atomic_read(global_pgp_count), global_pgp_count_max);
1863 if ( sum + n >= len )
1864 return sum;
1865 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1866 sum += n;
1867 return sum;
1870 static int tmemc_list(cli_id_t cli_id, tmem_cli_va_t buf, uint32_t len,
1871 bool_t use_long)
1873 client_t *client;
1874 int off = 0;
1876 if ( cli_id == CLI_ID_NULL ) {
1877 off = tmemc_list_global(buf,0,len,use_long);
1878 off += tmemc_list_shared(buf,off,len-off,use_long);
1879 list_for_each_entry(client,&global_client_list,client_list)
1880 off += tmemc_list_client(client, buf, off, len-off, use_long);
1881 off += tmemc_list_global_perf(buf,off,len-off,use_long);
1883 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1884 return -1;
1885 else {
1886 off = tmemc_list_client(client, buf, 0, len, use_long);
1887 tmh_client_put(client->tmh);
1890 return 0;
1893 static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
1895 cli_id_t cli_id = client->cli_id;
1896 uint32_t old_weight;
1898 switch (subop)
1900 case TMEMC_SET_WEIGHT:
1901 old_weight = client->weight;
1902 client->weight = arg1;
1903 printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1904 atomic_sub(old_weight,&client_weight_total);
1905 atomic_add(client->weight,&client_weight_total);
1906 break;
1907 case TMEMC_SET_CAP:
1908 client->cap = arg1;
1909 printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1910 break;
1911 case TMEMC_SET_COMPRESS:
1912 #ifdef __i386__
1913 return -1;
1914 #endif
1915 client->compress = arg1 ? 1 : 0;
1916 printk("tmem: compression %s for %s=%d\n",
1917 arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
1918 break;
1919 default:
1920 printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
1921 return -1;
1923 return 0;
1926 static int tmemc_set_var(cli_id_t cli_id, uint32_t subop, uint32_t arg1)
1928 client_t *client;
1930 if ( cli_id == CLI_ID_NULL )
1931 list_for_each_entry(client,&global_client_list,client_list)
1932 tmemc_set_var_one(client, subop, arg1);
1933 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1934 return -1;
1935 else
1937 tmemc_set_var_one(client, subop, arg1);
1938 tmh_client_put(client->tmh);
1940 return 0;
1943 static NOINLINE int tmemc_shared_pool_auth(cli_id_t cli_id, uint64_t uuid_lo,
1944 uint64_t uuid_hi, bool_t auth)
1946 client_t *client;
1947 int i, free = -1;
1949 if ( cli_id == CLI_ID_NULL )
1951 global_shared_auth = auth;
1952 return 1;
1954 client = tmh_client_from_cli_id(cli_id);
1955 if ( client == NULL )
1956 return -EINVAL;
1957 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1959 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1960 (client->shared_auth_uuid[i][1] == uuid_hi) )
1962 if ( auth == 0 )
1963 client->shared_auth_uuid[i][0] =
1964 client->shared_auth_uuid[i][1] = -1L;
1965 tmh_client_put(client->tmh);
1966 return 1;
1968 if ( (auth == 1) && (client->shared_auth_uuid[i][0] == -1L) &&
1969 (client->shared_auth_uuid[i][1] == -1L) && (free == -1) )
1970 free = i;
1972 if ( auth == 0 )
1974 tmh_client_put(client->tmh);
1975 return 0;
1977 if ( auth == 1 && free == -1 )
1978 return -ENOMEM;
1979 client->shared_auth_uuid[free][0] = uuid_lo;
1980 client->shared_auth_uuid[free][1] = uuid_hi;
1981 tmh_client_put(client->tmh);
1982 return 1;
1985 static NOINLINE int tmemc_save_subop(int cli_id, uint32_t pool_id,
1986 uint32_t subop, tmem_cli_va_t buf, uint32_t arg1)
1988 client_t *client = tmh_client_from_cli_id(cli_id);
1989 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1990 ? NULL : client->pools[pool_id];
1991 uint32_t p;
1992 uint64_t *uuid;
1993 pgp_t *pgp, *pgp2;
1994 int rc = -1;
1996 switch(subop)
1998 case TMEMC_SAVE_BEGIN:
1999 if ( client == NULL )
2000 return 0;
2001 for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
2002 if ( client->pools[p] != NULL )
2003 break;
2004 if ( p == MAX_POOLS_PER_DOMAIN )
2006 rc = 0;
2007 break;
2009 client->was_frozen = client->frozen;
2010 client->frozen = 1;
2011 if ( arg1 != 0 )
2012 client->live_migrating = 1;
2013 rc = 1;
2014 break;
2015 case TMEMC_RESTORE_BEGIN:
2016 if ( client == NULL && (client = client_create(cli_id)) != NULL )
2017 return 1;
2018 break;
2019 case TMEMC_SAVE_GET_VERSION:
2020 rc = TMEM_SPEC_VERSION;
2021 break;
2022 case TMEMC_SAVE_GET_MAXPOOLS:
2023 rc = MAX_POOLS_PER_DOMAIN;
2024 break;
2025 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
2026 rc = client->weight == -1 ? -2 : client->weight;
2027 break;
2028 case TMEMC_SAVE_GET_CLIENT_CAP:
2029 rc = client->cap == -1 ? -2 : client->cap;
2030 break;
2031 case TMEMC_SAVE_GET_CLIENT_FLAGS:
2032 rc = (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) |
2033 (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 );
2034 break;
2035 case TMEMC_SAVE_GET_POOL_FLAGS:
2036 if ( pool == NULL )
2037 break;
2038 rc = (pool->persistent ? TMEM_POOL_PERSIST : 0) |
2039 (pool->shared ? TMEM_POOL_SHARED : 0) |
2040 (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT);
2041 break;
2042 case TMEMC_SAVE_GET_POOL_NPAGES:
2043 if ( pool == NULL )
2044 break;
2045 rc = _atomic_read(pool->pgp_count);
2046 break;
2047 case TMEMC_SAVE_GET_POOL_UUID:
2048 if ( pool == NULL )
2049 break;
2050 uuid = (uint64_t *)buf.p;
2051 *uuid++ = pool->uuid[0];
2052 *uuid = pool->uuid[1];
2053 rc = 0;
2054 case TMEMC_SAVE_END:
2055 client->live_migrating = 0;
2056 if ( !list_empty(&client->persistent_invalidated_list) )
2057 list_for_each_entry_safe(pgp,pgp2,
2058 &client->persistent_invalidated_list, client_inv_pages)
2059 pgp_free_from_inv_list(client,pgp);
2060 client->frozen = client->was_frozen;
2061 rc = 0;
2063 if ( client )
2064 tmh_client_put(client->tmh);
2065 return rc;
2068 static NOINLINE int tmemc_save_get_next_page(int cli_id, int pool_id,
2069 tmem_cli_va_t buf, uint32_t bufsize)
2071 client_t *client = tmh_client_from_cli_id(cli_id);
2072 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2073 ? NULL : client->pools[pool_id];
2074 pgp_t *pgp;
2075 int ret = 0;
2076 struct tmem_handle *h;
2077 unsigned int pagesize = 1 << (pool->pageshift+12);
2079 if ( pool == NULL || is_ephemeral(pool) )
2081 tmh_client_put(client->tmh);
2082 return -1;
2084 if ( bufsize < pagesize + sizeof(struct tmem_handle) )
2086 tmh_client_put(client->tmh);
2087 return -ENOMEM;
2090 tmem_spin_lock(&pers_lists_spinlock);
2091 if ( list_empty(&pool->persistent_page_list) )
2093 ret = -1;
2094 goto out;
2096 /* note: pool->cur_pgp is the pgp last returned by get_next_page */
2097 if ( pool->cur_pgp == NULL )
2099 /* process the first one */
2100 pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
2101 pgp_t,pool_pers_pages);
2102 } else if ( list_is_last(&pool->cur_pgp->pool_pers_pages,
2103 &pool->persistent_page_list) )
2105 /* already processed the last one in the list */
2106 ret = -1;
2107 goto out;
2109 pgp = list_entry((&pool->cur_pgp->pool_pers_pages)->next,
2110 pgp_t,pool_pers_pages);
2111 pool->cur_pgp = pgp;
2112 h = (struct tmem_handle *)buf.p;
2113 h->oid = pgp->obj->oid;
2114 h->index = pgp->index;
2115 buf.p = (void *)(h+1);
2116 ret = do_tmem_get(pool, h->oid, h->index,0,0,0,pagesize,buf.p);
2118 out:
2119 tmem_spin_unlock(&pers_lists_spinlock);
2120 tmh_client_put(client->tmh);
2121 return ret;
2124 static NOINLINE int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_t buf,
2125 uint32_t bufsize)
2127 client_t *client = tmh_client_from_cli_id(cli_id);
2128 pgp_t *pgp;
2129 struct tmem_handle *h;
2130 int ret = 0;
2132 if ( client == NULL )
2133 return 0;
2134 if ( bufsize < sizeof(struct tmem_handle) )
2136 tmh_client_put(client->tmh);
2137 return 0;
2139 tmem_spin_lock(&pers_lists_spinlock);
2140 if ( list_empty(&client->persistent_invalidated_list) )
2141 goto out;
2142 if ( client->cur_pgp == NULL )
2144 pgp = list_entry((&client->persistent_invalidated_list)->next,
2145 pgp_t,client_inv_pages);
2146 client->cur_pgp = pgp;
2147 } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
2148 &client->persistent_invalidated_list) )
2150 client->cur_pgp = NULL;
2151 ret = 0;
2152 goto out;
2153 } else {
2154 pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
2155 pgp_t,client_inv_pages);
2156 client->cur_pgp = pgp;
2158 h = (struct tmem_handle *)buf.p;
2159 h->pool_id = pgp->pool_id;
2160 h->oid = pgp->inv_oid;
2161 h->index = pgp->index;
2162 ret = 1;
2163 out:
2164 tmem_spin_unlock(&pers_lists_spinlock);
2165 tmh_client_put(client->tmh);
2166 return ret;
2169 static int tmemc_restore_put_page(int cli_id, int pool_id, uint64_t oid,
2170 uint32_t index, tmem_cli_va_t buf, uint32_t bufsize)
2172 client_t *client = tmh_client_from_cli_id(cli_id);
2173 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2174 ? NULL : client->pools[pool_id];
2175 int rc = pool ? do_tmem_put(pool,oid,index,0,0,0,bufsize,buf.p) : -1;
2177 if ( client )
2178 tmh_client_put(client->tmh);
2179 return rc;
2182 static int tmemc_restore_flush_page(int cli_id, int pool_id, uint64_t oid,
2183 uint32_t index)
2185 client_t *client = tmh_client_from_cli_id(cli_id);
2186 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
2187 ? NULL : client->pools[pool_id];
2188 int rc = pool ? do_tmem_flush_page(pool, oid, index) : -1;
2190 if ( client )
2191 tmh_client_put(client->tmh);
2192 return rc;
2195 static NOINLINE int do_tmem_control(struct tmem_op *op)
2197 int ret;
2198 uint32_t pool_id = op->pool_id;
2199 uint32_t subop = op->u.ctrl.subop;
2201 if (!tmh_current_is_privileged())
2203 /* don't fail... mystery: sometimes dom0 fails here */
2204 /* return -EPERM; */
2206 switch(subop)
2208 case TMEMC_THAW:
2209 case TMEMC_FREEZE:
2210 case TMEMC_DESTROY:
2211 ret = tmemc_freeze_pools(op->u.ctrl.cli_id,subop);
2212 break;
2213 case TMEMC_FLUSH:
2214 ret = tmemc_flush_mem(op->u.ctrl.cli_id,op->u.ctrl.arg1);
2215 break;
2216 case TMEMC_LIST:
2217 ret = tmemc_list(op->u.ctrl.cli_id,op->u.ctrl.buf,
2218 op->u.ctrl.arg1,op->u.ctrl.arg2);
2219 break;
2220 case TMEMC_SET_WEIGHT:
2221 case TMEMC_SET_CAP:
2222 case TMEMC_SET_COMPRESS:
2223 ret = tmemc_set_var(op->u.ctrl.cli_id,subop,op->u.ctrl.arg1);
2224 break;
2225 case TMEMC_QUERY_FREEABLE_MB:
2226 ret = tmh_freeable_pages() >> (20 - PAGE_SHIFT);
2227 break;
2228 case TMEMC_SAVE_BEGIN:
2229 case TMEMC_RESTORE_BEGIN:
2230 case TMEMC_SAVE_GET_VERSION:
2231 case TMEMC_SAVE_GET_MAXPOOLS:
2232 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
2233 case TMEMC_SAVE_GET_CLIENT_CAP:
2234 case TMEMC_SAVE_GET_CLIENT_FLAGS:
2235 case TMEMC_SAVE_GET_POOL_FLAGS:
2236 case TMEMC_SAVE_GET_POOL_NPAGES:
2237 case TMEMC_SAVE_GET_POOL_UUID:
2238 case TMEMC_SAVE_END:
2239 ret = tmemc_save_subop(op->u.ctrl.cli_id,pool_id,subop,
2240 op->u.ctrl.buf,op->u.ctrl.arg1);
2241 break;
2242 case TMEMC_SAVE_GET_NEXT_PAGE:
2243 ret = tmemc_save_get_next_page(op->u.ctrl.cli_id, pool_id,
2244 op->u.ctrl.buf, op->u.ctrl.arg1);
2245 break;
2246 case TMEMC_SAVE_GET_NEXT_INV:
2247 ret = tmemc_save_get_next_inv(op->u.ctrl.cli_id, op->u.ctrl.buf,
2248 op->u.ctrl.arg1);
2249 break;
2250 case TMEMC_RESTORE_PUT_PAGE:
2251 ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id,
2252 op->u.ctrl.arg3, op->u.ctrl.arg2,
2253 op->u.ctrl.buf, op->u.ctrl.arg1);
2254 break;
2255 case TMEMC_RESTORE_FLUSH_PAGE:
2256 ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id,
2257 op->u.ctrl.arg3, op->u.ctrl.arg2);
2258 break;
2259 default:
2260 ret = -1;
2262 return ret;
2265 /************ EXPORTed FUNCTIONS **************************************/
2267 EXPORT long do_tmem_op(tmem_cli_op_t uops)
2269 struct tmem_op op;
2270 client_t *client = tmh_client_from_current();
2271 pool_t *pool = NULL;
2272 int rc = 0;
2273 bool_t succ_get = 0, succ_put = 0;
2274 bool_t non_succ_get = 0, non_succ_put = 0;
2275 bool_t flush = 0, flush_obj = 0;
2276 bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
2277 DECL_LOCAL_CYC_COUNTER(succ_get);
2278 DECL_LOCAL_CYC_COUNTER(succ_put);
2279 DECL_LOCAL_CYC_COUNTER(non_succ_get);
2280 DECL_LOCAL_CYC_COUNTER(non_succ_put);
2281 DECL_LOCAL_CYC_COUNTER(flush);
2282 DECL_LOCAL_CYC_COUNTER(flush_obj);
2284 if ( !tmem_initialized )
2285 return -ENODEV;
2287 total_tmem_ops++;
2289 if ( tmh_lock_all )
2291 if ( tmh_lock_all > 1 )
2292 spin_lock_irq(&tmem_spinlock);
2293 else
2294 spin_lock(&tmem_spinlock);
2297 START_CYC_COUNTER(succ_get);
2298 DUP_START_CYC_COUNTER(succ_put,succ_get);
2299 DUP_START_CYC_COUNTER(non_succ_get,succ_get);
2300 DUP_START_CYC_COUNTER(non_succ_put,succ_get);
2301 DUP_START_CYC_COUNTER(flush,succ_get);
2302 DUP_START_CYC_COUNTER(flush_obj,succ_get);
2304 if ( client != NULL && tmh_client_is_dying(client) )
2306 rc = -ENODEV;
2307 goto out;
2310 if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
2312 printk("tmem: can't get tmem struct from %s\n",client_str);
2313 rc = -EFAULT;
2314 goto out;
2317 if ( op.cmd == TMEM_CONTROL )
2319 tmem_write_lock(&tmem_rwlock);
2320 tmem_write_lock_set = 1;
2321 rc = do_tmem_control(&op);
2322 goto out;
2323 } else if ( op.cmd == TMEM_AUTH ) {
2324 tmem_write_lock(&tmem_rwlock);
2325 tmem_write_lock_set = 1;
2326 rc = tmemc_shared_pool_auth(op.u.new.arg1,op.u.new.uuid[0],
2327 op.u.new.uuid[1],op.u.new.flags);
2328 goto out;
2329 } else if ( op.cmd == TMEM_RESTORE_NEW ) {
2330 tmem_write_lock(&tmem_rwlock);
2331 tmem_write_lock_set = 1;
2332 rc = do_tmem_new_pool(op.u.new.arg1, op.pool_id, op.u.new.flags,
2333 op.u.new.uuid[0], op.u.new.uuid[1]);
2334 goto out;
2337 /* create per-client tmem structure dynamically on first use by client */
2338 if ( client == NULL )
2340 tmem_write_lock(&tmem_rwlock);
2341 tmem_write_lock_set = 1;
2342 if ( (client = client_create(tmh_get_cli_id_from_current())) == NULL )
2344 printk("tmem: can't create tmem structure for %s\n",client_str);
2345 rc = -ENOMEM;
2346 goto out;
2350 if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
2352 if ( !tmem_write_lock_set )
2354 tmem_write_lock(&tmem_rwlock);
2355 tmem_write_lock_set = 1;
2358 else
2360 if ( !tmem_write_lock_set )
2362 tmem_read_lock(&tmem_rwlock);
2363 tmem_read_lock_set = 1;
2365 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
2366 ((pool = client->pools[op.pool_id]) == NULL) )
2368 rc = -ENODEV;
2369 printk("tmem: operation requested on uncreated pool\n");
2370 goto out;
2372 ASSERT_SENTINEL(pool,POOL);
2375 switch ( op.cmd )
2377 case TMEM_NEW_POOL:
2378 rc = do_tmem_new_pool(CLI_ID_NULL, 0, op.u.new.flags,
2379 op.u.new.uuid[0], op.u.new.uuid[1]);
2380 break;
2381 case TMEM_NEW_PAGE:
2382 tmem_ensure_avail_pages();
2383 rc = do_tmem_put(pool, op.u.gen.object,
2384 op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, NULL);
2385 break;
2386 case TMEM_PUT_PAGE:
2387 tmem_ensure_avail_pages();
2388 rc = do_tmem_put(pool, op.u.gen.object,
2389 op.u.gen.index, op.u.gen.cmfn, 0, 0, PAGE_SIZE, NULL);
2390 if (rc == 1) succ_put = 1;
2391 else non_succ_put = 1;
2392 break;
2393 case TMEM_GET_PAGE:
2394 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
2395 0, 0, PAGE_SIZE, 0);
2396 if (rc == 1) succ_get = 1;
2397 else non_succ_get = 1;
2398 break;
2399 case TMEM_FLUSH_PAGE:
2400 flush = 1;
2401 rc = do_tmem_flush_page(pool, op.u.gen.object, op.u.gen.index);
2402 break;
2403 case TMEM_FLUSH_OBJECT:
2404 rc = do_tmem_flush_object(pool, op.u.gen.object);
2405 flush_obj = 1;
2406 break;
2407 case TMEM_DESTROY_POOL:
2408 flush = 1;
2409 rc = do_tmem_destroy_pool(op.pool_id);
2410 break;
2411 case TMEM_READ:
2412 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
2413 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
2414 op.u.gen.len,0);
2415 break;
2416 case TMEM_WRITE:
2417 rc = do_tmem_put(pool, op.u.gen.object,
2418 op.u.gen.index, op.u.gen.cmfn,
2419 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
2420 op.u.gen.len, NULL);
2421 break;
2422 case TMEM_XCHG:
2423 /* need to hold global lock to ensure xchg is atomic */
2424 printk("tmem_xchg op not implemented yet\n");
2425 rc = 0;
2426 break;
2427 default:
2428 printk("tmem: op %d not implemented\n", op.cmd);
2429 rc = 0;
2430 break;
2433 out:
2434 if ( rc < 0 )
2435 errored_tmem_ops++;
2436 if ( succ_get )
2437 END_CYC_COUNTER_CLI(succ_get,client);
2438 else if ( succ_put )
2439 END_CYC_COUNTER_CLI(succ_put,client);
2440 else if ( non_succ_get )
2441 END_CYC_COUNTER_CLI(non_succ_get,client);
2442 else if ( non_succ_put )
2443 END_CYC_COUNTER_CLI(non_succ_put,client);
2444 else if ( flush )
2445 END_CYC_COUNTER_CLI(flush,client);
2446 else if ( flush_obj )
2447 END_CYC_COUNTER_CLI(flush_obj,client);
2449 if ( tmh_lock_all )
2451 if ( tmh_lock_all > 1 )
2452 spin_unlock_irq(&tmem_spinlock);
2453 else
2454 spin_unlock(&tmem_spinlock);
2455 } else {
2456 if ( tmem_write_lock_set )
2457 write_unlock(&tmem_rwlock);
2458 else if ( tmem_read_lock_set )
2459 read_unlock(&tmem_rwlock);
2460 else
2461 ASSERT(0);
2464 return rc;
2467 /* this should be called when the host is destroying a client */
2468 EXPORT void tmem_destroy(void *v)
2470 client_t *client = (client_t *)v;
2472 if ( client == NULL )
2473 return;
2475 if ( !tmh_client_is_dying(client) )
2477 printk("tmem: tmem_destroy can only destroy dying client\n");
2478 return;
2481 if ( tmh_lock_all )
2482 spin_lock(&tmem_spinlock);
2483 else
2484 write_lock(&tmem_rwlock);
2486 printk("tmem: flushing tmem pools for %s=%d\n",
2487 cli_id_str, client->cli_id);
2488 client_flush(client, 1);
2490 if ( tmh_lock_all )
2491 spin_unlock(&tmem_spinlock);
2492 else
2493 write_unlock(&tmem_rwlock);
2496 /* freezing all pools guarantees that no additional memory will be consumed */
2497 EXPORT void tmem_freeze_all(unsigned char key)
2499 static int freeze = 0;
2501 if ( tmh_lock_all )
2502 spin_lock(&tmem_spinlock);
2503 else
2504 write_lock(&tmem_rwlock);
2506 freeze = !freeze;
2507 tmemc_freeze_pools(CLI_ID_NULL,freeze);
2509 if ( tmh_lock_all )
2510 spin_unlock(&tmem_spinlock);
2511 else
2512 write_unlock(&tmem_rwlock);
2515 #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
2517 EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2519 pfp_t *pfp;
2520 unsigned long evicts_per_relinq = 0;
2521 int max_evictions = 10;
2523 if (!tmh_enabled() || !tmh_freeable_pages())
2524 return NULL;
2525 #ifdef __i386__
2526 return NULL;
2527 #endif
2529 relinq_attempts++;
2530 if ( order > 0 )
2532 #ifndef NDEBUG
2533 printk("tmem_relinquish_page: failing order=%d\n", order);
2534 #endif
2535 return NULL;
2538 if ( tmh_called_from_tmem(memflags) )
2540 if ( tmh_lock_all )
2541 spin_lock(&tmem_spinlock);
2542 else
2543 read_lock(&tmem_rwlock);
2546 while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
2548 if ( (max_evictions-- <= 0) || !tmem_evict())
2549 break;
2550 evicts_per_relinq++;
2552 if ( evicts_per_relinq > max_evicts_per_relinq )
2553 max_evicts_per_relinq = evicts_per_relinq;
2554 tmh_scrub_page(pfp, memflags);
2555 if ( pfp != NULL )
2556 relinq_pgs++;
2558 if ( tmh_called_from_tmem(memflags) )
2560 if ( tmh_lock_all )
2561 spin_unlock(&tmem_spinlock);
2562 else
2563 read_unlock(&tmem_rwlock);
2566 return pfp;
2569 /* called at hypervisor startup */
2570 EXPORT void init_tmem(void)
2572 if ( !tmh_enabled() )
2573 return;
2575 radix_tree_init();
2576 if ( tmh_init() )
2578 printk("tmem: initialized comp=%d global-lock=%d\n",
2579 tmh_compression_enabled(), tmh_lock_all);
2580 tmem_initialized = 1;
2582 else
2583 printk("tmem: initialization FAILED\n");
2586 /*
2587 * Local variables:
2588 * mode: C
2589 * c-set-style: "BSD"
2590 * c-basic-offset: 4
2591 * tab-width: 4
2592 * indent-tabs-mode: nil
2593 * End:
2594 */