debuggers.hg

view xen/common/tmem.c @ 19990:1033c6cdec62

tmem: No noise when disabled and not configured

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jul 16 08:30:23 2009 +0100 (2009-07-16)
parents ef67f5916453
children c98fd816db85
line source
1 /******************************************************************************
2 * tmem.c
3 *
4 * Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
9 /* TODO list: 090129
10 - improve on reclamation policy
11 - use different tlsf pools for each client (maybe each pool)
12 - implement page accounting and minimal QoS limits
13 - test shared access more completely (need pv cluster fs)
14 - add feedback-driven compression (not for persistent pools though!)
15 - add data-structure total bytes overhead stats
16 */
18 #ifdef __XEN__
19 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
20 #endif
22 #include <xen/tmem.h>
23 #include <xen/rbtree.h>
24 #include <xen/radix-tree.h>
25 #include <xen/list.h>
27 #define EXPORT /* indicates code other modules are dependent upon */
28 #define FORWARD
30 /************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
32 #define CLI_ID_NULL TMH_CLI_ID_NULL
33 #define cli_id_str tmh_cli_id_str
34 #define client_str tmh_client_str
36 /************ DEBUG and STATISTICS (+ some compression testing) *******/
38 #ifndef NDEBUG
39 #define SENTINELS
40 #define NOINLINE noinline
41 #else
42 #define NOINLINE
43 #endif
45 #ifdef SENTINELS
46 #define DECL_SENTINEL unsigned long sentinel;
47 #define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
48 #define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
49 #define ASSERT_SENTINEL(_x,_y) \
50 ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
51 #ifdef __i386__
52 #define POOL_SENTINEL 0x87658765
53 #define OBJ_SENTINEL 0x12345678
54 #define OBJNODE_SENTINEL 0xfedcba09
55 #define PGD_SENTINEL 0x43214321
56 #else
57 #define POOL_SENTINEL 0x8765876587658765
58 #define OBJ_SENTINEL 0x1234567812345678
59 #define OBJNODE_SENTINEL 0xfedcba0987654321
60 #define PGD_SENTINEL 0x4321432143214321
61 #endif
62 #else
63 #define DECL_SENTINEL
64 #define SET_SENTINEL(_x,_y) do { } while (0)
65 #define ASSERT_SENTINEL(_x,_y) do { } while (0)
66 #define INVERT_SENTINEL(_x,_y) do { } while (0)
67 #endif
69 /* global statistics (none need to be locked) */
70 static unsigned long total_tmem_ops = 0;
71 static unsigned long errored_tmem_ops = 0;
72 static unsigned long total_flush_pool = 0;
73 static unsigned long alloc_failed = 0, alloc_page_failed = 0;
74 static unsigned long evicted_pgs = 0, evict_attempts = 0;
75 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
76 static unsigned long max_evicts_per_relinq = 0;
77 static unsigned long low_on_memory = 0;
78 static int global_obj_count_max = 0;
79 static int global_pgp_count_max = 0;
80 static int global_page_count_max = 0;
81 static int global_rtree_node_count_max = 0;
82 static long global_eph_count_max = 0;
83 static unsigned long failed_copies;
85 DECL_CYC_COUNTER(succ_get);
86 DECL_CYC_COUNTER(succ_put);
87 DECL_CYC_COUNTER(non_succ_get);
88 DECL_CYC_COUNTER(non_succ_put);
89 DECL_CYC_COUNTER(flush);
90 DECL_CYC_COUNTER(flush_obj);
91 #ifdef COMPARE_COPY_PAGE_SSE2
92 EXTERN_CYC_COUNTER(pg_copy1);
93 EXTERN_CYC_COUNTER(pg_copy2);
94 EXTERN_CYC_COUNTER(pg_copy3);
95 EXTERN_CYC_COUNTER(pg_copy4);
96 #else
97 EXTERN_CYC_COUNTER(pg_copy);
98 #endif
99 DECL_CYC_COUNTER(compress);
100 DECL_CYC_COUNTER(decompress);
102 /************ CORE DATA STRUCTURES ************************************/
104 #define MAX_POOLS_PER_DOMAIN 16
105 #define MAX_GLOBAL_SHARED_POOLS 16
107 struct tm_pool;
108 struct client {
109 struct list_head client_list;
110 struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
111 tmh_client_t *tmh;
112 struct list_head ephemeral_page_list;
113 long eph_count, eph_count_max;
114 cli_id_t cli_id;
115 uint32_t weight;
116 uint32_t cap;
117 bool_t compress;
118 bool_t frozen;
119 unsigned long compress_poor, compress_nomem;
120 unsigned long compressed_pages;
121 uint64_t compressed_sum_size;
122 uint64_t total_cycles;
123 unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets;
124 };
125 typedef struct client client_t;
127 struct share_list {
128 struct list_head share_list;
129 client_t *client;
130 };
131 typedef struct share_list sharelist_t;
133 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
134 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
135 #define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
137 struct tm_pool {
138 bool_t shared;
139 bool_t persistent;
140 struct list_head pool_list; /* FIXME do we need this anymore? */
141 client_t *client;
142 uint64_t uuid[2]; /* 0 for private, non-zero for shared */
143 uint32_t pool_id;
144 rwlock_t pool_rwlock;
145 struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
146 struct list_head share_list; /* valid if shared */
147 DECL_SENTINEL
148 int shared_count; /* valid if shared */
149 atomic_t pgp_count;
150 int pgp_count_max;
151 long obj_count; /* atomicity depends on pool_rwlock held for write */
152 long obj_count_max;
153 unsigned long objnode_count, objnode_count_max;
154 uint64_t sum_life_cycles;
155 uint64_t sum_evicted_cycles;
156 unsigned long puts, good_puts, no_mem_puts;
157 unsigned long dup_puts_flushed, dup_puts_replaced;
158 unsigned long gets, found_gets;
159 unsigned long flushs, flushs_found;
160 unsigned long flush_objs, flush_objs_found;
161 };
162 typedef struct tm_pool pool_t;
164 #define is_persistent(_p) (_p->persistent)
165 #define is_ephemeral(_p) (!(_p->persistent))
166 #define is_shared(_p) (_p->shared)
167 #define is_private(_p) (!(_p->shared))
169 struct tmem_object_root {
170 DECL_SENTINEL
171 uint64_t oid;
172 struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
173 unsigned long objnode_count; /* atomicity depends on obj_spinlock */
174 long pgp_count; /* atomicity depends on obj_spinlock */
175 struct radix_tree_root tree_root; /* tree of pages within object */
176 pool_t *pool;
177 cli_id_t last_client;
178 spinlock_t obj_spinlock;
179 bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
180 };
181 typedef struct tmem_object_root obj_t;
183 typedef struct radix_tree_node rtn_t;
184 struct tmem_object_node {
185 obj_t *obj;
186 DECL_SENTINEL
187 rtn_t rtn;
188 };
189 typedef struct tmem_object_node objnode_t;
191 struct tmem_page_descriptor {
192 struct list_head global_eph_pages;
193 struct list_head client_eph_pages;
194 obj_t *obj;
195 uint32_t index;
196 size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */
197 union {
198 pfp_t *pfp; /* page frame pointer */
199 char *cdata; /* compressed data */
200 };
201 uint64_t timestamp;
202 DECL_SENTINEL
203 };
204 typedef struct tmem_page_descriptor pgp_t;
206 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
208 static LIST_HEAD(global_client_list);
209 static LIST_HEAD(global_pool_list);
211 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
212 static atomic_t client_weight_total = ATOMIC_INIT(0);
213 static int tmem_initialized = 0;
215 /************ CONCURRENCY ***********************************************/
217 EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
218 EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
219 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
221 #define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
222 #define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
223 #define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
224 #define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
225 #define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
226 #define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
227 #define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
228 #define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
230 #define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
231 #define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
233 /* global counters (should use long_atomic_t access) */
234 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
235 static atomic_t global_obj_count = ATOMIC_INIT(0);
236 static atomic_t global_pgp_count = ATOMIC_INIT(0);
237 static atomic_t global_page_count = ATOMIC_INIT(0);
238 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
240 #define atomic_inc_and_max(_c) do { \
241 atomic_inc(&_c); \
242 if ( _atomic_read(_c) > _c##_max ) \
243 _c##_max = _atomic_read(_c); \
244 } while (0)
246 #define atomic_dec_and_assert(_c) do { \
247 atomic_dec(&_c); \
248 ASSERT(_atomic_read(_c) >= 0); \
249 } while (0)
252 /************ MEMORY ALLOCATION INTERFACE *****************************/
254 #define tmem_malloc(_type,_pool) \
255 _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
257 #define tmem_malloc_bytes(_size,_pool) \
258 _tmem_malloc(_size, 1, _pool)
260 static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
261 {
262 void *v;
264 if ( (pool != NULL) && is_persistent(pool) )
265 v = tmh_alloc_subpage_thispool(pool,size,align);
266 else
267 v = tmh_alloc_subpage(pool, size, align);
268 if ( v == NULL )
269 alloc_failed++;
270 return v;
271 }
273 static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
274 {
275 if ( pool == NULL || !is_persistent(pool) )
276 tmh_free_subpage(p,size);
277 else
278 tmh_free_subpage_thispool(pool,p,size);
279 }
281 static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
282 {
283 pfp_t *pfp = NULL;
285 if ( pool != NULL && is_persistent(pool) )
286 pfp = tmh_alloc_page_thispool(pool);
287 else
288 pfp = tmh_alloc_page(pool,0);
289 if ( pfp == NULL )
290 alloc_page_failed++;
291 else
292 atomic_inc_and_max(global_page_count);
293 return pfp;
294 }
296 static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
297 {
298 ASSERT(pfp);
299 if ( pool == NULL || !is_persistent(pool) )
300 tmh_free_page(pfp);
301 else
302 tmh_free_page_thispool(pool,pfp);
303 atomic_dec_and_assert(global_page_count);
304 }
306 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
308 /* allocate a pgp_t and associate it with an object */
309 static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
310 {
311 pgp_t *pgp;
312 pool_t *pool;
314 ASSERT(obj != NULL);
315 ASSERT(obj->pool != NULL);
316 pool = obj->pool;
317 if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
318 return NULL;
319 pgp->obj = obj;
320 INIT_LIST_HEAD(&pgp->global_eph_pages);
321 INIT_LIST_HEAD(&pgp->client_eph_pages);
322 pgp->pfp = NULL;
323 pgp->size = -1;
324 pgp->index = -1;
325 pgp->timestamp = get_cycles();
326 SET_SENTINEL(pgp,PGD);
327 atomic_inc_and_max(global_pgp_count);
328 atomic_inc_and_max(pool->pgp_count);
329 return pgp;
330 }
332 static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
333 {
334 ASSERT(obj != NULL);
335 ASSERT_SPINLOCK(&obj->obj_spinlock);
336 ASSERT_SENTINEL(obj,OBJ);
337 ASSERT(obj->pool != NULL);
338 ASSERT_SENTINEL(obj->pool,POOL);
339 return radix_tree_lookup(&obj->tree_root, index);
340 }
342 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
343 {
344 if ( pgp->pfp == NULL )
345 return;
346 if ( !pgp->size )
347 tmem_page_free(pgp->obj->pool,pgp->pfp);
348 else
349 {
350 tmem_free(pgp->cdata,pgp->size,pool);
351 if ( pool != NULL )
352 {
353 pool->client->compressed_pages--;
354 pool->client->compressed_sum_size -= pgp->size;
355 }
356 }
357 pgp->pfp = NULL;
358 pgp->size = -1;
359 }
361 static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
362 {
363 pool_t *pool = NULL;
365 ASSERT_SENTINEL(pgp,PGD);
366 ASSERT(pgp->obj != NULL);
367 ASSERT_SENTINEL(pgp->obj,OBJ);
368 ASSERT_SENTINEL(pgp->obj->pool,POOL);
369 ASSERT(list_empty(&pgp->global_eph_pages));
370 ASSERT(list_empty(&pgp->client_eph_pages));
371 if ( from_delete )
372 ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
373 ASSERT(pgp->obj->pool != NULL);
374 pool = pgp->obj->pool;
375 pgp_free_data(pgp, pool);
376 INVERT_SENTINEL(pgp,PGD);
377 pgp->obj = NULL;
378 pgp->index = -1;
379 pgp->size = -1;
380 atomic_dec_and_assert(global_pgp_count);
381 atomic_dec_and_assert(pool->pgp_count);
382 tmem_free(pgp,sizeof(pgp_t),pool);
383 }
385 /* remove the page from appropriate lists but not from parent object */
386 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
387 {
388 ASSERT(pgp != NULL);
389 ASSERT(pgp->obj != NULL);
390 ASSERT(pgp->obj->pool != NULL);
391 ASSERT(pgp->obj->pool->client != NULL);
392 if ( is_ephemeral(pgp->obj->pool) )
393 {
394 if ( !no_eph_lock )
395 tmem_spin_lock(&eph_lists_spinlock);
396 if ( !list_empty(&pgp->client_eph_pages) )
397 pgp->obj->pool->client->eph_count--;
398 ASSERT(pgp->obj->pool->client->eph_count >= 0);
399 list_del_init(&pgp->client_eph_pages);
400 if ( !list_empty(&pgp->global_eph_pages) )
401 global_eph_count--;
402 ASSERT(global_eph_count >= 0);
403 list_del_init(&pgp->global_eph_pages);
404 if ( !no_eph_lock )
405 tmem_spin_unlock(&eph_lists_spinlock);
406 }
407 }
409 /* remove page from lists (but not from parent object) and free it */
410 static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
411 {
412 uint64_t life;
414 ASSERT(pgp != NULL);
415 ASSERT(pgp->obj != NULL);
416 ASSERT(pgp->obj->pool != NULL);
417 life = get_cycles() - pgp->timestamp;
418 pgp->obj->pool->sum_life_cycles += life;
419 pgp_delist(pgp, no_eph_lock);
420 pgp_free(pgp,1);
421 }
423 /* called only indirectly by radix_tree_destroy */
424 static NOINLINE void pgp_destroy(void *v)
425 {
426 pgp_t *pgp = (pgp_t *)v;
428 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
429 pgp_delist(pgp,0);
430 ASSERT(pgp->obj != NULL);
431 pgp->obj->pgp_count--;
432 ASSERT(pgp->obj->pgp_count >= 0);
433 pgp_free(pgp,0);
434 }
436 FORWARD static rtn_t *rtn_alloc(void *arg);
437 FORWARD static void rtn_free(rtn_t *rtn);
439 static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
440 {
441 int ret;
443 ASSERT_SPINLOCK(&obj->obj_spinlock);
444 ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
445 if ( !ret )
446 obj->pgp_count++;
447 return ret;
448 }
450 static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
451 {
452 pgp_t *pgp;
454 ASSERT(obj != NULL);
455 ASSERT_SPINLOCK(&obj->obj_spinlock);
456 ASSERT_SENTINEL(obj,OBJ);
457 ASSERT(obj->pool != NULL);
458 ASSERT_SENTINEL(obj->pool,POOL);
459 pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
460 if ( pgp != NULL )
461 obj->pgp_count--;
462 ASSERT(obj->pgp_count >= 0);
464 return pgp;
465 }
467 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
469 /* called only indirectly from radix_tree_insert */
470 static NOINLINE rtn_t *rtn_alloc(void *arg)
471 {
472 objnode_t *objnode;
473 obj_t *obj = (obj_t *)arg;
475 ASSERT_SENTINEL(obj,OBJ);
476 ASSERT(obj->pool != NULL);
477 ASSERT_SENTINEL(obj->pool,POOL);
478 objnode = tmem_malloc(objnode_t,obj->pool);
479 if (objnode == NULL)
480 return NULL;
481 objnode->obj = obj;
482 SET_SENTINEL(objnode,OBJNODE);
483 memset(&objnode->rtn, 0, sizeof(rtn_t));
484 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
485 obj->pool->objnode_count_max = obj->pool->objnode_count;
486 atomic_inc_and_max(global_rtree_node_count);
487 obj->objnode_count++;
488 return &objnode->rtn;
489 }
491 /* called only indirectly from radix_tree_delete/destroy */
492 static void rtn_free(rtn_t *rtn)
493 {
494 pool_t *pool;
495 objnode_t *objnode;
496 int i;
498 ASSERT(rtn != NULL);
499 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
500 ASSERT(rtn->slots[i] == NULL);
501 objnode = container_of(rtn,objnode_t,rtn);
502 ASSERT_SENTINEL(objnode,OBJNODE);
503 INVERT_SENTINEL(objnode,OBJNODE);
504 ASSERT(objnode->obj != NULL);
505 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
506 ASSERT_SENTINEL(objnode->obj,OBJ);
507 pool = objnode->obj->pool;
508 ASSERT(pool != NULL);
509 ASSERT_SENTINEL(pool,POOL);
510 pool->objnode_count--;
511 objnode->obj->objnode_count--;
512 objnode->obj = NULL;
513 tmem_free(objnode,sizeof(objnode_t),pool);
514 atomic_dec_and_assert(global_rtree_node_count);
515 }
517 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
519 /* searches for object==oid in pool, returns locked object if found */
520 static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
521 {
522 struct rb_node *node;
523 obj_t *obj;
525 restart_find:
526 tmem_read_lock(&pool->pool_rwlock);
527 node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
528 while ( node )
529 {
530 obj = container_of(node, obj_t, rb_tree_node);
531 if ( obj->oid == oid )
532 {
533 if ( tmh_lock_all )
534 obj->no_evict = 1;
535 else
536 {
537 if ( !tmem_spin_trylock(&obj->obj_spinlock) )
538 {
539 tmem_read_unlock(&pool->pool_rwlock);
540 goto restart_find;
541 }
542 tmem_read_unlock(&pool->pool_rwlock);
543 }
544 return obj;
545 }
546 else if ( oid < obj->oid )
547 node = node->rb_left;
548 else
549 node = node->rb_right;
550 }
551 tmem_read_unlock(&pool->pool_rwlock);
552 return NULL;
553 }
555 /* free an object that has no more pgps in it */
556 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
557 {
558 pool_t *pool;
559 uint64_t old_oid;
561 ASSERT_SPINLOCK(&obj->obj_spinlock);
562 ASSERT(obj != NULL);
563 ASSERT_SENTINEL(obj,OBJ);
564 ASSERT(obj->pgp_count == 0);
565 pool = obj->pool;
566 ASSERT(pool != NULL);
567 ASSERT_WRITELOCK(&pool->pool_rwlock);
568 if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
569 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
570 ASSERT((long)obj->objnode_count == 0);
571 ASSERT(obj->tree_root.rnode == NULL);
572 pool->obj_count--;
573 ASSERT(pool->obj_count >= 0);
574 INVERT_SENTINEL(obj,OBJ);
575 obj->pool = NULL;
576 old_oid = obj->oid;
577 obj->oid = -1;
578 obj->last_client = CLI_ID_NULL;
579 atomic_dec_and_assert(global_obj_count);
580 /* use no_rebalance only if all objects are being destroyed anyway */
581 if ( !no_rebalance )
582 rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
583 tmem_free(obj,sizeof(obj_t),pool);
584 }
586 static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
587 {
588 struct rb_node **new, *parent = NULL;
589 obj_t *this;
591 new = &(root->rb_node);
592 while ( *new )
593 {
594 this = container_of(*new, obj_t, rb_tree_node);
595 parent = *new;
596 if ( obj->oid < this->oid )
597 new = &((*new)->rb_left);
598 else if ( obj->oid > this->oid )
599 new = &((*new)->rb_right);
600 else
601 return 0;
602 }
603 rb_link_node(&obj->rb_tree_node, parent, new);
604 rb_insert_color(&obj->rb_tree_node, root);
605 return 1;
606 }
608 /*
609 * allocate, initialize, and insert an tmem_object_root
610 * (should be called only if find failed)
611 */
612 static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
613 {
614 obj_t *obj;
616 ASSERT(pool != NULL);
617 ASSERT_WRITELOCK(&pool->pool_rwlock);
618 if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
619 return NULL;
620 pool->obj_count++;
621 if (pool->obj_count > pool->obj_count_max)
622 pool->obj_count_max = pool->obj_count;
623 atomic_inc_and_max(global_obj_count);
624 INIT_RADIX_TREE(&obj->tree_root,0);
625 spin_lock_init(&obj->obj_spinlock);
626 obj->pool = pool;
627 obj->oid = oid;
628 obj->objnode_count = 0;
629 obj->pgp_count = 0;
630 obj->last_client = CLI_ID_NULL;
631 SET_SENTINEL(obj,OBJ);
632 tmem_spin_lock(&obj->obj_spinlock);
633 obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
634 obj->no_evict = 1;
635 ASSERT_SPINLOCK(&obj->obj_spinlock);
636 return obj;
637 }
639 /* free an object after destroying any pgps in it */
640 static NOINLINE void obj_destroy(obj_t *obj, int no_rebalance)
641 {
642 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
643 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
644 obj_free(obj,no_rebalance);
645 }
647 /* destroys all objs in a pool, or only if obj->last_client matches cli_id */
648 static void pool_destroy_objs(pool_t *pool, bool_t selective, cli_id_t cli_id)
649 {
650 struct rb_node *node;
651 obj_t *obj;
652 int i;
654 tmem_write_lock(&pool->pool_rwlock);
655 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
656 {
657 node = rb_first(&pool->obj_rb_root[i]);
658 while ( node != NULL )
659 {
660 obj = container_of(node, obj_t, rb_tree_node);
661 tmem_spin_lock(&obj->obj_spinlock);
662 node = rb_next(node);
663 ASSERT(obj->no_evict == 0);
664 if ( !selective )
665 obj_destroy(obj,1);
666 else if ( obj->last_client == cli_id )
667 obj_destroy(obj,0);
668 else
669 tmem_spin_unlock(&obj->obj_spinlock);
670 }
671 }
672 tmem_write_unlock(&pool->pool_rwlock);
673 }
676 /************ POOL MANIPULATION ROUTINES ******************************/
678 static pool_t * pool_alloc(void)
679 {
680 pool_t *pool;
681 int i;
683 if ( (pool = tmem_malloc(pool_t,NULL)) == NULL )
684 return NULL;
685 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
686 pool->obj_rb_root[i] = RB_ROOT;
687 INIT_LIST_HEAD(&pool->pool_list);
688 rwlock_init(&pool->pool_rwlock);
689 pool->pgp_count_max = pool->obj_count_max = 0;
690 pool->objnode_count = pool->objnode_count_max = 0;
691 atomic_set(&pool->pgp_count,0);
692 pool->obj_count = 0;
693 pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
694 pool->dup_puts_replaced = pool->no_mem_puts = 0;
695 pool->found_gets = pool->gets = 0;
696 pool->flushs_found = pool->flushs = 0;
697 pool->flush_objs_found = pool->flush_objs = 0;
698 SET_SENTINEL(pool,POOL);
699 return pool;
700 }
702 static NOINLINE void pool_free(pool_t *pool)
703 {
704 ASSERT_SENTINEL(pool,POOL);
705 INVERT_SENTINEL(pool,POOL);
706 pool->client = NULL;
707 list_del(&pool->pool_list);
708 tmem_free(pool,sizeof(pool_t),NULL);
709 }
711 /* register new_client as a user of this shared pool and return new
712 total number of registered users */
713 static int shared_pool_join(pool_t *pool, client_t *new_client)
714 {
715 sharelist_t *sl;
717 ASSERT(is_shared(pool));
718 if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
719 return -1;
720 sl->client = new_client;
721 list_add_tail(&sl->share_list, &pool->share_list);
722 if ( new_client->cli_id != pool->client->cli_id )
723 printk("adding new %s %d to shared pool owned by %s %d\n",
724 client_str, new_client->cli_id, client_str, pool->client->cli_id);
725 return ++pool->shared_count;
726 }
728 /* reassign "ownership" of the pool to another client that shares this pool */
729 static NOINLINE void shared_pool_reassign(pool_t *pool)
730 {
731 sharelist_t *sl;
732 int poolid;
733 client_t *old_client = pool->client, *new_client;
735 ASSERT(is_shared(pool));
736 if ( list_empty(&pool->share_list) )
737 {
738 ASSERT(pool->shared_count == 0);
739 return;
740 }
741 old_client->pools[pool->pool_id] = NULL;
742 sl = list_entry(pool->share_list.next, sharelist_t, share_list);
743 ASSERT(sl->client != old_client);
744 pool->client = new_client = sl->client;
745 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
746 if (new_client->pools[poolid] == pool)
747 break;
748 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
749 new_client->eph_count += _atomic_read(pool->pgp_count);
750 old_client->eph_count -= _atomic_read(pool->pgp_count);
751 list_splice_init(&old_client->ephemeral_page_list,
752 &new_client->ephemeral_page_list);
753 printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
754 cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
755 pool->pool_id = poolid;
756 }
758 /* destroy all objects with last_client same as passed cli_id,
759 remove pool's cli_id from list of sharers of this pool */
760 static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
761 {
762 sharelist_t *sl;
763 int s_poolid;
765 ASSERT(is_shared(pool));
766 ASSERT(pool->client != NULL);
768 ASSERT_WRITELOCK(&tmem_rwlock);
769 pool_destroy_objs(pool,1,cli_id);
770 list_for_each_entry(sl,&pool->share_list, share_list)
771 {
772 if (sl->client->cli_id != cli_id)
773 continue;
774 list_del(&sl->share_list);
775 tmem_free(sl,sizeof(sharelist_t),pool);
776 --pool->shared_count;
777 if (pool->client->cli_id == cli_id)
778 shared_pool_reassign(pool);
779 if (pool->shared_count)
780 return pool->shared_count;
781 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
782 if ( (global_shared_pools[s_poolid]) == pool )
783 {
784 global_shared_pools[s_poolid] = NULL;
785 break;
786 }
787 return 0;
788 }
789 printk("tmem: no match unsharing pool, %s=%d\n",
790 cli_id_str,pool->client->cli_id);
791 return -1;
792 }
794 /* flush all data (owned by cli_id) from a pool and, optionally, free it */
795 static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
796 {
797 ASSERT(pool != NULL);
798 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
799 {
800 printk("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
801 cli_id_str, cli_id, pool->pool_id, cli_id_str,pool->client->cli_id);
802 return;
803 }
804 printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
805 is_persistent(pool) ? "persistent" : "ephemeral" ,
806 is_shared(pool) ? "shared" : "private");
807 printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
808 pool_destroy_objs(pool,0,CLI_ID_NULL);
809 if ( destroy )
810 {
811 pool->client->pools[pool->pool_id] = NULL;
812 pool_free(pool);
813 }
814 }
816 /************ CLIENT MANIPULATION OPERATIONS **************************/
818 static client_t *client_create(void)
819 {
820 client_t *client = tmem_malloc(client_t,NULL);
821 cli_id_t cli_id = tmh_get_cli_id_from_current();
823 printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
824 if ( client == NULL )
825 {
826 printk("failed... out of memory\n");
827 return NULL;
828 }
829 memset(client,0,sizeof(client_t));
830 if ( (client->tmh = tmh_client_init()) == NULL )
831 {
832 printk("failed... can't allocate host-dependent part of client\n");
833 if ( client )
834 tmem_free(client,sizeof(client_t),NULL);
835 return NULL;
836 }
837 tmh_set_current_client(client);
838 client->cli_id = cli_id;
839 #ifdef __i386__
840 client->compress = 0;
841 #else
842 client->compress = tmh_compression_enabled();
843 #endif
844 list_add_tail(&client->client_list, &global_client_list);
845 INIT_LIST_HEAD(&client->ephemeral_page_list);
846 client->eph_count = client->eph_count_max = 0;
847 client->total_cycles = 0; client->succ_pers_puts = 0;
848 client->succ_eph_gets = 0; client->succ_pers_gets = 0;
849 printk("ok\n");
850 return client;
851 }
853 static void client_free(client_t *client)
854 {
855 list_del(&client->client_list);
856 tmh_client_destroy(client->tmh);
857 tmem_free(client,sizeof(client_t),NULL);
858 }
860 /* flush all data from a client and, optionally, free it */
861 static void client_flush(client_t *client, bool_t destroy)
862 {
863 int i;
864 pool_t *pool;
866 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
867 {
868 if ( (pool = client->pools[i]) == NULL )
869 continue;
870 pool_flush(pool,client->cli_id,destroy);
871 if ( destroy )
872 client->pools[i] = NULL;
873 }
874 if ( destroy )
875 client_free(client);
876 }
878 static bool_t client_over_quota(client_t *client)
879 {
880 int total = _atomic_read(client_weight_total);
882 ASSERT(client != NULL);
883 if ( (total == 0) || (client->weight == 0) ||
884 (client->eph_count == 0) )
885 return 0;
886 return ( ((global_eph_count*100L) / client->eph_count ) >
887 ((total*100L) / client->weight) );
888 }
890 /************ MEMORY REVOCATION ROUTINES *******************************/
892 static int tmem_evict(void)
893 {
894 client_t *client = tmh_client_from_current();
895 pgp_t *pgp = NULL, *pgp_del;
896 obj_t *obj;
897 pool_t *pool;
898 int ret = 0;
899 bool_t hold_pool_rwlock = 0;
901 evict_attempts++;
902 tmem_spin_lock(&eph_lists_spinlock);
903 if ( (client != NULL) && client_over_quota(client) &&
904 !list_empty(&client->ephemeral_page_list) )
905 {
906 list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
907 {
908 obj = pgp->obj;
909 pool = obj->pool;
910 if ( tmh_lock_all && !obj->no_evict )
911 goto found;
912 if ( tmem_spin_trylock(&obj->obj_spinlock) )
913 {
914 if ( obj->pgp_count > 1 )
915 goto found;
916 if ( tmem_write_trylock(&pool->pool_rwlock) )
917 {
918 hold_pool_rwlock = 1;
919 goto found;
920 }
921 tmem_spin_unlock(&obj->obj_spinlock);
922 }
923 }
924 } else if ( list_empty(&global_ephemeral_page_list) ) {
925 goto out;
926 } else {
927 list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
928 {
929 obj = pgp->obj;
930 pool = obj->pool;
931 if ( tmh_lock_all && !obj->no_evict )
932 goto found;
933 if ( tmem_spin_trylock(&obj->obj_spinlock) )
934 {
935 if ( obj->pgp_count > 1 )
936 goto found;
937 if ( tmem_write_trylock(&pool->pool_rwlock) )
938 {
939 hold_pool_rwlock = 1;
940 goto found;
941 }
942 tmem_spin_unlock(&obj->obj_spinlock);
943 }
944 }
945 }
947 ret = 0;
948 goto out;
950 found:
951 ASSERT(pgp != NULL);
952 ASSERT_SENTINEL(pgp,PGD);
953 obj = pgp->obj;
954 ASSERT(obj != NULL);
955 ASSERT(obj->no_evict == 0);
956 ASSERT(obj->pool != NULL);
957 ASSERT_SENTINEL(obj,OBJ);
959 ASSERT_SPINLOCK(&obj->obj_spinlock);
960 pgp_del = pgp_delete_from_obj(obj, pgp->index);
961 ASSERT(pgp_del == pgp);
962 pgp_delete(pgp,1);
963 if ( obj->pgp_count == 0 )
964 {
965 ASSERT_WRITELOCK(&pool->pool_rwlock);
966 obj_free(obj,0);
967 }
968 else
969 tmem_spin_unlock(&obj->obj_spinlock);
970 if ( hold_pool_rwlock )
971 tmem_write_unlock(&pool->pool_rwlock);
972 evicted_pgs++;
973 ret = 1;
975 out:
976 tmem_spin_unlock(&eph_lists_spinlock);
977 return ret;
978 }
980 static unsigned long tmem_relinquish_npages(unsigned long n)
981 {
982 unsigned long avail_pages = 0;
984 while ( (avail_pages = tmh_avail_pages()) < n )
985 {
986 if ( !tmem_evict() )
987 break;
988 }
989 if ( avail_pages )
990 tmh_release_avail_pages_to_host();
991 return avail_pages;
992 }
994 /************ TMEM CORE OPERATIONS ************************************/
996 static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn)
997 {
998 void *dst, *p;
999 size_t size;
1000 int ret = 0;
1001 DECL_LOCAL_CYC_COUNTER(compress);
1003 ASSERT(pgp != NULL);
1004 ASSERT(pgp->obj != NULL);
1005 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
1006 ASSERT(pgp->obj->pool != NULL);
1007 ASSERT(pgp->obj->pool->client != NULL);
1008 #ifdef __i386__
1009 return -ENOMEM;
1010 #endif
1011 if ( pgp->pfp != NULL )
1012 pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */
1013 START_CYC_COUNTER(compress);
1014 ret = tmh_compress_from_client(cmfn, &dst, &size);
1015 if ( (ret == -EFAULT) || (ret == 0) )
1016 goto out;
1017 else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
1018 ret = 0;
1019 else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
1020 ret = -ENOMEM;
1021 else
1023 memcpy(p,dst,size);
1024 pgp->cdata = p;
1025 pgp->size = size;
1026 pgp->obj->pool->client->compressed_pages++;
1027 pgp->obj->pool->client->compressed_sum_size += size;
1028 ret = 1;
1031 out:
1032 END_CYC_COUNTER(compress);
1033 return ret;
1036 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1037 uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
1039 pool_t *pool;
1040 obj_t *obj;
1041 client_t *client;
1042 pgp_t *pgpfound = NULL;
1043 int ret;
1045 /* if we can successfully manipulate pgp to change out the data, do so */
1046 ASSERT(pgp != NULL);
1047 ASSERT(pgp->pfp != NULL);
1048 ASSERT(pgp->size != -1);
1049 obj = pgp->obj;
1050 ASSERT_SPINLOCK(&obj->obj_spinlock);
1051 ASSERT(obj != NULL);
1052 pool = obj->pool;
1053 ASSERT(pool != NULL);
1054 client = pool->client;
1055 if ( len != 0 && tmh_compression_enabled() &&
1056 client->compress && pgp->size != 0 )
1058 ret = do_tmem_put_compress(pgp,cmfn);
1059 if ( ret == 1 )
1060 goto done;
1061 else if ( ret == 0 )
1062 goto copy_uncompressed;
1063 else if ( ret == -ENOMEM )
1064 goto failed_dup;
1065 else if ( ret == -EFAULT )
1066 goto bad_copy;
1069 copy_uncompressed:
1070 if ( pgp->pfp )
1071 pgp_free_data(pgp, pool);
1072 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1073 goto failed_dup;
1074 /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
1075 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
1076 if ( ret == -EFAULT )
1077 goto bad_copy;
1078 pgp->size = 0;
1080 done:
1081 /* successfully replaced data, clean up and return success */
1082 if ( is_shared(pool) )
1083 obj->last_client = client->cli_id;
1084 obj->no_evict = 0;
1085 tmem_spin_unlock(&obj->obj_spinlock);
1086 pool->dup_puts_replaced++;
1087 pool->good_puts++;
1088 if ( is_persistent(pool) )
1089 client->succ_pers_puts++;
1090 return 1;
1092 bad_copy:
1093 /* this should only happen if the client passed a bad mfn */
1094 failed_copies++;
1095 ASSERT(0);
1096 return -EFAULT;
1098 failed_dup:
1099 /* couldn't change out the data, flush the old data and return
1100 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
1101 pgpfound = pgp_delete_from_obj(obj, pgp->index);
1102 ASSERT(pgpfound == pgp);
1103 pgp_delete(pgpfound,0);
1104 if ( obj->pgp_count == 0 )
1106 tmem_write_lock(&pool->pool_rwlock);
1107 obj_free(obj,0);
1108 tmem_write_unlock(&pool->pool_rwlock);
1109 } else {
1110 obj->no_evict = 0;
1111 tmem_spin_unlock(&obj->obj_spinlock);
1113 pool->dup_puts_flushed++;
1114 return -ENOSPC;
1118 static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index,
1119 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1120 uint32_t pfn_offset, uint32_t len)
1122 obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
1123 pgp_t *pgp = NULL, *pgpdel = NULL;
1124 client_t *client = pool->client;
1125 int ret = client->frozen ? -EFROZEN : -ENOMEM;
1127 ASSERT(pool != NULL);
1128 pool->puts++;
1129 /* does page already exist (dup)? if so, handle specially */
1130 if ( (obj = objfound = obj_find(pool,oid)) != NULL )
1132 ASSERT_SPINLOCK(&objfound->obj_spinlock);
1133 if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
1134 return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len);
1137 /* no puts allowed into a frozen pool (except dup puts) */
1138 if ( client->frozen )
1139 goto free;
1141 if ( (objfound == NULL) )
1143 tmem_write_lock(&pool->pool_rwlock);
1144 if ( (obj = objnew = obj_new(pool,oid)) == NULL )
1146 tmem_write_unlock(&pool->pool_rwlock);
1147 return -ENOMEM;
1149 ASSERT_SPINLOCK(&objnew->obj_spinlock);
1150 tmem_write_unlock(&pool->pool_rwlock);
1153 ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
1154 ASSERT_SPINLOCK(&obj->obj_spinlock);
1155 if ( (pgp = pgp_alloc(obj)) == NULL )
1156 goto free;
1158 ret = pgp_add_to_obj(obj, index, pgp);
1159 if ( ret == -ENOMEM )
1160 /* warning, may result in partially built radix tree ("stump") */
1161 goto free;
1162 ASSERT(ret != -EEXIST);
1163 pgp->index = index;
1165 if ( len != 0 && tmh_compression_enabled() && client->compress )
1167 ASSERT(pgp->pfp == NULL);
1168 ret = do_tmem_put_compress(pgp,cmfn);
1169 if ( ret == 1 )
1170 goto insert_page;
1171 if ( ret == -ENOMEM )
1173 client->compress_nomem++;
1174 goto delete_and_free;
1176 if ( ret == 0 )
1178 client->compress_poor++;
1179 goto copy_uncompressed;
1181 if ( ret == -EFAULT )
1182 goto bad_copy;
1185 copy_uncompressed:
1186 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1188 ret == -ENOMEM;
1189 goto delete_and_free;
1191 /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
1192 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
1193 if ( ret == -EFAULT )
1194 goto bad_copy;
1195 pgp->size = 0;
1197 insert_page:
1198 if ( is_ephemeral(pool) )
1200 tmem_spin_lock(&eph_lists_spinlock);
1201 list_add_tail(&pgp->global_eph_pages,
1202 &global_ephemeral_page_list);
1203 if (++global_eph_count > global_eph_count_max)
1204 global_eph_count_max = global_eph_count;
1205 list_add_tail(&pgp->client_eph_pages,
1206 &client->ephemeral_page_list);
1207 if (++client->eph_count > client->eph_count_max)
1208 client->eph_count_max = client->eph_count;
1209 tmem_spin_unlock(&eph_lists_spinlock);
1211 ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
1212 if ( is_shared(pool) )
1213 obj->last_client = client->cli_id;
1214 obj->no_evict = 0;
1215 tmem_spin_unlock(&obj->obj_spinlock);
1216 pool->good_puts++;
1217 if ( is_persistent(pool) )
1218 client->succ_pers_puts++;
1219 return 1;
1221 delete_and_free:
1222 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1223 pgpdel = pgp_delete_from_obj(obj, pgp->index);
1224 ASSERT(pgp == pgpdel);
1226 free:
1227 if ( pgp )
1228 pgp_delete(pgp,0);
1229 if ( objfound )
1231 objfound->no_evict = 0;
1232 tmem_spin_unlock(&objfound->obj_spinlock);
1234 if ( objnew )
1236 tmem_write_lock(&pool->pool_rwlock);
1237 obj_free(objnew,0);
1238 tmem_write_unlock(&pool->pool_rwlock);
1240 pool->no_mem_puts++;
1241 return ret;
1243 bad_copy:
1244 /* this should only happen if the client passed a bad mfn */
1245 failed_copies++;
1246 ASSERT(0);
1247 goto free;
1250 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
1251 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1252 uint32_t pfn_offset, uint32_t len)
1254 obj_t *obj;
1255 pgp_t *pgp;
1256 client_t *client = pool->client;
1257 DECL_LOCAL_CYC_COUNTER(decompress);
1259 if ( !_atomic_read(pool->pgp_count) )
1260 return -EEMPTY;
1262 pool->gets++;
1263 obj = obj_find(pool,oid);
1264 if ( obj == NULL )
1265 return 0;
1267 ASSERT_SPINLOCK(&obj->obj_spinlock);
1268 if (is_shared(pool) || is_persistent(pool) )
1269 pgp = pgp_lookup_in_obj(obj, index);
1270 else
1271 pgp = pgp_delete_from_obj(obj, index);
1272 if ( pgp == NULL )
1274 obj->no_evict = 0;
1275 tmem_spin_unlock(&obj->obj_spinlock);
1276 return 0;
1278 ASSERT(pgp->size != -1);
1279 if ( pgp->size != 0 )
1281 START_CYC_COUNTER(decompress);
1282 if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT )
1283 goto bad_copy;
1284 END_CYC_COUNTER(decompress);
1286 else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
1287 pfn_offset, len) == -EFAULT)
1288 goto bad_copy;
1289 if ( is_ephemeral(pool) )
1291 if ( is_private(pool) )
1293 pgp_delete(pgp,0);
1294 if ( obj->pgp_count == 0 )
1296 tmem_write_lock(&pool->pool_rwlock);
1297 obj_free(obj,0);
1298 obj = NULL;
1299 tmem_write_unlock(&pool->pool_rwlock);
1301 } else {
1302 tmem_spin_lock(&eph_lists_spinlock);
1303 list_del(&pgp->global_eph_pages);
1304 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1305 list_del(&pgp->client_eph_pages);
1306 list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
1307 tmem_spin_unlock(&eph_lists_spinlock);
1308 ASSERT(obj != NULL);
1309 obj->last_client = tmh_get_cli_id_from_current();
1312 if ( obj != NULL )
1314 obj->no_evict = 0;
1315 tmem_spin_unlock(&obj->obj_spinlock);
1317 pool->found_gets++;
1318 if ( is_ephemeral(pool) )
1319 client->succ_eph_gets++;
1320 else
1321 client->succ_pers_gets++;
1322 return 1;
1324 bad_copy:
1325 /* this should only happen if the client passed a bad mfn */
1326 failed_copies++;
1327 ASSERT(0);
1328 return -EFAULT;
1332 static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
1334 obj_t *obj;
1335 pgp_t *pgp;
1337 pool->flushs++;
1338 obj = obj_find(pool,oid);
1339 if ( obj == NULL )
1340 goto out;
1341 pgp = pgp_delete_from_obj(obj, index);
1342 if ( pgp == NULL )
1344 obj->no_evict = 0;
1345 tmem_spin_unlock(&obj->obj_spinlock);
1346 goto out;
1348 pgp_delete(pgp,0);
1349 if ( obj->pgp_count == 0 )
1351 tmem_write_lock(&pool->pool_rwlock);
1352 obj_free(obj,0);
1353 tmem_write_unlock(&pool->pool_rwlock);
1354 } else {
1355 obj->no_evict = 0;
1356 tmem_spin_unlock(&obj->obj_spinlock);
1358 pool->flushs_found++;
1360 out:
1361 if ( pool->client->frozen )
1362 return -EFROZEN;
1363 else
1364 return 1;
1367 static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
1369 obj_t *obj;
1371 pool->flush_objs++;
1372 obj = obj_find(pool,oid);
1373 if ( obj == NULL )
1374 goto out;
1375 tmem_write_lock(&pool->pool_rwlock);
1376 obj_destroy(obj,0);
1377 pool->flush_objs_found++;
1378 tmem_write_unlock(&pool->pool_rwlock);
1380 out:
1381 if ( pool->client->frozen )
1382 return -EFROZEN;
1383 else
1384 return 1;
1387 static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
1389 client_t *client = tmh_client_from_current();
1390 pool_t *pool;
1392 if ( client->pools == NULL )
1393 return 0;
1394 if ( (pool = client->pools[pool_id]) == NULL )
1395 return 0;
1396 client->pools[pool_id] = NULL;
1397 pool_flush(pool,client->cli_id,1);
1398 return 1;
1401 static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi)
1403 client_t *client = tmh_client_from_current();
1404 cli_id_t cli_id = tmh_get_cli_id_from_current();
1405 int persistent = flags & TMEM_POOL_PERSIST;
1406 int shared = flags & TMEM_POOL_SHARED;
1407 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1408 & TMEM_POOL_PAGESIZE_MASK;
1409 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1410 & TMEM_POOL_VERSION_MASK;
1411 pool_t *pool, *shpool;
1412 int s_poolid, d_poolid, first_unused_s_poolid;
1414 ASSERT(client != NULL);
1415 printk("tmem: allocating %s-%s tmem pool for %s=%d...",
1416 persistent ? "persistent" : "ephemeral" ,
1417 shared ? "shared" : "private", cli_id_str, cli_id);
1418 if ( specversion != 0 )
1420 printk("failed... unsupported spec version\n");
1421 return -EPERM;
1423 if ( pagebits != (PAGE_SHIFT - 12) )
1425 printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
1426 return -EPERM;
1428 if ( (pool = pool_alloc()) == NULL )
1430 printk("failed... out of memory\n");
1431 return -ENOMEM;
1433 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1434 if ( client->pools[d_poolid] == NULL )
1435 break;
1436 if ( d_poolid == MAX_POOLS_PER_DOMAIN )
1438 printk("failed... no more pool slots available for this %s\n",
1439 client_str);
1440 goto fail;
1442 pool->shared = shared;
1443 pool->client = client;
1444 if ( shared )
1446 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1447 for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
1449 if ( (shpool = global_shared_pools[s_poolid]) != NULL )
1451 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1453 printk("(matches shared pool uuid=%"PRIx64".%"PRIx64") ",
1454 uuid_hi, uuid_lo);
1455 printk("pool_id=%d\n",d_poolid);
1456 client->pools[d_poolid] = global_shared_pools[s_poolid];
1457 shared_pool_join(global_shared_pools[s_poolid], client);
1458 pool_free(pool);
1459 return d_poolid;
1462 else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1463 first_unused_s_poolid = s_poolid;
1465 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1467 printk("tmem: failed... no global shared pool slots available\n");
1468 goto fail;
1470 else
1472 INIT_LIST_HEAD(&pool->share_list);
1473 pool->shared_count = 0;
1474 global_shared_pools[first_unused_s_poolid] = pool;
1475 (void)shared_pool_join(pool,client);
1478 client->pools[d_poolid] = pool;
1479 list_add_tail(&pool->pool_list, &global_pool_list);
1480 pool->pool_id = d_poolid;
1481 pool->persistent = persistent;
1482 pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
1483 printk("pool_id=%d\n",d_poolid);
1484 return d_poolid;
1486 fail:
1487 pool_free(pool);
1488 return -EPERM;
1491 /************ TMEM CONTROL OPERATIONS ************************************/
1493 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
1494 static int tmemc_freeze_pools(int cli_id, int arg)
1496 client_t *client;
1497 bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
1498 bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
1499 char *s;
1501 s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
1502 if ( cli_id == CLI_ID_NULL )
1504 list_for_each_entry(client,&global_client_list,client_list)
1505 client->frozen = freeze;
1506 printk("tmem: all pools %s for all %ss\n",s,client_str);
1508 else
1510 if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1511 return -1;
1512 client->frozen = freeze;
1513 printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
1515 return 0;
1518 static int tmemc_flush_mem(int cli_id, uint32_t kb)
1520 uint32_t npages, flushed_pages, flushed_kb;
1522 if ( cli_id != CLI_ID_NULL )
1524 printk("tmem: %s-specific flush not supported yet, use --all\n",
1525 client_str);
1526 return -1;
1528 /* convert kb to pages, rounding up if necessary */
1529 npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
1530 flushed_pages = tmem_relinquish_npages(npages);
1531 flushed_kb = flushed_pages << (PAGE_SHIFT-10);
1532 return flushed_kb;
1535 /*
1536 * These tmemc_list* routines output lots of stats in a format that is
1537 * intended to be program-parseable, not human-readable. Further, by
1538 * tying each group of stats to a line format indicator (e.g. G= for
1539 * global stats) and each individual stat to a two-letter specifier
1540 * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
1541 * global ephemeral pool), it should allow the stats reported to be
1542 * forward and backwards compatible as tmem evolves.
1543 */
1544 #define BSIZE 1024
1546 static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
1547 uint32_t len, bool_t use_long)
1549 char info[BSIZE];
1550 int i, n = 0, sum = 0;
1551 pool_t *p;
1552 bool_t s;
1554 n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d,"
1555 "Tc:%"PRIu64",Ge:%ld,Pp:%ld,Gp:%ld%c",
1556 c->cli_id, c->weight, c->cap, c->compress, c->frozen,
1557 c->total_cycles, c->succ_eph_gets, c->succ_pers_puts, c->succ_pers_gets,
1558 use_long ? ',' : '\n');
1559 if (use_long)
1560 n += scnprintf(info+n,BSIZE-n,
1561 "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n",
1562 c->eph_count, c->eph_count_max,
1563 c->compressed_pages, c->compressed_sum_size,
1564 c->compress_poor, c->compress_nomem);
1565 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1566 sum += n;
1567 for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
1569 if ( (p = c->pools[i]) == NULL )
1570 continue;
1571 s = is_shared(p);
1572 n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,"
1573 "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c",
1574 c->cli_id, p->pool_id,
1575 is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
1576 (uint64_t)(s ? p->uuid[0] : 0),
1577 (uint64_t)(s ? p->uuid[1] : 0LL),
1578 use_long ? ',' : '\n');
1579 if (use_long)
1580 n += scnprintf(info+n,BSIZE-n,
1581 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1582 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1583 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1584 _atomic_read(p->pgp_count), p->pgp_count_max,
1585 p->obj_count, p->obj_count_max,
1586 p->objnode_count, p->objnode_count_max,
1587 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1588 p->no_mem_puts,
1589 p->found_gets, p->gets,
1590 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1591 if ( sum + n >= len )
1592 return sum;
1593 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1594 sum += n;
1596 return sum;
1599 static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
1600 bool_t use_long)
1602 char info[BSIZE];
1603 int i, n = 0, sum = 0;
1604 pool_t *p;
1605 sharelist_t *sl;
1607 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
1609 if ( (p = global_shared_pools[i]) == NULL )
1610 continue;
1611 n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64,
1612 i, is_persistent(p) ? 'P' : 'E',
1613 is_shared(p) ? 'S' : 'P',
1614 p->uuid[0], p->uuid[1]);
1615 list_for_each_entry(sl,&p->share_list, share_list)
1616 n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
1617 n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
1618 if (use_long)
1619 n += scnprintf(info+n,BSIZE-n,
1620 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1621 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1622 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1623 _atomic_read(p->pgp_count), p->pgp_count_max,
1624 p->obj_count, p->obj_count_max,
1625 p->objnode_count, p->objnode_count_max,
1626 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1627 p->no_mem_puts,
1628 p->found_gets, p->gets,
1629 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1630 if ( sum + n >= len )
1631 return sum;
1632 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1633 sum += n;
1635 return sum;
1638 #ifdef TMEM_PERF
1639 static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
1640 bool_t use_long)
1642 char info[BSIZE];
1643 int n = 0, sum = 0;
1645 n = scnprintf(info+n,BSIZE-n,"T=");
1646 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
1647 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
1648 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
1649 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
1650 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
1651 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
1652 #ifdef COMPARE_COPY_PAGE_SSE2
1653 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
1654 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
1655 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
1656 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
1657 #else
1658 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
1659 #endif
1660 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
1661 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
1662 n--; /* overwrite trailing comma */
1663 n += scnprintf(info+n,BSIZE-n,"\n");
1664 if ( sum + n >= len )
1665 return sum;
1666 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1667 sum += n;
1668 return sum;
1670 #else
1671 #define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
1672 #endif
1674 static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
1675 bool_t use_long)
1677 char info[BSIZE];
1678 int n = 0, sum = off;
1680 n += scnprintf(info,BSIZE,"G="
1681 "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
1682 "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
1683 total_tmem_ops, errored_tmem_ops, failed_copies,
1684 alloc_failed, alloc_page_failed, tmh_avail_pages(),
1685 low_on_memory, evicted_pgs,
1686 evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
1687 total_flush_pool, use_long ? ',' : '\n');
1688 if (use_long)
1689 n += scnprintf(info+n,BSIZE-n,
1690 "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
1691 global_eph_count, global_eph_count_max,
1692 _atomic_read(global_obj_count), global_obj_count_max,
1693 _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
1694 _atomic_read(global_pgp_count), global_pgp_count_max);
1695 if ( sum + n >= len )
1696 return sum;
1697 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1698 sum += n;
1699 return sum;
1702 static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len,
1703 bool_t use_long)
1705 client_t *client;
1706 int off = 0;
1708 if ( cli_id == CLI_ID_NULL ) {
1709 off = tmemc_list_global(buf,0,len,use_long);
1710 off += tmemc_list_shared(buf,off,len-off,use_long);
1711 list_for_each_entry(client,&global_client_list,client_list)
1712 off += tmemc_list_client(client, buf, off, len-off, use_long);
1713 off += tmemc_list_global_perf(buf,off,len-off,use_long);
1715 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1716 return -1;
1717 else
1718 off = tmemc_list_client(client, buf, 0, len, use_long);
1721 return 0;
1724 static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
1726 cli_id_t cli_id = client->cli_id;
1727 uint32_t old_weight;
1729 switch (subop)
1731 case TMEMC_SET_WEIGHT:
1732 old_weight = client->weight;
1733 client->weight = arg1;
1734 printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1735 atomic_sub(old_weight,&client_weight_total);
1736 atomic_add(client->weight,&client_weight_total);
1737 break;
1738 case TMEMC_SET_CAP:
1739 client->cap = arg1;
1740 printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1741 break;
1742 case TMEMC_SET_COMPRESS:
1743 client->compress = arg1 ? 1 : 0;
1744 printk("tmem: compression %s for %s=%d\n",
1745 arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
1746 break;
1747 default:
1748 printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
1749 return -1;
1751 return 0;
1754 static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1)
1756 client_t *client;
1758 if ( cli_id == CLI_ID_NULL )
1759 list_for_each_entry(client,&global_client_list,client_list)
1760 tmemc_set_var_one(client, subop, arg1);
1761 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1762 return -1;
1763 else
1764 tmemc_set_var_one(client, subop, arg1);
1765 return 0;
1768 static int do_tmem_control(uint32_t subop, uint32_t cli_id32,
1769 uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf)
1771 int ret;
1772 cli_id_t cli_id = (cli_id_t)cli_id32;
1774 if (!tmh_current_is_privileged())
1776 /* don't fail... mystery: sometimes dom0 fails here */
1777 /* return -EPERM; */
1779 switch(subop)
1781 case TMEMC_THAW:
1782 case TMEMC_FREEZE:
1783 case TMEMC_DESTROY:
1784 ret = tmemc_freeze_pools(cli_id,subop);
1785 break;
1786 case TMEMC_FLUSH:
1787 ret = tmemc_flush_mem(cli_id,arg1);
1788 break;
1789 case TMEMC_LIST:
1790 ret = tmemc_list(cli_id,buf,arg1,arg2);
1791 break;
1792 case TMEMC_SET_WEIGHT:
1793 case TMEMC_SET_CAP:
1794 case TMEMC_SET_COMPRESS:
1795 ret = tmemc_set_var(cli_id,subop,arg1);
1796 break;
1797 default:
1798 ret = -1;
1800 return ret;
1803 /************ EXPORTed FUNCTIONS **************************************/
1805 EXPORT long do_tmem_op(tmem_cli_op_t uops)
1807 struct tmem_op op;
1808 client_t *client = tmh_client_from_current();
1809 pool_t *pool = NULL;
1810 int rc = 0;
1811 bool_t succ_get = 0, succ_put = 0;
1812 bool_t non_succ_get = 0, non_succ_put = 0;
1813 bool_t flush = 0, flush_obj = 0;
1814 bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
1815 DECL_LOCAL_CYC_COUNTER(succ_get);
1816 DECL_LOCAL_CYC_COUNTER(succ_put);
1817 DECL_LOCAL_CYC_COUNTER(non_succ_get);
1818 DECL_LOCAL_CYC_COUNTER(non_succ_put);
1819 DECL_LOCAL_CYC_COUNTER(flush);
1820 DECL_LOCAL_CYC_COUNTER(flush_obj);
1822 if ( !tmem_initialized )
1823 return -ENODEV;
1825 total_tmem_ops++;
1827 if ( tmh_lock_all )
1829 if ( tmh_lock_all > 1 )
1830 spin_lock_irq(&tmem_spinlock);
1831 else
1832 spin_lock(&tmem_spinlock);
1835 START_CYC_COUNTER(succ_get);
1836 DUP_START_CYC_COUNTER(succ_put,succ_get);
1837 DUP_START_CYC_COUNTER(non_succ_get,succ_get);
1838 DUP_START_CYC_COUNTER(non_succ_put,succ_get);
1839 DUP_START_CYC_COUNTER(flush,succ_get);
1840 DUP_START_CYC_COUNTER(flush_obj,succ_get);
1842 if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
1844 printk("tmem: can't get tmem struct from %s\n",client_str);
1845 rc = -EFAULT;
1846 goto out;
1849 if ( op.cmd == TMEM_CONTROL )
1851 tmem_write_lock(&tmem_rwlock);
1852 tmem_write_lock_set = 1;
1853 rc = do_tmem_control(op.u.ctrl.subop, op.u.ctrl.cli_id,
1854 op.u.ctrl.arg1, op.u.ctrl.arg2, op.u.ctrl.buf);
1855 goto out;
1858 /* create per-client tmem structure dynamically on first use by client */
1859 if ( client == NULL )
1861 tmem_write_lock(&tmem_rwlock);
1862 tmem_write_lock_set = 1;
1863 if ( (client = client_create()) == NULL )
1865 printk("tmem: can't create tmem structure for %s\n",client_str);
1866 rc = -ENOMEM;
1867 goto out;
1871 if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
1873 if ( !tmem_write_lock_set )
1875 tmem_write_lock(&tmem_rwlock);
1876 tmem_write_lock_set = 1;
1879 else
1881 if ( !tmem_write_lock_set )
1883 tmem_read_lock(&tmem_rwlock);
1884 tmem_read_lock_set = 1;
1886 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
1887 ((pool = client->pools[op.pool_id]) == NULL) )
1889 rc = -ENODEV;
1890 printk("tmem: operation requested on uncreated pool\n");
1891 goto out;
1893 ASSERT_SENTINEL(pool,POOL);
1896 switch ( op.cmd )
1898 case TMEM_NEW_POOL:
1899 rc = do_tmem_new_pool(op.u.new.flags,
1900 op.u.new.uuid[0], op.u.new.uuid[1]);
1901 break;
1902 case TMEM_NEW_PAGE:
1903 rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1904 0, 0, 0);
1905 break;
1906 case TMEM_PUT_PAGE:
1907 rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1908 0, 0, PAGE_SIZE);
1909 if (rc == 1) succ_put = 1;
1910 else non_succ_put = 1;
1911 break;
1912 case TMEM_GET_PAGE:
1913 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1914 0, 0, PAGE_SIZE);
1915 if (rc == 1) succ_get = 1;
1916 else non_succ_get = 1;
1917 break;
1918 case TMEM_FLUSH_PAGE:
1919 flush = 1;
1920 rc = do_tmem_flush_page(pool, op.u.gen.object, op.u.gen.index);
1921 break;
1922 case TMEM_FLUSH_OBJECT:
1923 rc = do_tmem_flush_object(pool, op.u.gen.object);
1924 flush_obj = 1;
1925 break;
1926 case TMEM_DESTROY_POOL:
1927 flush = 1;
1928 rc = do_tmem_destroy_pool(op.pool_id);
1929 break;
1930 case TMEM_READ:
1931 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1932 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
1933 op.u.gen.len);
1934 break;
1935 case TMEM_WRITE:
1936 rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1937 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
1938 op.u.gen.len);
1939 break;
1940 case TMEM_XCHG:
1941 /* need to hold global lock to ensure xchg is atomic */
1942 printk("tmem_xchg op not implemented yet\n");
1943 rc = 0;
1944 break;
1945 default:
1946 printk("tmem: op %d not implemented\n", op.cmd);
1947 rc = 0;
1948 break;
1951 out:
1952 if ( rc < 0 )
1953 errored_tmem_ops++;
1954 if ( succ_get )
1955 END_CYC_COUNTER_CLI(succ_get,client);
1956 else if ( succ_put )
1957 END_CYC_COUNTER_CLI(succ_put,client);
1958 else if ( non_succ_get )
1959 END_CYC_COUNTER_CLI(non_succ_get,client);
1960 else if ( non_succ_put )
1961 END_CYC_COUNTER_CLI(non_succ_put,client);
1962 else if ( flush )
1963 END_CYC_COUNTER_CLI(flush,client);
1964 else if ( flush_obj )
1965 END_CYC_COUNTER_CLI(flush_obj,client);
1967 if ( tmh_lock_all )
1969 if ( tmh_lock_all > 1 )
1970 spin_unlock_irq(&tmem_spinlock);
1971 else
1972 spin_unlock(&tmem_spinlock);
1973 } else {
1974 if ( tmem_write_lock_set )
1975 write_unlock(&tmem_rwlock);
1976 else if ( tmem_read_lock_set )
1977 read_unlock(&tmem_rwlock);
1978 else
1979 ASSERT(0);
1982 return rc;
1985 /* this should be called when the host is destroying a client */
1986 EXPORT void tmem_destroy(void *v)
1988 client_t *client = (client_t *)v;
1990 if ( client == NULL )
1991 return;
1993 if ( tmh_lock_all )
1994 spin_lock(&tmem_spinlock);
1995 else
1996 write_lock(&tmem_rwlock);
1998 printk("tmem: flushing tmem pools for %s=%d\n",
1999 cli_id_str, client->cli_id);
2000 client_flush(client, 1);
2002 if ( tmh_lock_all )
2003 spin_unlock(&tmem_spinlock);
2004 else
2005 write_unlock(&tmem_rwlock);
2008 /* freezing all pools guarantees that no additional memory will be consumed */
2009 EXPORT void tmem_freeze_all(unsigned char key)
2011 static int freeze = 0;
2013 if ( tmh_lock_all )
2014 spin_lock(&tmem_spinlock);
2015 else
2016 write_lock(&tmem_rwlock);
2018 freeze = !freeze;
2019 tmemc_freeze_pools(CLI_ID_NULL,freeze);
2021 if ( tmh_lock_all )
2022 spin_unlock(&tmem_spinlock);
2023 else
2024 write_unlock(&tmem_rwlock);
2027 #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
2029 EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2031 pfp_t *pfp;
2032 unsigned long evicts_per_relinq = 0;
2033 int max_evictions = 10;
2035 if (!tmh_enabled())
2036 return NULL;
2037 #ifdef __i386__
2038 return NULL;
2039 #endif
2041 relinq_attempts++;
2042 if ( order > 0 )
2044 printk("tmem_relinquish_page: failing order=%d\n", order);
2045 return NULL;
2048 if ( tmh_called_from_tmem(memflags) )
2050 if ( tmh_lock_all )
2051 spin_lock(&tmem_spinlock);
2052 else
2053 read_lock(&tmem_rwlock);
2056 while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
2058 if ( (max_evictions-- <= 0) || !tmem_evict())
2059 break;
2060 evicts_per_relinq++;
2062 if ( evicts_per_relinq > max_evicts_per_relinq )
2063 max_evicts_per_relinq = evicts_per_relinq;
2064 tmh_scrub_page(pfp, memflags);
2065 if ( pfp != NULL )
2066 relinq_pgs++;
2068 if ( tmh_called_from_tmem(memflags) )
2070 if ( tmh_lock_all )
2071 spin_unlock(&tmem_spinlock);
2072 else
2073 read_unlock(&tmem_rwlock);
2076 return pfp;
2079 /* called at hypervisor startup */
2080 EXPORT void init_tmem(void)
2082 if ( !tmh_enabled() )
2083 return;
2085 radix_tree_init();
2086 if ( tmh_init() )
2088 printk("tmem: initialized comp=%d global-lock=%d\n",
2089 tmh_compression_enabled(), tmh_lock_all);
2090 tmem_initialized = 1;
2092 else
2093 printk("tmem: initialization FAILED\n");
2096 /*
2097 * Local variables:
2098 * mode: C
2099 * c-set-style: "BSD"
2100 * c-basic-offset: 4
2101 * tab-width: 4
2102 * indent-tabs-mode: nil
2103 * End:
2104 */