debuggers.hg

annotate xen/common/tmem.c @ 20964:a3fa6d444b25

Fix domain reference leaks

Besides two unlikely/rarely hit ones in x86 code, the main offender
was tmh_client_from_cli_id(), which didn't even have a counterpart
(albeit it had a comment correctly saying that it causes d->refcnt to
get incremented). Unfortunately(?) this required a bit of code
restructuring (as I needed to change the code anyway, I also fixed
a couple os missing bounds checks which would sooner or later be
reported as security vulnerabilities), so I would hope Dan could give
it his blessing before it gets applied.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 10 09:18:43 2010 +0000 (2010-02-10)
parents 277bfc2d47b1
children 87f1e5b7660b
rev   line source
keir@19684 1 /******************************************************************************
keir@19684 2 * tmem.c
keir@19684 3 *
keir@19684 4 * Transcendent memory
keir@19684 5 *
keir@19684 6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
keir@19684 7 */
keir@19684 8
keir@19684 9 /* TODO list: 090129
keir@19684 10 - improve on reclamation policy
keir@19684 11 - use different tlsf pools for each client (maybe each pool)
keir@19684 12 - implement page accounting and minimal QoS limits
keir@19684 13 - test shared access more completely (need pv cluster fs)
keir@19684 14 - add feedback-driven compression (not for persistent pools though!)
keir@19684 15 - add data-structure total bytes overhead stats
keir@19684 16 */
keir@19684 17
keir@19684 18 #ifdef __XEN__
keir@19684 19 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
keir@19684 20 #endif
keir@19684 21
keir@19684 22 #include <xen/tmem.h>
keir@19684 23 #include <xen/rbtree.h>
keir@19684 24 #include <xen/radix-tree.h>
keir@19684 25 #include <xen/list.h>
keir@19684 26
keir@19684 27 #define EXPORT /* indicates code other modules are dependent upon */
keir@19684 28 #define FORWARD
keir@19684 29
keir@20067 30 #define TMEM_SPEC_VERSION 0
keir@20067 31
keir@19684 32 /************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
keir@19684 33
keir@19684 34 #define CLI_ID_NULL TMH_CLI_ID_NULL
keir@19684 35 #define cli_id_str tmh_cli_id_str
keir@19684 36 #define client_str tmh_client_str
keir@19684 37
keir@19684 38 /************ DEBUG and STATISTICS (+ some compression testing) *******/
keir@19684 39
keir@19684 40 #ifndef NDEBUG
keir@19684 41 #define SENTINELS
keir@19684 42 #define NOINLINE noinline
keir@19684 43 #else
keir@19684 44 #define NOINLINE
keir@19684 45 #endif
keir@19684 46
keir@19684 47 #ifdef SENTINELS
keir@19684 48 #define DECL_SENTINEL unsigned long sentinel;
keir@19684 49 #define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
keir@19684 50 #define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
keir@19684 51 #define ASSERT_SENTINEL(_x,_y) \
keir@19684 52 ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
keir@19684 53 #ifdef __i386__
keir@19684 54 #define POOL_SENTINEL 0x87658765
keir@19684 55 #define OBJ_SENTINEL 0x12345678
keir@19684 56 #define OBJNODE_SENTINEL 0xfedcba09
keir@19684 57 #define PGD_SENTINEL 0x43214321
keir@19684 58 #else
keir@19684 59 #define POOL_SENTINEL 0x8765876587658765
keir@19684 60 #define OBJ_SENTINEL 0x1234567812345678
keir@19684 61 #define OBJNODE_SENTINEL 0xfedcba0987654321
keir@19684 62 #define PGD_SENTINEL 0x4321432143214321
keir@19684 63 #endif
keir@19684 64 #else
keir@19684 65 #define DECL_SENTINEL
keir@19684 66 #define SET_SENTINEL(_x,_y) do { } while (0)
keir@19684 67 #define ASSERT_SENTINEL(_x,_y) do { } while (0)
keir@19684 68 #define INVERT_SENTINEL(_x,_y) do { } while (0)
keir@19684 69 #endif
keir@19684 70
keir@19684 71 /* global statistics (none need to be locked) */
keir@19684 72 static unsigned long total_tmem_ops = 0;
keir@19684 73 static unsigned long errored_tmem_ops = 0;
keir@19684 74 static unsigned long total_flush_pool = 0;
keir@19684 75 static unsigned long alloc_failed = 0, alloc_page_failed = 0;
keir@19684 76 static unsigned long evicted_pgs = 0, evict_attempts = 0;
keir@19684 77 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
keir@19684 78 static unsigned long max_evicts_per_relinq = 0;
keir@19684 79 static unsigned long low_on_memory = 0;
keir@19684 80 static int global_obj_count_max = 0;
keir@19684 81 static int global_pgp_count_max = 0;
keir@19684 82 static int global_page_count_max = 0;
keir@19684 83 static int global_rtree_node_count_max = 0;
keir@19684 84 static long global_eph_count_max = 0;
keir@19684 85 static unsigned long failed_copies;
keir@19684 86
keir@19684 87 DECL_CYC_COUNTER(succ_get);
keir@19684 88 DECL_CYC_COUNTER(succ_put);
keir@19684 89 DECL_CYC_COUNTER(non_succ_get);
keir@19684 90 DECL_CYC_COUNTER(non_succ_put);
keir@19684 91 DECL_CYC_COUNTER(flush);
keir@19684 92 DECL_CYC_COUNTER(flush_obj);
keir@19684 93 #ifdef COMPARE_COPY_PAGE_SSE2
keir@19684 94 EXTERN_CYC_COUNTER(pg_copy1);
keir@19684 95 EXTERN_CYC_COUNTER(pg_copy2);
keir@19684 96 EXTERN_CYC_COUNTER(pg_copy3);
keir@19684 97 EXTERN_CYC_COUNTER(pg_copy4);
keir@19684 98 #else
keir@19684 99 EXTERN_CYC_COUNTER(pg_copy);
keir@19684 100 #endif
keir@19684 101 DECL_CYC_COUNTER(compress);
keir@19684 102 DECL_CYC_COUNTER(decompress);
keir@19684 103
keir@19684 104 /************ CORE DATA STRUCTURES ************************************/
keir@19684 105
keir@19684 106 #define MAX_POOLS_PER_DOMAIN 16
keir@19684 107 #define MAX_GLOBAL_SHARED_POOLS 16
keir@19684 108
keir@19684 109 struct tm_pool;
keir@20067 110 struct tmem_page_descriptor;
keir@19684 111 struct client {
keir@19684 112 struct list_head client_list;
keir@19684 113 struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
keir@19684 114 tmh_client_t *tmh;
keir@19684 115 struct list_head ephemeral_page_list;
keir@19684 116 long eph_count, eph_count_max;
keir@19684 117 cli_id_t cli_id;
keir@19684 118 uint32_t weight;
keir@19684 119 uint32_t cap;
keir@19684 120 bool_t compress;
keir@19684 121 bool_t frozen;
keir@20067 122 bool_t shared_auth_required;
keir@20067 123 /* for save/restore/migration */
keir@20067 124 bool_t live_migrating;
keir@20067 125 bool_t was_frozen;
keir@20067 126 struct list_head persistent_invalidated_list;
keir@20067 127 struct tmem_page_descriptor *cur_pgp;
keir@20067 128 /* statistics collection */
keir@19684 129 unsigned long compress_poor, compress_nomem;
keir@19684 130 unsigned long compressed_pages;
keir@19684 131 uint64_t compressed_sum_size;
keir@19897 132 uint64_t total_cycles;
keir@19897 133 unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets;
keir@20067 134 /* shared pool authentication */
keir@20067 135 uint64_t shared_auth_uuid[MAX_GLOBAL_SHARED_POOLS][2];
keir@19684 136 };
keir@19684 137 typedef struct client client_t;
keir@19684 138
keir@19684 139 struct share_list {
keir@19684 140 struct list_head share_list;
keir@19684 141 client_t *client;
keir@19684 142 };
keir@19684 143 typedef struct share_list sharelist_t;
keir@19684 144
keir@19684 145 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
keir@19684 146 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
keir@19684 147 #define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
keir@19684 148
keir@19684 149 struct tm_pool {
keir@19684 150 bool_t shared;
keir@19684 151 bool_t persistent;
keir@20535 152 bool_t is_dying;
keir@20067 153 int pageshift; /* 0 == 2**12 */
keir@19684 154 struct list_head pool_list; /* FIXME do we need this anymore? */
keir@19684 155 client_t *client;
keir@19684 156 uint64_t uuid[2]; /* 0 for private, non-zero for shared */
keir@19684 157 uint32_t pool_id;
keir@19684 158 rwlock_t pool_rwlock;
keir@19684 159 struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
keir@19684 160 struct list_head share_list; /* valid if shared */
keir@19684 161 int shared_count; /* valid if shared */
keir@20067 162 /* for save/restore/migration */
keir@20067 163 struct list_head persistent_page_list;
keir@20067 164 struct tmem_page_descriptor *cur_pgp;
keir@20067 165 /* statistics collection */
keir@19684 166 atomic_t pgp_count;
keir@19684 167 int pgp_count_max;
keir@19684 168 long obj_count; /* atomicity depends on pool_rwlock held for write */
keir@19684 169 long obj_count_max;
keir@19684 170 unsigned long objnode_count, objnode_count_max;
keir@19684 171 uint64_t sum_life_cycles;
keir@19684 172 uint64_t sum_evicted_cycles;
keir@19684 173 unsigned long puts, good_puts, no_mem_puts;
keir@19684 174 unsigned long dup_puts_flushed, dup_puts_replaced;
keir@19684 175 unsigned long gets, found_gets;
keir@19684 176 unsigned long flushs, flushs_found;
keir@19684 177 unsigned long flush_objs, flush_objs_found;
keir@20067 178 DECL_SENTINEL
keir@19684 179 };
keir@19684 180 typedef struct tm_pool pool_t;
keir@19684 181
keir@19684 182 #define is_persistent(_p) (_p->persistent)
keir@19684 183 #define is_ephemeral(_p) (!(_p->persistent))
keir@19684 184 #define is_shared(_p) (_p->shared)
keir@19684 185 #define is_private(_p) (!(_p->shared))
keir@19684 186
keir@19684 187 struct tmem_object_root {
keir@19684 188 DECL_SENTINEL
keir@19684 189 uint64_t oid;
keir@19684 190 struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
keir@19684 191 unsigned long objnode_count; /* atomicity depends on obj_spinlock */
keir@19684 192 long pgp_count; /* atomicity depends on obj_spinlock */
keir@19684 193 struct radix_tree_root tree_root; /* tree of pages within object */
keir@19684 194 pool_t *pool;
keir@19684 195 cli_id_t last_client;
keir@19684 196 spinlock_t obj_spinlock;
keir@19684 197 bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
keir@19684 198 };
keir@19684 199 typedef struct tmem_object_root obj_t;
keir@19684 200
keir@19684 201 typedef struct radix_tree_node rtn_t;
keir@19684 202 struct tmem_object_node {
keir@19684 203 obj_t *obj;
keir@19684 204 DECL_SENTINEL
keir@19684 205 rtn_t rtn;
keir@19684 206 };
keir@19684 207 typedef struct tmem_object_node objnode_t;
keir@19684 208
keir@19684 209 struct tmem_page_descriptor {
keir@20067 210 union {
keir@20067 211 struct list_head global_eph_pages;
keir@20067 212 struct list_head client_inv_pages;
keir@20067 213 };
keir@20067 214 union {
keir@20067 215 struct list_head client_eph_pages;
keir@20067 216 struct list_head pool_pers_pages;
keir@20067 217 };
keir@20067 218 union {
keir@20067 219 obj_t *obj;
keir@20067 220 uint64_t inv_oid; /* used for invalid list only */
keir@20067 221 };
keir@19684 222 uint32_t index;
keir@20067 223 size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
keir@20067 224 else compressed data (cdata) */
keir@19684 225 union {
keir@19684 226 pfp_t *pfp; /* page frame pointer */
keir@19684 227 char *cdata; /* compressed data */
keir@19684 228 };
keir@20067 229 union {
keir@20067 230 uint64_t timestamp;
keir@20067 231 uint32_t pool_id; /* used for invalid list only */
keir@20067 232 };
keir@19684 233 DECL_SENTINEL
keir@19684 234 };
keir@19684 235 typedef struct tmem_page_descriptor pgp_t;
keir@19684 236
keir@19684 237 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
keir@19684 238
keir@19684 239 static LIST_HEAD(global_client_list);
keir@19684 240 static LIST_HEAD(global_pool_list);
keir@19684 241
keir@19684 242 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
keir@20067 243 static bool_t global_shared_auth = 0;
keir@19684 244 static atomic_t client_weight_total = ATOMIC_INIT(0);
keir@19684 245 static int tmem_initialized = 0;
keir@19684 246
keir@19684 247 /************ CONCURRENCY ***********************************************/
keir@19684 248
keir@19684 249 EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
keir@19684 250 EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
keir@19684 251 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
keir@20067 252 static DEFINE_SPINLOCK(pers_lists_spinlock);
keir@19684 253
keir@19684 254 #define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
keir@19684 255 #define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
keir@19684 256 #define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
keir@19684 257 #define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
keir@19684 258 #define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
keir@19684 259 #define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
keir@19684 260 #define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
keir@19684 261 #define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
keir@19684 262
keir@19684 263 #define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
keir@19684 264 #define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
keir@19684 265
keir@19684 266 /* global counters (should use long_atomic_t access) */
keir@19684 267 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
keir@19684 268 static atomic_t global_obj_count = ATOMIC_INIT(0);
keir@19684 269 static atomic_t global_pgp_count = ATOMIC_INIT(0);
keir@19684 270 static atomic_t global_page_count = ATOMIC_INIT(0);
keir@19684 271 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
keir@19684 272
keir@19684 273 #define atomic_inc_and_max(_c) do { \
keir@19684 274 atomic_inc(&_c); \
keir@19684 275 if ( _atomic_read(_c) > _c##_max ) \
keir@19684 276 _c##_max = _atomic_read(_c); \
keir@19684 277 } while (0)
keir@19684 278
keir@19684 279 #define atomic_dec_and_assert(_c) do { \
keir@19684 280 atomic_dec(&_c); \
keir@19684 281 ASSERT(_atomic_read(_c) >= 0); \
keir@19684 282 } while (0)
keir@19684 283
keir@19684 284
keir@19684 285 /************ MEMORY ALLOCATION INTERFACE *****************************/
keir@19684 286
keir@19684 287 #define tmem_malloc(_type,_pool) \
keir@19684 288 _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
keir@19684 289
keir@19684 290 #define tmem_malloc_bytes(_size,_pool) \
keir@19684 291 _tmem_malloc(_size, 1, _pool)
keir@19684 292
keir@19684 293 static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
keir@19684 294 {
keir@19684 295 void *v;
keir@19684 296
keir@19684 297 if ( (pool != NULL) && is_persistent(pool) )
keir@19684 298 v = tmh_alloc_subpage_thispool(pool,size,align);
keir@19684 299 else
keir@19684 300 v = tmh_alloc_subpage(pool, size, align);
keir@19684 301 if ( v == NULL )
keir@19684 302 alloc_failed++;
keir@19684 303 return v;
keir@19684 304 }
keir@19684 305
keir@19684 306 static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
keir@19684 307 {
keir@19684 308 if ( pool == NULL || !is_persistent(pool) )
keir@19684 309 tmh_free_subpage(p,size);
keir@19684 310 else
keir@19684 311 tmh_free_subpage_thispool(pool,p,size);
keir@19684 312 }
keir@19684 313
keir@19684 314 static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
keir@19684 315 {
keir@19684 316 pfp_t *pfp = NULL;
keir@19684 317
keir@19684 318 if ( pool != NULL && is_persistent(pool) )
keir@19684 319 pfp = tmh_alloc_page_thispool(pool);
keir@19684 320 else
keir@19684 321 pfp = tmh_alloc_page(pool,0);
keir@19684 322 if ( pfp == NULL )
keir@19684 323 alloc_page_failed++;
keir@19684 324 else
keir@19684 325 atomic_inc_and_max(global_page_count);
keir@19684 326 return pfp;
keir@19684 327 }
keir@19684 328
keir@19684 329 static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
keir@19684 330 {
keir@19684 331 ASSERT(pfp);
keir@19684 332 if ( pool == NULL || !is_persistent(pool) )
keir@19684 333 tmh_free_page(pfp);
keir@19684 334 else
keir@19684 335 tmh_free_page_thispool(pool,pfp);
keir@19684 336 atomic_dec_and_assert(global_page_count);
keir@19684 337 }
keir@19684 338
keir@19684 339 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
keir@19684 340
keir@19684 341 /* allocate a pgp_t and associate it with an object */
keir@19684 342 static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
keir@19684 343 {
keir@19684 344 pgp_t *pgp;
keir@19684 345 pool_t *pool;
keir@19684 346
keir@19684 347 ASSERT(obj != NULL);
keir@19684 348 ASSERT(obj->pool != NULL);
keir@19684 349 pool = obj->pool;
keir@19684 350 if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
keir@19684 351 return NULL;
keir@19684 352 pgp->obj = obj;
keir@19684 353 INIT_LIST_HEAD(&pgp->global_eph_pages);
keir@19684 354 INIT_LIST_HEAD(&pgp->client_eph_pages);
keir@19684 355 pgp->pfp = NULL;
keir@19684 356 pgp->size = -1;
keir@19684 357 pgp->index = -1;
keir@19684 358 pgp->timestamp = get_cycles();
keir@19684 359 SET_SENTINEL(pgp,PGD);
keir@19684 360 atomic_inc_and_max(global_pgp_count);
keir@19684 361 atomic_inc_and_max(pool->pgp_count);
keir@19684 362 return pgp;
keir@19684 363 }
keir@19684 364
keir@19684 365 static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
keir@19684 366 {
keir@19684 367 ASSERT(obj != NULL);
keir@19684 368 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 369 ASSERT_SENTINEL(obj,OBJ);
keir@19684 370 ASSERT(obj->pool != NULL);
keir@19684 371 ASSERT_SENTINEL(obj->pool,POOL);
keir@19684 372 return radix_tree_lookup(&obj->tree_root, index);
keir@19684 373 }
keir@19684 374
keir@19684 375 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
keir@19684 376 {
keir@19684 377 if ( pgp->pfp == NULL )
keir@19684 378 return;
keir@19684 379 if ( !pgp->size )
keir@19684 380 tmem_page_free(pgp->obj->pool,pgp->pfp);
keir@19684 381 else
keir@19684 382 {
keir@19684 383 tmem_free(pgp->cdata,pgp->size,pool);
keir@19684 384 if ( pool != NULL )
keir@19684 385 {
keir@19684 386 pool->client->compressed_pages--;
keir@19684 387 pool->client->compressed_sum_size -= pgp->size;
keir@19684 388 }
keir@19684 389 }
keir@19684 390 pgp->pfp = NULL;
keir@19684 391 pgp->size = -1;
keir@19684 392 }
keir@19684 393
keir@19684 394 static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
keir@19684 395 {
keir@19684 396 pool_t *pool = NULL;
keir@19684 397
keir@19684 398 ASSERT_SENTINEL(pgp,PGD);
keir@19684 399 ASSERT(pgp->obj != NULL);
keir@19684 400 ASSERT_SENTINEL(pgp->obj,OBJ);
keir@19684 401 ASSERT_SENTINEL(pgp->obj->pool,POOL);
keir@20067 402 ASSERT(pgp->obj->pool->client != NULL);
keir@19684 403 if ( from_delete )
keir@19684 404 ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
keir@19684 405 ASSERT(pgp->obj->pool != NULL);
keir@19684 406 pool = pgp->obj->pool;
keir@20067 407 if ( is_ephemeral(pool) )
keir@20067 408 {
keir@20067 409 ASSERT(list_empty(&pgp->global_eph_pages));
keir@20067 410 ASSERT(list_empty(&pgp->client_eph_pages));
keir@20067 411 }
keir@19684 412 pgp_free_data(pgp, pool);
keir@20067 413 atomic_dec_and_assert(global_pgp_count);
keir@20067 414 atomic_dec_and_assert(pool->pgp_count);
keir@20067 415 pgp->size = -1;
keir@20067 416 if ( is_persistent(pool) && pool->client->live_migrating )
keir@20067 417 {
keir@20067 418 pgp->inv_oid = pgp->obj->oid;
keir@20067 419 pgp->pool_id = pool->pool_id;
keir@20067 420 return;
keir@20067 421 }
keir@19684 422 INVERT_SENTINEL(pgp,PGD);
keir@19684 423 pgp->obj = NULL;
keir@19684 424 pgp->index = -1;
keir@20067 425 tmem_free(pgp,sizeof(pgp_t),pool);
keir@20067 426 }
keir@20067 427
keir@20067 428 static NOINLINE void pgp_free_from_inv_list(client_t *client, pgp_t *pgp)
keir@20067 429 {
keir@20067 430 pool_t *pool = client->pools[pgp->pool_id];
keir@20067 431
keir@20067 432 ASSERT_SENTINEL(pool,POOL);
keir@20067 433 ASSERT_SENTINEL(pgp,PGD);
keir@20067 434 INVERT_SENTINEL(pgp,PGD);
keir@20067 435 pgp->obj = NULL;
keir@20067 436 pgp->index = -1;
keir@19684 437 tmem_free(pgp,sizeof(pgp_t),pool);
keir@19684 438 }
keir@19684 439
keir@19684 440 /* remove the page from appropriate lists but not from parent object */
keir@19684 441 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
keir@19684 442 {
keir@20067 443 client_t *client;
keir@20067 444
keir@19684 445 ASSERT(pgp != NULL);
keir@19684 446 ASSERT(pgp->obj != NULL);
keir@19684 447 ASSERT(pgp->obj->pool != NULL);
keir@20067 448 client = pgp->obj->pool->client;
keir@20067 449 ASSERT(client != NULL);
keir@19684 450 if ( is_ephemeral(pgp->obj->pool) )
keir@19684 451 {
keir@19684 452 if ( !no_eph_lock )
keir@19684 453 tmem_spin_lock(&eph_lists_spinlock);
keir@19684 454 if ( !list_empty(&pgp->client_eph_pages) )
keir@20067 455 client->eph_count--;
keir@20067 456 ASSERT(client->eph_count >= 0);
keir@19684 457 list_del_init(&pgp->client_eph_pages);
keir@19684 458 if ( !list_empty(&pgp->global_eph_pages) )
keir@19684 459 global_eph_count--;
keir@19684 460 ASSERT(global_eph_count >= 0);
keir@19684 461 list_del_init(&pgp->global_eph_pages);
keir@19684 462 if ( !no_eph_lock )
keir@19684 463 tmem_spin_unlock(&eph_lists_spinlock);
keir@20067 464 } else {
keir@20067 465 if ( client->live_migrating )
keir@20067 466 {
keir@20067 467 tmem_spin_lock(&pers_lists_spinlock);
keir@20067 468 list_add_tail(&pgp->client_inv_pages,
keir@20067 469 &client->persistent_invalidated_list);
keir@20067 470 if ( pgp != pgp->obj->pool->cur_pgp )
keir@20067 471 list_del_init(&pgp->pool_pers_pages);
keir@20067 472 tmem_spin_unlock(&pers_lists_spinlock);
keir@20067 473 } else {
keir@20067 474 tmem_spin_lock(&pers_lists_spinlock);
keir@20067 475 list_del_init(&pgp->pool_pers_pages);
keir@20067 476 tmem_spin_unlock(&pers_lists_spinlock);
keir@20067 477 }
keir@19684 478 }
keir@19684 479 }
keir@19684 480
keir@19684 481 /* remove page from lists (but not from parent object) and free it */
keir@19684 482 static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
keir@19684 483 {
keir@19684 484 uint64_t life;
keir@19684 485
keir@19684 486 ASSERT(pgp != NULL);
keir@19684 487 ASSERT(pgp->obj != NULL);
keir@19684 488 ASSERT(pgp->obj->pool != NULL);
keir@19684 489 life = get_cycles() - pgp->timestamp;
keir@19684 490 pgp->obj->pool->sum_life_cycles += life;
keir@19684 491 pgp_delist(pgp, no_eph_lock);
keir@19684 492 pgp_free(pgp,1);
keir@19684 493 }
keir@19684 494
keir@19684 495 /* called only indirectly by radix_tree_destroy */
keir@19684 496 static NOINLINE void pgp_destroy(void *v)
keir@19684 497 {
keir@19684 498 pgp_t *pgp = (pgp_t *)v;
keir@19684 499
keir@19684 500 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
keir@19684 501 pgp_delist(pgp,0);
keir@19684 502 ASSERT(pgp->obj != NULL);
keir@19684 503 pgp->obj->pgp_count--;
keir@19684 504 ASSERT(pgp->obj->pgp_count >= 0);
keir@19684 505 pgp_free(pgp,0);
keir@19684 506 }
keir@19684 507
keir@19684 508 FORWARD static rtn_t *rtn_alloc(void *arg);
keir@19684 509 FORWARD static void rtn_free(rtn_t *rtn);
keir@19684 510
keir@19684 511 static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
keir@19684 512 {
keir@19684 513 int ret;
keir@19684 514
keir@19684 515 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 516 ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
keir@19684 517 if ( !ret )
keir@19684 518 obj->pgp_count++;
keir@19684 519 return ret;
keir@19684 520 }
keir@19684 521
keir@19684 522 static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
keir@19684 523 {
keir@19684 524 pgp_t *pgp;
keir@19684 525
keir@19684 526 ASSERT(obj != NULL);
keir@19684 527 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 528 ASSERT_SENTINEL(obj,OBJ);
keir@19684 529 ASSERT(obj->pool != NULL);
keir@19684 530 ASSERT_SENTINEL(obj->pool,POOL);
keir@19684 531 pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
keir@19684 532 if ( pgp != NULL )
keir@19684 533 obj->pgp_count--;
keir@19684 534 ASSERT(obj->pgp_count >= 0);
keir@19684 535
keir@19684 536 return pgp;
keir@19684 537 }
keir@19684 538
keir@19684 539 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
keir@19684 540
keir@19684 541 /* called only indirectly from radix_tree_insert */
keir@19684 542 static NOINLINE rtn_t *rtn_alloc(void *arg)
keir@19684 543 {
keir@19684 544 objnode_t *objnode;
keir@19684 545 obj_t *obj = (obj_t *)arg;
keir@19684 546
keir@19684 547 ASSERT_SENTINEL(obj,OBJ);
keir@19684 548 ASSERT(obj->pool != NULL);
keir@19684 549 ASSERT_SENTINEL(obj->pool,POOL);
keir@19684 550 objnode = tmem_malloc(objnode_t,obj->pool);
keir@19684 551 if (objnode == NULL)
keir@19684 552 return NULL;
keir@19684 553 objnode->obj = obj;
keir@19684 554 SET_SENTINEL(objnode,OBJNODE);
keir@19684 555 memset(&objnode->rtn, 0, sizeof(rtn_t));
keir@19684 556 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
keir@19684 557 obj->pool->objnode_count_max = obj->pool->objnode_count;
keir@19684 558 atomic_inc_and_max(global_rtree_node_count);
keir@19684 559 obj->objnode_count++;
keir@19684 560 return &objnode->rtn;
keir@19684 561 }
keir@19684 562
keir@19684 563 /* called only indirectly from radix_tree_delete/destroy */
keir@19684 564 static void rtn_free(rtn_t *rtn)
keir@19684 565 {
keir@19684 566 pool_t *pool;
keir@19684 567 objnode_t *objnode;
keir@19684 568 int i;
keir@19684 569
keir@19684 570 ASSERT(rtn != NULL);
keir@19684 571 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
keir@19684 572 ASSERT(rtn->slots[i] == NULL);
keir@19684 573 objnode = container_of(rtn,objnode_t,rtn);
keir@19684 574 ASSERT_SENTINEL(objnode,OBJNODE);
keir@19684 575 INVERT_SENTINEL(objnode,OBJNODE);
keir@19684 576 ASSERT(objnode->obj != NULL);
keir@19684 577 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
keir@19684 578 ASSERT_SENTINEL(objnode->obj,OBJ);
keir@19684 579 pool = objnode->obj->pool;
keir@19684 580 ASSERT(pool != NULL);
keir@19684 581 ASSERT_SENTINEL(pool,POOL);
keir@19684 582 pool->objnode_count--;
keir@19684 583 objnode->obj->objnode_count--;
keir@19684 584 objnode->obj = NULL;
keir@19684 585 tmem_free(objnode,sizeof(objnode_t),pool);
keir@19684 586 atomic_dec_and_assert(global_rtree_node_count);
keir@19684 587 }
keir@19684 588
keir@19684 589 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
keir@19684 590
keir@19684 591 /* searches for object==oid in pool, returns locked object if found */
keir@19684 592 static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
keir@19684 593 {
keir@19684 594 struct rb_node *node;
keir@19684 595 obj_t *obj;
keir@19684 596
keir@19684 597 restart_find:
keir@19684 598 tmem_read_lock(&pool->pool_rwlock);
keir@19684 599 node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
keir@19684 600 while ( node )
keir@19684 601 {
keir@19684 602 obj = container_of(node, obj_t, rb_tree_node);
keir@19684 603 if ( obj->oid == oid )
keir@19684 604 {
keir@19684 605 if ( tmh_lock_all )
keir@19684 606 obj->no_evict = 1;
keir@19684 607 else
keir@19684 608 {
keir@19684 609 if ( !tmem_spin_trylock(&obj->obj_spinlock) )
keir@19684 610 {
keir@19684 611 tmem_read_unlock(&pool->pool_rwlock);
keir@19684 612 goto restart_find;
keir@19684 613 }
keir@19684 614 tmem_read_unlock(&pool->pool_rwlock);
keir@19684 615 }
keir@19684 616 return obj;
keir@19684 617 }
keir@19684 618 else if ( oid < obj->oid )
keir@19684 619 node = node->rb_left;
keir@19684 620 else
keir@19684 621 node = node->rb_right;
keir@19684 622 }
keir@19684 623 tmem_read_unlock(&pool->pool_rwlock);
keir@19684 624 return NULL;
keir@19684 625 }
keir@19684 626
keir@19684 627 /* free an object that has no more pgps in it */
keir@19684 628 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
keir@19684 629 {
keir@19684 630 pool_t *pool;
keir@19684 631 uint64_t old_oid;
keir@19684 632
keir@19684 633 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 634 ASSERT(obj != NULL);
keir@19684 635 ASSERT_SENTINEL(obj,OBJ);
keir@19684 636 ASSERT(obj->pgp_count == 0);
keir@19684 637 pool = obj->pool;
keir@19684 638 ASSERT(pool != NULL);
keir@20067 639 ASSERT(pool->client != NULL);
keir@19684 640 ASSERT_WRITELOCK(&pool->pool_rwlock);
keir@19684 641 if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
keir@19684 642 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
keir@19684 643 ASSERT((long)obj->objnode_count == 0);
keir@19684 644 ASSERT(obj->tree_root.rnode == NULL);
keir@19684 645 pool->obj_count--;
keir@19684 646 ASSERT(pool->obj_count >= 0);
keir@19684 647 INVERT_SENTINEL(obj,OBJ);
keir@19684 648 obj->pool = NULL;
keir@19684 649 old_oid = obj->oid;
keir@19684 650 obj->oid = -1;
keir@19684 651 obj->last_client = CLI_ID_NULL;
keir@19684 652 atomic_dec_and_assert(global_obj_count);
keir@19684 653 /* use no_rebalance only if all objects are being destroyed anyway */
keir@19684 654 if ( !no_rebalance )
keir@19684 655 rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
keir@19684 656 tmem_free(obj,sizeof(obj_t),pool);
keir@19684 657 }
keir@19684 658
keir@19684 659 static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
keir@19684 660 {
keir@19684 661 struct rb_node **new, *parent = NULL;
keir@19684 662 obj_t *this;
keir@19684 663
keir@19684 664 new = &(root->rb_node);
keir@19684 665 while ( *new )
keir@19684 666 {
keir@19684 667 this = container_of(*new, obj_t, rb_tree_node);
keir@19684 668 parent = *new;
keir@19684 669 if ( obj->oid < this->oid )
keir@19684 670 new = &((*new)->rb_left);
keir@19684 671 else if ( obj->oid > this->oid )
keir@19684 672 new = &((*new)->rb_right);
keir@19684 673 else
keir@19684 674 return 0;
keir@19684 675 }
keir@19684 676 rb_link_node(&obj->rb_tree_node, parent, new);
keir@19684 677 rb_insert_color(&obj->rb_tree_node, root);
keir@19684 678 return 1;
keir@19684 679 }
keir@19684 680
keir@19684 681 /*
keir@19684 682 * allocate, initialize, and insert an tmem_object_root
keir@19684 683 * (should be called only if find failed)
keir@19684 684 */
keir@19684 685 static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
keir@19684 686 {
keir@19684 687 obj_t *obj;
keir@19684 688
keir@19684 689 ASSERT(pool != NULL);
keir@19684 690 ASSERT_WRITELOCK(&pool->pool_rwlock);
keir@19684 691 if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
keir@19684 692 return NULL;
keir@19684 693 pool->obj_count++;
keir@19684 694 if (pool->obj_count > pool->obj_count_max)
keir@19684 695 pool->obj_count_max = pool->obj_count;
keir@19684 696 atomic_inc_and_max(global_obj_count);
keir@19684 697 INIT_RADIX_TREE(&obj->tree_root,0);
keir@19684 698 spin_lock_init(&obj->obj_spinlock);
keir@19684 699 obj->pool = pool;
keir@19684 700 obj->oid = oid;
keir@19684 701 obj->objnode_count = 0;
keir@19684 702 obj->pgp_count = 0;
keir@19684 703 obj->last_client = CLI_ID_NULL;
keir@19684 704 SET_SENTINEL(obj,OBJ);
keir@19684 705 tmem_spin_lock(&obj->obj_spinlock);
keir@19684 706 obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
keir@19684 707 obj->no_evict = 1;
keir@19684 708 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 709 return obj;
keir@19684 710 }
keir@19684 711
keir@19684 712 /* free an object after destroying any pgps in it */
keir@19734 713 static NOINLINE void obj_destroy(obj_t *obj, int no_rebalance)
keir@19684 714 {
keir@19684 715 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
keir@19684 716 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
keir@19734 717 obj_free(obj,no_rebalance);
keir@19684 718 }
keir@19684 719
keir@19734 720 /* destroys all objs in a pool, or only if obj->last_client matches cli_id */
keir@19734 721 static void pool_destroy_objs(pool_t *pool, bool_t selective, cli_id_t cli_id)
keir@19684 722 {
keir@19684 723 struct rb_node *node;
keir@19684 724 obj_t *obj;
keir@19684 725 int i;
keir@19684 726
keir@19684 727 tmem_write_lock(&pool->pool_rwlock);
keir@20535 728 pool->is_dying = 1;
keir@19684 729 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
keir@19684 730 {
keir@19684 731 node = rb_first(&pool->obj_rb_root[i]);
keir@19684 732 while ( node != NULL )
keir@19684 733 {
keir@19684 734 obj = container_of(node, obj_t, rb_tree_node);
keir@19684 735 tmem_spin_lock(&obj->obj_spinlock);
keir@19684 736 node = rb_next(node);
keir@19734 737 ASSERT(obj->no_evict == 0);
keir@19734 738 if ( !selective )
keir@20535 739 /* FIXME: should be obj,1 but walking/erasing rbtree is racy */
keir@20535 740 obj_destroy(obj,0);
keir@19734 741 else if ( obj->last_client == cli_id )
keir@19734 742 obj_destroy(obj,0);
keir@19684 743 else
keir@19684 744 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 745 }
keir@19684 746 }
keir@19684 747 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 748 }
keir@19684 749
keir@19684 750
keir@19684 751 /************ POOL MANIPULATION ROUTINES ******************************/
keir@19684 752
keir@19684 753 static pool_t * pool_alloc(void)
keir@19684 754 {
keir@19684 755 pool_t *pool;
keir@19684 756 int i;
keir@19684 757
keir@20079 758 if ( (pool = tmh_alloc_infra(sizeof(pool_t),__alignof__(pool_t))) == NULL )
keir@19684 759 return NULL;
keir@19684 760 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
keir@19684 761 pool->obj_rb_root[i] = RB_ROOT;
keir@19684 762 INIT_LIST_HEAD(&pool->pool_list);
keir@20067 763 INIT_LIST_HEAD(&pool->persistent_page_list);
keir@20067 764 pool->cur_pgp = NULL;
keir@19684 765 rwlock_init(&pool->pool_rwlock);
keir@19684 766 pool->pgp_count_max = pool->obj_count_max = 0;
keir@19684 767 pool->objnode_count = pool->objnode_count_max = 0;
keir@19684 768 atomic_set(&pool->pgp_count,0);
keir@20067 769 pool->obj_count = 0; pool->shared_count = 0;
keir@20067 770 pool->pageshift = PAGE_SHIFT - 12;
keir@19684 771 pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
keir@19684 772 pool->dup_puts_replaced = pool->no_mem_puts = 0;
keir@19684 773 pool->found_gets = pool->gets = 0;
keir@19684 774 pool->flushs_found = pool->flushs = 0;
keir@19684 775 pool->flush_objs_found = pool->flush_objs = 0;
keir@20535 776 pool->is_dying = 0;
keir@19684 777 SET_SENTINEL(pool,POOL);
keir@19684 778 return pool;
keir@19684 779 }
keir@19684 780
keir@19684 781 static NOINLINE void pool_free(pool_t *pool)
keir@19684 782 {
keir@19684 783 ASSERT_SENTINEL(pool,POOL);
keir@19684 784 INVERT_SENTINEL(pool,POOL);
keir@19684 785 pool->client = NULL;
keir@19684 786 list_del(&pool->pool_list);
keir@20079 787 tmh_free_infra(pool);
keir@19684 788 }
keir@19684 789
keir@19684 790 /* register new_client as a user of this shared pool and return new
keir@19684 791 total number of registered users */
keir@19684 792 static int shared_pool_join(pool_t *pool, client_t *new_client)
keir@19684 793 {
keir@19684 794 sharelist_t *sl;
keir@19684 795
keir@19684 796 ASSERT(is_shared(pool));
keir@19684 797 if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
keir@19684 798 return -1;
keir@19684 799 sl->client = new_client;
keir@19684 800 list_add_tail(&sl->share_list, &pool->share_list);
keir@19734 801 if ( new_client->cli_id != pool->client->cli_id )
keir@19734 802 printk("adding new %s %d to shared pool owned by %s %d\n",
keir@19734 803 client_str, new_client->cli_id, client_str, pool->client->cli_id);
keir@19684 804 return ++pool->shared_count;
keir@19684 805 }
keir@19684 806
keir@19684 807 /* reassign "ownership" of the pool to another client that shares this pool */
keir@19684 808 static NOINLINE void shared_pool_reassign(pool_t *pool)
keir@19684 809 {
keir@19684 810 sharelist_t *sl;
keir@19684 811 int poolid;
keir@19684 812 client_t *old_client = pool->client, *new_client;
keir@19684 813
keir@19684 814 ASSERT(is_shared(pool));
keir@19684 815 if ( list_empty(&pool->share_list) )
keir@19684 816 {
keir@19684 817 ASSERT(pool->shared_count == 0);
keir@19684 818 return;
keir@19684 819 }
keir@19684 820 old_client->pools[pool->pool_id] = NULL;
keir@19684 821 sl = list_entry(pool->share_list.next, sharelist_t, share_list);
keir@19684 822 ASSERT(sl->client != old_client);
keir@19684 823 pool->client = new_client = sl->client;
keir@19684 824 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
keir@19684 825 if (new_client->pools[poolid] == pool)
keir@19684 826 break;
keir@19684 827 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
keir@19734 828 new_client->eph_count += _atomic_read(pool->pgp_count);
keir@19734 829 old_client->eph_count -= _atomic_read(pool->pgp_count);
keir@19734 830 list_splice_init(&old_client->ephemeral_page_list,
keir@19734 831 &new_client->ephemeral_page_list);
keir@19684 832 printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
keir@19684 833 cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
keir@19684 834 pool->pool_id = poolid;
keir@19684 835 }
keir@19684 836
keir@19684 837 /* destroy all objects with last_client same as passed cli_id,
keir@19684 838 remove pool's cli_id from list of sharers of this pool */
keir@19684 839 static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
keir@19684 840 {
keir@19684 841 sharelist_t *sl;
keir@19684 842 int s_poolid;
keir@19684 843
keir@19684 844 ASSERT(is_shared(pool));
keir@19684 845 ASSERT(pool->client != NULL);
keir@19684 846
keir@19734 847 ASSERT_WRITELOCK(&tmem_rwlock);
keir@19734 848 pool_destroy_objs(pool,1,cli_id);
keir@19684 849 list_for_each_entry(sl,&pool->share_list, share_list)
keir@19684 850 {
keir@19684 851 if (sl->client->cli_id != cli_id)
keir@19684 852 continue;
keir@19684 853 list_del(&sl->share_list);
keir@19684 854 tmem_free(sl,sizeof(sharelist_t),pool);
keir@19684 855 --pool->shared_count;
keir@19684 856 if (pool->client->cli_id == cli_id)
keir@19684 857 shared_pool_reassign(pool);
keir@19684 858 if (pool->shared_count)
keir@19684 859 return pool->shared_count;
keir@19684 860 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
keir@19684 861 if ( (global_shared_pools[s_poolid]) == pool )
keir@19684 862 {
keir@19684 863 global_shared_pools[s_poolid] = NULL;
keir@19684 864 break;
keir@19684 865 }
keir@19684 866 return 0;
keir@19684 867 }
keir@19684 868 printk("tmem: no match unsharing pool, %s=%d\n",
keir@19684 869 cli_id_str,pool->client->cli_id);
keir@19684 870 return -1;
keir@19684 871 }
keir@19684 872
keir@19684 873 /* flush all data (owned by cli_id) from a pool and, optionally, free it */
keir@19684 874 static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
keir@19684 875 {
keir@19684 876 ASSERT(pool != NULL);
keir@19684 877 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
keir@19684 878 {
keir@19734 879 printk("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
keir@19734 880 cli_id_str, cli_id, pool->pool_id, cli_id_str,pool->client->cli_id);
keir@19684 881 return;
keir@19684 882 }
keir@19684 883 printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
keir@19684 884 is_persistent(pool) ? "persistent" : "ephemeral" ,
keir@19684 885 is_shared(pool) ? "shared" : "private");
keir@19684 886 printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
keir@20067 887 if ( pool->client->live_migrating )
keir@20067 888 {
keir@20067 889 printk("can't %s pool while %s is live-migrating\n",
keir@20067 890 destroy?"destroy":"flush", client_str);
keir@20067 891 return;
keir@20067 892 }
keir@19734 893 pool_destroy_objs(pool,0,CLI_ID_NULL);
keir@19684 894 if ( destroy )
keir@19684 895 {
keir@19684 896 pool->client->pools[pool->pool_id] = NULL;
keir@19684 897 pool_free(pool);
keir@19684 898 }
keir@19684 899 }
keir@19684 900
keir@19684 901 /************ CLIENT MANIPULATION OPERATIONS **************************/
keir@19684 902
keir@20067 903 static client_t *client_create(cli_id_t cli_id)
keir@19684 904 {
keir@20079 905 client_t *client = tmh_alloc_infra(sizeof(client_t),__alignof__(client_t));
keir@20067 906 int i;
keir@19684 907
keir@19684 908 printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
keir@19684 909 if ( client == NULL )
keir@19684 910 {
keir@19684 911 printk("failed... out of memory\n");
keir@19684 912 return NULL;
keir@19684 913 }
keir@19684 914 memset(client,0,sizeof(client_t));
keir@20964 915 if ( (client->tmh = tmh_client_init(cli_id)) == NULL )
keir@19684 916 {
keir@19684 917 printk("failed... can't allocate host-dependent part of client\n");
keir@19684 918 if ( client )
keir@20079 919 tmh_free_infra(client);
keir@19684 920 return NULL;
keir@19684 921 }
keir@20964 922 tmh_set_client_from_id(client, client->tmh, cli_id);
keir@19684 923 client->cli_id = cli_id;
keir@19684 924 #ifdef __i386__
keir@19684 925 client->compress = 0;
keir@19684 926 #else
keir@19684 927 client->compress = tmh_compression_enabled();
keir@19684 928 #endif
keir@20067 929 client->shared_auth_required = tmh_shared_auth();
keir@20067 930 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
keir@20067 931 client->shared_auth_uuid[i][0] =
keir@20067 932 client->shared_auth_uuid[i][1] = -1L;
keir@20067 933 client->frozen = 0; client->live_migrating = 0;
keir@20067 934 client->weight = 0; client->cap = 0;
keir@19684 935 list_add_tail(&client->client_list, &global_client_list);
keir@19684 936 INIT_LIST_HEAD(&client->ephemeral_page_list);
keir@20067 937 INIT_LIST_HEAD(&client->persistent_invalidated_list);
keir@20067 938 client->cur_pgp = NULL;
keir@19684 939 client->eph_count = client->eph_count_max = 0;
keir@19897 940 client->total_cycles = 0; client->succ_pers_puts = 0;
keir@19897 941 client->succ_eph_gets = 0; client->succ_pers_gets = 0;
keir@19684 942 printk("ok\n");
keir@19684 943 return client;
keir@19684 944 }
keir@19684 945
keir@19684 946 static void client_free(client_t *client)
keir@19684 947 {
keir@19684 948 list_del(&client->client_list);
keir@19684 949 tmh_client_destroy(client->tmh);
keir@20507 950 tmh_free_infra(client);
keir@19684 951 }
keir@19684 952
keir@19684 953 /* flush all data from a client and, optionally, free it */
keir@19684 954 static void client_flush(client_t *client, bool_t destroy)
keir@19684 955 {
keir@19684 956 int i;
keir@19684 957 pool_t *pool;
keir@19684 958
keir@19684 959 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
keir@19684 960 {
keir@19684 961 if ( (pool = client->pools[i]) == NULL )
keir@19684 962 continue;
keir@19684 963 pool_flush(pool,client->cli_id,destroy);
keir@19684 964 if ( destroy )
keir@19684 965 client->pools[i] = NULL;
keir@19684 966 }
keir@19684 967 if ( destroy )
keir@19684 968 client_free(client);
keir@19684 969 }
keir@19684 970
keir@19684 971 static bool_t client_over_quota(client_t *client)
keir@19684 972 {
keir@19684 973 int total = _atomic_read(client_weight_total);
keir@19684 974
keir@19684 975 ASSERT(client != NULL);
keir@19684 976 if ( (total == 0) || (client->weight == 0) ||
keir@19684 977 (client->eph_count == 0) )
keir@19684 978 return 0;
keir@19684 979 return ( ((global_eph_count*100L) / client->eph_count ) >
keir@19684 980 ((total*100L) / client->weight) );
keir@19684 981 }
keir@19684 982
keir@20067 983 static void client_freeze(client_t *client, int freeze)
keir@20067 984 {
keir@20067 985 client->frozen = freeze;
keir@20067 986 }
keir@20067 987
keir@19684 988 /************ MEMORY REVOCATION ROUTINES *******************************/
keir@19684 989
keir@19684 990 static int tmem_evict(void)
keir@19684 991 {
keir@19684 992 client_t *client = tmh_client_from_current();
keir@19684 993 pgp_t *pgp = NULL, *pgp_del;
keir@19684 994 obj_t *obj;
keir@19684 995 pool_t *pool;
keir@19684 996 int ret = 0;
keir@19684 997 bool_t hold_pool_rwlock = 0;
keir@19684 998
keir@19684 999 evict_attempts++;
keir@19684 1000 tmem_spin_lock(&eph_lists_spinlock);
keir@19684 1001 if ( (client != NULL) && client_over_quota(client) &&
keir@19684 1002 !list_empty(&client->ephemeral_page_list) )
keir@19684 1003 {
keir@19684 1004 list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
keir@19684 1005 {
keir@19684 1006 obj = pgp->obj;
keir@19684 1007 pool = obj->pool;
keir@20535 1008 if ( pool->is_dying )
keir@20535 1009 continue;
keir@19684 1010 if ( tmh_lock_all && !obj->no_evict )
keir@19684 1011 goto found;
keir@19684 1012 if ( tmem_spin_trylock(&obj->obj_spinlock) )
keir@19684 1013 {
keir@19684 1014 if ( obj->pgp_count > 1 )
keir@19684 1015 goto found;
keir@19684 1016 if ( tmem_write_trylock(&pool->pool_rwlock) )
keir@19684 1017 {
keir@19684 1018 hold_pool_rwlock = 1;
keir@19684 1019 goto found;
keir@19684 1020 }
keir@19684 1021 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1022 }
keir@19684 1023 }
keir@19684 1024 } else if ( list_empty(&global_ephemeral_page_list) ) {
keir@19684 1025 goto out;
keir@19684 1026 } else {
keir@19684 1027 list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
keir@19684 1028 {
keir@19684 1029 obj = pgp->obj;
keir@19684 1030 pool = obj->pool;
keir@20535 1031 if ( pool->is_dying )
keir@20535 1032 continue;
keir@19684 1033 if ( tmh_lock_all && !obj->no_evict )
keir@19684 1034 goto found;
keir@19684 1035 if ( tmem_spin_trylock(&obj->obj_spinlock) )
keir@19684 1036 {
keir@19684 1037 if ( obj->pgp_count > 1 )
keir@19684 1038 goto found;
keir@19684 1039 if ( tmem_write_trylock(&pool->pool_rwlock) )
keir@19684 1040 {
keir@19684 1041 hold_pool_rwlock = 1;
keir@19684 1042 goto found;
keir@19684 1043 }
keir@19684 1044 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1045 }
keir@19684 1046 }
keir@19684 1047 }
keir@19684 1048
keir@19684 1049 ret = 0;
keir@19684 1050 goto out;
keir@19684 1051
keir@19684 1052 found:
keir@19684 1053 ASSERT(pgp != NULL);
keir@19684 1054 ASSERT_SENTINEL(pgp,PGD);
keir@19684 1055 obj = pgp->obj;
keir@19684 1056 ASSERT(obj != NULL);
keir@19684 1057 ASSERT(obj->no_evict == 0);
keir@19684 1058 ASSERT(obj->pool != NULL);
keir@19684 1059 ASSERT_SENTINEL(obj,OBJ);
keir@19684 1060
keir@19684 1061 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 1062 pgp_del = pgp_delete_from_obj(obj, pgp->index);
keir@19684 1063 ASSERT(pgp_del == pgp);
keir@19684 1064 pgp_delete(pgp,1);
keir@19684 1065 if ( obj->pgp_count == 0 )
keir@19684 1066 {
keir@19684 1067 ASSERT_WRITELOCK(&pool->pool_rwlock);
keir@19684 1068 obj_free(obj,0);
keir@19684 1069 }
keir@19684 1070 else
keir@19684 1071 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1072 if ( hold_pool_rwlock )
keir@19684 1073 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1074 evicted_pgs++;
keir@19684 1075 ret = 1;
keir@19684 1076
keir@19684 1077 out:
keir@19684 1078 tmem_spin_unlock(&eph_lists_spinlock);
keir@19684 1079 return ret;
keir@19684 1080 }
keir@19684 1081
keir@19684 1082 static unsigned long tmem_relinquish_npages(unsigned long n)
keir@19684 1083 {
keir@19684 1084 unsigned long avail_pages = 0;
keir@19684 1085
keir@19684 1086 while ( (avail_pages = tmh_avail_pages()) < n )
keir@19684 1087 {
keir@19684 1088 if ( !tmem_evict() )
keir@19684 1089 break;
keir@19684 1090 }
keir@19684 1091 if ( avail_pages )
keir@19684 1092 tmh_release_avail_pages_to_host();
keir@19684 1093 return avail_pages;
keir@19684 1094 }
keir@19684 1095
keir@20648 1096 /* Under certain conditions (e.g. if each client is putting pages for exactly
keir@20648 1097 * one object), once locks are held, freeing up memory may
keir@20648 1098 * result in livelocks and very long "put" times, so we try to ensure there
keir@20648 1099 * is a minimum amount of memory (1MB) available BEFORE any data structure
keir@20648 1100 * locks are held */
keir@20648 1101 static inline void tmem_ensure_avail_pages(void)
keir@20648 1102 {
keir@20648 1103 int failed_evict = 10;
keir@20648 1104
keir@20648 1105 while ( !tmh_free_mb() )
keir@20648 1106 {
keir@20648 1107 if ( tmem_evict() )
keir@20648 1108 continue;
keir@20648 1109 else if ( failed_evict-- <= 0 )
keir@20648 1110 break;
keir@20648 1111 }
keir@20648 1112 }
keir@20648 1113
keir@19684 1114 /************ TMEM CORE OPERATIONS ************************************/
keir@19684 1115
keir@20067 1116 static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn,
keir@20067 1117 void *cva)
keir@19684 1118 {
keir@19684 1119 void *dst, *p;
keir@19684 1120 size_t size;
keir@19684 1121 int ret = 0;
keir@19684 1122 DECL_LOCAL_CYC_COUNTER(compress);
keir@19684 1123
keir@19684 1124 ASSERT(pgp != NULL);
keir@19684 1125 ASSERT(pgp->obj != NULL);
keir@19684 1126 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
keir@19684 1127 ASSERT(pgp->obj->pool != NULL);
keir@19684 1128 ASSERT(pgp->obj->pool->client != NULL);
keir@19684 1129 #ifdef __i386__
keir@19684 1130 return -ENOMEM;
keir@19684 1131 #endif
keir@19684 1132 if ( pgp->pfp != NULL )
keir@19684 1133 pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */
keir@19684 1134 START_CYC_COUNTER(compress);
keir@20067 1135 ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
keir@19684 1136 if ( (ret == -EFAULT) || (ret == 0) )
keir@19684 1137 goto out;
keir@19684 1138 else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
keir@19684 1139 ret = 0;
keir@19684 1140 else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
keir@19684 1141 ret = -ENOMEM;
keir@19684 1142 else
keir@19684 1143 {
keir@19684 1144 memcpy(p,dst,size);
keir@19684 1145 pgp->cdata = p;
keir@19684 1146 pgp->size = size;
keir@19684 1147 pgp->obj->pool->client->compressed_pages++;
keir@19684 1148 pgp->obj->pool->client->compressed_sum_size += size;
keir@19684 1149 ret = 1;
keir@19684 1150 }
keir@19684 1151
keir@19684 1152 out:
keir@19684 1153 END_CYC_COUNTER(compress);
keir@19684 1154 return ret;
keir@19684 1155 }
keir@19684 1156
keir@19684 1157 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
keir@20067 1158 uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva)
keir@19684 1159 {
keir@19684 1160 pool_t *pool;
keir@19684 1161 obj_t *obj;
keir@19684 1162 client_t *client;
keir@19684 1163 pgp_t *pgpfound = NULL;
keir@19684 1164 int ret;
keir@19684 1165
keir@19684 1166 ASSERT(pgp != NULL);
keir@19684 1167 ASSERT(pgp->pfp != NULL);
keir@19684 1168 ASSERT(pgp->size != -1);
keir@19684 1169 obj = pgp->obj;
keir@19684 1170 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 1171 ASSERT(obj != NULL);
keir@19684 1172 pool = obj->pool;
keir@19684 1173 ASSERT(pool != NULL);
keir@19684 1174 client = pool->client;
keir@20067 1175 if ( client->live_migrating )
keir@20067 1176 goto failed_dup; /* no dups allowed when migrating */
keir@20067 1177 /* can we successfully manipulate pgp to change out the data? */
keir@20067 1178 if ( len != 0 && client->compress && pgp->size != 0 )
keir@19684 1179 {
keir@20067 1180 ret = do_tmem_put_compress(pgp,cmfn,cva);
keir@19684 1181 if ( ret == 1 )
keir@19684 1182 goto done;
keir@19684 1183 else if ( ret == 0 )
keir@19684 1184 goto copy_uncompressed;
keir@19684 1185 else if ( ret == -ENOMEM )
keir@19684 1186 goto failed_dup;
keir@19684 1187 else if ( ret == -EFAULT )
keir@19684 1188 goto bad_copy;
keir@19684 1189 }
keir@19684 1190
keir@19684 1191 copy_uncompressed:
keir@19684 1192 if ( pgp->pfp )
keir@19684 1193 pgp_free_data(pgp, pool);
keir@19684 1194 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
keir@19684 1195 goto failed_dup;
keir@19684 1196 /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
keir@20067 1197 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
keir@19684 1198 if ( ret == -EFAULT )
keir@19684 1199 goto bad_copy;
keir@19684 1200 pgp->size = 0;
keir@19684 1201
keir@19684 1202 done:
keir@19684 1203 /* successfully replaced data, clean up and return success */
keir@19684 1204 if ( is_shared(pool) )
keir@19684 1205 obj->last_client = client->cli_id;
keir@19684 1206 obj->no_evict = 0;
keir@19684 1207 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1208 pool->dup_puts_replaced++;
keir@19684 1209 pool->good_puts++;
keir@19897 1210 if ( is_persistent(pool) )
keir@19897 1211 client->succ_pers_puts++;
keir@19684 1212 return 1;
keir@19684 1213
keir@19684 1214 bad_copy:
keir@19684 1215 /* this should only happen if the client passed a bad mfn */
keir@19684 1216 failed_copies++;
keir@19684 1217 ASSERT(0);
keir@19684 1218 return -EFAULT;
keir@19684 1219
keir@19684 1220 failed_dup:
keir@19684 1221 /* couldn't change out the data, flush the old data and return
keir@19684 1222 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
keir@19684 1223 pgpfound = pgp_delete_from_obj(obj, pgp->index);
keir@19684 1224 ASSERT(pgpfound == pgp);
keir@19684 1225 pgp_delete(pgpfound,0);
keir@19684 1226 if ( obj->pgp_count == 0 )
keir@19684 1227 {
keir@19684 1228 tmem_write_lock(&pool->pool_rwlock);
keir@19684 1229 obj_free(obj,0);
keir@19684 1230 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1231 } else {
keir@19684 1232 obj->no_evict = 0;
keir@19684 1233 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1234 }
keir@19684 1235 pool->dup_puts_flushed++;
keir@19684 1236 return -ENOSPC;
keir@19684 1237 }
keir@19684 1238
keir@19684 1239
keir@20067 1240 static NOINLINE int do_tmem_put(pool_t *pool,
keir@20067 1241 uint64_t oid, uint32_t index,
keir@19684 1242 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
keir@20067 1243 uint32_t pfn_offset, uint32_t len, void *cva)
keir@19684 1244 {
keir@19684 1245 obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
keir@19684 1246 pgp_t *pgp = NULL, *pgpdel = NULL;
keir@19684 1247 client_t *client = pool->client;
keir@19684 1248 int ret = client->frozen ? -EFROZEN : -ENOMEM;
keir@19684 1249
keir@19684 1250 ASSERT(pool != NULL);
keir@19684 1251 pool->puts++;
keir@19684 1252 /* does page already exist (dup)? if so, handle specially */
keir@19684 1253 if ( (obj = objfound = obj_find(pool,oid)) != NULL )
keir@19684 1254 {
keir@19684 1255 ASSERT_SPINLOCK(&objfound->obj_spinlock);
keir@19684 1256 if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
keir@20067 1257 return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len,cva);
keir@19684 1258 }
keir@19684 1259
keir@19684 1260 /* no puts allowed into a frozen pool (except dup puts) */
keir@19684 1261 if ( client->frozen )
keir@19684 1262 goto free;
keir@19684 1263
keir@19684 1264 if ( (objfound == NULL) )
keir@19684 1265 {
keir@19684 1266 tmem_write_lock(&pool->pool_rwlock);
keir@19684 1267 if ( (obj = objnew = obj_new(pool,oid)) == NULL )
keir@19684 1268 {
keir@19684 1269 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1270 return -ENOMEM;
keir@19684 1271 }
keir@19684 1272 ASSERT_SPINLOCK(&objnew->obj_spinlock);
keir@19684 1273 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1274 }
keir@19684 1275
keir@19684 1276 ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
keir@19684 1277 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 1278 if ( (pgp = pgp_alloc(obj)) == NULL )
keir@19684 1279 goto free;
keir@19684 1280
keir@19684 1281 ret = pgp_add_to_obj(obj, index, pgp);
keir@19684 1282 if ( ret == -ENOMEM )
keir@19684 1283 /* warning, may result in partially built radix tree ("stump") */
keir@19684 1284 goto free;
keir@19684 1285 ASSERT(ret != -EEXIST);
keir@19684 1286 pgp->index = index;
keir@19684 1287
keir@20067 1288 if ( len != 0 && client->compress )
keir@19684 1289 {
keir@19684 1290 ASSERT(pgp->pfp == NULL);
keir@20067 1291 ret = do_tmem_put_compress(pgp,cmfn,cva);
keir@19684 1292 if ( ret == 1 )
keir@19684 1293 goto insert_page;
keir@19684 1294 if ( ret == -ENOMEM )
keir@19684 1295 {
keir@19684 1296 client->compress_nomem++;
keir@19684 1297 goto delete_and_free;
keir@19684 1298 }
keir@19684 1299 if ( ret == 0 )
keir@19684 1300 {
keir@19684 1301 client->compress_poor++;
keir@19684 1302 goto copy_uncompressed;
keir@19684 1303 }
keir@19684 1304 if ( ret == -EFAULT )
keir@19684 1305 goto bad_copy;
keir@19684 1306 }
keir@19684 1307
keir@19684 1308 copy_uncompressed:
keir@19684 1309 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
keir@19684 1310 {
keir@19684 1311 ret == -ENOMEM;
keir@19684 1312 goto delete_and_free;
keir@19684 1313 }
keir@19684 1314 /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
keir@20067 1315 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
keir@19684 1316 if ( ret == -EFAULT )
keir@19684 1317 goto bad_copy;
keir@19684 1318 pgp->size = 0;
keir@19684 1319
keir@19684 1320 insert_page:
keir@19684 1321 if ( is_ephemeral(pool) )
keir@19684 1322 {
keir@19684 1323 tmem_spin_lock(&eph_lists_spinlock);
keir@19684 1324 list_add_tail(&pgp->global_eph_pages,
keir@19684 1325 &global_ephemeral_page_list);
keir@19684 1326 if (++global_eph_count > global_eph_count_max)
keir@19684 1327 global_eph_count_max = global_eph_count;
keir@19684 1328 list_add_tail(&pgp->client_eph_pages,
keir@19684 1329 &client->ephemeral_page_list);
keir@19684 1330 if (++client->eph_count > client->eph_count_max)
keir@19684 1331 client->eph_count_max = client->eph_count;
keir@19684 1332 tmem_spin_unlock(&eph_lists_spinlock);
keir@20067 1333 } else { /* is_persistent */
keir@20067 1334 tmem_spin_lock(&pers_lists_spinlock);
keir@20067 1335 list_add_tail(&pgp->pool_pers_pages,
keir@20067 1336 &pool->persistent_page_list);
keir@20067 1337 tmem_spin_unlock(&pers_lists_spinlock);
keir@19684 1338 }
keir@19684 1339 ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
keir@19684 1340 if ( is_shared(pool) )
keir@19684 1341 obj->last_client = client->cli_id;
keir@19684 1342 obj->no_evict = 0;
keir@19684 1343 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1344 pool->good_puts++;
keir@19897 1345 if ( is_persistent(pool) )
keir@19897 1346 client->succ_pers_puts++;
keir@19684 1347 return 1;
keir@19684 1348
keir@19684 1349 delete_and_free:
keir@19684 1350 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
keir@19684 1351 pgpdel = pgp_delete_from_obj(obj, pgp->index);
keir@19684 1352 ASSERT(pgp == pgpdel);
keir@19684 1353
keir@19684 1354 free:
keir@19684 1355 if ( pgp )
keir@19684 1356 pgp_delete(pgp,0);
keir@19684 1357 if ( objfound )
keir@19684 1358 {
keir@19684 1359 objfound->no_evict = 0;
keir@19684 1360 tmem_spin_unlock(&objfound->obj_spinlock);
keir@19684 1361 }
keir@19684 1362 if ( objnew )
keir@19684 1363 {
keir@19684 1364 tmem_write_lock(&pool->pool_rwlock);
keir@19684 1365 obj_free(objnew,0);
keir@19684 1366 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1367 }
keir@19684 1368 pool->no_mem_puts++;
keir@19684 1369 return ret;
keir@19684 1370
keir@19684 1371 bad_copy:
keir@19684 1372 /* this should only happen if the client passed a bad mfn */
keir@19684 1373 failed_copies++;
keir@19684 1374 ASSERT(0);
keir@19684 1375 goto free;
keir@19684 1376 }
keir@19684 1377
keir@19684 1378 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
keir@19684 1379 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
keir@20067 1380 uint32_t pfn_offset, uint32_t len, void *cva)
keir@19684 1381 {
keir@19684 1382 obj_t *obj;
keir@19684 1383 pgp_t *pgp;
keir@19684 1384 client_t *client = pool->client;
keir@19684 1385 DECL_LOCAL_CYC_COUNTER(decompress);
keir@19684 1386
keir@19684 1387 if ( !_atomic_read(pool->pgp_count) )
keir@19684 1388 return -EEMPTY;
keir@19684 1389
keir@19684 1390 pool->gets++;
keir@19684 1391 obj = obj_find(pool,oid);
keir@19684 1392 if ( obj == NULL )
keir@19684 1393 return 0;
keir@19684 1394
keir@19684 1395 ASSERT_SPINLOCK(&obj->obj_spinlock);
keir@19684 1396 if (is_shared(pool) || is_persistent(pool) )
keir@19684 1397 pgp = pgp_lookup_in_obj(obj, index);
keir@19684 1398 else
keir@19684 1399 pgp = pgp_delete_from_obj(obj, index);
keir@19684 1400 if ( pgp == NULL )
keir@19684 1401 {
keir@19684 1402 obj->no_evict = 0;
keir@19684 1403 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1404 return 0;
keir@19684 1405 }
keir@19684 1406 ASSERT(pgp->size != -1);
keir@19684 1407 if ( pgp->size != 0 )
keir@19684 1408 {
keir@19684 1409 START_CYC_COUNTER(decompress);
keir@20067 1410 if ( tmh_decompress_to_client(cmfn, pgp->cdata,
keir@20067 1411 pgp->size, cva) == -EFAULT )
keir@19684 1412 goto bad_copy;
keir@19684 1413 END_CYC_COUNTER(decompress);
keir@19684 1414 }
keir@19684 1415 else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
keir@20067 1416 pfn_offset, len, cva) == -EFAULT)
keir@19684 1417 goto bad_copy;
keir@19684 1418 if ( is_ephemeral(pool) )
keir@19684 1419 {
keir@19684 1420 if ( is_private(pool) )
keir@19684 1421 {
keir@19684 1422 pgp_delete(pgp,0);
keir@19684 1423 if ( obj->pgp_count == 0 )
keir@19684 1424 {
keir@19684 1425 tmem_write_lock(&pool->pool_rwlock);
keir@19684 1426 obj_free(obj,0);
keir@19684 1427 obj = NULL;
keir@19684 1428 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1429 }
keir@19684 1430 } else {
keir@19684 1431 tmem_spin_lock(&eph_lists_spinlock);
keir@19684 1432 list_del(&pgp->global_eph_pages);
keir@19684 1433 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
keir@19684 1434 list_del(&pgp->client_eph_pages);
keir@19684 1435 list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
keir@19684 1436 tmem_spin_unlock(&eph_lists_spinlock);
keir@19684 1437 ASSERT(obj != NULL);
keir@19684 1438 obj->last_client = tmh_get_cli_id_from_current();
keir@19684 1439 }
keir@19684 1440 }
keir@19684 1441 if ( obj != NULL )
keir@19684 1442 {
keir@19684 1443 obj->no_evict = 0;
keir@19684 1444 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1445 }
keir@19684 1446 pool->found_gets++;
keir@19897 1447 if ( is_ephemeral(pool) )
keir@19897 1448 client->succ_eph_gets++;
keir@19897 1449 else
keir@19897 1450 client->succ_pers_gets++;
keir@19684 1451 return 1;
keir@19684 1452
keir@19684 1453 bad_copy:
keir@19684 1454 /* this should only happen if the client passed a bad mfn */
keir@19684 1455 failed_copies++;
keir@19684 1456 ASSERT(0);
keir@19684 1457 return -EFAULT;
keir@19684 1458
keir@19684 1459 }
keir@19684 1460
keir@19684 1461 static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
keir@19684 1462 {
keir@19684 1463 obj_t *obj;
keir@19684 1464 pgp_t *pgp;
keir@19684 1465
keir@19684 1466 pool->flushs++;
keir@19684 1467 obj = obj_find(pool,oid);
keir@19684 1468 if ( obj == NULL )
keir@19684 1469 goto out;
keir@19684 1470 pgp = pgp_delete_from_obj(obj, index);
keir@19684 1471 if ( pgp == NULL )
keir@19684 1472 {
keir@19684 1473 obj->no_evict = 0;
keir@19684 1474 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1475 goto out;
keir@19684 1476 }
keir@19684 1477 pgp_delete(pgp,0);
keir@19684 1478 if ( obj->pgp_count == 0 )
keir@19684 1479 {
keir@19684 1480 tmem_write_lock(&pool->pool_rwlock);
keir@19684 1481 obj_free(obj,0);
keir@19684 1482 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1483 } else {
keir@19684 1484 obj->no_evict = 0;
keir@19684 1485 tmem_spin_unlock(&obj->obj_spinlock);
keir@19684 1486 }
keir@19684 1487 pool->flushs_found++;
keir@19684 1488
keir@19684 1489 out:
keir@19684 1490 if ( pool->client->frozen )
keir@19684 1491 return -EFROZEN;
keir@19684 1492 else
keir@19684 1493 return 1;
keir@19684 1494 }
keir@19684 1495
keir@19684 1496 static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
keir@19684 1497 {
keir@19684 1498 obj_t *obj;
keir@19684 1499
keir@19684 1500 pool->flush_objs++;
keir@19684 1501 obj = obj_find(pool,oid);
keir@19684 1502 if ( obj == NULL )
keir@19684 1503 goto out;
keir@19684 1504 tmem_write_lock(&pool->pool_rwlock);
keir@19734 1505 obj_destroy(obj,0);
keir@19684 1506 pool->flush_objs_found++;
keir@19684 1507 tmem_write_unlock(&pool->pool_rwlock);
keir@19684 1508
keir@19684 1509 out:
keir@19684 1510 if ( pool->client->frozen )
keir@19684 1511 return -EFROZEN;
keir@19684 1512 else
keir@19684 1513 return 1;
keir@19684 1514 }
keir@19684 1515
keir@19684 1516 static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
keir@19684 1517 {
keir@19684 1518 client_t *client = tmh_client_from_current();
keir@19684 1519 pool_t *pool;
keir@19684 1520
keir@19684 1521 if ( client->pools == NULL )
keir@19684 1522 return 0;
keir@19684 1523 if ( (pool = client->pools[pool_id]) == NULL )
keir@19684 1524 return 0;
keir@19684 1525 client->pools[pool_id] = NULL;
keir@19684 1526 pool_flush(pool,client->cli_id,1);
keir@19684 1527 return 1;
keir@19684 1528 }
keir@19684 1529
keir@20067 1530 static NOINLINE int do_tmem_new_pool(cli_id_t this_cli_id,
keir@20964 1531 uint32_t d_poolid, uint32_t flags,
keir@20067 1532 uint64_t uuid_lo, uint64_t uuid_hi)
keir@19684 1533 {
keir@20067 1534 client_t *client;
keir@20067 1535 cli_id_t cli_id;
keir@19684 1536 int persistent = flags & TMEM_POOL_PERSIST;
keir@19684 1537 int shared = flags & TMEM_POOL_SHARED;
keir@19684 1538 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
keir@19684 1539 & TMEM_POOL_PAGESIZE_MASK;
keir@19684 1540 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
keir@19684 1541 & TMEM_POOL_VERSION_MASK;
keir@19684 1542 pool_t *pool, *shpool;
keir@20964 1543 int s_poolid, first_unused_s_poolid;
keir@20067 1544 int i;
keir@19684 1545
keir@20067 1546 if ( this_cli_id == CLI_ID_NULL )
keir@20067 1547 cli_id = tmh_get_cli_id_from_current();
keir@20964 1548 else
keir@20067 1549 cli_id = this_cli_id;
keir@19684 1550 printk("tmem: allocating %s-%s tmem pool for %s=%d...",
keir@19684 1551 persistent ? "persistent" : "ephemeral" ,
keir@19684 1552 shared ? "shared" : "private", cli_id_str, cli_id);
keir@20067 1553 if ( specversion != TMEM_SPEC_VERSION )
keir@19684 1554 {
keir@19684 1555 printk("failed... unsupported spec version\n");
keir@19684 1556 return -EPERM;
keir@19684 1557 }
keir@19684 1558 if ( pagebits != (PAGE_SHIFT - 12) )
keir@19684 1559 {
keir@19684 1560 printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
keir@19684 1561 return -EPERM;
keir@19684 1562 }
keir@19684 1563 if ( (pool = pool_alloc()) == NULL )
keir@19684 1564 {
keir@19684 1565 printk("failed... out of memory\n");
keir@19684 1566 return -ENOMEM;
keir@19684 1567 }
keir@20067 1568 if ( this_cli_id != CLI_ID_NULL )
keir@20067 1569 {
keir@20964 1570 if ( (client = tmh_client_from_cli_id(this_cli_id)) == NULL
keir@20964 1571 || d_poolid >= MAX_POOLS_PER_DOMAIN
keir@20964 1572 || client->pools[d_poolid] != NULL )
keir@20964 1573 goto fail;
keir@20067 1574 }
keir@20964 1575 else
keir@19684 1576 {
keir@20964 1577 client = tmh_client_from_current();
keir@20964 1578 ASSERT(client != NULL);
keir@20964 1579 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
keir@20964 1580 if ( client->pools[d_poolid] == NULL )
keir@20964 1581 break;
keir@20964 1582 if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
keir@20964 1583 {
keir@20964 1584 printk("failed... no more pool slots available for this %s\n",
keir@20964 1585 client_str);
keir@20964 1586 goto fail;
keir@20964 1587 }
keir@19684 1588 }
keir@20067 1589 if ( shared )
keir@20067 1590 {
keir@20067 1591 if ( uuid_lo == -1L && uuid_hi == -1L )
keir@20067 1592 shared = 0;
keir@20067 1593 if ( client->shared_auth_required && !global_shared_auth )
keir@20067 1594 {
keir@20067 1595 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
keir@20067 1596 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
keir@20067 1597 (client->shared_auth_uuid[i][1] == uuid_hi) )
keir@20067 1598 break;
keir@20067 1599 if ( i == MAX_GLOBAL_SHARED_POOLS )
keir@20067 1600 shared = 0;
keir@20067 1601 }
keir@20067 1602 }
keir@19684 1603 pool->shared = shared;
keir@19684 1604 pool->client = client;
keir@19684 1605 if ( shared )
keir@19684 1606 {
keir@19684 1607 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
keir@19684 1608 for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
keir@19684 1609 {
keir@19684 1610 if ( (shpool = global_shared_pools[s_poolid]) != NULL )
keir@19684 1611 {
keir@19684 1612 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
keir@19684 1613 {
keir@19734 1614 printk("(matches shared pool uuid=%"PRIx64".%"PRIx64") ",
keir@19684 1615 uuid_hi, uuid_lo);
keir@19684 1616 printk("pool_id=%d\n",d_poolid);
keir@19684 1617 client->pools[d_poolid] = global_shared_pools[s_poolid];
keir@19684 1618 shared_pool_join(global_shared_pools[s_poolid], client);
keir@19684 1619 pool_free(pool);
keir@20964 1620 if ( this_cli_id != CLI_ID_NULL )
keir@20964 1621 tmh_client_put(client->tmh);
keir@19684 1622 return d_poolid;
keir@19684 1623 }
keir@19684 1624 }
keir@19684 1625 else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
keir@19684 1626 first_unused_s_poolid = s_poolid;
keir@19684 1627 }
keir@19684 1628 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
keir@19684 1629 {
keir@19684 1630 printk("tmem: failed... no global shared pool slots available\n");
keir@19684 1631 goto fail;
keir@19684 1632 }
keir@19684 1633 else
keir@19684 1634 {
keir@19684 1635 INIT_LIST_HEAD(&pool->share_list);
keir@19684 1636 pool->shared_count = 0;
keir@19684 1637 global_shared_pools[first_unused_s_poolid] = pool;
keir@19684 1638 (void)shared_pool_join(pool,client);
keir@19684 1639 }
keir@19684 1640 }
keir@19684 1641 client->pools[d_poolid] = pool;
keir@20964 1642 if ( this_cli_id != CLI_ID_NULL )
keir@20964 1643 tmh_client_put(client->tmh);
keir@19684 1644 list_add_tail(&pool->pool_list, &global_pool_list);
keir@19684 1645 pool->pool_id = d_poolid;
keir@19684 1646 pool->persistent = persistent;
keir@19684 1647 pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
keir@19684 1648 printk("pool_id=%d\n",d_poolid);
keir@19684 1649 return d_poolid;
keir@19684 1650
keir@19684 1651 fail:
keir@19684 1652 pool_free(pool);
keir@20964 1653 if ( this_cli_id != CLI_ID_NULL )
keir@20964 1654 tmh_client_put(client->tmh);
keir@19684 1655 return -EPERM;
keir@19684 1656 }
keir@19684 1657
keir@19684 1658 /************ TMEM CONTROL OPERATIONS ************************************/
keir@19684 1659
keir@19684 1660 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
keir@20067 1661 static int tmemc_freeze_pools(cli_id_t cli_id, int arg)
keir@19684 1662 {
keir@19684 1663 client_t *client;
keir@19684 1664 bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
keir@19684 1665 bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
keir@19684 1666 char *s;
keir@19684 1667
keir@19684 1668 s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
keir@19684 1669 if ( cli_id == CLI_ID_NULL )
keir@19684 1670 {
keir@19684 1671 list_for_each_entry(client,&global_client_list,client_list)
keir@20067 1672 client_freeze(client,freeze);
keir@19734 1673 printk("tmem: all pools %s for all %ss\n",s,client_str);
keir@19684 1674 }
keir@19684 1675 else
keir@19684 1676 {
keir@19684 1677 if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
keir@19684 1678 return -1;
keir@20067 1679 client_freeze(client,freeze);
keir@20964 1680 tmh_client_put(client->tmh);
keir@19684 1681 printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
keir@19684 1682 }
keir@19684 1683 return 0;
keir@19684 1684 }
keir@19684 1685
keir@20067 1686 static int tmemc_flush_mem(cli_id_t cli_id, uint32_t kb)
keir@19684 1687 {
keir@19684 1688 uint32_t npages, flushed_pages, flushed_kb;
keir@19684 1689
keir@19684 1690 if ( cli_id != CLI_ID_NULL )
keir@19684 1691 {
keir@19684 1692 printk("tmem: %s-specific flush not supported yet, use --all\n",
keir@19684 1693 client_str);
keir@19684 1694 return -1;
keir@19684 1695 }
keir@19684 1696 /* convert kb to pages, rounding up if necessary */
keir@19684 1697 npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
keir@19684 1698 flushed_pages = tmem_relinquish_npages(npages);
keir@19684 1699 flushed_kb = flushed_pages << (PAGE_SHIFT-10);
keir@19684 1700 return flushed_kb;
keir@19684 1701 }
keir@19684 1702
keir@19684 1703 /*
keir@19684 1704 * These tmemc_list* routines output lots of stats in a format that is
keir@19684 1705 * intended to be program-parseable, not human-readable. Further, by
keir@19684 1706 * tying each group of stats to a line format indicator (e.g. G= for
keir@19684 1707 * global stats) and each individual stat to a two-letter specifier
keir@19684 1708 * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
keir@19684 1709 * global ephemeral pool), it should allow the stats reported to be
keir@19684 1710 * forward and backwards compatible as tmem evolves.
keir@19684 1711 */
keir@19684 1712 #define BSIZE 1024
keir@19684 1713
keir@19684 1714 static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
keir@19684 1715 uint32_t len, bool_t use_long)
keir@19684 1716 {
keir@19684 1717 char info[BSIZE];
keir@19684 1718 int i, n = 0, sum = 0;
keir@19684 1719 pool_t *p;
keir@19684 1720 bool_t s;
keir@19684 1721
keir@19897 1722 n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d,"
keir@19897 1723 "Tc:%"PRIu64",Ge:%ld,Pp:%ld,Gp:%ld%c",
keir@19897 1724 c->cli_id, c->weight, c->cap, c->compress, c->frozen,
keir@19897 1725 c->total_cycles, c->succ_eph_gets, c->succ_pers_puts, c->succ_pers_gets,
keir@19897 1726 use_long ? ',' : '\n');
keir@19684 1727 if (use_long)
keir@19684 1728 n += scnprintf(info+n,BSIZE-n,
keir@19687 1729 "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n",
keir@19684 1730 c->eph_count, c->eph_count_max,
keir@19687 1731 c->compressed_pages, c->compressed_sum_size,
keir@19684 1732 c->compress_poor, c->compress_nomem);
keir@19684 1733 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
keir@19684 1734 sum += n;
keir@19684 1735 for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
keir@19684 1736 {
keir@19684 1737 if ( (p = c->pools[i]) == NULL )
keir@19684 1738 continue;
keir@19684 1739 s = is_shared(p);
keir@19687 1740 n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,"
keir@19687 1741 "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c",
keir@19687 1742 c->cli_id, p->pool_id,
keir@19687 1743 is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
keir@19687 1744 (uint64_t)(s ? p->uuid[0] : 0),
keir@19687 1745 (uint64_t)(s ? p->uuid[1] : 0LL),
keir@19687 1746 use_long ? ',' : '\n');
keir@19684 1747 if (use_long)
keir@19684 1748 n += scnprintf(info+n,BSIZE-n,
keir@19684 1749 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
keir@19684 1750 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
keir@19684 1751 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
keir@19684 1752 _atomic_read(p->pgp_count), p->pgp_count_max,
keir@19684 1753 p->obj_count, p->obj_count_max,
keir@19684 1754 p->objnode_count, p->objnode_count_max,
keir@19684 1755 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
keir@19684 1756 p->no_mem_puts,
keir@19684 1757 p->found_gets, p->gets,
keir@19684 1758 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
keir@19684 1759 if ( sum + n >= len )
keir@19684 1760 return sum;
keir@19684 1761 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
keir@19684 1762 sum += n;
keir@19684 1763 }
keir@19684 1764 return sum;
keir@19684 1765 }
keir@19684 1766
keir@19684 1767 static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
keir@19684 1768 bool_t use_long)
keir@19684 1769 {
keir@19684 1770 char info[BSIZE];
keir@19684 1771 int i, n = 0, sum = 0;
keir@19684 1772 pool_t *p;
keir@19684 1773 sharelist_t *sl;
keir@19684 1774
keir@19684 1775 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
keir@19684 1776 {
keir@19684 1777 if ( (p = global_shared_pools[i]) == NULL )
keir@19684 1778 continue;
keir@19687 1779 n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64,
keir@19687 1780 i, is_persistent(p) ? 'P' : 'E',
keir@19687 1781 is_shared(p) ? 'S' : 'P',
keir@19687 1782 p->uuid[0], p->uuid[1]);
keir@19684 1783 list_for_each_entry(sl,&p->share_list, share_list)
keir@19684 1784 n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
keir@19684 1785 n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
keir@19684 1786 if (use_long)
keir@19684 1787 n += scnprintf(info+n,BSIZE-n,
keir@19684 1788 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
keir@19684 1789 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
keir@19684 1790 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
keir@19684 1791 _atomic_read(p->pgp_count), p->pgp_count_max,
keir@19684 1792 p->obj_count, p->obj_count_max,
keir@19684 1793 p->objnode_count, p->objnode_count_max,
keir@19684 1794 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
keir@19684 1795 p->no_mem_puts,
keir@19684 1796 p->found_gets, p->gets,
keir@19684 1797 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
keir@19684 1798 if ( sum + n >= len )
keir@19684 1799 return sum;
keir@19684 1800 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
keir@19684 1801 sum += n;
keir@19684 1802 }
keir@19684 1803 return sum;
keir@19684 1804 }
keir@19684 1805
keir@19684 1806 #ifdef TMEM_PERF
keir@19684 1807 static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
keir@19684 1808 bool_t use_long)
keir@19684 1809 {
keir@19684 1810 char info[BSIZE];
keir@19684 1811 int n = 0, sum = 0;
keir@19684 1812
keir@19684 1813 n = scnprintf(info+n,BSIZE-n,"T=");
keir@19684 1814 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
keir@19684 1815 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
keir@19684 1816 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
keir@19684 1817 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
keir@19684 1818 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
keir@19684 1819 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
keir@19684 1820 #ifdef COMPARE_COPY_PAGE_SSE2
keir@19684 1821 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
keir@19684 1822 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
keir@19684 1823 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
keir@19684 1824 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
keir@19684 1825 #else
keir@19684 1826 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
keir@19684 1827 #endif
keir@19684 1828 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
keir@19684 1829 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
keir@19684 1830 n--; /* overwrite trailing comma */
keir@19684 1831 n += scnprintf(info+n,BSIZE-n,"\n");
keir@19684 1832 if ( sum + n >= len )
keir@19684 1833 return sum;
keir@19684 1834 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
keir@19684 1835 sum += n;
keir@19684 1836 return sum;
keir@19684 1837 }
keir@19684 1838 #else
keir@19684 1839 #define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
keir@19684 1840 #endif
keir@19684 1841
keir@19684 1842 static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
keir@19684 1843 bool_t use_long)
keir@19684 1844 {
keir@19684 1845 char info[BSIZE];
keir@19684 1846 int n = 0, sum = off;
keir@19684 1847
keir@19684 1848 n += scnprintf(info,BSIZE,"G="
keir@19684 1849 "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
keir@19684 1850 "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
keir@19684 1851 total_tmem_ops, errored_tmem_ops, failed_copies,
keir@19684 1852 alloc_failed, alloc_page_failed, tmh_avail_pages(),
keir@19684 1853 low_on_memory, evicted_pgs,
keir@19684 1854 evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
keir@19684 1855 total_flush_pool, use_long ? ',' : '\n');
keir@19684 1856 if (use_long)
keir@19684 1857 n += scnprintf(info+n,BSIZE-n,
keir@19684 1858 "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
keir@19684 1859 global_eph_count, global_eph_count_max,
keir@19684 1860 _atomic_read(global_obj_count), global_obj_count_max,
keir@19684 1861 _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
keir@19684 1862 _atomic_read(global_pgp_count), global_pgp_count_max);
keir@19684 1863 if ( sum + n >= len )
keir@19684 1864 return sum;
keir@19684 1865 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
keir@19684 1866 sum += n;
keir@19684 1867 return sum;
keir@19684 1868 }
keir@19684 1869
keir@20067 1870 static int tmemc_list(cli_id_t cli_id, tmem_cli_va_t buf, uint32_t len,
keir@19684 1871 bool_t use_long)
keir@19684 1872 {
keir@19684 1873 client_t *client;
keir@19684 1874 int off = 0;
keir@19684 1875
keir@19684 1876 if ( cli_id == CLI_ID_NULL ) {
keir@19684 1877 off = tmemc_list_global(buf,0,len,use_long);
keir@19684 1878 off += tmemc_list_shared(buf,off,len-off,use_long);
keir@19684 1879 list_for_each_entry(client,&global_client_list,client_list)
keir@19684 1880 off += tmemc_list_client(client, buf, off, len-off, use_long);
keir@19684 1881 off += tmemc_list_global_perf(buf,off,len-off,use_long);
keir@19684 1882 }
keir@19684 1883 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
keir@19684 1884 return -1;
keir@20964 1885 else {
keir@19684 1886 off = tmemc_list_client(client, buf, 0, len, use_long);
keir@20964 1887 tmh_client_put(client->tmh);
keir@20964 1888 }
keir@19684 1889
keir@19684 1890 return 0;
keir@19684 1891 }
keir@19684 1892
keir@19684 1893 static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
keir@19684 1894 {
keir@19684 1895 cli_id_t cli_id = client->cli_id;
keir@19684 1896 uint32_t old_weight;
keir@19684 1897
keir@19684 1898 switch (subop)
keir@19684 1899 {
keir@19684 1900 case TMEMC_SET_WEIGHT:
keir@19684 1901 old_weight = client->weight;
keir@19684 1902 client->weight = arg1;
keir@19684 1903 printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
keir@19684 1904 atomic_sub(old_weight,&client_weight_total);
keir@19684 1905 atomic_add(client->weight,&client_weight_total);
keir@19684 1906 break;
keir@19684 1907 case TMEMC_SET_CAP:
keir@19684 1908 client->cap = arg1;
keir@19684 1909 printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
keir@19684 1910 break;
keir@19684 1911 case TMEMC_SET_COMPRESS:
keir@20067 1912 #ifdef __i386__
keir@20067 1913 return -1;
keir@20067 1914 #endif
keir@19684 1915 client->compress = arg1 ? 1 : 0;
keir@19684 1916 printk("tmem: compression %s for %s=%d\n",
keir@19684 1917 arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
keir@19684 1918 break;
keir@19684 1919 default:
keir@19684 1920 printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
keir@19684 1921 return -1;
keir@19684 1922 }
keir@19684 1923 return 0;
keir@19684 1924 }
keir@19684 1925
keir@20067 1926 static int tmemc_set_var(cli_id_t cli_id, uint32_t subop, uint32_t arg1)
keir@19684 1927 {
keir@19684 1928 client_t *client;
keir@19684 1929
keir@19684 1930 if ( cli_id == CLI_ID_NULL )
keir@19684 1931 list_for_each_entry(client,&global_client_list,client_list)
keir@19684 1932 tmemc_set_var_one(client, subop, arg1);
keir@19684 1933 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
keir@19684 1934 return -1;
keir@19684 1935 else
keir@20964 1936 {
keir@20964 1937 tmemc_set_var_one(client, subop, arg1);
keir@20964 1938 tmh_client_put(client->tmh);
keir@20964 1939 }
keir@19684 1940 return 0;
keir@19684 1941 }
keir@19684 1942
keir@20067 1943 static NOINLINE int tmemc_shared_pool_auth(cli_id_t cli_id, uint64_t uuid_lo,
keir@20067 1944 uint64_t uuid_hi, bool_t auth)
keir@20067 1945 {
keir@20067 1946 client_t *client;
keir@20067 1947 int i, free = -1;
keir@20067 1948
keir@20067 1949 if ( cli_id == CLI_ID_NULL )
keir@20067 1950 {
keir@20067 1951 global_shared_auth = auth;
keir@20067 1952 return 1;
keir@20067 1953 }
keir@20067 1954 client = tmh_client_from_cli_id(cli_id);
keir@20964 1955 if ( client == NULL )
keir@20964 1956 return -EINVAL;
keir@20067 1957 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
keir@20067 1958 {
keir@20067 1959 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
keir@20067 1960 (client->shared_auth_uuid[i][1] == uuid_hi) )
keir@20067 1961 {
keir@20067 1962 if ( auth == 0 )
keir@20067 1963 client->shared_auth_uuid[i][0] =
keir@20067 1964 client->shared_auth_uuid[i][1] = -1L;
keir@20964 1965 tmh_client_put(client->tmh);
keir@20067 1966 return 1;
keir@20067 1967 }
keir@20067 1968 if ( (auth == 1) && (client->shared_auth_uuid[i][0] == -1L) &&
keir@20067 1969 (client->shared_auth_uuid[i][1] == -1L) && (free == -1) )
keir@20067 1970 free = i;
keir@20067 1971 }
keir@20067 1972 if ( auth == 0 )
keir@20964 1973 {
keir@20964 1974 tmh_client_put(client->tmh);
keir@20067 1975 return 0;
keir@20964 1976 }
keir@20067 1977 if ( auth == 1 && free == -1 )
keir@20067 1978 return -ENOMEM;
keir@20067 1979 client->shared_auth_uuid[free][0] = uuid_lo;
keir@20067 1980 client->shared_auth_uuid[free][1] = uuid_hi;
keir@20964 1981 tmh_client_put(client->tmh);
keir@20067 1982 return 1;
keir@20067 1983 }
keir@20067 1984
keir@20067 1985 static NOINLINE int tmemc_save_subop(int cli_id, uint32_t pool_id,
keir@20067 1986 uint32_t subop, tmem_cli_va_t buf, uint32_t arg1)
keir@20067 1987 {
keir@20067 1988 client_t *client = tmh_client_from_cli_id(cli_id);
keir@20964 1989 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
keir@20964 1990 ? NULL : client->pools[pool_id];
keir@20067 1991 uint32_t p;
keir@20067 1992 uint64_t *uuid;
keir@20067 1993 pgp_t *pgp, *pgp2;
keir@20964 1994 int rc = -1;
keir@20067 1995
keir@20067 1996 switch(subop)
keir@20067 1997 {
keir@20067 1998 case TMEMC_SAVE_BEGIN:
keir@20067 1999 if ( client == NULL )
keir@20067 2000 return 0;
keir@20067 2001 for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
keir@20067 2002 if ( client->pools[p] != NULL )
keir@20067 2003 break;
keir@20067 2004 if ( p == MAX_POOLS_PER_DOMAIN )
keir@20964 2005 {
keir@20964 2006 rc = 0;
keir@20964 2007 break;
keir@20964 2008 }
keir@20067 2009 client->was_frozen = client->frozen;
keir@20067 2010 client->frozen = 1;
keir@20067 2011 if ( arg1 != 0 )
keir@20067 2012 client->live_migrating = 1;
keir@20964 2013 rc = 1;
keir@20964 2014 break;
keir@20067 2015 case TMEMC_RESTORE_BEGIN:
keir@20964 2016 if ( client == NULL && (client = client_create(cli_id)) != NULL )
keir@20964 2017 return 1;
keir@20964 2018 break;
keir@20067 2019 case TMEMC_SAVE_GET_VERSION:
keir@20964 2020 rc = TMEM_SPEC_VERSION;
keir@20964 2021 break;
keir@20067 2022 case TMEMC_SAVE_GET_MAXPOOLS:
keir@20964 2023 rc = MAX_POOLS_PER_DOMAIN;
keir@20964 2024 break;
keir@20067 2025 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
keir@20964 2026 rc = client->weight == -1 ? -2 : client->weight;
keir@20964 2027 break;
keir@20067 2028 case TMEMC_SAVE_GET_CLIENT_CAP:
keir@20964 2029 rc = client->cap == -1 ? -2 : client->cap;
keir@20964 2030 break;
keir@20067 2031 case TMEMC_SAVE_GET_CLIENT_FLAGS:
keir@20964 2032 rc = (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) |
keir@20964 2033 (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 );
keir@20964 2034 break;
keir@20067 2035 case TMEMC_SAVE_GET_POOL_FLAGS:
keir@20067 2036 if ( pool == NULL )
keir@20964 2037 break;
keir@20964 2038 rc = (pool->persistent ? TMEM_POOL_PERSIST : 0) |
keir@20964 2039 (pool->shared ? TMEM_POOL_SHARED : 0) |
keir@20964 2040 (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT);
keir@20964 2041 break;
keir@20067 2042 case TMEMC_SAVE_GET_POOL_NPAGES:
keir@20067 2043 if ( pool == NULL )
keir@20964 2044 break;
keir@20964 2045 rc = _atomic_read(pool->pgp_count);
keir@20964 2046 break;
keir@20067 2047 case TMEMC_SAVE_GET_POOL_UUID:
keir@20067 2048 if ( pool == NULL )
keir@20964 2049 break;
keir@20067 2050 uuid = (uint64_t *)buf.p;
keir@20067 2051 *uuid++ = pool->uuid[0];
keir@20067 2052 *uuid = pool->uuid[1];
keir@20964 2053 rc = 0;
keir@20067 2054 case TMEMC_SAVE_END:
keir@20067 2055 client->live_migrating = 0;
keir@20067 2056 if ( !list_empty(&client->persistent_invalidated_list) )
keir@20067 2057 list_for_each_entry_safe(pgp,pgp2,
keir@20067 2058 &client->persistent_invalidated_list, client_inv_pages)
keir@20067 2059 pgp_free_from_inv_list(client,pgp);
keir@20067 2060 client->frozen = client->was_frozen;
keir@20964 2061 rc = 0;
keir@20067 2062 }
keir@20964 2063 if ( client )
keir@20964 2064 tmh_client_put(client->tmh);
keir@20964 2065 return rc;
keir@20067 2066 }
keir@20067 2067
keir@20067 2068 static NOINLINE int tmemc_save_get_next_page(int cli_id, int pool_id,
keir@20067 2069 tmem_cli_va_t buf, uint32_t bufsize)
keir@20067 2070 {
keir@20067 2071 client_t *client = tmh_client_from_cli_id(cli_id);
keir@20964 2072 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
keir@20964 2073 ? NULL : client->pools[pool_id];
keir@20067 2074 pgp_t *pgp;
keir@20067 2075 int ret = 0;
keir@20067 2076 struct tmem_handle *h;
keir@20067 2077 unsigned int pagesize = 1 << (pool->pageshift+12);
keir@20067 2078
keir@20964 2079 if ( pool == NULL || is_ephemeral(pool) )
keir@20964 2080 {
keir@20964 2081 tmh_client_put(client->tmh);
keir@20067 2082 return -1;
keir@20964 2083 }
keir@20067 2084 if ( bufsize < pagesize + sizeof(struct tmem_handle) )
keir@20964 2085 {
keir@20964 2086 tmh_client_put(client->tmh);
keir@20067 2087 return -ENOMEM;
keir@20964 2088 }
keir@20067 2089
keir@20067 2090 tmem_spin_lock(&pers_lists_spinlock);
keir@20067 2091 if ( list_empty(&pool->persistent_page_list) )
keir@20067 2092 {
keir@20067 2093 ret = -1;
keir@20067 2094 goto out;
keir@20067 2095 }
keir@20067 2096 /* note: pool->cur_pgp is the pgp last returned by get_next_page */
keir@20067 2097 if ( pool->cur_pgp == NULL )
keir@20067 2098 {
keir@20067 2099 /* process the first one */
keir@20067 2100 pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
keir@20067 2101 pgp_t,pool_pers_pages);
keir@20067 2102 } else if ( list_is_last(&pool->cur_pgp->pool_pers_pages,
keir@20067 2103 &pool->persistent_page_list) )
keir@20067 2104 {
keir@20067 2105 /* already processed the last one in the list */
keir@20067 2106 ret = -1;
keir@20067 2107 goto out;
keir@20067 2108 }
keir@20067 2109 pgp = list_entry((&pool->cur_pgp->pool_pers_pages)->next,
keir@20067 2110 pgp_t,pool_pers_pages);
keir@20067 2111 pool->cur_pgp = pgp;
keir@20067 2112 h = (struct tmem_handle *)buf.p;
keir@20067 2113 h->oid = pgp->obj->oid;
keir@20067 2114 h->index = pgp->index;
keir@20067 2115 buf.p = (void *)(h+1);
keir@20067 2116 ret = do_tmem_get(pool, h->oid, h->index,0,0,0,pagesize,buf.p);
keir@20067 2117
keir@20067 2118 out:
keir@20067 2119 tmem_spin_unlock(&pers_lists_spinlock);
keir@20964 2120 tmh_client_put(client->tmh);
keir@20067 2121 return ret;
keir@20067 2122 }
keir@20067 2123
keir@20067 2124 static NOINLINE int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_t buf,
keir@20067 2125 uint32_t bufsize)
keir@20067 2126 {
keir@20067 2127 client_t *client = tmh_client_from_cli_id(cli_id);
keir@20067 2128 pgp_t *pgp;
keir@20067 2129 struct tmem_handle *h;
keir@20067 2130 int ret = 0;
keir@20067 2131
keir@20067 2132 if ( client == NULL )
keir@20067 2133 return 0;
keir@20067 2134 if ( bufsize < sizeof(struct tmem_handle) )
keir@20964 2135 {
keir@20964 2136 tmh_client_put(client->tmh);
keir@20067 2137 return 0;
keir@20964 2138 }
keir@20067 2139 tmem_spin_lock(&pers_lists_spinlock);
keir@20067 2140 if ( list_empty(&client->persistent_invalidated_list) )
keir@20067 2141 goto out;
keir@20067 2142 if ( client->cur_pgp == NULL )
keir@20067 2143 {
keir@20067 2144 pgp = list_entry((&client->persistent_invalidated_list)->next,
keir@20067 2145 pgp_t,client_inv_pages);
keir@20067 2146 client->cur_pgp = pgp;
keir@20067 2147 } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
keir@20067 2148 &client->persistent_invalidated_list) )
keir@20067 2149 {
keir@20067 2150 client->cur_pgp = NULL;
keir@20067 2151 ret = 0;
keir@20067 2152 goto out;
keir@20067 2153 } else {
keir@20067 2154 pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
keir@20067 2155 pgp_t,client_inv_pages);
keir@20067 2156 client->cur_pgp = pgp;
keir@20067 2157 }
keir@20067 2158 h = (struct tmem_handle *)buf.p;
keir@20067 2159 h->pool_id = pgp->pool_id;
keir@20067 2160 h->oid = pgp->inv_oid;
keir@20067 2161 h->index = pgp->index;
keir@20067 2162 ret = 1;
keir@20067 2163 out:
keir@20067 2164 tmem_spin_unlock(&pers_lists_spinlock);
keir@20964 2165 tmh_client_put(client->tmh);
keir@20067 2166 return ret;
keir@20067 2167 }
keir@20067 2168
keir@20067 2169 static int tmemc_restore_put_page(int cli_id, int pool_id, uint64_t oid,
keir@20067 2170 uint32_t index, tmem_cli_va_t buf, uint32_t bufsize)
keir@20067 2171 {
keir@20067 2172 client_t *client = tmh_client_from_cli_id(cli_id);
keir@20964 2173 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
keir@20964 2174 ? NULL : client->pools[pool_id];
keir@20964 2175 int rc = pool ? do_tmem_put(pool,oid,index,0,0,0,bufsize,buf.p) : -1;
keir@20067 2176
keir@20964 2177 if ( client )
keir@20964 2178 tmh_client_put(client->tmh);
keir@20964 2179 return rc;
keir@20067 2180 }
keir@20067 2181
keir@20067 2182 static int tmemc_restore_flush_page(int cli_id, int pool_id, uint64_t oid,
keir@20067 2183 uint32_t index)
keir@20067 2184 {
keir@20067 2185 client_t *client = tmh_client_from_cli_id(cli_id);
keir@20964 2186 pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
keir@20964 2187 ? NULL : client->pools[pool_id];
keir@20964 2188 int rc = pool ? do_tmem_flush_page(pool, oid, index) : -1;
keir@20067 2189
keir@20964 2190 if ( client )
keir@20964 2191 tmh_client_put(client->tmh);
keir@20964 2192 return rc;
keir@20067 2193 }
keir@20067 2194
keir@20067 2195 static NOINLINE int do_tmem_control(struct tmem_op *op)
keir@19684 2196 {
keir@19684 2197 int ret;
keir@20067 2198 uint32_t pool_id = op->pool_id;
keir@20067 2199 uint32_t subop = op->u.ctrl.subop;
keir@19684 2200
keir@19684 2201 if (!tmh_current_is_privileged())
keir@19684 2202 {
keir@19684 2203 /* don't fail... mystery: sometimes dom0 fails here */
keir@19684 2204 /* return -EPERM; */
keir@19684 2205 }
keir@19684 2206 switch(subop)
keir@19684 2207 {
keir@19684 2208 case TMEMC_THAW:
keir@19684 2209 case TMEMC_FREEZE:
keir@19684 2210 case TMEMC_DESTROY:
keir@20067 2211 ret = tmemc_freeze_pools(op->u.ctrl.cli_id,subop);
keir@19684 2212 break;
keir@19684 2213 case TMEMC_FLUSH:
keir@20067 2214 ret = tmemc_flush_mem(op->u.ctrl.cli_id,op->u.ctrl.arg1);
keir@19684 2215 break;
keir@19684 2216 case TMEMC_LIST:
keir@20067 2217 ret = tmemc_list(op->u.ctrl.cli_id,op->u.ctrl.buf,
keir@20067 2218 op->u.ctrl.arg1,op->u.ctrl.arg2);
keir@19684 2219 break;
keir@19684 2220 case TMEMC_SET_WEIGHT:
keir@19684 2221 case TMEMC_SET_CAP:
keir@19684 2222 case TMEMC_SET_COMPRESS:
keir@20067 2223 ret = tmemc_set_var(op->u.ctrl.cli_id,subop,op->u.ctrl.arg1);
keir@20067 2224 break;
keir@20079 2225 case TMEMC_QUERY_FREEABLE_MB:
keir@20812 2226 ret = tmh_freeable_pages() >> (20 - PAGE_SHIFT);
keir@20079 2227 break;
keir@20067 2228 case TMEMC_SAVE_BEGIN:
keir@20067 2229 case TMEMC_RESTORE_BEGIN:
keir@20067 2230 case TMEMC_SAVE_GET_VERSION:
keir@20067 2231 case TMEMC_SAVE_GET_MAXPOOLS:
keir@20067 2232 case TMEMC_SAVE_GET_CLIENT_WEIGHT:
keir@20067 2233 case TMEMC_SAVE_GET_CLIENT_CAP:
keir@20067 2234 case TMEMC_SAVE_GET_CLIENT_FLAGS:
keir@20067 2235 case TMEMC_SAVE_GET_POOL_FLAGS:
keir@20067 2236 case TMEMC_SAVE_GET_POOL_NPAGES:
keir@20067 2237 case TMEMC_SAVE_GET_POOL_UUID:
keir@20067 2238 case TMEMC_SAVE_END:
keir@20067 2239 ret = tmemc_save_subop(op->u.ctrl.cli_id,pool_id,subop,
keir@20067 2240 op->u.ctrl.buf,op->u.ctrl.arg1);
keir@20067 2241 break;
keir@20067 2242 case TMEMC_SAVE_GET_NEXT_PAGE:
keir@20067 2243 ret = tmemc_save_get_next_page(op->u.ctrl.cli_id, pool_id,
keir@20067 2244 op->u.ctrl.buf, op->u.ctrl.arg1);
keir@20067 2245 break;
keir@20067 2246 case TMEMC_SAVE_GET_NEXT_INV:
keir@20067 2247 ret = tmemc_save_get_next_inv(op->u.ctrl.cli_id, op->u.ctrl.buf,
keir@20067 2248 op->u.ctrl.arg1);
keir@20067 2249 break;
keir@20067 2250 case TMEMC_RESTORE_PUT_PAGE:
keir@20067 2251 ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id,
keir@20067 2252 op->u.ctrl.arg3, op->u.ctrl.arg2,
keir@20067 2253 op->u.ctrl.buf, op->u.ctrl.arg1);
keir@20067 2254 break;
keir@20067 2255 case TMEMC_RESTORE_FLUSH_PAGE:
keir@20067 2256 ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id,
keir@20067 2257 op->u.ctrl.arg3, op->u.ctrl.arg2);
keir@19684 2258 break;
keir@19684 2259 default:
keir@19684 2260 ret = -1;
keir@19684 2261 }
keir@19684 2262 return ret;
keir@19684 2263 }
keir@19684 2264
keir@19684 2265 /************ EXPORTed FUNCTIONS **************************************/
keir@19684 2266
keir@19684 2267 EXPORT long do_tmem_op(tmem_cli_op_t uops)
keir@19684 2268 {
keir@19684 2269 struct tmem_op op;
keir@19684 2270 client_t *client = tmh_client_from_current();
keir@19684 2271 pool_t *pool = NULL;
keir@19684 2272 int rc = 0;
keir@19684 2273 bool_t succ_get = 0, succ_put = 0;
keir@19684 2274 bool_t non_succ_get = 0, non_succ_put = 0;
keir@19684 2275 bool_t flush = 0, flush_obj = 0;
keir@19684 2276 bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
keir@19684 2277 DECL_LOCAL_CYC_COUNTER(succ_get);
keir@19684 2278 DECL_LOCAL_CYC_COUNTER(succ_put);
keir@19684 2279 DECL_LOCAL_CYC_COUNTER(non_succ_get);
keir@19684 2280 DECL_LOCAL_CYC_COUNTER(non_succ_put);
keir@19684 2281 DECL_LOCAL_CYC_COUNTER(flush);
keir@19684 2282 DECL_LOCAL_CYC_COUNTER(flush_obj);
keir@19684 2283
keir@19684 2284 if ( !tmem_initialized )
keir@19684 2285 return -ENODEV;
keir@19684 2286
keir@19684 2287 total_tmem_ops++;
keir@19684 2288
keir@19684 2289 if ( tmh_lock_all )
keir@19684 2290 {
keir@19684 2291 if ( tmh_lock_all > 1 )
keir@19684 2292 spin_lock_irq(&tmem_spinlock);
keir@19684 2293 else
keir@19684 2294 spin_lock(&tmem_spinlock);
keir@19684 2295 }
keir@19684 2296
keir@19684 2297 START_CYC_COUNTER(succ_get);
keir@19684 2298 DUP_START_CYC_COUNTER(succ_put,succ_get);
keir@19684 2299 DUP_START_CYC_COUNTER(non_succ_get,succ_get);
keir@19684 2300 DUP_START_CYC_COUNTER(non_succ_put,succ_get);
keir@19684 2301 DUP_START_CYC_COUNTER(flush,succ_get);
keir@19684 2302 DUP_START_CYC_COUNTER(flush_obj,succ_get);
keir@19684 2303
keir@20499 2304 if ( client != NULL && tmh_client_is_dying(client) )
keir@20499 2305 {
keir@20499 2306 rc = -ENODEV;
keir@20499 2307 goto out;
keir@20499 2308 }
keir@20499 2309
keir@19684 2310 if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
keir@19684 2311 {
keir@19684 2312 printk("tmem: can't get tmem struct from %s\n",client_str);
keir@19684 2313 rc = -EFAULT;
keir@19684 2314 goto out;
keir@19684 2315 }
keir@19684 2316
keir@19684 2317 if ( op.cmd == TMEM_CONTROL )
keir@19684 2318 {
keir@19684 2319 tmem_write_lock(&tmem_rwlock);
keir@19684 2320 tmem_write_lock_set = 1;
keir@20067 2321 rc = do_tmem_control(&op);
keir@20067 2322 goto out;
keir@20067 2323 } else if ( op.cmd == TMEM_AUTH ) {
keir@20067 2324 tmem_write_lock(&tmem_rwlock);
keir@20067 2325 tmem_write_lock_set = 1;
keir@20067 2326 rc = tmemc_shared_pool_auth(op.u.new.arg1,op.u.new.uuid[0],
keir@20067 2327 op.u.new.uuid[1],op.u.new.flags);
keir@20067 2328 goto out;
keir@20067 2329 } else if ( op.cmd == TMEM_RESTORE_NEW ) {
keir@20067 2330 tmem_write_lock(&tmem_rwlock);
keir@20067 2331 tmem_write_lock_set = 1;
keir@20067 2332 rc = do_tmem_new_pool(op.u.new.arg1, op.pool_id, op.u.new.flags,
keir@20067 2333 op.u.new.uuid[0], op.u.new.uuid[1]);
keir@19684 2334 goto out;
keir@19684 2335 }
keir@19684 2336
keir@19684 2337 /* create per-client tmem structure dynamically on first use by client */
keir@19684 2338 if ( client == NULL )
keir@19684 2339 {
keir@19684 2340 tmem_write_lock(&tmem_rwlock);
keir@19684 2341 tmem_write_lock_set = 1;
keir@20067 2342 if ( (client = client_create(tmh_get_cli_id_from_current())) == NULL )
keir@19684 2343 {
keir@19684 2344 printk("tmem: can't create tmem structure for %s\n",client_str);
keir@19684 2345 rc = -ENOMEM;
keir@19684 2346 goto out;
keir@19684 2347 }
keir@19684 2348 }
keir@19684 2349
keir@19734 2350 if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
keir@19684 2351 {
keir@19684 2352 if ( !tmem_write_lock_set )
keir@19684 2353 {
keir@19684 2354 tmem_write_lock(&tmem_rwlock);
keir@19684 2355 tmem_write_lock_set = 1;
keir@19684 2356 }
keir@19684 2357 }
keir@19684 2358 else
keir@19684 2359 {
keir@19684 2360 if ( !tmem_write_lock_set )
keir@19684 2361 {
keir@19684 2362 tmem_read_lock(&tmem_rwlock);
keir@19684 2363 tmem_read_lock_set = 1;
keir@19684 2364 }
keir@19684 2365 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
keir@19684 2366 ((pool = client->pools[op.pool_id]) == NULL) )
keir@19684 2367 {
keir@19684 2368 rc = -ENODEV;
keir@19684 2369 printk("tmem: operation requested on uncreated pool\n");
keir@19684 2370 goto out;
keir@19684 2371 }
keir@19684 2372 ASSERT_SENTINEL(pool,POOL);
keir@19684 2373 }
keir@19684 2374
keir@19684 2375 switch ( op.cmd )
keir@19684 2376 {
keir@19684 2377 case TMEM_NEW_POOL:
keir@20067 2378 rc = do_tmem_new_pool(CLI_ID_NULL, 0, op.u.new.flags,
keir@19809 2379 op.u.new.uuid[0], op.u.new.uuid[1]);
keir@19684 2380 break;
keir@19684 2381 case TMEM_NEW_PAGE:
keir@20648 2382 tmem_ensure_avail_pages();
keir@20067 2383 rc = do_tmem_put(pool, op.u.gen.object,
keir@20067 2384 op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, NULL);
keir@19684 2385 break;
keir@19684 2386 case TMEM_PUT_PAGE:
keir@20648 2387 tmem_ensure_avail_pages();
keir@20067 2388 rc = do_tmem_put(pool, op.u.gen.object,
keir@20067 2389 op.u.gen.index, op.u.gen.cmfn, 0, 0, PAGE_SIZE, NULL);
keir@19684 2390 if (rc == 1) succ_put = 1;
keir@19684 2391 else non_succ_put = 1;
keir@19684 2392 break;
keir@19684 2393 case TMEM_GET_PAGE:
keir@19809 2394 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
keir@20067 2395 0, 0, PAGE_SIZE, 0);
keir@19684 2396 if (rc == 1) succ_get = 1;
keir@19684 2397 else non_succ_get = 1;
keir@19684 2398 break;
keir@19684 2399 case TMEM_FLUSH_PAGE:
keir@19684 2400 flush = 1;
keir@19809 2401 rc = do_tmem_flush_page(pool, op.u.gen.object, op.u.gen.index);
keir@19684 2402 break;
keir@19684 2403 case TMEM_FLUSH_OBJECT:
keir@19809 2404 rc = do_tmem_flush_object(pool, op.u.gen.object);
keir@19684 2405 flush_obj = 1;
keir@19684 2406 break;
keir@19684 2407 case TMEM_DESTROY_POOL:
keir@19684 2408 flush = 1;
keir@19684 2409 rc = do_tmem_destroy_pool(op.pool_id);
keir@19684 2410 break;
keir@19684 2411 case TMEM_READ:
keir@19809 2412 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
keir@19809 2413 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
keir@20067 2414 op.u.gen.len,0);
keir@19684 2415 break;
keir@19684 2416 case TMEM_WRITE:
keir@20067 2417 rc = do_tmem_put(pool, op.u.gen.object,
keir@20067 2418 op.u.gen.index, op.u.gen.cmfn,
keir@19809 2419 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
keir@20067 2420 op.u.gen.len, NULL);
keir@19684 2421 break;
keir@19684 2422 case TMEM_XCHG:
keir@19684 2423 /* need to hold global lock to ensure xchg is atomic */
keir@19684 2424 printk("tmem_xchg op not implemented yet\n");
keir@19684 2425 rc = 0;
keir@19684 2426 break;
keir@19684 2427 default:
keir@19684 2428 printk("tmem: op %d not implemented\n", op.cmd);
keir@19684 2429 rc = 0;
keir@19684 2430 break;
keir@19684 2431 }
keir@19684 2432
keir@19684 2433 out:
keir@19684 2434 if ( rc < 0 )
keir@19684 2435 errored_tmem_ops++;
keir@19684 2436 if ( succ_get )
keir@19897 2437 END_CYC_COUNTER_CLI(succ_get,client);
keir@19684 2438 else if ( succ_put )
keir@19897 2439 END_CYC_COUNTER_CLI(succ_put,client);
keir@19684 2440 else if ( non_succ_get )
keir@19897 2441 END_CYC_COUNTER_CLI(non_succ_get,client);
keir@19684 2442 else if ( non_succ_put )
keir@19897 2443 END_CYC_COUNTER_CLI(non_succ_put,client);
keir@19684 2444 else if ( flush )
keir@19897 2445 END_CYC_COUNTER_CLI(flush,client);
keir@19897 2446 else if ( flush_obj )
keir@19897 2447 END_CYC_COUNTER_CLI(flush_obj,client);
keir@19684 2448
keir@19684 2449 if ( tmh_lock_all )
keir@19684 2450 {
keir@19684 2451 if ( tmh_lock_all > 1 )
keir@19684 2452 spin_unlock_irq(&tmem_spinlock);
keir@19684 2453 else
keir@19684 2454 spin_unlock(&tmem_spinlock);
keir@19684 2455 } else {
keir@19684 2456 if ( tmem_write_lock_set )
keir@19684 2457 write_unlock(&tmem_rwlock);
keir@19684 2458 else if ( tmem_read_lock_set )
keir@19684 2459 read_unlock(&tmem_rwlock);
keir@19684 2460 else
keir@19684 2461 ASSERT(0);
keir@19684 2462 }
keir@19684 2463
keir@19684 2464 return rc;
keir@19684 2465 }
keir@19684 2466
keir@19684 2467 /* this should be called when the host is destroying a client */
keir@19684 2468 EXPORT void tmem_destroy(void *v)
keir@19684 2469 {
keir@19684 2470 client_t *client = (client_t *)v;
keir@19684 2471
keir@19724 2472 if ( client == NULL )
keir@19724 2473 return;
keir@19724 2474
keir@20499 2475 if ( !tmh_client_is_dying(client) )
keir@20499 2476 {
keir@20499 2477 printk("tmem: tmem_destroy can only destroy dying client\n");
keir@20499 2478 return;
keir@20499 2479 }
keir@20499 2480
keir@19684 2481 if ( tmh_lock_all )
keir@19684 2482 spin_lock(&tmem_spinlock);
keir@19684 2483 else
keir@19684 2484 write_lock(&tmem_rwlock);
keir@19684 2485
keir@19724 2486 printk("tmem: flushing tmem pools for %s=%d\n",
keir@19724 2487 cli_id_str, client->cli_id);
keir@19724 2488 client_flush(client, 1);
keir@19684 2489
keir@19684 2490 if ( tmh_lock_all )
keir@19684 2491 spin_unlock(&tmem_spinlock);
keir@19684 2492 else
keir@19684 2493 write_unlock(&tmem_rwlock);
keir@19684 2494 }
keir@19684 2495
keir@19684 2496 /* freezing all pools guarantees that no additional memory will be consumed */
keir@19684 2497 EXPORT void tmem_freeze_all(unsigned char key)
keir@19684 2498 {
keir@19684 2499 static int freeze = 0;
keir@19684 2500
keir@19684 2501 if ( tmh_lock_all )
keir@19684 2502 spin_lock(&tmem_spinlock);
keir@19684 2503 else
keir@19684 2504 write_lock(&tmem_rwlock);
keir@19684 2505
keir@19684 2506 freeze = !freeze;
keir@19684 2507 tmemc_freeze_pools(CLI_ID_NULL,freeze);
keir@19684 2508
keir@19684 2509 if ( tmh_lock_all )
keir@19684 2510 spin_unlock(&tmem_spinlock);
keir@19684 2511 else
keir@19684 2512 write_unlock(&tmem_rwlock);
keir@19684 2513 }
keir@19684 2514
keir@19684 2515 #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
keir@19684 2516
keir@19684 2517 EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
keir@19684 2518 {
keir@19684 2519 pfp_t *pfp;
keir@19684 2520 unsigned long evicts_per_relinq = 0;
keir@19684 2521 int max_evictions = 10;
keir@19684 2522
keir@20812 2523 if (!tmh_enabled() || !tmh_freeable_pages())
keir@19684 2524 return NULL;
keir@19684 2525 #ifdef __i386__
keir@19684 2526 return NULL;
keir@19684 2527 #endif
keir@19684 2528
keir@19684 2529 relinq_attempts++;
keir@19684 2530 if ( order > 0 )
keir@19684 2531 {
keir@19684 2532 printk("tmem_relinquish_page: failing order=%d\n", order);
keir@19684 2533 return NULL;
keir@19684 2534 }
keir@19684 2535
keir@19684 2536 if ( tmh_called_from_tmem(memflags) )
keir@19684 2537 {
keir@19684 2538 if ( tmh_lock_all )
keir@19684 2539 spin_lock(&tmem_spinlock);
keir@19684 2540 else
keir@19684 2541 read_lock(&tmem_rwlock);
keir@19684 2542 }
keir@19684 2543
keir@19684 2544 while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
keir@19684 2545 {
keir@19684 2546 if ( (max_evictions-- <= 0) || !tmem_evict())
keir@19684 2547 break;
keir@19684 2548 evicts_per_relinq++;
keir@19684 2549 }
keir@19684 2550 if ( evicts_per_relinq > max_evicts_per_relinq )
keir@19684 2551 max_evicts_per_relinq = evicts_per_relinq;
keir@19684 2552 tmh_scrub_page(pfp, memflags);
keir@19684 2553 if ( pfp != NULL )
keir@19684 2554 relinq_pgs++;
keir@19684 2555
keir@19684 2556 if ( tmh_called_from_tmem(memflags) )
keir@19684 2557 {
keir@19684 2558 if ( tmh_lock_all )
keir@19684 2559 spin_unlock(&tmem_spinlock);
keir@19684 2560 else
keir@19684 2561 read_unlock(&tmem_rwlock);
keir@19684 2562 }
keir@19684 2563
keir@19684 2564 return pfp;
keir@19684 2565 }
keir@19684 2566
keir@19684 2567 /* called at hypervisor startup */
keir@19684 2568 EXPORT void init_tmem(void)
keir@19684 2569 {
keir@19684 2570 if ( !tmh_enabled() )
keir@19684 2571 return;
keir@19684 2572
keir@19684 2573 radix_tree_init();
keir@19684 2574 if ( tmh_init() )
keir@19684 2575 {
keir@19684 2576 printk("tmem: initialized comp=%d global-lock=%d\n",
keir@19684 2577 tmh_compression_enabled(), tmh_lock_all);
keir@19684 2578 tmem_initialized = 1;
keir@19684 2579 }
keir@19684 2580 else
keir@19684 2581 printk("tmem: initialization FAILED\n");
keir@19684 2582 }
keir@19684 2583
keir@19684 2584 /*
keir@19684 2585 * Local variables:
keir@19684 2586 * mode: C
keir@19684 2587 * c-set-style: "BSD"
keir@19684 2588 * c-basic-offset: 4
keir@19684 2589 * tab-width: 4
keir@19684 2590 * indent-tabs-mode: nil
keir@19684 2591 * End:
keir@19684 2592 */