Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/common/tmem.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 * tmem.c
3
 *
4
 * Transcendent memory
5
 *
6
 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7
 */
8
9
/* TODO list: 090129 (updated 100318)
10
   - any better reclamation policy?
11
   - use different tlsf pools for each client (maybe each pool)
12
   - test shared access more completely (ocfs2)
13
   - add feedback-driven compression (not for persistent pools though!)
14
   - add data-structure total bytes overhead stats
15
 */
16
17
#ifdef __XEN__
18
#include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here. */
19
#endif
20
21
#include <public/sysctl.h>
22
#include <xen/tmem.h>
23
#include <xen/rbtree.h>
24
#include <xen/radix-tree.h>
25
#include <xen/list.h>
26
#include <xen/init.h>
27
28
0
#define TMEM_SPEC_VERSION 1
29
30
struct tmem_statistics tmem_stats = {
31
    .global_obj_count = ATOMIC_INIT(0),
32
    .global_pgp_count = ATOMIC_INIT(0),
33
    .global_pcd_count = ATOMIC_INIT(0),
34
    .global_page_count = ATOMIC_INIT(0),
35
    .global_rtree_node_count = ATOMIC_INIT(0),
36
};
37
38
/************ CORE DATA STRUCTURES ************************************/
39
40
struct tmem_object_root {
41
    struct xen_tmem_oid oid;
42
    struct rb_node rb_tree_node; /* Protected by pool->pool_rwlock. */
43
    unsigned long objnode_count; /* Atomicity depends on obj_spinlock. */
44
    long pgp_count; /* Atomicity depends on obj_spinlock. */
45
    struct radix_tree_root tree_root; /* Tree of pages within object. */
46
    struct tmem_pool *pool;
47
    domid_t last_client;
48
    spinlock_t obj_spinlock;
49
};
50
51
struct tmem_object_node {
52
    struct tmem_object_root *obj;
53
    struct radix_tree_node rtn;
54
};
55
56
struct tmem_page_descriptor {
57
    union {
58
        struct list_head global_eph_pages;
59
        struct list_head client_inv_pages;
60
    };
61
    union {
62
        struct {
63
            union {
64
                struct list_head client_eph_pages;
65
                struct list_head pool_pers_pages;
66
            };
67
            struct tmem_object_root *obj;
68
        } us;
69
        struct xen_tmem_oid inv_oid;  /* Used for invalid list only. */
70
    };
71
    pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
72
                    else compressed data (cdata). */
73
    uint32_t index;
74
    bool eviction_attempted;  /* CHANGE TO lifetimes? (settable). */
75
    union {
76
        struct page_info *pfp;  /* Page frame pointer. */
77
        char *cdata; /* Compressed data. */
78
        struct tmem_page_content_descriptor *pcd; /* Page dedup. */
79
    };
80
    union {
81
        uint64_t timestamp;
82
        uint32_t pool_id;  /* Used for invalid list only. */
83
    };
84
};
85
86
#define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
87
88
struct tmem_page_content_descriptor {
89
    union {
90
        struct page_info *pfp;  /* Page frame pointer. */
91
        char *cdata; /* If compression_enabled. */
92
    };
93
    pagesize_t size; /* If compression_enabled -> 0<size<PAGE_SIZE (*cdata)
94
                     * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
95
                     * else PAGE_SIZE -> *pfp. */
96
};
97
98
static int tmem_initialized = 0;
99
100
struct xmem_pool *tmem_mempool = 0;
101
unsigned int tmem_mempool_maxalloc = 0;
102
103
DEFINE_SPINLOCK(tmem_page_list_lock);
104
PAGE_LIST_HEAD(tmem_page_list);
105
unsigned long tmem_page_list_pages = 0;
106
107
DEFINE_RWLOCK(tmem_rwlock);
108
static DEFINE_SPINLOCK(eph_lists_spinlock); /* Protects global AND clients. */
109
static DEFINE_SPINLOCK(pers_lists_spinlock);
110
111
0
#define ASSERT_SPINLOCK(_l) ASSERT(spin_is_locked(_l))
112
0
#define ASSERT_WRITELOCK(_l) ASSERT(rw_is_write_locked(_l))
113
114
    atomic_t client_weight_total;
115
116
struct tmem_global tmem_global = {
117
    .ephemeral_page_list = LIST_HEAD_INIT(tmem_global.ephemeral_page_list),
118
    .client_list = LIST_HEAD_INIT(tmem_global.client_list),
119
    .client_weight_total = ATOMIC_INIT(0),
120
};
121
122
/*
123
 * There two types of memory allocation interfaces in tmem.
124
 * One is based on xmem_pool and the other is used for allocate a whole page.
125
 * Both of them are based on the lowlevel function __tmem_alloc_page/_thispool().
126
 * The call trace of alloc path is like below.
127
 * Persistant pool:
128
 *     1.tmem_malloc()
129
 *         > xmem_pool_alloc()
130
 *             > tmem_persistent_pool_page_get()
131
 *                 > __tmem_alloc_page_thispool()
132
 *     2.tmem_alloc_page()
133
 *         > __tmem_alloc_page_thispool()
134
 *
135
 * Ephemeral pool:
136
 *     1.tmem_malloc()
137
 *         > xmem_pool_alloc()
138
 *             > tmem_mempool_page_get()
139
 *                 > __tmem_alloc_page()
140
 *     2.tmem_alloc_page()
141
 *         > __tmem_alloc_page()
142
 *
143
 * The free path is done in the same manner.
144
 */
145
static void *tmem_malloc(size_t size, struct tmem_pool *pool)
146
0
{
147
0
    void *v = NULL;
148
0
149
0
    if ( (pool != NULL) && is_persistent(pool) ) {
150
0
        if ( pool->client->persistent_pool )
151
0
            v = xmem_pool_alloc(size, pool->client->persistent_pool);
152
0
    }
153
0
    else
154
0
    {
155
0
        ASSERT( size < tmem_mempool_maxalloc );
156
0
        ASSERT( tmem_mempool != NULL );
157
0
        v = xmem_pool_alloc(size, tmem_mempool);
158
0
    }
159
0
    if ( v == NULL )
160
0
        tmem_stats.alloc_failed++;
161
0
    return v;
162
0
}
163
164
static void tmem_free(void *p, struct tmem_pool *pool)
165
0
{
166
0
    if ( pool == NULL || !is_persistent(pool) )
167
0
    {
168
0
        ASSERT( tmem_mempool != NULL );
169
0
        xmem_pool_free(p, tmem_mempool);
170
0
    }
171
0
    else
172
0
    {
173
0
        ASSERT( pool->client->persistent_pool != NULL );
174
0
        xmem_pool_free(p, pool->client->persistent_pool);
175
0
    }
176
0
}
177
178
static struct page_info *tmem_alloc_page(struct tmem_pool *pool)
179
0
{
180
0
    struct page_info *pfp = NULL;
181
0
182
0
    if ( pool != NULL && is_persistent(pool) )
183
0
        pfp = __tmem_alloc_page_thispool(pool->client->domain);
184
0
    else
185
0
        pfp = __tmem_alloc_page();
186
0
    if ( pfp == NULL )
187
0
        tmem_stats.alloc_page_failed++;
188
0
    else
189
0
        atomic_inc_and_max(global_page_count);
190
0
    return pfp;
191
0
}
192
193
static void tmem_free_page(struct tmem_pool *pool, struct page_info *pfp)
194
0
{
195
0
    ASSERT(pfp);
196
0
    if ( pool == NULL || !is_persistent(pool) )
197
0
        __tmem_free_page(pfp);
198
0
    else
199
0
        __tmem_free_page_thispool(pfp);
200
0
    atomic_dec_and_assert(global_page_count);
201
0
}
202
203
static void *tmem_mempool_page_get(unsigned long size)
204
0
{
205
0
    struct page_info *pi;
206
0
207
0
    ASSERT(size == PAGE_SIZE);
208
0
    if ( (pi = __tmem_alloc_page()) == NULL )
209
0
        return NULL;
210
0
    return page_to_virt(pi);
211
0
}
212
213
static void tmem_mempool_page_put(void *page_va)
214
0
{
215
0
    ASSERT(IS_PAGE_ALIGNED(page_va));
216
0
    __tmem_free_page(virt_to_page(page_va));
217
0
}
218
219
static int __init tmem_mempool_init(void)
220
0
{
221
0
    tmem_mempool = xmem_pool_create("tmem", tmem_mempool_page_get,
222
0
        tmem_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
223
0
    if ( tmem_mempool )
224
0
        tmem_mempool_maxalloc = xmem_pool_maxalloc(tmem_mempool);
225
0
    return tmem_mempool != NULL;
226
0
}
227
228
/* Persistent pools are per-domain. */
229
static void *tmem_persistent_pool_page_get(unsigned long size)
230
0
{
231
0
    struct page_info *pi;
232
0
    struct domain *d = current->domain;
233
0
234
0
    ASSERT(size == PAGE_SIZE);
235
0
    if ( (pi = __tmem_alloc_page_thispool(d)) == NULL )
236
0
        return NULL;
237
0
    ASSERT(IS_VALID_PAGE(pi));
238
0
    return page_to_virt(pi);
239
0
}
240
241
static void tmem_persistent_pool_page_put(void *page_va)
242
0
{
243
0
    struct page_info *pi;
244
0
245
0
    ASSERT(IS_PAGE_ALIGNED(page_va));
246
0
    pi = mfn_to_page(virt_to_mfn(page_va));
247
0
    ASSERT(IS_VALID_PAGE(pi));
248
0
    __tmem_free_page_thispool(pi);
249
0
}
250
251
/*
252
 * Page content descriptor manipulation routines.
253
 */
254
#define NOT_SHAREABLE ((uint16_t)-1UL)
255
256
/************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
257
258
/* Allocate a struct tmem_page_descriptor and associate it with an object. */
259
static struct tmem_page_descriptor *pgp_alloc(struct tmem_object_root *obj)
260
0
{
261
0
    struct tmem_page_descriptor *pgp;
262
0
    struct tmem_pool *pool;
263
0
264
0
    ASSERT(obj != NULL);
265
0
    ASSERT(obj->pool != NULL);
266
0
    pool = obj->pool;
267
0
    if ( (pgp = tmem_malloc(sizeof(struct tmem_page_descriptor), pool)) == NULL )
268
0
        return NULL;
269
0
    pgp->us.obj = obj;
270
0
    INIT_LIST_HEAD(&pgp->global_eph_pages);
271
0
    INIT_LIST_HEAD(&pgp->us.client_eph_pages);
272
0
    pgp->pfp = NULL;
273
0
    pgp->size = -1;
274
0
    pgp->index = -1;
275
0
    pgp->timestamp = get_cycles();
276
0
    atomic_inc_and_max(global_pgp_count);
277
0
    atomic_inc(&pool->pgp_count);
278
0
    if ( _atomic_read(pool->pgp_count) > pool->pgp_count_max )
279
0
        pool->pgp_count_max = _atomic_read(pool->pgp_count);
280
0
    return pgp;
281
0
}
282
283
static struct tmem_page_descriptor *pgp_lookup_in_obj(struct tmem_object_root *obj, uint32_t index)
284
0
{
285
0
    ASSERT(obj != NULL);
286
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
287
0
    ASSERT(obj->pool != NULL);
288
0
    return radix_tree_lookup(&obj->tree_root, index);
289
0
}
290
291
static void pgp_free_data(struct tmem_page_descriptor *pgp, struct tmem_pool *pool)
292
0
{
293
0
    pagesize_t pgp_size = pgp->size;
294
0
295
0
    if ( pgp->pfp == NULL )
296
0
        return;
297
0
    if ( pgp_size )
298
0
        tmem_free(pgp->cdata, pool);
299
0
    else
300
0
        tmem_free_page(pgp->us.obj->pool,pgp->pfp);
301
0
    if ( pool != NULL && pgp_size )
302
0
    {
303
0
        pool->client->compressed_pages--;
304
0
        pool->client->compressed_sum_size -= pgp_size;
305
0
    }
306
0
    pgp->pfp = NULL;
307
0
    pgp->size = -1;
308
0
}
309
310
static void __pgp_free(struct tmem_page_descriptor *pgp, struct tmem_pool *pool)
311
0
{
312
0
    pgp->us.obj = NULL;
313
0
    pgp->index = -1;
314
0
    tmem_free(pgp, pool);
315
0
}
316
317
static void pgp_free(struct tmem_page_descriptor *pgp)
318
0
{
319
0
    struct tmem_pool *pool = NULL;
320
0
321
0
    ASSERT(pgp->us.obj != NULL);
322
0
    ASSERT(pgp->us.obj->pool != NULL);
323
0
    ASSERT(pgp->us.obj->pool->client != NULL);
324
0
325
0
    pool = pgp->us.obj->pool;
326
0
    if ( !is_persistent(pool) )
327
0
    {
328
0
        ASSERT(list_empty(&pgp->global_eph_pages));
329
0
        ASSERT(list_empty(&pgp->us.client_eph_pages));
330
0
    }
331
0
    pgp_free_data(pgp, pool);
332
0
    atomic_dec_and_assert(global_pgp_count);
333
0
    atomic_dec(&pool->pgp_count);
334
0
    ASSERT(_atomic_read(pool->pgp_count) >= 0);
335
0
    pgp->size = -1;
336
0
    if ( is_persistent(pool) && pool->client->info.flags.u.migrating )
337
0
    {
338
0
        pgp->inv_oid = pgp->us.obj->oid;
339
0
        pgp->pool_id = pool->pool_id;
340
0
        return;
341
0
    }
342
0
    __pgp_free(pgp, pool);
343
0
}
344
345
/* Remove pgp from global/pool/client lists and free it. */
346
static void pgp_delist_free(struct tmem_page_descriptor *pgp)
347
0
{
348
0
    struct client *client;
349
0
    uint64_t life;
350
0
351
0
    ASSERT(pgp != NULL);
352
0
    ASSERT(pgp->us.obj != NULL);
353
0
    ASSERT(pgp->us.obj->pool != NULL);
354
0
    client = pgp->us.obj->pool->client;
355
0
    ASSERT(client != NULL);
356
0
357
0
    /* Delist pgp. */
358
0
    if ( !is_persistent(pgp->us.obj->pool) )
359
0
    {
360
0
        spin_lock(&eph_lists_spinlock);
361
0
        if ( !list_empty(&pgp->us.client_eph_pages) )
362
0
            client->eph_count--;
363
0
        ASSERT(client->eph_count >= 0);
364
0
        list_del_init(&pgp->us.client_eph_pages);
365
0
        if ( !list_empty(&pgp->global_eph_pages) )
366
0
            tmem_global.eph_count--;
367
0
        ASSERT(tmem_global.eph_count >= 0);
368
0
        list_del_init(&pgp->global_eph_pages);
369
0
        spin_unlock(&eph_lists_spinlock);
370
0
    }
371
0
    else
372
0
    {
373
0
        if ( client->info.flags.u.migrating )
374
0
        {
375
0
            spin_lock(&pers_lists_spinlock);
376
0
            list_add_tail(&pgp->client_inv_pages,
377
0
                          &client->persistent_invalidated_list);
378
0
            if ( pgp != pgp->us.obj->pool->cur_pgp )
379
0
                list_del_init(&pgp->us.pool_pers_pages);
380
0
            spin_unlock(&pers_lists_spinlock);
381
0
        }
382
0
        else
383
0
        {
384
0
            spin_lock(&pers_lists_spinlock);
385
0
            list_del_init(&pgp->us.pool_pers_pages);
386
0
            spin_unlock(&pers_lists_spinlock);
387
0
        }
388
0
    }
389
0
    life = get_cycles() - pgp->timestamp;
390
0
    pgp->us.obj->pool->sum_life_cycles += life;
391
0
392
0
    /* Free pgp. */
393
0
    pgp_free(pgp);
394
0
}
395
396
/* Called only indirectly by radix_tree_destroy. */
397
static void pgp_destroy(void *v)
398
0
{
399
0
    struct tmem_page_descriptor *pgp = (struct tmem_page_descriptor *)v;
400
0
401
0
    pgp->us.obj->pgp_count--;
402
0
    pgp_delist_free(pgp);
403
0
}
404
405
static int pgp_add_to_obj(struct tmem_object_root *obj, uint32_t index, struct tmem_page_descriptor *pgp)
406
0
{
407
0
    int ret;
408
0
409
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
410
0
    ret = radix_tree_insert(&obj->tree_root, index, pgp);
411
0
    if ( !ret )
412
0
        obj->pgp_count++;
413
0
    return ret;
414
0
}
415
416
static struct tmem_page_descriptor *pgp_delete_from_obj(struct tmem_object_root *obj, uint32_t index)
417
0
{
418
0
    struct tmem_page_descriptor *pgp;
419
0
420
0
    ASSERT(obj != NULL);
421
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
422
0
    ASSERT(obj->pool != NULL);
423
0
    pgp = radix_tree_delete(&obj->tree_root, index);
424
0
    if ( pgp != NULL )
425
0
        obj->pgp_count--;
426
0
    ASSERT(obj->pgp_count >= 0);
427
0
428
0
    return pgp;
429
0
}
430
431
/************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
432
433
/* Called only indirectly from radix_tree_insert. */
434
static struct radix_tree_node *rtn_alloc(void *arg)
435
0
{
436
0
    struct tmem_object_node *objnode;
437
0
    struct tmem_object_root *obj = (struct tmem_object_root *)arg;
438
0
439
0
    ASSERT(obj->pool != NULL);
440
0
    objnode = tmem_malloc(sizeof(struct tmem_object_node),obj->pool);
441
0
    if (objnode == NULL)
442
0
        return NULL;
443
0
    objnode->obj = obj;
444
0
    memset(&objnode->rtn, 0, sizeof(struct radix_tree_node));
445
0
    if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
446
0
        obj->pool->objnode_count_max = obj->pool->objnode_count;
447
0
    atomic_inc_and_max(global_rtree_node_count);
448
0
    obj->objnode_count++;
449
0
    return &objnode->rtn;
450
0
}
451
452
/* Called only indirectly from radix_tree_delete/destroy. */
453
static void rtn_free(struct radix_tree_node *rtn, void *arg)
454
0
{
455
0
    struct tmem_pool *pool;
456
0
    struct tmem_object_node *objnode;
457
0
458
0
    ASSERT(rtn != NULL);
459
0
    objnode = container_of(rtn,struct tmem_object_node,rtn);
460
0
    ASSERT(objnode->obj != NULL);
461
0
    ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
462
0
    pool = objnode->obj->pool;
463
0
    ASSERT(pool != NULL);
464
0
    pool->objnode_count--;
465
0
    objnode->obj->objnode_count--;
466
0
    objnode->obj = NULL;
467
0
    tmem_free(objnode, pool);
468
0
    atomic_dec_and_assert(global_rtree_node_count);
469
0
}
470
471
/************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
472
473
static int oid_compare(struct xen_tmem_oid *left,
474
                       struct xen_tmem_oid *right)
475
0
{
476
0
    if ( left->oid[2] == right->oid[2] )
477
0
    {
478
0
        if ( left->oid[1] == right->oid[1] )
479
0
        {
480
0
            if ( left->oid[0] == right->oid[0] )
481
0
                return 0;
482
0
            else if ( left->oid[0] < right->oid[0] )
483
0
                return -1;
484
0
            else
485
0
                return 1;
486
0
        }
487
0
        else if ( left->oid[1] < right->oid[1] )
488
0
            return -1;
489
0
        else
490
0
            return 1;
491
0
    }
492
0
    else if ( left->oid[2] < right->oid[2] )
493
0
        return -1;
494
0
    else
495
0
        return 1;
496
0
}
497
498
static void oid_set_invalid(struct xen_tmem_oid *oidp)
499
0
{
500
0
    oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
501
0
}
502
503
static unsigned oid_hash(struct xen_tmem_oid *oidp)
504
0
{
505
0
    return (tmem_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
506
0
                     BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK);
507
0
}
508
509
/* Searches for object==oid in pool, returns locked object if found. */
510
static struct tmem_object_root * obj_find(struct tmem_pool *pool,
511
                                          struct xen_tmem_oid *oidp)
512
0
{
513
0
    struct rb_node *node;
514
0
    struct tmem_object_root *obj;
515
0
516
0
restart_find:
517
0
    read_lock(&pool->pool_rwlock);
518
0
    node = pool->obj_rb_root[oid_hash(oidp)].rb_node;
519
0
    while ( node )
520
0
    {
521
0
        obj = container_of(node, struct tmem_object_root, rb_tree_node);
522
0
        switch ( oid_compare(&obj->oid, oidp) )
523
0
        {
524
0
            case 0: /* Equal. */
525
0
                if ( !spin_trylock(&obj->obj_spinlock) )
526
0
                {
527
0
                    read_unlock(&pool->pool_rwlock);
528
0
                    goto restart_find;
529
0
                }
530
0
                read_unlock(&pool->pool_rwlock);
531
0
                return obj;
532
0
            case -1:
533
0
                node = node->rb_left;
534
0
                break;
535
0
            case 1:
536
0
                node = node->rb_right;
537
0
        }
538
0
    }
539
0
    read_unlock(&pool->pool_rwlock);
540
0
    return NULL;
541
0
}
542
543
/* Free an object that has no more pgps in it. */
544
static void obj_free(struct tmem_object_root *obj)
545
0
{
546
0
    struct tmem_pool *pool;
547
0
    struct xen_tmem_oid old_oid;
548
0
549
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
550
0
    ASSERT(obj != NULL);
551
0
    ASSERT(obj->pgp_count == 0);
552
0
    pool = obj->pool;
553
0
    ASSERT(pool != NULL);
554
0
    ASSERT(pool->client != NULL);
555
0
    ASSERT_WRITELOCK(&pool->pool_rwlock);
556
0
    if ( obj->tree_root.rnode != NULL ) /* May be a "stump" with no leaves. */
557
0
        radix_tree_destroy(&obj->tree_root, pgp_destroy);
558
0
    ASSERT((long)obj->objnode_count == 0);
559
0
    ASSERT(obj->tree_root.rnode == NULL);
560
0
    pool->obj_count--;
561
0
    ASSERT(pool->obj_count >= 0);
562
0
    obj->pool = NULL;
563
0
    old_oid = obj->oid;
564
0
    oid_set_invalid(&obj->oid);
565
0
    obj->last_client = TMEM_CLI_ID_NULL;
566
0
    atomic_dec_and_assert(global_obj_count);
567
0
    rb_erase(&obj->rb_tree_node, &pool->obj_rb_root[oid_hash(&old_oid)]);
568
0
    spin_unlock(&obj->obj_spinlock);
569
0
    tmem_free(obj, pool);
570
0
}
571
572
static int obj_rb_insert(struct rb_root *root, struct tmem_object_root *obj)
573
0
{
574
0
    struct rb_node **new, *parent = NULL;
575
0
    struct tmem_object_root *this;
576
0
577
0
    ASSERT(obj->pool);
578
0
    ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
579
0
580
0
    new = &(root->rb_node);
581
0
    while ( *new )
582
0
    {
583
0
        this = container_of(*new, struct tmem_object_root, rb_tree_node);
584
0
        parent = *new;
585
0
        switch ( oid_compare(&this->oid, &obj->oid) )
586
0
        {
587
0
            case 0:
588
0
                return 0;
589
0
            case -1:
590
0
                new = &((*new)->rb_left);
591
0
                break;
592
0
            case 1:
593
0
                new = &((*new)->rb_right);
594
0
                break;
595
0
        }
596
0
    }
597
0
    rb_link_node(&obj->rb_tree_node, parent, new);
598
0
    rb_insert_color(&obj->rb_tree_node, root);
599
0
    return 1;
600
0
}
601
602
/*
603
 * Allocate, initialize, and insert an tmem_object_root
604
 * (should be called only if find failed).
605
 */
606
static struct tmem_object_root * obj_alloc(struct tmem_pool *pool,
607
                                           struct xen_tmem_oid *oidp)
608
0
{
609
0
    struct tmem_object_root *obj;
610
0
611
0
    ASSERT(pool != NULL);
612
0
    if ( (obj = tmem_malloc(sizeof(struct tmem_object_root), pool)) == NULL )
613
0
        return NULL;
614
0
    pool->obj_count++;
615
0
    if (pool->obj_count > pool->obj_count_max)
616
0
        pool->obj_count_max = pool->obj_count;
617
0
    atomic_inc_and_max(global_obj_count);
618
0
    radix_tree_init(&obj->tree_root);
619
0
    radix_tree_set_alloc_callbacks(&obj->tree_root, rtn_alloc, rtn_free, obj);
620
0
    spin_lock_init(&obj->obj_spinlock);
621
0
    obj->pool = pool;
622
0
    obj->oid = *oidp;
623
0
    obj->objnode_count = 0;
624
0
    obj->pgp_count = 0;
625
0
    obj->last_client = TMEM_CLI_ID_NULL;
626
0
    return obj;
627
0
}
628
629
/* Free an object after destroying any pgps in it. */
630
static void obj_destroy(struct tmem_object_root *obj)
631
0
{
632
0
    ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
633
0
    radix_tree_destroy(&obj->tree_root, pgp_destroy);
634
0
    obj_free(obj);
635
0
}
636
637
/* Destroys all objs in a pool, or only if obj->last_client matches cli_id. */
638
static void pool_destroy_objs(struct tmem_pool *pool, domid_t cli_id)
639
0
{
640
0
    struct rb_node *node;
641
0
    struct tmem_object_root *obj;
642
0
    int i;
643
0
644
0
    write_lock(&pool->pool_rwlock);
645
0
    pool->is_dying = 1;
646
0
    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
647
0
    {
648
0
        node = rb_first(&pool->obj_rb_root[i]);
649
0
        while ( node != NULL )
650
0
        {
651
0
            obj = container_of(node, struct tmem_object_root, rb_tree_node);
652
0
            spin_lock(&obj->obj_spinlock);
653
0
            node = rb_next(node);
654
0
            if ( obj->last_client == cli_id )
655
0
                obj_destroy(obj);
656
0
            else
657
0
                spin_unlock(&obj->obj_spinlock);
658
0
        }
659
0
    }
660
0
    write_unlock(&pool->pool_rwlock);
661
0
}
662
663
664
/************ POOL MANIPULATION ROUTINES ******************************/
665
666
static struct tmem_pool * pool_alloc(void)
667
0
{
668
0
    struct tmem_pool *pool;
669
0
    int i;
670
0
671
0
    if ( (pool = xzalloc(struct tmem_pool)) == NULL )
672
0
        return NULL;
673
0
    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
674
0
        pool->obj_rb_root[i] = RB_ROOT;
675
0
    INIT_LIST_HEAD(&pool->persistent_page_list);
676
0
    rwlock_init(&pool->pool_rwlock);
677
0
    return pool;
678
0
}
679
680
static void pool_free(struct tmem_pool *pool)
681
0
{
682
0
    pool->client = NULL;
683
0
    xfree(pool);
684
0
}
685
686
/*
687
 * Register new_client as a user of this shared pool and return 0 on succ.
688
 */
689
static int shared_pool_join(struct tmem_pool *pool, struct client *new_client)
690
0
{
691
0
    struct share_list *sl;
692
0
    ASSERT(is_shared(pool));
693
0
694
0
    if ( (sl = tmem_malloc(sizeof(struct share_list), NULL)) == NULL )
695
0
        return -1;
696
0
    sl->client = new_client;
697
0
    list_add_tail(&sl->share_list, &pool->share_list);
698
0
    if ( new_client->cli_id != pool->client->cli_id )
699
0
        tmem_client_info("adding new %s %d to shared pool owned by %s %d\n",
700
0
                    tmem_client_str, new_client->cli_id, tmem_client_str,
701
0
                    pool->client->cli_id);
702
0
    else if ( pool->shared_count )
703
0
        tmem_client_info("inter-guest sharing of shared pool %s by client %d\n",
704
0
                         tmem_client_str, pool->client->cli_id);
705
0
    ++pool->shared_count;
706
0
    return 0;
707
0
}
708
709
/* Reassign "ownership" of the pool to another client that shares this pool. */
710
static void shared_pool_reassign(struct tmem_pool *pool)
711
0
{
712
0
    struct share_list *sl;
713
0
    int poolid;
714
0
    struct client *old_client = pool->client, *new_client;
715
0
716
0
    ASSERT(is_shared(pool));
717
0
    if ( list_empty(&pool->share_list) )
718
0
    {
719
0
        ASSERT(pool->shared_count == 0);
720
0
        return;
721
0
    }
722
0
    old_client->pools[pool->pool_id] = NULL;
723
0
    sl = list_entry(pool->share_list.next, struct share_list, share_list);
724
0
    /*
725
0
     * The sl->client can be old_client if there are multiple shared pools
726
0
     * within an guest.
727
0
     */
728
0
    pool->client = new_client = sl->client;
729
0
    for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
730
0
        if (new_client->pools[poolid] == pool)
731
0
            break;
732
0
    ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
733
0
    new_client->eph_count += _atomic_read(pool->pgp_count);
734
0
    old_client->eph_count -= _atomic_read(pool->pgp_count);
735
0
    list_splice_init(&old_client->ephemeral_page_list,
736
0
                     &new_client->ephemeral_page_list);
737
0
    tmem_client_info("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
738
0
        tmem_cli_id_str, old_client->cli_id, tmem_cli_id_str, new_client->cli_id, poolid);
739
0
    pool->pool_id = poolid;
740
0
}
741
742
/*
743
 * Destroy all objects with last_client same as passed cli_id,
744
 * remove pool's cli_id from list of sharers of this pool.
745
 */
746
static int shared_pool_quit(struct tmem_pool *pool, domid_t cli_id)
747
0
{
748
0
    struct share_list *sl;
749
0
    int s_poolid;
750
0
751
0
    ASSERT(is_shared(pool));
752
0
    ASSERT(pool->client != NULL);
753
0
754
0
    ASSERT_WRITELOCK(&tmem_rwlock);
755
0
    pool_destroy_objs(pool, cli_id);
756
0
    list_for_each_entry(sl,&pool->share_list, share_list)
757
0
    {
758
0
        if (sl->client->cli_id != cli_id)
759
0
            continue;
760
0
        list_del(&sl->share_list);
761
0
        tmem_free(sl, pool);
762
0
        --pool->shared_count;
763
0
        if (pool->client->cli_id == cli_id)
764
0
            shared_pool_reassign(pool);
765
0
        if (pool->shared_count)
766
0
            return pool->shared_count;
767
0
        for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
768
0
            if ( (tmem_global.shared_pools[s_poolid]) == pool )
769
0
            {
770
0
                tmem_global.shared_pools[s_poolid] = NULL;
771
0
                break;
772
0
            }
773
0
        return 0;
774
0
    }
775
0
    tmem_client_warn("tmem: no match unsharing pool, %s=%d\n",
776
0
        tmem_cli_id_str,pool->client->cli_id);
777
0
    return -1;
778
0
}
779
780
/* Flush all data (owned by cli_id) from a pool and, optionally, free it. */
781
static void pool_flush(struct tmem_pool *pool, domid_t cli_id)
782
0
{
783
0
    ASSERT(pool != NULL);
784
0
    if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
785
0
    {
786
0
        tmem_client_warn("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
787
0
           tmem_cli_id_str, cli_id, pool->pool_id, tmem_cli_id_str,pool->client->cli_id);
788
0
        return;
789
0
    }
790
0
    tmem_client_info("Destroying %s-%s tmem pool %s=%d pool_id=%d\n",
791
0
                    is_persistent(pool) ? "persistent" : "ephemeral" ,
792
0
                    is_shared(pool) ? "shared" : "private",
793
0
                    tmem_cli_id_str, pool->client->cli_id, pool->pool_id);
794
0
    if ( pool->client->info.flags.u.migrating )
795
0
    {
796
0
        tmem_client_warn("can't destroy pool while %s is live-migrating\n",
797
0
                    tmem_client_str);
798
0
        return;
799
0
    }
800
0
    pool_destroy_objs(pool, TMEM_CLI_ID_NULL);
801
0
    pool->client->pools[pool->pool_id] = NULL;
802
0
    pool_free(pool);
803
0
}
804
805
/************ CLIENT MANIPULATION OPERATIONS **************************/
806
807
struct client *client_create(domid_t cli_id)
808
0
{
809
0
    struct client *client = xzalloc(struct client);
810
0
    int i, shift;
811
0
    char name[5];
812
0
    struct domain *d;
813
0
814
0
    tmem_client_info("tmem: initializing tmem capability for %s=%d...",
815
0
                    tmem_cli_id_str, cli_id);
816
0
    if ( client == NULL )
817
0
    {
818
0
        tmem_client_err("failed... out of memory\n");
819
0
        goto fail;
820
0
    }
821
0
822
0
    for (i = 0, shift = 12; i < 4; shift -=4, i++)
823
0
        name[i] = (((unsigned short)cli_id >> shift) & 0xf) + '0';
824
0
    name[4] = '\0';
825
0
    client->persistent_pool = xmem_pool_create(name, tmem_persistent_pool_page_get,
826
0
        tmem_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
827
0
    if ( client->persistent_pool == NULL )
828
0
    {
829
0
        tmem_client_err("failed... can't alloc persistent pool\n");
830
0
        goto fail;
831
0
    }
832
0
833
0
    d = rcu_lock_domain_by_id(cli_id);
834
0
    if ( d == NULL ) {
835
0
        tmem_client_err("failed... can't set client\n");
836
0
        xmem_pool_destroy(client->persistent_pool);
837
0
        goto fail;
838
0
    }
839
0
    if ( !d->is_dying ) {
840
0
        d->tmem_client = client;
841
0
        client->domain = d;
842
0
    }
843
0
    rcu_unlock_domain(d);
844
0
845
0
    client->cli_id = cli_id;
846
0
    client->info.version = TMEM_SPEC_VERSION;
847
0
    client->info.maxpools = MAX_POOLS_PER_DOMAIN;
848
0
    client->info.flags.u.compress = tmem_compression_enabled();
849
0
    for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
850
0
        client->shared_auth_uuid[i][0] =
851
0
            client->shared_auth_uuid[i][1] = -1L;
852
0
    list_add_tail(&client->client_list, &tmem_global.client_list);
853
0
    INIT_LIST_HEAD(&client->ephemeral_page_list);
854
0
    INIT_LIST_HEAD(&client->persistent_invalidated_list);
855
0
    tmem_client_info("ok\n");
856
0
    return client;
857
0
858
0
 fail:
859
0
    xfree(client);
860
0
    return NULL;
861
0
}
862
863
static void client_free(struct client *client)
864
0
{
865
0
    list_del(&client->client_list);
866
0
    xmem_pool_destroy(client->persistent_pool);
867
0
    xfree(client);
868
0
}
869
870
/* Flush all data from a client and, optionally, free it. */
871
static void client_flush(struct client *client)
872
0
{
873
0
    int i;
874
0
    struct tmem_pool *pool;
875
0
876
0
    for  (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
877
0
    {
878
0
        if ( (pool = client->pools[i]) == NULL )
879
0
            continue;
880
0
        pool_flush(pool, client->cli_id);
881
0
        client->pools[i] = NULL;
882
0
        client->info.nr_pools--;
883
0
    }
884
0
    client_free(client);
885
0
}
886
887
static bool client_over_quota(const struct client *client)
888
0
{
889
0
    int total = _atomic_read(tmem_global.client_weight_total);
890
0
891
0
    ASSERT(client != NULL);
892
0
    if ( (total == 0) || (client->info.weight == 0) ||
893
0
          (client->eph_count == 0) )
894
0
        return false;
895
0
896
0
    return (((tmem_global.eph_count * 100L) / client->eph_count) >
897
0
            ((total * 100L) / client->info.weight));
898
0
}
899
900
/************ MEMORY REVOCATION ROUTINES *******************************/
901
902
static bool tmem_try_to_evict_pgp(struct tmem_page_descriptor *pgp,
903
                                  bool *hold_pool_rwlock)
904
0
{
905
0
    struct tmem_object_root *obj = pgp->us.obj;
906
0
    struct tmem_pool *pool = obj->pool;
907
0
908
0
    if ( pool->is_dying )
909
0
        return false;
910
0
    if ( spin_trylock(&obj->obj_spinlock) )
911
0
    {
912
0
        if ( obj->pgp_count > 1 )
913
0
            return true;
914
0
        if ( write_trylock(&pool->pool_rwlock) )
915
0
        {
916
0
            *hold_pool_rwlock = 1;
917
0
            return true;
918
0
        }
919
0
        spin_unlock(&obj->obj_spinlock);
920
0
    }
921
0
    return false;
922
0
}
923
924
int tmem_evict(void)
925
0
{
926
0
    struct client *client = current->domain->tmem_client;
927
0
    struct tmem_page_descriptor *pgp = NULL, *pgp_del;
928
0
    struct tmem_object_root *obj;
929
0
    struct tmem_pool *pool;
930
0
    int ret = 0;
931
0
    bool hold_pool_rwlock = false;
932
0
933
0
    tmem_stats.evict_attempts++;
934
0
    spin_lock(&eph_lists_spinlock);
935
0
    if ( (client != NULL) && client_over_quota(client) &&
936
0
         !list_empty(&client->ephemeral_page_list) )
937
0
    {
938
0
        list_for_each_entry(pgp, &client->ephemeral_page_list, us.client_eph_pages)
939
0
            if ( tmem_try_to_evict_pgp(pgp, &hold_pool_rwlock) )
940
0
                goto found;
941
0
    }
942
0
    else if ( !list_empty(&tmem_global.ephemeral_page_list) )
943
0
    {
944
0
        list_for_each_entry(pgp, &tmem_global.ephemeral_page_list, global_eph_pages)
945
0
            if ( tmem_try_to_evict_pgp(pgp, &hold_pool_rwlock) )
946
0
            {
947
0
                client = pgp->us.obj->pool->client;
948
0
                goto found;
949
0
            }
950
0
    }
951
0
     /* Global_ephemeral_page_list is empty, so we bail out. */
952
0
    spin_unlock(&eph_lists_spinlock);
953
0
    goto out;
954
0
955
0
found:
956
0
    /* Delist. */
957
0
    list_del_init(&pgp->us.client_eph_pages);
958
0
    client->eph_count--;
959
0
    list_del_init(&pgp->global_eph_pages);
960
0
    tmem_global.eph_count--;
961
0
    ASSERT(tmem_global.eph_count >= 0);
962
0
    ASSERT(client->eph_count >= 0);
963
0
    spin_unlock(&eph_lists_spinlock);
964
0
965
0
    ASSERT(pgp != NULL);
966
0
    obj = pgp->us.obj;
967
0
    ASSERT(obj != NULL);
968
0
    ASSERT(obj->pool != NULL);
969
0
    pool = obj->pool;
970
0
971
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
972
0
    pgp_del = pgp_delete_from_obj(obj, pgp->index);
973
0
    ASSERT(pgp_del == pgp);
974
0
975
0
    /* pgp already delist, so call pgp_free directly. */
976
0
    pgp_free(pgp);
977
0
    if ( obj->pgp_count == 0 )
978
0
    {
979
0
        ASSERT_WRITELOCK(&pool->pool_rwlock);
980
0
        obj_free(obj);
981
0
    }
982
0
    else
983
0
        spin_unlock(&obj->obj_spinlock);
984
0
    if ( hold_pool_rwlock )
985
0
        write_unlock(&pool->pool_rwlock);
986
0
    tmem_stats.evicted_pgs++;
987
0
    ret = 1;
988
0
out:
989
0
    return ret;
990
0
}
991
992
993
/*
994
 * Under certain conditions (e.g. if each client is putting pages for exactly
995
 * one object), once locks are held, freeing up memory may
996
 * result in livelocks and very long "put" times, so we try to ensure there
997
 * is a minimum amount of memory (1MB) available BEFORE any data structure
998
 * locks are held.
999
 */
1000
static inline bool tmem_ensure_avail_pages(void)
1001
0
{
1002
0
    int failed_evict = 10;
1003
0
    unsigned long free_mem;
1004
0
1005
0
    do {
1006
0
        free_mem = (tmem_page_list_pages + total_free_pages())
1007
0
                        >> (20 - PAGE_SHIFT);
1008
0
        if ( free_mem )
1009
0
            return true;
1010
0
        if ( !tmem_evict() )
1011
0
            failed_evict--;
1012
0
    } while ( failed_evict > 0 );
1013
0
1014
0
    return false;
1015
0
}
1016
1017
/************ TMEM CORE OPERATIONS ************************************/
1018
1019
static int do_tmem_put_compress(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn,
1020
                                         tmem_cli_va_param_t clibuf)
1021
0
{
1022
0
    void *dst, *p;
1023
0
    size_t size;
1024
0
    int ret = 0;
1025
0
1026
0
    ASSERT(pgp != NULL);
1027
0
    ASSERT(pgp->us.obj != NULL);
1028
0
    ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
1029
0
    ASSERT(pgp->us.obj->pool != NULL);
1030
0
    ASSERT(pgp->us.obj->pool->client != NULL);
1031
0
1032
0
    if ( pgp->pfp != NULL )
1033
0
        pgp_free_data(pgp, pgp->us.obj->pool);
1034
0
    ret = tmem_compress_from_client(cmfn, &dst, &size, clibuf);
1035
0
    if ( ret <= 0 )
1036
0
        goto out;
1037
0
    else if ( (size == 0) || (size >= tmem_mempool_maxalloc) ) {
1038
0
        ret = 0;
1039
0
        goto out;
1040
0
    } else if ( (p = tmem_malloc(size,pgp->us.obj->pool)) == NULL ) {
1041
0
        ret = -ENOMEM;
1042
0
        goto out;
1043
0
    } else {
1044
0
        memcpy(p,dst,size);
1045
0
        pgp->cdata = p;
1046
0
    }
1047
0
    pgp->size = size;
1048
0
    pgp->us.obj->pool->client->compressed_pages++;
1049
0
    pgp->us.obj->pool->client->compressed_sum_size += size;
1050
0
    ret = 1;
1051
0
1052
0
out:
1053
0
    return ret;
1054
0
}
1055
1056
static int do_tmem_dup_put(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn,
1057
       tmem_cli_va_param_t clibuf)
1058
0
{
1059
0
    struct tmem_pool *pool;
1060
0
    struct tmem_object_root *obj;
1061
0
    struct client *client;
1062
0
    struct tmem_page_descriptor *pgpfound = NULL;
1063
0
    int ret;
1064
0
1065
0
    ASSERT(pgp != NULL);
1066
0
    ASSERT(pgp->pfp != NULL);
1067
0
    ASSERT(pgp->size != -1);
1068
0
    obj = pgp->us.obj;
1069
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
1070
0
    ASSERT(obj != NULL);
1071
0
    pool = obj->pool;
1072
0
    ASSERT(pool != NULL);
1073
0
    client = pool->client;
1074
0
    if ( client->info.flags.u.migrating )
1075
0
        goto failed_dup; /* No dups allowed when migrating. */
1076
0
    /* Can we successfully manipulate pgp to change out the data? */
1077
0
    if ( client->info.flags.u.compress && pgp->size != 0 )
1078
0
    {
1079
0
        ret = do_tmem_put_compress(pgp, cmfn, clibuf);
1080
0
        if ( ret == 1 )
1081
0
            goto done;
1082
0
        else if ( ret == 0 )
1083
0
            goto copy_uncompressed;
1084
0
        else if ( ret == -ENOMEM )
1085
0
            goto failed_dup;
1086
0
        else if ( ret == -EFAULT )
1087
0
            goto bad_copy;
1088
0
    }
1089
0
1090
0
copy_uncompressed:
1091
0
    if ( pgp->pfp )
1092
0
        pgp_free_data(pgp, pool);
1093
0
    if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL )
1094
0
        goto failed_dup;
1095
0
    pgp->size = 0;
1096
0
    ret = tmem_copy_from_client(pgp->pfp, cmfn, tmem_cli_buf_null);
1097
0
    if ( ret < 0 )
1098
0
        goto bad_copy;
1099
0
1100
0
done:
1101
0
    /* Successfully replaced data, clean up and return success. */
1102
0
    if ( is_shared(pool) )
1103
0
        obj->last_client = client->cli_id;
1104
0
    spin_unlock(&obj->obj_spinlock);
1105
0
    pool->dup_puts_replaced++;
1106
0
    pool->good_puts++;
1107
0
    if ( is_persistent(pool) )
1108
0
        client->succ_pers_puts++;
1109
0
    return 1;
1110
0
1111
0
bad_copy:
1112
0
    tmem_stats.failed_copies++;
1113
0
    goto cleanup;
1114
0
1115
0
failed_dup:
1116
0
    /*
1117
0
     * Couldn't change out the data, flush the old data and return
1118
0
     * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put.
1119
0
     */
1120
0
    ret = -ENOSPC;
1121
0
cleanup:
1122
0
    pgpfound = pgp_delete_from_obj(obj, pgp->index);
1123
0
    ASSERT(pgpfound == pgp);
1124
0
    pgp_delist_free(pgpfound);
1125
0
    if ( obj->pgp_count == 0 )
1126
0
    {
1127
0
        write_lock(&pool->pool_rwlock);
1128
0
        obj_free(obj);
1129
0
        write_unlock(&pool->pool_rwlock);
1130
0
    } else {
1131
0
        spin_unlock(&obj->obj_spinlock);
1132
0
    }
1133
0
    pool->dup_puts_flushed++;
1134
0
    return ret;
1135
0
}
1136
1137
static int do_tmem_put(struct tmem_pool *pool,
1138
                       struct xen_tmem_oid *oidp, uint32_t index,
1139
                       xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
1140
0
{
1141
0
    struct tmem_object_root *obj = NULL;
1142
0
    struct tmem_page_descriptor *pgp = NULL;
1143
0
    struct client *client;
1144
0
    int ret, newobj = 0;
1145
0
1146
0
    ASSERT(pool != NULL);
1147
0
    client = pool->client;
1148
0
    ASSERT(client != NULL);
1149
0
    ret = client->info.flags.u.frozen  ? -EFROZEN : -ENOMEM;
1150
0
    pool->puts++;
1151
0
1152
0
refind:
1153
0
    /* Does page already exist (dup)?  if so, handle specially. */
1154
0
    if ( (obj = obj_find(pool, oidp)) != NULL )
1155
0
    {
1156
0
        if ((pgp = pgp_lookup_in_obj(obj, index)) != NULL)
1157
0
        {
1158
0
            return do_tmem_dup_put(pgp, cmfn, clibuf);
1159
0
        }
1160
0
        else
1161
0
        {
1162
0
            /* No puts allowed into a frozen pool (except dup puts). */
1163
0
            if ( client->info.flags.u.frozen )
1164
0
                goto unlock_obj;
1165
0
        }
1166
0
    }
1167
0
    else
1168
0
    {
1169
0
        /* No puts allowed into a frozen pool (except dup puts). */
1170
0
        if ( client->info.flags.u.frozen )
1171
0
            return ret;
1172
0
        if ( (obj = obj_alloc(pool, oidp)) == NULL )
1173
0
            return -ENOMEM;
1174
0
1175
0
        write_lock(&pool->pool_rwlock);
1176
0
        /*
1177
0
         * Parallel callers may already allocated obj and inserted to obj_rb_root
1178
0
         * before us.
1179
0
         */
1180
0
        if ( !obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj) )
1181
0
        {
1182
0
            tmem_free(obj, pool);
1183
0
            write_unlock(&pool->pool_rwlock);
1184
0
            goto refind;
1185
0
        }
1186
0
1187
0
        spin_lock(&obj->obj_spinlock);
1188
0
        newobj = 1;
1189
0
        write_unlock(&pool->pool_rwlock);
1190
0
    }
1191
0
1192
0
    /* When arrive here, we have a spinlocked obj for use. */
1193
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
1194
0
    if ( (pgp = pgp_alloc(obj)) == NULL )
1195
0
        goto unlock_obj;
1196
0
1197
0
    ret = pgp_add_to_obj(obj, index, pgp);
1198
0
    if ( ret == -ENOMEM  )
1199
0
        /* Warning: may result in partially built radix tree ("stump"). */
1200
0
        goto free_pgp;
1201
0
1202
0
    pgp->index = index;
1203
0
    pgp->size = 0;
1204
0
1205
0
    if ( client->info.flags.u.compress )
1206
0
    {
1207
0
        ASSERT(pgp->pfp == NULL);
1208
0
        ret = do_tmem_put_compress(pgp, cmfn, clibuf);
1209
0
        if ( ret == 1 )
1210
0
            goto insert_page;
1211
0
        if ( ret == -ENOMEM )
1212
0
        {
1213
0
            client->compress_nomem++;
1214
0
            goto del_pgp_from_obj;
1215
0
        }
1216
0
        if ( ret == 0 )
1217
0
        {
1218
0
            client->compress_poor++;
1219
0
            goto copy_uncompressed;
1220
0
        }
1221
0
        if ( ret == -EFAULT )
1222
0
            goto bad_copy;
1223
0
    }
1224
0
1225
0
copy_uncompressed:
1226
0
    if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL )
1227
0
    {
1228
0
        ret = -ENOMEM;
1229
0
        goto del_pgp_from_obj;
1230
0
    }
1231
0
    ret = tmem_copy_from_client(pgp->pfp, cmfn, clibuf);
1232
0
    if ( ret < 0 )
1233
0
        goto bad_copy;
1234
0
1235
0
insert_page:
1236
0
    if ( !is_persistent(pool) )
1237
0
    {
1238
0
        spin_lock(&eph_lists_spinlock);
1239
0
        list_add_tail(&pgp->global_eph_pages, &tmem_global.ephemeral_page_list);
1240
0
        if (++tmem_global.eph_count > tmem_stats.global_eph_count_max)
1241
0
            tmem_stats.global_eph_count_max = tmem_global.eph_count;
1242
0
        list_add_tail(&pgp->us.client_eph_pages,
1243
0
            &client->ephemeral_page_list);
1244
0
        if (++client->eph_count > client->eph_count_max)
1245
0
            client->eph_count_max = client->eph_count;
1246
0
        spin_unlock(&eph_lists_spinlock);
1247
0
    }
1248
0
    else
1249
0
    { /* is_persistent. */
1250
0
        spin_lock(&pers_lists_spinlock);
1251
0
        list_add_tail(&pgp->us.pool_pers_pages,
1252
0
            &pool->persistent_page_list);
1253
0
        spin_unlock(&pers_lists_spinlock);
1254
0
    }
1255
0
1256
0
    if ( is_shared(pool) )
1257
0
        obj->last_client = client->cli_id;
1258
0
1259
0
    /* Free the obj spinlock. */
1260
0
    spin_unlock(&obj->obj_spinlock);
1261
0
    pool->good_puts++;
1262
0
1263
0
    if ( is_persistent(pool) )
1264
0
        client->succ_pers_puts++;
1265
0
    else
1266
0
        tmem_stats.tot_good_eph_puts++;
1267
0
    return 1;
1268
0
1269
0
bad_copy:
1270
0
    tmem_stats.failed_copies++;
1271
0
1272
0
del_pgp_from_obj:
1273
0
    ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1274
0
    pgp_delete_from_obj(obj, pgp->index);
1275
0
1276
0
free_pgp:
1277
0
    pgp_free(pgp);
1278
0
unlock_obj:
1279
0
    if ( newobj )
1280
0
    {
1281
0
        write_lock(&pool->pool_rwlock);
1282
0
        obj_free(obj);
1283
0
        write_unlock(&pool->pool_rwlock);
1284
0
    }
1285
0
    else
1286
0
    {
1287
0
        spin_unlock(&obj->obj_spinlock);
1288
0
    }
1289
0
    pool->no_mem_puts++;
1290
0
    return ret;
1291
0
}
1292
1293
static int do_tmem_get(struct tmem_pool *pool,
1294
                       struct xen_tmem_oid *oidp, uint32_t index,
1295
                       xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
1296
0
{
1297
0
    struct tmem_object_root *obj;
1298
0
    struct tmem_page_descriptor *pgp;
1299
0
    struct client *client = pool->client;
1300
0
    int rc;
1301
0
1302
0
    if ( !_atomic_read(pool->pgp_count) )
1303
0
        return -EEMPTY;
1304
0
1305
0
    pool->gets++;
1306
0
    obj = obj_find(pool,oidp);
1307
0
    if ( obj == NULL )
1308
0
        return 0;
1309
0
1310
0
    ASSERT_SPINLOCK(&obj->obj_spinlock);
1311
0
    if (is_shared(pool) || is_persistent(pool) )
1312
0
        pgp = pgp_lookup_in_obj(obj, index);
1313
0
    else
1314
0
        pgp = pgp_delete_from_obj(obj, index);
1315
0
    if ( pgp == NULL )
1316
0
    {
1317
0
        spin_unlock(&obj->obj_spinlock);
1318
0
        return 0;
1319
0
    }
1320
0
    ASSERT(pgp->size != -1);
1321
0
    if ( pgp->size != 0 )
1322
0
    {
1323
0
        rc = tmem_decompress_to_client(cmfn, pgp->cdata, pgp->size, clibuf);
1324
0
    }
1325
0
    else
1326
0
        rc = tmem_copy_to_client(cmfn, pgp->pfp, clibuf);
1327
0
    if ( rc <= 0 )
1328
0
        goto bad_copy;
1329
0
1330
0
    if ( !is_persistent(pool) )
1331
0
    {
1332
0
        if ( !is_shared(pool) )
1333
0
        {
1334
0
            pgp_delist_free(pgp);
1335
0
            if ( obj->pgp_count == 0 )
1336
0
            {
1337
0
                write_lock(&pool->pool_rwlock);
1338
0
                obj_free(obj);
1339
0
                obj = NULL;
1340
0
                write_unlock(&pool->pool_rwlock);
1341
0
            }
1342
0
        } else {
1343
0
            spin_lock(&eph_lists_spinlock);
1344
0
            list_del(&pgp->global_eph_pages);
1345
0
            list_add_tail(&pgp->global_eph_pages,&tmem_global.ephemeral_page_list);
1346
0
            list_del(&pgp->us.client_eph_pages);
1347
0
            list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
1348
0
            spin_unlock(&eph_lists_spinlock);
1349
0
            obj->last_client = current->domain->domain_id;
1350
0
        }
1351
0
    }
1352
0
    if ( obj != NULL )
1353
0
    {
1354
0
        spin_unlock(&obj->obj_spinlock);
1355
0
    }
1356
0
    pool->found_gets++;
1357
0
    if ( is_persistent(pool) )
1358
0
        client->succ_pers_gets++;
1359
0
    else
1360
0
        client->succ_eph_gets++;
1361
0
    return 1;
1362
0
1363
0
bad_copy:
1364
0
    spin_unlock(&obj->obj_spinlock);
1365
0
    tmem_stats.failed_copies++;
1366
0
    return rc;
1367
0
}
1368
1369
static int do_tmem_flush_page(struct tmem_pool *pool,
1370
                              struct xen_tmem_oid *oidp, uint32_t index)
1371
0
{
1372
0
    struct tmem_object_root *obj;
1373
0
    struct tmem_page_descriptor *pgp;
1374
0
1375
0
    pool->flushs++;
1376
0
    obj = obj_find(pool,oidp);
1377
0
    if ( obj == NULL )
1378
0
        goto out;
1379
0
    pgp = pgp_delete_from_obj(obj, index);
1380
0
    if ( pgp == NULL )
1381
0
    {
1382
0
        spin_unlock(&obj->obj_spinlock);
1383
0
        goto out;
1384
0
    }
1385
0
    pgp_delist_free(pgp);
1386
0
    if ( obj->pgp_count == 0 )
1387
0
    {
1388
0
        write_lock(&pool->pool_rwlock);
1389
0
        obj_free(obj);
1390
0
        write_unlock(&pool->pool_rwlock);
1391
0
    } else {
1392
0
        spin_unlock(&obj->obj_spinlock);
1393
0
    }
1394
0
    pool->flushs_found++;
1395
0
1396
0
out:
1397
0
    if ( pool->client->info.flags.u.frozen )
1398
0
        return -EFROZEN;
1399
0
    else
1400
0
        return 1;
1401
0
}
1402
1403
static int do_tmem_flush_object(struct tmem_pool *pool,
1404
                                struct xen_tmem_oid *oidp)
1405
0
{
1406
0
    struct tmem_object_root *obj;
1407
0
1408
0
    pool->flush_objs++;
1409
0
    obj = obj_find(pool,oidp);
1410
0
    if ( obj == NULL )
1411
0
        goto out;
1412
0
    write_lock(&pool->pool_rwlock);
1413
0
    obj_destroy(obj);
1414
0
    pool->flush_objs_found++;
1415
0
    write_unlock(&pool->pool_rwlock);
1416
0
1417
0
out:
1418
0
    if ( pool->client->info.flags.u.frozen )
1419
0
        return -EFROZEN;
1420
0
    else
1421
0
        return 1;
1422
0
}
1423
1424
static int do_tmem_destroy_pool(uint32_t pool_id)
1425
0
{
1426
0
    struct client *client = current->domain->tmem_client;
1427
0
    struct tmem_pool *pool;
1428
0
1429
0
    if ( pool_id >= MAX_POOLS_PER_DOMAIN )
1430
0
        return 0;
1431
0
    if ( (pool = client->pools[pool_id]) == NULL )
1432
0
        return 0;
1433
0
    client->pools[pool_id] = NULL;
1434
0
    pool_flush(pool, client->cli_id);
1435
0
    client->info.nr_pools--;
1436
0
    return 1;
1437
0
}
1438
1439
int do_tmem_new_pool(domid_t this_cli_id,
1440
                     uint32_t d_poolid, uint32_t flags,
1441
                     uint64_t uuid_lo, uint64_t uuid_hi)
1442
0
{
1443
0
    struct client *client;
1444
0
    domid_t cli_id;
1445
0
    int persistent = flags & TMEM_POOL_PERSIST;
1446
0
    int shared = flags & TMEM_POOL_SHARED;
1447
0
    int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1448
0
         & TMEM_POOL_PAGESIZE_MASK;
1449
0
    int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1450
0
         & TMEM_POOL_VERSION_MASK;
1451
0
    struct tmem_pool *pool, *shpool;
1452
0
    int i, first_unused_s_poolid;
1453
0
1454
0
    if ( this_cli_id == TMEM_CLI_ID_NULL )
1455
0
        cli_id = current->domain->domain_id;
1456
0
    else
1457
0
        cli_id = this_cli_id;
1458
0
    tmem_client_info("tmem: allocating %s-%s tmem pool for %s=%d...",
1459
0
        persistent ? "persistent" : "ephemeral" ,
1460
0
        shared ? "shared" : "private", tmem_cli_id_str, cli_id);
1461
0
    if ( specversion != TMEM_SPEC_VERSION )
1462
0
    {
1463
0
        tmem_client_err("failed... unsupported spec version\n");
1464
0
        return -EPERM;
1465
0
    }
1466
0
    if ( shared && persistent )
1467
0
    {
1468
0
        tmem_client_err("failed... unable to create a shared-persistant pool\n");
1469
0
        return -EPERM;
1470
0
    }
1471
0
    if ( pagebits != (PAGE_SHIFT - 12) )
1472
0
    {
1473
0
        tmem_client_err("failed... unsupported pagesize %d\n",
1474
0
                       1 << (pagebits + 12));
1475
0
        return -EPERM;
1476
0
    }
1477
0
    if ( flags & TMEM_POOL_PRECOMPRESSED )
1478
0
    {
1479
0
        tmem_client_err("failed... precompression flag set but unsupported\n");
1480
0
        return -EPERM;
1481
0
    }
1482
0
    if ( flags & TMEM_POOL_RESERVED_BITS )
1483
0
    {
1484
0
        tmem_client_err("failed... reserved bits must be zero\n");
1485
0
        return -EPERM;
1486
0
    }
1487
0
    if ( this_cli_id != TMEM_CLI_ID_NULL )
1488
0
    {
1489
0
        if ( (client = tmem_client_from_cli_id(this_cli_id)) == NULL
1490
0
             || d_poolid >= MAX_POOLS_PER_DOMAIN
1491
0
             || client->pools[d_poolid] != NULL )
1492
0
            return -EPERM;
1493
0
    }
1494
0
    else
1495
0
    {
1496
0
        client = current->domain->tmem_client;
1497
0
        ASSERT(client != NULL);
1498
0
        for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1499
0
            if ( client->pools[d_poolid] == NULL )
1500
0
                break;
1501
0
        if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
1502
0
        {
1503
0
            tmem_client_err("failed... no more pool slots available for this %s\n",
1504
0
                   tmem_client_str);
1505
0
            return -EPERM;
1506
0
        }
1507
0
    }
1508
0
1509
0
    if ( (pool = pool_alloc()) == NULL )
1510
0
    {
1511
0
        tmem_client_err("failed... out of memory\n");
1512
0
        return -ENOMEM;
1513
0
    }
1514
0
    client->pools[d_poolid] = pool;
1515
0
    pool->client = client;
1516
0
    pool->pool_id = d_poolid;
1517
0
    pool->shared = shared;
1518
0
    pool->persistent = persistent;
1519
0
    pool->uuid[0] = uuid_lo;
1520
0
    pool->uuid[1] = uuid_hi;
1521
0
1522
0
    /*
1523
0
     * Already created a pool when arrived here, but need some special process
1524
0
     * for shared pool.
1525
0
     */
1526
0
    if ( shared )
1527
0
    {
1528
0
        if ( uuid_lo == -1L && uuid_hi == -1L )
1529
0
        {
1530
0
            tmem_client_info("Invalid uuid, create non shared pool instead!\n");
1531
0
            pool->shared = 0;
1532
0
            goto out;
1533
0
        }
1534
0
        if ( !tmem_global.shared_auth )
1535
0
        {
1536
0
            for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1537
0
                if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1538
0
                     (client->shared_auth_uuid[i][1] == uuid_hi) )
1539
0
                    break;
1540
0
            if ( i == MAX_GLOBAL_SHARED_POOLS )
1541
0
            {
1542
0
                tmem_client_info("Shared auth failed, create non shared pool instead!\n");
1543
0
                pool->shared = 0;
1544
0
                goto out;
1545
0
            }
1546
0
        }
1547
0
1548
0
        /*
1549
0
         * Authorize okay, match a global shared pool or use the newly allocated
1550
0
         * one.
1551
0
         */
1552
0
        first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1553
0
        for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
1554
0
        {
1555
0
            if ( (shpool = tmem_global.shared_pools[i]) != NULL )
1556
0
            {
1557
0
                if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1558
0
                {
1559
0
                    /* Succ to match a global shared pool. */
1560
0
                    tmem_client_info("(matches shared pool uuid=%"PRIx64".%"PRIx64") pool_id=%d\n",
1561
0
                        uuid_hi, uuid_lo, d_poolid);
1562
0
                    client->pools[d_poolid] = shpool;
1563
0
                    if ( !shared_pool_join(shpool, client) )
1564
0
                    {
1565
0
                        pool_free(pool);
1566
0
                        goto out;
1567
0
                    }
1568
0
                    else
1569
0
                        goto fail;
1570
0
                }
1571
0
            }
1572
0
            else
1573
0
            {
1574
0
                if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1575
0
                    first_unused_s_poolid = i;
1576
0
            }
1577
0
        }
1578
0
1579
0
        /* Failed to find a global shared pool slot. */
1580
0
        if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1581
0
        {
1582
0
            tmem_client_warn("tmem: failed... no global shared pool slots available\n");
1583
0
            goto fail;
1584
0
        }
1585
0
        /* Add pool to global shared pool. */
1586
0
        else
1587
0
        {
1588
0
            INIT_LIST_HEAD(&pool->share_list);
1589
0
            pool->shared_count = 0;
1590
0
            if ( shared_pool_join(pool, client) )
1591
0
                goto fail;
1592
0
            tmem_global.shared_pools[first_unused_s_poolid] = pool;
1593
0
        }
1594
0
    }
1595
0
1596
0
out:
1597
0
    tmem_client_info("pool_id=%d\n", d_poolid);
1598
0
    client->info.nr_pools++;
1599
0
    return d_poolid;
1600
0
1601
0
fail:
1602
0
    pool_free(pool);
1603
0
    return -EPERM;
1604
0
}
1605
1606
/************ TMEM CONTROL OPERATIONS ************************************/
1607
1608
int tmemc_shared_pool_auth(domid_t cli_id, uint64_t uuid_lo,
1609
                           uint64_t uuid_hi, bool auth)
1610
0
{
1611
0
    struct client *client;
1612
0
    int i, free = -1;
1613
0
1614
0
    if ( cli_id == TMEM_CLI_ID_NULL )
1615
0
    {
1616
0
        tmem_global.shared_auth = auth;
1617
0
        return 1;
1618
0
    }
1619
0
    client = tmem_client_from_cli_id(cli_id);
1620
0
    if ( client == NULL )
1621
0
        return -EINVAL;
1622
0
1623
0
    for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1624
0
    {
1625
0
        if ( auth == 0 )
1626
0
        {
1627
0
            if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1628
0
                    (client->shared_auth_uuid[i][1] == uuid_hi) )
1629
0
            {
1630
0
                client->shared_auth_uuid[i][0] = -1L;
1631
0
                client->shared_auth_uuid[i][1] = -1L;
1632
0
                return 1;
1633
0
            }
1634
0
        }
1635
0
        else
1636
0
        {
1637
0
            if ( (client->shared_auth_uuid[i][0] == -1L) &&
1638
0
                    (client->shared_auth_uuid[i][1] == -1L) )
1639
0
            {
1640
0
                free = i;
1641
0
                break;
1642
0
            }
1643
0
  }
1644
0
    }
1645
0
    if ( auth == 0 )
1646
0
        return 0;
1647
0
    else if ( free == -1)
1648
0
        return -ENOMEM;
1649
0
    else
1650
0
    {
1651
0
        client->shared_auth_uuid[free][0] = uuid_lo;
1652
0
        client->shared_auth_uuid[free][1] = uuid_hi;
1653
0
        return 1;
1654
0
    }
1655
0
}
1656
1657
static int tmemc_save_subop(int cli_id, uint32_t pool_id,
1658
                        uint32_t subop, tmem_cli_va_param_t buf, uint32_t arg)
1659
0
{
1660
0
    struct client *client = tmem_client_from_cli_id(cli_id);
1661
0
    uint32_t p;
1662
0
    struct tmem_page_descriptor *pgp, *pgp2;
1663
0
    int rc = -ENOENT;
1664
0
1665
0
    switch(subop)
1666
0
    {
1667
0
    case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
1668
0
        if ( client == NULL )
1669
0
            break;
1670
0
        for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
1671
0
            if ( client->pools[p] != NULL )
1672
0
                break;
1673
0
1674
0
        if ( p == MAX_POOLS_PER_DOMAIN )
1675
0
            break;
1676
0
1677
0
        client->was_frozen = client->info.flags.u.frozen;
1678
0
        client->info.flags.u.frozen = 1;
1679
0
        if ( arg != 0 )
1680
0
            client->info.flags.u.migrating = 1;
1681
0
        rc = 0;
1682
0
        break;
1683
0
    case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
1684
0
        if ( client == NULL )
1685
0
            rc = client_create(cli_id) ? 0 : -ENOMEM;
1686
0
        else
1687
0
            rc = -EEXIST;
1688
0
        break;
1689
0
    case XEN_SYSCTL_TMEM_OP_SAVE_END:
1690
0
        if ( client == NULL )
1691
0
            break;
1692
0
        client->info.flags.u.migrating = 0;
1693
0
        if ( !list_empty(&client->persistent_invalidated_list) )
1694
0
            list_for_each_entry_safe(pgp,pgp2,
1695
0
              &client->persistent_invalidated_list, client_inv_pages)
1696
0
                __pgp_free(pgp, client->pools[pgp->pool_id]);
1697
0
        client->info.flags.u.frozen = client->was_frozen;
1698
0
        rc = 0;
1699
0
        break;
1700
0
    }
1701
0
    return rc;
1702
0
}
1703
1704
static int tmemc_save_get_next_page(int cli_id, uint32_t pool_id,
1705
                        tmem_cli_va_param_t buf, uint32_t bufsize)
1706
0
{
1707
0
    struct client *client = tmem_client_from_cli_id(cli_id);
1708
0
    struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1709
0
                   ? NULL : client->pools[pool_id];
1710
0
    struct tmem_page_descriptor *pgp;
1711
0
    struct xen_tmem_oid *oid;
1712
0
    int ret = 0;
1713
0
    struct tmem_handle h;
1714
0
1715
0
    if ( pool == NULL || !is_persistent(pool) )
1716
0
        return -1;
1717
0
1718
0
    if ( bufsize < PAGE_SIZE + sizeof(struct tmem_handle) )
1719
0
        return -ENOMEM;
1720
0
1721
0
    spin_lock(&pers_lists_spinlock);
1722
0
    if ( list_empty(&pool->persistent_page_list) )
1723
0
    {
1724
0
        ret = -1;
1725
0
        goto out;
1726
0
    }
1727
0
    /* Note: pool->cur_pgp is the pgp last returned by get_next_page. */
1728
0
    if ( pool->cur_pgp == NULL )
1729
0
    {
1730
0
        /* Process the first one. */
1731
0
        pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
1732
0
                         struct tmem_page_descriptor,us.pool_pers_pages);
1733
0
    } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages,
1734
0
                             &pool->persistent_page_list) )
1735
0
    {
1736
0
        /* Already processed the last one in the list. */
1737
0
        ret = -1;
1738
0
        goto out;
1739
0
    }
1740
0
    pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next,
1741
0
                         struct tmem_page_descriptor,us.pool_pers_pages);
1742
0
    pool->cur_pgp = pgp;
1743
0
    oid = &pgp->us.obj->oid;
1744
0
    h.pool_id = pool_id;
1745
0
    BUILD_BUG_ON(sizeof(h.oid) != sizeof(*oid));
1746
0
    memcpy(&(h.oid), oid, sizeof(h.oid));
1747
0
    h.index = pgp->index;
1748
0
    if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
1749
0
    {
1750
0
        ret = -EFAULT;
1751
0
        goto out;
1752
0
    }
1753
0
    guest_handle_add_offset(buf, sizeof(h));
1754
0
    ret = do_tmem_get(pool, oid, pgp->index, 0, buf);
1755
0
1756
0
out:
1757
0
    spin_unlock(&pers_lists_spinlock);
1758
0
    return ret;
1759
0
}
1760
1761
static int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_param_t buf,
1762
                        uint32_t bufsize)
1763
0
{
1764
0
    struct client *client = tmem_client_from_cli_id(cli_id);
1765
0
    struct tmem_page_descriptor *pgp;
1766
0
    struct tmem_handle h;
1767
0
    int ret = 0;
1768
0
1769
0
    if ( client == NULL )
1770
0
        return 0;
1771
0
    if ( bufsize < sizeof(struct tmem_handle) )
1772
0
        return 0;
1773
0
    spin_lock(&pers_lists_spinlock);
1774
0
    if ( list_empty(&client->persistent_invalidated_list) )
1775
0
        goto out;
1776
0
    if ( client->cur_pgp == NULL )
1777
0
    {
1778
0
        pgp = list_entry((&client->persistent_invalidated_list)->next,
1779
0
                         struct tmem_page_descriptor,client_inv_pages);
1780
0
        client->cur_pgp = pgp;
1781
0
    } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
1782
0
                             &client->persistent_invalidated_list) )
1783
0
    {
1784
0
        client->cur_pgp = NULL;
1785
0
        ret = 0;
1786
0
        goto out;
1787
0
    } else {
1788
0
        pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
1789
0
                         struct tmem_page_descriptor,client_inv_pages);
1790
0
        client->cur_pgp = pgp;
1791
0
    }
1792
0
    h.pool_id = pgp->pool_id;
1793
0
    BUILD_BUG_ON(sizeof(h.oid) != sizeof(pgp->inv_oid));
1794
0
    memcpy(&(h.oid), &(pgp->inv_oid), sizeof(h.oid));
1795
0
    h.index = pgp->index;
1796
0
    ret = 1;
1797
0
    if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
1798
0
        ret = -EFAULT;
1799
0
out:
1800
0
    spin_unlock(&pers_lists_spinlock);
1801
0
    return ret;
1802
0
}
1803
1804
static int tmemc_restore_put_page(int cli_id, uint32_t pool_id,
1805
                                  struct xen_tmem_oid *oidp,
1806
                                  uint32_t index, tmem_cli_va_param_t buf,
1807
                                  uint32_t bufsize)
1808
0
{
1809
0
    struct client *client = tmem_client_from_cli_id(cli_id);
1810
0
    struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1811
0
                   ? NULL : client->pools[pool_id];
1812
0
1813
0
    if ( pool == NULL )
1814
0
        return -1;
1815
0
    if (bufsize != PAGE_SIZE) {
1816
0
        tmem_client_err("tmem: %s: invalid parameter bufsize(%d) != (%ld)\n",
1817
0
                __func__, bufsize, PAGE_SIZE);
1818
0
        return -EINVAL;
1819
0
    }
1820
0
    return do_tmem_put(pool, oidp, index, 0, buf);
1821
0
}
1822
1823
static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id,
1824
                                    struct xen_tmem_oid *oidp,
1825
                                    uint32_t index)
1826
0
{
1827
0
    struct client *client = tmem_client_from_cli_id(cli_id);
1828
0
    struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1829
0
                   ? NULL : client->pools[pool_id];
1830
0
1831
0
    if ( pool == NULL )
1832
0
        return -1;
1833
0
    return do_tmem_flush_page(pool,oidp,index);
1834
0
}
1835
1836
int do_tmem_control(struct xen_sysctl_tmem_op *op)
1837
0
{
1838
0
    int ret;
1839
0
    uint32_t pool_id = op->pool_id;
1840
0
    uint32_t cmd = op->cmd;
1841
0
    struct xen_tmem_oid *oidp = &op->oid;
1842
0
1843
0
    ASSERT(rw_is_write_locked(&tmem_rwlock));
1844
0
1845
0
    switch (cmd)
1846
0
    {
1847
0
    case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
1848
0
    case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
1849
0
    case XEN_SYSCTL_TMEM_OP_SAVE_END:
1850
0
        ret = tmemc_save_subop(op->cli_id, pool_id, cmd,
1851
0
                               guest_handle_cast(op->u.buf, char), op->arg);
1852
0
        break;
1853
0
    case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE:
1854
0
        ret = tmemc_save_get_next_page(op->cli_id, pool_id,
1855
0
                                       guest_handle_cast(op->u.buf, char), op->len);
1856
0
        break;
1857
0
    case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV:
1858
0
        ret = tmemc_save_get_next_inv(op->cli_id,
1859
0
                                      guest_handle_cast(op->u.buf, char), op->len);
1860
0
        break;
1861
0
    case XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE:
1862
0
        ret = tmemc_restore_put_page(op->cli_id, pool_id, oidp, op->arg,
1863
0
                                     guest_handle_cast(op->u.buf, char), op->len);
1864
0
        break;
1865
0
    case XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE:
1866
0
        ret = tmemc_restore_flush_page(op->cli_id, pool_id, oidp, op->arg);
1867
0
        break;
1868
0
    default:
1869
0
        ret = -1;
1870
0
    }
1871
0
1872
0
    return ret;
1873
0
}
1874
1875
/************ EXPORTed FUNCTIONS **************************************/
1876
1877
long do_tmem_op(tmem_cli_op_t uops)
1878
0
{
1879
0
    struct tmem_op op;
1880
0
    struct client *client = current->domain->tmem_client;
1881
0
    struct tmem_pool *pool = NULL;
1882
0
    struct xen_tmem_oid *oidp;
1883
0
    int rc = 0;
1884
0
1885
0
    if ( !tmem_initialized )
1886
0
        return -ENODEV;
1887
0
1888
0
    if ( xsm_tmem_op(XSM_HOOK) )
1889
0
        return -EPERM;
1890
0
1891
0
    tmem_stats.total_tmem_ops++;
1892
0
1893
0
    if ( client != NULL && client->domain->is_dying )
1894
0
    {
1895
0
        tmem_stats.errored_tmem_ops++;
1896
0
        return -ENODEV;
1897
0
    }
1898
0
1899
0
    if ( unlikely(tmem_get_tmemop_from_client(&op, uops) != 0) )
1900
0
    {
1901
0
        tmem_client_err("tmem: can't get tmem struct from %s\n", tmem_client_str);
1902
0
        tmem_stats.errored_tmem_ops++;
1903
0
        return -EFAULT;
1904
0
    }
1905
0
1906
0
    /* Acquire write lock for all commands at first. */
1907
0
    write_lock(&tmem_rwlock);
1908
0
1909
0
    switch ( op.cmd )
1910
0
    {
1911
0
    case TMEM_CONTROL:
1912
0
    case TMEM_RESTORE_NEW:
1913
0
    case TMEM_AUTH:
1914
0
        rc = -EOPNOTSUPP;
1915
0
        break;
1916
0
1917
0
    default:
1918
0
    /*
1919
0
   * For other commands, create per-client tmem structure dynamically on
1920
0
   * first use by client.
1921
0
   */
1922
0
        if ( client == NULL )
1923
0
        {
1924
0
            if ( (client = client_create(current->domain->domain_id)) == NULL )
1925
0
            {
1926
0
                tmem_client_err("tmem: can't create tmem structure for %s\n",
1927
0
                               tmem_client_str);
1928
0
                rc = -ENOMEM;
1929
0
                goto out;
1930
0
            }
1931
0
        }
1932
0
1933
0
        if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
1934
0
        {
1935
0
            if ( op.cmd == TMEM_NEW_POOL )
1936
0
                rc = do_tmem_new_pool(TMEM_CLI_ID_NULL, 0, op.u.creat.flags,
1937
0
                                op.u.creat.uuid[0], op.u.creat.uuid[1]);
1938
0
          else
1939
0
                rc = do_tmem_destroy_pool(op.pool_id);
1940
0
        }
1941
0
        else
1942
0
        {
1943
0
            if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
1944
0
                 ((pool = client->pools[op.pool_id]) == NULL) )
1945
0
            {
1946
0
                tmem_client_err("tmem: operation requested on uncreated pool\n");
1947
0
                rc = -ENODEV;
1948
0
                goto out;
1949
0
            }
1950
0
            /* Commands that only need read lock. */
1951
0
            write_unlock(&tmem_rwlock);
1952
0
            read_lock(&tmem_rwlock);
1953
0
1954
0
            oidp = &op.u.gen.oid;
1955
0
            switch ( op.cmd )
1956
0
            {
1957
0
            case TMEM_NEW_POOL:
1958
0
            case TMEM_DESTROY_POOL:
1959
0
                BUG(); /* Done earlier. */
1960
0
                break;
1961
0
            case TMEM_PUT_PAGE:
1962
0
                if (tmem_ensure_avail_pages())
1963
0
                    rc = do_tmem_put(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
1964
0
                                tmem_cli_buf_null);
1965
0
                else
1966
0
                    rc = -ENOMEM;
1967
0
                break;
1968
0
            case TMEM_GET_PAGE:
1969
0
                rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
1970
0
                                tmem_cli_buf_null);
1971
0
                break;
1972
0
            case TMEM_FLUSH_PAGE:
1973
0
                rc = do_tmem_flush_page(pool, oidp, op.u.gen.index);
1974
0
                break;
1975
0
            case TMEM_FLUSH_OBJECT:
1976
0
                rc = do_tmem_flush_object(pool, oidp);
1977
0
                break;
1978
0
            default:
1979
0
                tmem_client_warn("tmem: op %d not implemented\n", op.cmd);
1980
0
                rc = -ENOSYS;
1981
0
                break;
1982
0
            }
1983
0
            read_unlock(&tmem_rwlock);
1984
0
            if ( rc < 0 )
1985
0
                tmem_stats.errored_tmem_ops++;
1986
0
            return rc;
1987
0
        }
1988
0
        break;
1989
0
1990
0
    }
1991
0
out:
1992
0
    write_unlock(&tmem_rwlock);
1993
0
    if ( rc < 0 )
1994
0
        tmem_stats.errored_tmem_ops++;
1995
0
    return rc;
1996
0
}
1997
1998
/* This should be called when the host is destroying a client (domain). */
1999
void tmem_destroy(void *v)
2000
0
{
2001
0
    struct client *client = (struct client *)v;
2002
0
2003
0
    if ( client == NULL )
2004
0
        return;
2005
0
2006
0
    if ( !client->domain->is_dying )
2007
0
    {
2008
0
        printk("tmem: tmem_destroy can only destroy dying client\n");
2009
0
        return;
2010
0
    }
2011
0
2012
0
    write_lock(&tmem_rwlock);
2013
0
2014
0
    printk("tmem: flushing tmem pools for %s=%d\n",
2015
0
           tmem_cli_id_str, client->cli_id);
2016
0
    client_flush(client);
2017
0
2018
0
    write_unlock(&tmem_rwlock);
2019
0
}
2020
2021
#define MAX_EVICTS 10  /* Should be variable or set via XEN_SYSCTL_TMEM_OP_ ?? */
2022
void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2023
0
{
2024
0
    struct page_info *pfp;
2025
0
    unsigned long evicts_per_relinq = 0;
2026
0
    int max_evictions = 10;
2027
0
2028
0
    if (!tmem_enabled() || !tmem_freeable_pages())
2029
0
        return NULL;
2030
0
2031
0
    tmem_stats.relinq_attempts++;
2032
0
    if ( order > 0 )
2033
0
    {
2034
0
#ifndef NDEBUG
2035
0
        printk("tmem_relinquish_page: failing order=%d\n", order);
2036
0
#endif
2037
0
        return NULL;
2038
0
    }
2039
0
2040
0
    while ( (pfp = tmem_page_list_get()) == NULL )
2041
0
    {
2042
0
        if ( (max_evictions-- <= 0) || !tmem_evict())
2043
0
            break;
2044
0
        evicts_per_relinq++;
2045
0
    }
2046
0
    if ( evicts_per_relinq > tmem_stats.max_evicts_per_relinq )
2047
0
        tmem_stats.max_evicts_per_relinq = evicts_per_relinq;
2048
0
    if ( pfp != NULL )
2049
0
    {
2050
0
        if ( !(memflags & MEMF_tmem) )
2051
0
            scrub_one_page(pfp);
2052
0
        tmem_stats.relinq_pgs++;
2053
0
    }
2054
0
2055
0
    return pfp;
2056
0
}
2057
2058
unsigned long tmem_freeable_pages(void)
2059
86.5k
{
2060
86.5k
    if ( !tmem_enabled() )
2061
86.5k
        return 0;
2062
86.5k
2063
0
    return tmem_page_list_pages + _atomic_read(freeable_page_count);
2064
86.5k
}
2065
2066
/* Called at hypervisor startup. */
2067
static int __init init_tmem(void)
2068
1
{
2069
1
    if ( !tmem_enabled() )
2070
1
        return 0;
2071
1
2072
0
    if ( !tmem_mempool_init() )
2073
0
        return 0;
2074
0
2075
0
    if ( tmem_init() )
2076
0
    {
2077
0
        printk("tmem: initialized comp=%d\n", tmem_compression_enabled());
2078
0
        tmem_initialized = 1;
2079
0
    }
2080
0
    else
2081
0
        printk("tmem: initialization FAILED\n");
2082
0
2083
0
    return 0;
2084
0
}
2085
__initcall(init_tmem);
2086
2087
/*
2088
 * Local variables:
2089
 * mode: C
2090
 * c-file-style: "BSD"
2091
 * c-basic-offset: 4
2092
 * tab-width: 4
2093
 * indent-tabs-mode: nil
2094
 * End:
2095
 */