debuggers.hg

changeset 21149:61372a4f4e76

tmem: add page deduplication with optional compression or trailing-zero-elimination

Add "page deduplication" capability (with optional compression
and trailing-zero elimination) to Xen's tmem.

(Transparent to tmem-enabled guests.) Ephemeral pages
that have the exact same content are "combined" so that only
one page frame is needed. Since ephemeral pages are essentially
read-only, no C-O-W (and thus no equivalent of swapping) is
necessary. Deduplication can be combined with compression
or "trailing zero elimination" for even more space savings.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Apr 06 07:11:48 2010 +0100 (2010-04-06)
parents 44bef2b4a075
children 4822f82acec6
files tools/misc/xen-tmem-list-parse.c xen/common/tmem.c xen/common/tmem_xen.c xen/include/xen/tmem_xen.h
line diff
     1.1 --- a/tools/misc/xen-tmem-list-parse.c	Tue Apr 06 07:09:35 2010 +0100
     1.2 +++ b/tools/misc/xen-tmem-list-parse.c	Tue Apr 06 07:11:48 2010 +0100
     1.3 @@ -110,13 +110,39 @@ void parse_global(char *s)
     1.4      unsigned long long rtree_node_max = parse(s,"Nm");
     1.5      unsigned long long pgp_count = parse(s,"Pc");
     1.6      unsigned long long pgp_max = parse(s,"Pm");
     1.7 +    unsigned long long page_count = parse(s,"Fc");
     1.8 +    unsigned long long max_page_count = parse(s,"Fm");
     1.9 +    unsigned long long pcd_count = parse(s,"Sc");
    1.10 +    unsigned long long max_pcd_count = parse(s,"Sm");
    1.11 +    unsigned long long pcd_tot_tze_size = parse(s,"Zt");
    1.12 +    unsigned long long pcd_tot_csize = parse(s,"Gz");
    1.13 +    unsigned long long deduped_puts = parse(s,"Gd");
    1.14 +    unsigned long long tot_good_eph_puts = parse(s,"Ep");
    1.15  
    1.16      printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n",
    1.17             total_ops, errored_ops, avail_pages);
    1.18      printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) "
    1.19 -           "nodes=%llu (max=%llu)\n",
    1.20 +           "nodes=%llu (max=%llu) pages=%llu (max=%llu) ",
    1.21             obj_count, obj_max, pgp_count, pgp_max,
    1.22 -           rtree_node_count, rtree_node_max);
    1.23 +           rtree_node_count, rtree_node_max,
    1.24 +           page_count,max_page_count);
    1.25 +    if (max_pcd_count != 0 && global_eph_count != 0 && tot_good_eph_puts != 0) {
    1.26 +           printf("pcds=%llu (max=%llu) ",
    1.27 +               pcd_count,max_pcd_count);
    1.28 +           printf("deduped: avg=%4.2f%% (curr=%4.2f%%) ",
    1.29 +                   ((deduped_puts*1.0)/tot_good_eph_puts)*100,
    1.30 +                   (1.0-(pcd_count*1.0)/global_eph_count)*100);
    1.31 +    }
    1.32 +    if (pcd_count != 0)
    1.33 +    {
    1.34 +           if (pcd_tot_tze_size && (pcd_tot_tze_size < pcd_count*PAGE_SIZE))
    1.35 +               printf("tze savings=%4.2f%% ",
    1.36 +                   (1.0-(pcd_tot_tze_size*1.0)/(pcd_count*PAGE_SIZE))*100);
    1.37 +           if (pcd_tot_csize && (pcd_tot_csize < pcd_count*PAGE_SIZE))
    1.38 +               printf("compression savings=%4.2f%% ",
    1.39 +                   (1.0-(pcd_tot_csize*1.0)/(pcd_count*PAGE_SIZE))*100);
    1.40 +    }
    1.41 +    printf("\n");
    1.42      printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu "
    1.43             "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, "
    1.44             "max_evicts_per_relinq=%llu, flush_pools=%llu, "
     2.1 --- a/xen/common/tmem.c	Tue Apr 06 07:09:35 2010 +0100
     2.2 +++ b/xen/common/tmem.c	Tue Apr 06 07:11:48 2010 +0100
     2.3 @@ -6,11 +6,10 @@
     2.4   * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
     2.5   */
     2.6  
     2.7 -/* TODO list: 090129
     2.8 -   - improve on reclamation policy
     2.9 +/* TODO list: 090129 (updated 100318)
    2.10 +   - any better reclamation policy?
    2.11     - use different tlsf pools for each client (maybe each pool)
    2.12 -   - implement page accounting and minimal QoS limits
    2.13 -   - test shared access more completely (need pv cluster fs)
    2.14 +   - test shared access more completely (ocfs2)
    2.15     - add feedback-driven compression (not for persistent pools though!)
    2.16     - add data-structure total bytes overhead stats
    2.17   */
    2.18 @@ -77,12 +76,17 @@ static unsigned long evicted_pgs = 0, ev
    2.19  static unsigned long relinq_pgs = 0, relinq_attempts = 0;
    2.20  static unsigned long max_evicts_per_relinq = 0;
    2.21  static unsigned long low_on_memory = 0;
    2.22 +static unsigned long deduped_puts = 0;
    2.23 +static unsigned long tot_good_eph_puts = 0;
    2.24  static int global_obj_count_max = 0;
    2.25  static int global_pgp_count_max = 0;
    2.26 +static int global_pcd_count_max = 0;
    2.27  static int global_page_count_max = 0;
    2.28  static int global_rtree_node_count_max = 0;
    2.29  static long global_eph_count_max = 0;
    2.30  static unsigned long failed_copies;
    2.31 +static unsigned long pcd_tot_tze_size = 0;
    2.32 +static unsigned long pcd_tot_csize = 0;
    2.33  
    2.34  DECL_CYC_COUNTER(succ_get);
    2.35  DECL_CYC_COUNTER(succ_put);
    2.36 @@ -108,6 +112,7 @@ DECL_CYC_COUNTER(decompress);
    2.37  
    2.38  struct tm_pool;
    2.39  struct tmem_page_descriptor;
    2.40 +struct tmem_page_content_descriptor;
    2.41  struct client {
    2.42      struct list_head client_list;
    2.43      struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
    2.44 @@ -219,12 +224,17 @@ struct tmem_page_descriptor {
    2.45          obj_t *obj;
    2.46          uint64_t inv_oid;  /* used for invalid list only */
    2.47      };
    2.48 +    pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
    2.49 +                    else compressed data (cdata) */
    2.50      uint32_t index;
    2.51 -    size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
    2.52 -                    else compressed data (cdata) */
    2.53 +    /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
    2.54 +    uint16_t firstbyte; /* NON_SHAREABLE->pfp  otherwise->pcd */
    2.55 +    bool_t eviction_attempted;  /* CHANGE TO lifetimes? (settable) */
    2.56 +    struct list_head pcd_siblings;
    2.57      union {
    2.58          pfp_t *pfp;  /* page frame pointer */
    2.59          char *cdata; /* compressed data */
    2.60 +        struct tmem_page_content_descriptor *pcd; /* page dedup */
    2.61      };
    2.62      union {
    2.63          uint64_t timestamp;
    2.64 @@ -234,6 +244,25 @@ struct tmem_page_descriptor {
    2.65  };
    2.66  typedef struct tmem_page_descriptor pgp_t;
    2.67  
    2.68 +#define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
    2.69 +
    2.70 +struct tmem_page_content_descriptor {
    2.71 +    union {
    2.72 +        pfp_t *pfp;  /* page frame pointer */
    2.73 +        char *cdata; /* if compression_enabled */
    2.74 +        char *tze; /* if !compression_enabled, trailing zeroes eliminated */
    2.75 +    };
    2.76 +    struct list_head pgp_list;
    2.77 +    struct rb_node pcd_rb_tree_node;
    2.78 +    uint32_t pgp_ref_count;
    2.79 +    pagesize_t size; /* if compression_enabled -> 0<size<PAGE_SIZE (*cdata)
    2.80 +                     * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
    2.81 +                     * else PAGE_SIZE -> *pfp */
    2.82 +};
    2.83 +typedef struct tmem_page_content_descriptor pcd_t;
    2.84 +struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
    2.85 +rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
    2.86 +
    2.87  static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
    2.88  
    2.89  static LIST_HEAD(global_client_list);
    2.90 @@ -267,6 +296,7 @@ static DEFINE_SPINLOCK(pers_lists_spinlo
    2.91  static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
    2.92  static atomic_t global_obj_count = ATOMIC_INIT(0);
    2.93  static atomic_t global_pgp_count = ATOMIC_INIT(0);
    2.94 +static atomic_t global_pcd_count = ATOMIC_INIT(0);
    2.95  static atomic_t global_page_count = ATOMIC_INIT(0);
    2.96  static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
    2.97  
    2.98 @@ -336,6 +366,229 @@ static NOINLINE void tmem_page_free(pool
    2.99      atomic_dec_and_assert(global_page_count);
   2.100  }
   2.101  
   2.102 +/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/
   2.103 +
   2.104 +#define NOT_SHAREABLE ((uint16_t)-1UL)
   2.105 +
   2.106 +static NOINLINE int pcd_copy_to_client(tmem_cli_mfn_t cmfn, pgp_t *pgp)
   2.107 +{
   2.108 +    uint8_t firstbyte = pgp->firstbyte;
   2.109 +    pcd_t *pcd;
   2.110 +    int ret;
   2.111 +
   2.112 +    ASSERT(tmh_dedup_enabled());
   2.113 +    tmem_read_lock(&pcd_tree_rwlocks[firstbyte]);
   2.114 +    pcd = pgp->pcd;
   2.115 +    if ( pgp->size < PAGE_SIZE && pgp->size != 0 &&
   2.116 +         pcd->size < PAGE_SIZE && pcd->size != 0 )
   2.117 +        ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL);
   2.118 +    else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE )
   2.119 +        ret = tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size);
   2.120 +    else
   2.121 +        ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL);
   2.122 +    tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]);
   2.123 +    return ret;
   2.124 +}
   2.125 +
   2.126 +/* ensure pgp no longer points to pcd, nor vice-versa */
   2.127 +/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
   2.128 +static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t have_pcd_rwlock)
   2.129 +{
   2.130 +    pcd_t *pcd = pgp->pcd;
   2.131 +    pfp_t *pfp = pgp->pcd->pfp;
   2.132 +    uint16_t firstbyte = pgp->firstbyte;
   2.133 +    char *pcd_tze = pgp->pcd->tze;
   2.134 +    pagesize_t pcd_size = pcd->size;
   2.135 +    pagesize_t pgp_size = pgp->size;
   2.136 +    char *pcd_cdata = pgp->pcd->cdata;
   2.137 +    pagesize_t pcd_csize = pgp->pcd->size;
   2.138 +
   2.139 +    ASSERT(tmh_dedup_enabled());
   2.140 +    ASSERT(firstbyte != NOT_SHAREABLE);
   2.141 +    ASSERT(firstbyte < 256);
   2.142 +
   2.143 +    if ( have_pcd_rwlock )
   2.144 +        ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]);
   2.145 +    else
   2.146 +        tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
   2.147 +    list_del_init(&pgp->pcd_siblings);
   2.148 +    pgp->pcd = NULL;
   2.149 +    pgp->firstbyte = NOT_SHAREABLE;
   2.150 +    pgp->size = -1;
   2.151 +    if ( --pcd->pgp_ref_count )
   2.152 +    {
   2.153 +        tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
   2.154 +        return;
   2.155 +    }
   2.156 +
   2.157 +    /* no more references to this pcd, recycle it and the physical page */
   2.158 +    ASSERT(list_empty(&pcd->pgp_list));
   2.159 +    pcd->pfp = NULL;
   2.160 +    /* remove pcd from rbtree */
   2.161 +    rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
   2.162 +    /* reinit the struct for safety for now */
   2.163 +    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
   2.164 +    /* now free up the pcd memory */
   2.165 +    tmem_free(pcd,sizeof(pcd_t),NULL);
   2.166 +    atomic_dec_and_assert(global_pcd_count);
   2.167 +    if ( pgp_size != 0 && pcd_size < PAGE_SIZE )
   2.168 +    {
   2.169 +        /* compressed data */
   2.170 +        tmem_free(pcd_cdata,pcd_csize,pool);
   2.171 +        pcd_tot_csize -= pcd_csize;
   2.172 +    }
   2.173 +    else if ( pcd_size != PAGE_SIZE )
   2.174 +    {
   2.175 +        /* trailing zero data */
   2.176 +        pcd_tot_tze_size -= pcd_size;
   2.177 +        if ( pcd_size )
   2.178 +            tmem_free(pcd_tze,pcd_size,pool);
   2.179 +    } else {
   2.180 +        /* real physical page */
   2.181 +        if ( tmh_tze_enabled() )
   2.182 +            pcd_tot_tze_size -= PAGE_SIZE;
   2.183 +        if ( tmh_compression_enabled() )
   2.184 +            pcd_tot_csize -= PAGE_SIZE;
   2.185 +        tmem_page_free(pool,pfp);
   2.186 +    }
   2.187 +    tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
   2.188 +}
   2.189 +
   2.190 +
   2.191 +static NOINLINE int pcd_associate(pgp_t *pgp, char *cdata, pagesize_t csize)
   2.192 +{
   2.193 +    struct rb_node **new, *parent = NULL;
   2.194 +    struct rb_root *root;
   2.195 +    pcd_t *pcd;
   2.196 +    int cmp;
   2.197 +    pagesize_t pfp_size = 0;
   2.198 +    uint8_t firstbyte = (cdata == NULL) ? tmh_get_first_byte(pgp->pfp) : *cdata;
   2.199 +    int ret = 0;
   2.200 +
   2.201 +    if ( !tmh_dedup_enabled() )
   2.202 +        return 0;
   2.203 +    ASSERT(pgp->obj != NULL);
   2.204 +    ASSERT(pgp->obj->pool != NULL);
   2.205 +    ASSERT(!pgp->obj->pool->persistent);
   2.206 +    if ( cdata == NULL )
   2.207 +    {
   2.208 +        ASSERT(pgp->pfp != NULL);
   2.209 +        pfp_size = PAGE_SIZE;
   2.210 +        if ( tmh_tze_enabled() )
   2.211 +        {
   2.212 +            pfp_size = tmh_tze_pfp_scan(pgp->pfp);
   2.213 +            if ( pfp_size > PCD_TZE_MAX_SIZE )
   2.214 +                pfp_size = PAGE_SIZE;
   2.215 +        }
   2.216 +        ASSERT(pfp_size <= PAGE_SIZE);
   2.217 +        ASSERT(!(pfp_size & (sizeof(uint64_t)-1)));
   2.218 +    }
   2.219 +    tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
   2.220 +
   2.221 +    /* look for page match */
   2.222 +    root = &pcd_tree_roots[firstbyte];
   2.223 +    new = &(root->rb_node);
   2.224 +    while ( *new )
   2.225 +    {
   2.226 +        pcd = container_of(*new, pcd_t, pcd_rb_tree_node);
   2.227 +        parent = *new;
   2.228 +        /* compare new entry and rbtree entry, set cmp accordingly */
   2.229 +        if ( cdata != NULL )
   2.230 +        {
   2.231 +            if ( pcd->size < PAGE_SIZE )
   2.232 +                /* both new entry and rbtree entry are compressed */
   2.233 +                cmp = tmh_pcd_cmp(cdata,csize,pcd->cdata,pcd->size);
   2.234 +            else
   2.235 +                /* new entry is compressed, rbtree entry is not */
   2.236 +                cmp = -1;
   2.237 +        } else if ( pcd->size < PAGE_SIZE )
   2.238 +            /* rbtree entry is compressed, rbtree entry is not */
   2.239 +            cmp = 1;
   2.240 +        else if ( tmh_tze_enabled() ) {
   2.241 +            if ( pcd->size < PAGE_SIZE )
   2.242 +                /* both new entry and rbtree entry are trailing zero */
   2.243 +                cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size);
   2.244 +            else
   2.245 +                /* new entry is trailing zero, rbtree entry is not */
   2.246 +                cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE);
   2.247 +        } else  {
   2.248 +            /* both new entry and rbtree entry are full physical pages */
   2.249 +            ASSERT(pgp->pfp != NULL);
   2.250 +            ASSERT(pcd->pfp != NULL);
   2.251 +            cmp = tmh_page_cmp(pgp->pfp,pcd->pfp);
   2.252 +        }
   2.253 +
   2.254 +        /* walk tree or match depending on cmp */
   2.255 +        if ( cmp < 0 )
   2.256 +            new = &((*new)->rb_left);
   2.257 +        else if ( cmp > 0 )
   2.258 +            new = &((*new)->rb_right);
   2.259 +        else
   2.260 +        {
   2.261 +            /* match! if not compressed, free the no-longer-needed page */
   2.262 +            /* but if compressed, data is assumed static so don't free! */
   2.263 +            if ( cdata == NULL )
   2.264 +                tmem_page_free(pgp->obj->pool,pgp->pfp);
   2.265 +            deduped_puts++;
   2.266 +            goto match;
   2.267 +        }
   2.268 +    }
   2.269 +
   2.270 +    /* exited while loop with no match, so alloc a pcd and put it in the tree */
   2.271 +    if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL )
   2.272 +    {
   2.273 +        ret = -ENOMEM;
   2.274 +        goto unlock;
   2.275 +    } else if ( cdata != NULL ) {
   2.276 +        if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->obj->pool)) == NULL )
   2.277 +        {
   2.278 +            tmem_free(pcd,sizeof(pcd_t),NULL);
   2.279 +            ret = -ENOMEM;
   2.280 +            goto unlock;
   2.281 +        }
   2.282 +    }
   2.283 +    atomic_inc_and_max(global_pcd_count);
   2.284 +    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);  /* is this necessary */
   2.285 +    INIT_LIST_HEAD(&pcd->pgp_list);  /* is this necessary */
   2.286 +    pcd->pgp_ref_count = 0;
   2.287 +    if ( cdata != NULL )
   2.288 +    {
   2.289 +        memcpy(pcd->cdata,cdata,csize);
   2.290 +        pcd->size = csize;
   2.291 +        pcd_tot_csize += csize;
   2.292 +    } else if ( pfp_size == 0 ) {
   2.293 +        ASSERT(tmh_tze_enabled());
   2.294 +        pcd->size = 0;
   2.295 +        pcd->tze = NULL;
   2.296 +    } else if ( pfp_size < PAGE_SIZE &&
   2.297 +         ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->obj->pool)) != NULL) ) {
   2.298 +        tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size);
   2.299 +        pcd->size = pfp_size;
   2.300 +        pcd_tot_tze_size += pfp_size;
   2.301 +        tmem_page_free(pgp->obj->pool,pgp->pfp);
   2.302 +    } else {
   2.303 +        pcd->pfp = pgp->pfp;
   2.304 +        pcd->size = PAGE_SIZE;
   2.305 +        if ( tmh_tze_enabled() )
   2.306 +            pcd_tot_tze_size += PAGE_SIZE;
   2.307 +        if ( tmh_compression_enabled() )
   2.308 +            pcd_tot_csize += PAGE_SIZE;
   2.309 +    }
   2.310 +    rb_link_node(&pcd->pcd_rb_tree_node, parent, new);
   2.311 +    rb_insert_color(&pcd->pcd_rb_tree_node, root);
   2.312 +
   2.313 +match:
   2.314 +    pcd->pgp_ref_count++;
   2.315 +    list_add(&pgp->pcd_siblings,&pcd->pgp_list);
   2.316 +    pgp->firstbyte = firstbyte;
   2.317 +    pgp->eviction_attempted = 0;
   2.318 +    pgp->pcd = pcd;
   2.319 +
   2.320 +unlock:
   2.321 +    tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
   2.322 +    return ret;
   2.323 +}
   2.324 +
   2.325  /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
   2.326  
   2.327  /* allocate a pgp_t and associate it with an object */
   2.328 @@ -353,6 +606,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t *
   2.329      INIT_LIST_HEAD(&pgp->global_eph_pages);
   2.330      INIT_LIST_HEAD(&pgp->client_eph_pages);
   2.331      pgp->pfp = NULL;
   2.332 +    if ( tmh_dedup_enabled() )
   2.333 +    {
   2.334 +        pgp->firstbyte = NOT_SHAREABLE;
   2.335 +        pgp->eviction_attempted = 0;
   2.336 +        INIT_LIST_HEAD(&pgp->pcd_siblings);
   2.337 +    }
   2.338      pgp->size = -1;
   2.339      pgp->index = -1;
   2.340      pgp->timestamp = get_cycles();
   2.341 @@ -374,18 +633,20 @@ static pgp_t *pgp_lookup_in_obj(obj_t *o
   2.342  
   2.343  static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
   2.344  {
   2.345 +    pagesize_t pgp_size = pgp->size;
   2.346 +
   2.347      if ( pgp->pfp == NULL )
   2.348          return;
   2.349 -    if ( !pgp->size )
   2.350 +    if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
   2.351 +        pcd_disassociate(pgp,pool,0); /* pgp->size lost */
   2.352 +    else if ( pgp_size )
   2.353 +        tmem_free(pgp->cdata,pgp_size,pool);
   2.354 +    else
   2.355          tmem_page_free(pgp->obj->pool,pgp->pfp);
   2.356 -    else
   2.357 +    if ( pool != NULL && pgp_size )
   2.358      {
   2.359 -        tmem_free(pgp->cdata,pgp->size,pool);
   2.360 -        if ( pool != NULL )
   2.361 -        {
   2.362 -            pool->client->compressed_pages--;
   2.363 -            pool->client->compressed_sum_size -= pgp->size;
   2.364 -        }
   2.365 +        pool->client->compressed_pages--;
   2.366 +        pool->client->compressed_sum_size -= pgp_size;
   2.367      }
   2.368      pgp->pfp = NULL;
   2.369      pgp->size = -1;
   2.370 @@ -987,10 +1248,56 @@ static void client_freeze(client_t *clie
   2.371  
   2.372  /************ MEMORY REVOCATION ROUTINES *******************************/
   2.373  
   2.374 +static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
   2.375 +{
   2.376 +    obj_t *obj = pgp->obj;
   2.377 +    pool_t *pool = obj->pool;
   2.378 +    client_t *client = pool->client;
   2.379 +    uint16_t firstbyte = pgp->firstbyte;
   2.380 +
   2.381 +    if ( pool->is_dying )
   2.382 +        return 0;
   2.383 +    if ( tmh_lock_all && !obj->no_evict )
   2.384 +       return 1;
   2.385 +    if ( tmem_spin_trylock(&obj->obj_spinlock) )
   2.386 +    {
   2.387 +        if ( tmh_dedup_enabled() )
   2.388 +        {
   2.389 +            firstbyte = pgp->firstbyte;
   2.390 +            if ( firstbyte ==  NOT_SHAREABLE )
   2.391 +                goto obj_unlock;
   2.392 +            ASSERT(firstbyte < 256);
   2.393 +            if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) )
   2.394 +                goto obj_unlock;
   2.395 +            if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted )
   2.396 +            {
   2.397 +                pgp->eviction_attempted++;
   2.398 +                list_del(&pgp->global_eph_pages);
   2.399 +                list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
   2.400 +                list_del(&pgp->client_eph_pages);
   2.401 +                list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
   2.402 +                goto pcd_unlock;
   2.403 +            }
   2.404 +        }
   2.405 +        if ( obj->pgp_count > 1 )
   2.406 +            return 1;
   2.407 +        if ( tmem_write_trylock(&pool->pool_rwlock) )
   2.408 +        {
   2.409 +            *hold_pool_rwlock = 1;
   2.410 +            return 1;
   2.411 +        }
   2.412 +pcd_unlock:
   2.413 +        tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
   2.414 +obj_unlock:
   2.415 +        tmem_spin_unlock(&obj->obj_spinlock);
   2.416 +    }
   2.417 +    return 0;
   2.418 +}
   2.419 +
   2.420  static int tmem_evict(void)
   2.421  {
   2.422      client_t *client = tmh_client_from_current();
   2.423 -    pgp_t *pgp = NULL, *pgp_del;
   2.424 +    pgp_t *pgp = NULL, *pgp2, *pgp_del;
   2.425      obj_t *obj;
   2.426      pool_t *pool;
   2.427      int ret = 0;
   2.428 @@ -1001,49 +1308,15 @@ static int tmem_evict(void)
   2.429      if ( (client != NULL) && client_over_quota(client) &&
   2.430           !list_empty(&client->ephemeral_page_list) )
   2.431      {
   2.432 -        list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
   2.433 -        {
   2.434 -            obj = pgp->obj;
   2.435 -            pool = obj->pool;
   2.436 -            if ( pool->is_dying )
   2.437 -                continue;
   2.438 -            if ( tmh_lock_all && !obj->no_evict )
   2.439 +        list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages)
   2.440 +            if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
   2.441                  goto found;
   2.442 -            if ( tmem_spin_trylock(&obj->obj_spinlock) )
   2.443 -            {
   2.444 -                if ( obj->pgp_count > 1 )
   2.445 -                    goto found;
   2.446 -                if ( tmem_write_trylock(&pool->pool_rwlock) )
   2.447 -                {
   2.448 -                    hold_pool_rwlock = 1;
   2.449 -                    goto found;
   2.450 -                }
   2.451 -                tmem_spin_unlock(&obj->obj_spinlock);
   2.452 -            }
   2.453 -        }
   2.454      } else if ( list_empty(&global_ephemeral_page_list) ) {
   2.455          goto out;
   2.456      } else {
   2.457 -        list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
   2.458 -        {
   2.459 -            obj = pgp->obj;
   2.460 -            pool = obj->pool;
   2.461 -            if ( pool->is_dying )
   2.462 -                continue;
   2.463 -            if ( tmh_lock_all && !obj->no_evict )
   2.464 +        list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages)
   2.465 +            if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
   2.466                  goto found;
   2.467 -            if ( tmem_spin_trylock(&obj->obj_spinlock) )
   2.468 -            {
   2.469 -                if ( obj->pgp_count > 1 )
   2.470 -                    goto found;
   2.471 -                if ( tmem_write_trylock(&pool->pool_rwlock) )
   2.472 -                {
   2.473 -                    hold_pool_rwlock = 1;
   2.474 -                    goto found;
   2.475 -                }
   2.476 -                tmem_spin_unlock(&obj->obj_spinlock);
   2.477 -            }
   2.478 -        }
   2.479      }
   2.480  
   2.481      ret = 0;
   2.482 @@ -1057,10 +1330,16 @@ found:
   2.483      ASSERT(obj->no_evict == 0);
   2.484      ASSERT(obj->pool != NULL);
   2.485      ASSERT_SENTINEL(obj,OBJ);
   2.486 +    pool = obj->pool;
   2.487  
   2.488      ASSERT_SPINLOCK(&obj->obj_spinlock);
   2.489      pgp_del = pgp_delete_from_obj(obj, pgp->index);
   2.490      ASSERT(pgp_del == pgp);
   2.491 +    if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
   2.492 +    {
   2.493 +        ASSERT(pgp->pcd->pgp_ref_count == 1 || pgp->eviction_attempted);
   2.494 +        pcd_disassociate(pgp,pool,1);
   2.495 +    }
   2.496      pgp_delete(pgp,1);
   2.497      if ( obj->pgp_count == 0 )
   2.498      {
   2.499 @@ -1129,25 +1408,30 @@ static NOINLINE int do_tmem_put_compress
   2.500  #ifdef __i386__
   2.501      return -ENOMEM;
   2.502  #endif
   2.503 +
   2.504      if ( pgp->pfp != NULL )
   2.505 -        pgp_free_data(pgp, pgp->obj->pool);  /* FIXME... is this right? */
   2.506 +        pgp_free_data(pgp, pgp->obj->pool);
   2.507      START_CYC_COUNTER(compress);
   2.508      ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
   2.509      if ( (ret == -EFAULT) || (ret == 0) )
   2.510          goto out;
   2.511 -    else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
   2.512 +    else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) {
   2.513          ret = 0;
   2.514 -    else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
   2.515 +        goto out;
   2.516 +    } else if ( tmh_dedup_enabled() && !is_persistent(pgp->obj->pool) ) {
   2.517 +        if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM )
   2.518 +            goto out;
   2.519 +    } else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL ) {
   2.520          ret = -ENOMEM;
   2.521 -    else
   2.522 -    {
   2.523 +        goto out;
   2.524 +    } else {
   2.525          memcpy(p,dst,size);
   2.526          pgp->cdata = p;
   2.527 -        pgp->size = size;
   2.528 -        pgp->obj->pool->client->compressed_pages++;
   2.529 -        pgp->obj->pool->client->compressed_sum_size += size;
   2.530 -        ret = 1;
   2.531      }
   2.532 +    pgp->size = size;
   2.533 +    pgp->obj->pool->client->compressed_pages++;
   2.534 +    pgp->obj->pool->client->compressed_sum_size += size;
   2.535 +    ret = 1;
   2.536  
   2.537  out:
   2.538      END_CYC_COUNTER(compress);
   2.539 @@ -1155,7 +1439,7 @@ out:
   2.540  }
   2.541  
   2.542  static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
   2.543 -       uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva)
   2.544 +       pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva)
   2.545  {
   2.546      pool_t *pool;
   2.547      obj_t *obj;
   2.548 @@ -1197,6 +1481,11 @@ copy_uncompressed:
   2.549      ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
   2.550      if ( ret == -EFAULT )
   2.551          goto bad_copy;
   2.552 +    if ( tmh_dedup_enabled() && !is_persistent(pool) )
   2.553 +    {
   2.554 +        if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
   2.555 +            goto failed_dup;
   2.556 +    }
   2.557      pgp->size = 0;
   2.558  
   2.559  done:
   2.560 @@ -1239,8 +1528,8 @@ failed_dup:
   2.561  
   2.562  static NOINLINE int do_tmem_put(pool_t *pool,
   2.563                uint64_t oid, uint32_t index,
   2.564 -              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
   2.565 -              uint32_t pfn_offset, uint32_t len, void *cva)
   2.566 +              tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
   2.567 +              pagesize_t pfn_offset, pagesize_t len, void *cva)
   2.568  {
   2.569      obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
   2.570      pgp_t *pgp = NULL, *pgpdel = NULL;
   2.571 @@ -1315,6 +1604,11 @@ copy_uncompressed:
   2.572      ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
   2.573      if ( ret == -EFAULT )
   2.574          goto bad_copy;
   2.575 +    if ( tmh_dedup_enabled() && !is_persistent(pool) )
   2.576 +    {
   2.577 +        if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
   2.578 +            goto delete_and_free;
   2.579 +    }
   2.580      pgp->size = 0;
   2.581  
   2.582  insert_page:
   2.583 @@ -1344,6 +1638,8 @@ insert_page:
   2.584      pool->good_puts++;
   2.585      if ( is_persistent(pool) )
   2.586          client->succ_pers_puts++;
   2.587 +    else
   2.588 +        tot_good_eph_puts++;
   2.589      return 1;
   2.590  
   2.591  delete_and_free:
   2.592 @@ -1376,8 +1672,8 @@ ASSERT(0);
   2.593  }
   2.594  
   2.595  static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
   2.596 -              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
   2.597 -              uint32_t pfn_offset, uint32_t len, void *cva)
   2.598 +              tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
   2.599 +              pagesize_t pfn_offset, pagesize_t len, void *cva)
   2.600  {
   2.601      obj_t *obj;
   2.602      pgp_t *pgp;
   2.603 @@ -1404,15 +1700,18 @@ static NOINLINE int do_tmem_get(pool_t *
   2.604          return 0;
   2.605      }
   2.606      ASSERT(pgp->size != -1);
   2.607 -    if ( pgp->size != 0 )
   2.608 +    if ( tmh_dedup_enabled() && !is_persistent(pool) &&
   2.609 +              pgp->firstbyte != NOT_SHAREABLE )
   2.610      {
   2.611 +        if ( pcd_copy_to_client(cmfn, pgp) == -EFAULT )
   2.612 +            goto bad_copy;
   2.613 +    } else if ( pgp->size != 0 ) {
   2.614          START_CYC_COUNTER(decompress);
   2.615          if ( tmh_decompress_to_client(cmfn, pgp->cdata,
   2.616                                        pgp->size, cva) == -EFAULT )
   2.617              goto bad_copy;
   2.618          END_CYC_COUNTER(decompress);
   2.619 -    }
   2.620 -    else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
   2.621 +    } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
   2.622                                   pfn_offset, len, cva) == -EFAULT)
   2.623          goto bad_copy;
   2.624      if ( is_ephemeral(pool) )
   2.625 @@ -1855,11 +2154,15 @@ static int tmemc_list_global(tmem_cli_va
   2.626        total_flush_pool, use_long ? ',' : '\n');
   2.627      if (use_long)
   2.628          n += scnprintf(info+n,BSIZE-n,
   2.629 -          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
   2.630 +          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d,"
   2.631 +          "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Ep:%lu,Gd:%lu,Zt:%lu,Gz:%lu\n",
   2.632            global_eph_count, global_eph_count_max,
   2.633            _atomic_read(global_obj_count), global_obj_count_max,
   2.634            _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
   2.635 -          _atomic_read(global_pgp_count), global_pgp_count_max);
   2.636 +          _atomic_read(global_pgp_count), global_pgp_count_max,
   2.637 +          _atomic_read(global_page_count), global_page_count_max,
   2.638 +          _atomic_read(global_pcd_count), global_pcd_count_max,
   2.639 +         tot_good_eph_puts,deduped_puts,pcd_tot_tze_size,pcd_tot_csize);
   2.640      if ( sum + n >= len )
   2.641          return sum;
   2.642      tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
   2.643 @@ -1912,6 +2215,13 @@ static int tmemc_set_var_one(client_t *c
   2.644  #ifdef __i386__
   2.645          return -1;
   2.646  #endif
   2.647 +        if ( tmh_dedup_enabled() )
   2.648 +        {
   2.649 +            printk("tmem: compression %s for all %ss, cannot be changed "
   2.650 +                   "when tmem_dedup is enabled\n",
   2.651 +            tmh_compression_enabled() ? "enabled" : "disabled",client_str);
   2.652 +            return -1;
   2.653 +        }
   2.654          client->compress = arg1 ? 1 : 0;
   2.655          printk("tmem: compression %s for %s=%d\n",
   2.656              arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
   2.657 @@ -2569,14 +2879,28 @@ EXPORT void *tmem_relinquish_pages(unsig
   2.658  /* called at hypervisor startup */
   2.659  EXPORT void init_tmem(void)
   2.660  {
   2.661 +    int i;
   2.662      if ( !tmh_enabled() )
   2.663          return;
   2.664  
   2.665      radix_tree_init();
   2.666 +    if ( tmh_dedup_enabled() )
   2.667 +        for (i = 0; i < 256; i++ )
   2.668 +        {
   2.669 +            pcd_tree_roots[i] = RB_ROOT;
   2.670 +            rwlock_init(&pcd_tree_rwlocks[i]);
   2.671 +        }
   2.672 +
   2.673      if ( tmh_init() )
   2.674      {
   2.675 -        printk("tmem: initialized comp=%d global-lock=%d\n",
   2.676 -            tmh_compression_enabled(), tmh_lock_all);
   2.677 +        printk("tmem: initialized comp=%d dedup=%d tze=%d global-lock=%d\n",
   2.678 +            tmh_compression_enabled(), tmh_dedup_enabled(), tmh_tze_enabled(),
   2.679 +            tmh_lock_all);
   2.680 +        if ( tmh_dedup_enabled()&&tmh_compression_enabled()&&tmh_tze_enabled() )
   2.681 +        {
   2.682 +            tmh_tze_disable();
   2.683 +            printk("tmem: tze and compression not compatible, disabling tze\n");
   2.684 +        }
   2.685          tmem_initialized = 1;
   2.686      }
   2.687      else
     3.1 --- a/xen/common/tmem_xen.c	Tue Apr 06 07:09:35 2010 +0100
     3.2 +++ b/xen/common/tmem_xen.c	Tue Apr 06 07:11:48 2010 +0100
     3.3 @@ -20,6 +20,12 @@ boolean_param("tmem", opt_tmem);
     3.4  EXPORT int opt_tmem_compress = 0;
     3.5  boolean_param("tmem_compress", opt_tmem_compress);
     3.6  
     3.7 +EXPORT int opt_tmem_dedup = 0;
     3.8 +boolean_param("tmem_dedup", opt_tmem_dedup);
     3.9 +
    3.10 +EXPORT int opt_tmem_tze = 0;
    3.11 +boolean_param("tmem_tze", opt_tmem_tze);
    3.12 +
    3.13  EXPORT int opt_tmem_shared_auth = 0;
    3.14  boolean_param("tmem_shared_auth", opt_tmem_shared_auth);
    3.15  
    3.16 @@ -103,8 +109,8 @@ static inline void *cli_mfn_to_va(tmem_c
    3.17  #endif
    3.18  
    3.19  EXPORT int tmh_copy_from_client(pfp_t *pfp,
    3.20 -    tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
    3.21 -    uint32_t pfn_offset, uint32_t len, void *cli_va)
    3.22 +    tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
    3.23 +    pagesize_t pfn_offset, pagesize_t len, void *cli_va)
    3.24  {
    3.25      unsigned long tmem_mfn;
    3.26      void *tmem_va;
    3.27 @@ -148,7 +154,7 @@ EXPORT int tmh_compress_from_client(tmem
    3.28  }
    3.29  
    3.30  EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
    3.31 -    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cli_va)
    3.32 +    pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cli_va)
    3.33  {
    3.34      unsigned long tmem_mfn, cli_mfn = 0;
    3.35      int mark_dirty = 1;
    3.36 @@ -199,6 +205,27 @@ EXPORT int tmh_decompress_to_client(tmem
    3.37      return 1;
    3.38  }
    3.39  
    3.40 +EXPORT int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va,
    3.41 +                                    pagesize_t len)
    3.42 +{
    3.43 +    void *cli_va;
    3.44 +    unsigned long cli_mfn;
    3.45 +
    3.46 +    ASSERT(!(len & (sizeof(uint64_t)-1)));
    3.47 +    ASSERT(len <= PAGE_SIZE);
    3.48 +    ASSERT(len > 0 || tmem_va == NULL);
    3.49 +    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
    3.50 +        return -EFAULT;
    3.51 +    if ( len > 0 )
    3.52 +        memcpy((char *)cli_va,(char *)tmem_va,len);
    3.53 +    if ( len < PAGE_SIZE )
    3.54 +        memset((char *)cli_va+len,0,PAGE_SIZE-len);
    3.55 +    unmap_domain_page(cli_va);
    3.56 +    paging_mark_dirty(current->domain,cli_mfn);
    3.57 +    mb();
    3.58 +    return 1;
    3.59 +}
    3.60 +
    3.61  /******************  XEN-SPECIFIC MEMORY ALLOCATION ********************/
    3.62  
    3.63  EXPORT struct xmem_pool *tmh_mempool = 0;
     4.1 --- a/xen/include/xen/tmem_xen.h	Tue Apr 06 07:09:35 2010 +0100
     4.2 +++ b/xen/include/xen/tmem_xen.h	Tue Apr 06 07:11:48 2010 +0100
     4.3 @@ -26,6 +26,8 @@ struct tmem_host_dependent_client {
     4.4  };
     4.5  typedef struct tmem_host_dependent_client tmh_client_t;
     4.6  
     4.7 +typedef uint32_t pagesize_t;  /* like size_t, must handle largest PAGE_SIZE */
     4.8 +
     4.9  #define IS_PAGE_ALIGNED(addr) \
    4.10    ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr)
    4.11  #define IS_VALID_PAGE(_pi)  ( mfn_valid(page_to_mfn(_pi)) )
    4.12 @@ -54,6 +56,23 @@ static inline int tmh_compression_enable
    4.13      return opt_tmem_compress;
    4.14  }
    4.15  
    4.16 +extern int opt_tmem_dedup;
    4.17 +static inline int tmh_dedup_enabled(void)
    4.18 +{
    4.19 +    return opt_tmem_dedup;
    4.20 +}
    4.21 +
    4.22 +extern int opt_tmem_tze;
    4.23 +static inline int tmh_tze_enabled(void)
    4.24 +{
    4.25 +    return opt_tmem_tze;
    4.26 +}
    4.27 +
    4.28 +static inline void tmh_tze_disable(void)
    4.29 +{
    4.30 +    opt_tmem_tze = 0;
    4.31 +}
    4.32 +
    4.33  extern int opt_tmem_shared_auth;
    4.34  static inline int tmh_shared_auth(void)
    4.35  {
    4.36 @@ -326,6 +345,101 @@ static inline bool_t tmh_current_is_priv
    4.37      return IS_PRIV(current->domain);
    4.38  }
    4.39  
    4.40 +static inline uint8_t tmh_get_first_byte(pfp_t *pfp)
    4.41 +{
    4.42 +    void *p = __map_domain_page(pfp);
    4.43 +
    4.44 +    return (uint8_t)(*(char *)p);
    4.45 +}
    4.46 +
    4.47 +static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2)
    4.48 +{
    4.49 +    const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
    4.50 +    const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2);
    4.51 +    int i;
    4.52 +
    4.53 +    // FIXME: code in assembly?
    4.54 +ASSERT(p1 != NULL);
    4.55 +ASSERT(p2 != NULL);
    4.56 +    for ( i = PAGE_SIZE/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
    4.57 +    if ( !i )
    4.58 +        return 0;
    4.59 +    if ( *p1 < *p2 )
    4.60 +        return -1;
    4.61 +    return 1;
    4.62 +}
    4.63 +
    4.64 +static inline int tmh_pcd_cmp(void *va1, pagesize_t len1, void *va2, pagesize_t len2)
    4.65 +{
    4.66 +    const char *p1 = (char *)va1;
    4.67 +    const char *p2 = (char *)va2;
    4.68 +    pagesize_t i;
    4.69 +
    4.70 +    ASSERT(len1 <= PAGE_SIZE);
    4.71 +    ASSERT(len2 <= PAGE_SIZE);
    4.72 +    if ( len1 < len2 )
    4.73 +        return -1;
    4.74 +    if ( len1 > len2 )
    4.75 +        return 1;
    4.76 +    ASSERT(len1 == len2);
    4.77 +    for ( i = len2; i && *p1 == *p2; i--, *p1++, *p2++ );
    4.78 +    if ( !i )
    4.79 +        return 0;
    4.80 +    if ( *p1 < *p2 )
    4.81 +        return -1;
    4.82 +    return 1;
    4.83 +}
    4.84 +
    4.85 +static inline int tmh_tze_pfp_cmp(pfp_t *pfp1, pagesize_t pfp_len, void *tva, pagesize_t tze_len)
    4.86 +{
    4.87 +    const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
    4.88 +    const uint64_t *p2;
    4.89 +    pagesize_t i;
    4.90 +
    4.91 +    if ( tze_len == PAGE_SIZE )
    4.92 +       p2 = (uint64_t *)__map_domain_page((pfp_t *)tva);
    4.93 +    else
    4.94 +       p2 = (uint64_t *)tva;
    4.95 +    ASSERT(pfp_len <= PAGE_SIZE);
    4.96 +    ASSERT(!(pfp_len & (sizeof(uint64_t)-1)));
    4.97 +    ASSERT(tze_len <= PAGE_SIZE);
    4.98 +    ASSERT(!(tze_len & (sizeof(uint64_t)-1)));
    4.99 +    if ( pfp_len < tze_len )
   4.100 +        return -1;
   4.101 +    if ( pfp_len > tze_len )
   4.102 +        return 1;
   4.103 +    ASSERT(pfp_len == tze_len);
   4.104 +    for ( i = tze_len/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
   4.105 +    if ( !i )
   4.106 +        return 0;
   4.107 +    if ( *p1 < *p2 )
   4.108 +        return -1;
   4.109 +    return 1;
   4.110 +}
   4.111 +
   4.112 +/* return the size of the data in the pfp, ignoring trailing zeroes and
   4.113 + * rounded up to the nearest multiple of 8 */
   4.114 +static inline pagesize_t tmh_tze_pfp_scan(pfp_t *pfp)
   4.115 +{
   4.116 +    const uint64_t *p = (uint64_t *)__map_domain_page(pfp);
   4.117 +    pagesize_t bytecount = PAGE_SIZE;
   4.118 +    pagesize_t len = PAGE_SIZE/sizeof(uint64_t);
   4.119 +    p += len;
   4.120 +    while ( len-- && !*--p )
   4.121 +        bytecount -= sizeof(uint64_t);
   4.122 +    return bytecount;
   4.123 +}
   4.124 +
   4.125 +static inline void tmh_tze_copy_from_pfp(void *tva, pfp_t *pfp, pagesize_t len)
   4.126 +{
   4.127 +    uint64_t *p1 = (uint64_t *)tva;
   4.128 +    const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp);
   4.129 +
   4.130 +    pagesize_t i;
   4.131 +    ASSERT(!(len & (sizeof(uint64_t)-1)));
   4.132 +    for ( i = len/sizeof(uint64_t); i--; *p1++ = *p2++);
   4.133 +}
   4.134 +
   4.135  /* these typedefs are in the public/tmem.h interface
   4.136  typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
   4.137  typedef XEN_GUEST_HANDLE(char) cli_va_t;
   4.138 @@ -378,11 +492,13 @@ extern int tmh_decompress_to_client(tmem
   4.139  extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*);
   4.140  
   4.141  extern int tmh_copy_from_client(pfp_t *pfp,
   4.142 -    tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
   4.143 -    uint32_t pfn_offset, uint32_t len, void *cva);
   4.144 +    tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
   4.145 +    pagesize_t pfn_offset, pagesize_t len, void *cva);
   4.146  
   4.147  extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
   4.148 -    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva);
   4.149 +    pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva);
   4.150 +
   4.151 +extern int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, pagesize_t len);
   4.152  
   4.153  
   4.154  #define TMEM_PERF