debuggers.hg

view xen/include/xen/tmem_xen.h @ 21149:61372a4f4e76

tmem: add page deduplication with optional compression or trailing-zero-elimination

Add "page deduplication" capability (with optional compression
and trailing-zero elimination) to Xen's tmem.

(Transparent to tmem-enabled guests.) Ephemeral pages
that have the exact same content are "combined" so that only
one page frame is needed. Since ephemeral pages are essentially
read-only, no C-O-W (and thus no equivalent of swapping) is
necessary. Deduplication can be combined with compression
or "trailing zero elimination" for even more space savings.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Apr 06 07:11:48 2010 +0100 (2010-04-06)
parents a3fa6d444b25
children a33909be109c
line source
1 /******************************************************************************
2 * tmem_xen.h
3 *
4 * Xen-specific Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
9 #ifndef __XEN_TMEM_XEN_H__
10 #define __XEN_TMEM_XEN_H__
12 #include <xen/config.h>
13 #include <xen/mm.h> /* heap alloc/free */
14 #include <xen/xmalloc.h> /* xmalloc/xfree */
15 #include <xen/sched.h> /* struct domain */
16 #include <xen/guest_access.h> /* copy_from_guest */
17 #include <xen/hash.h> /* hash_long */
18 #include <public/tmem.h>
19 #ifdef CONFIG_COMPAT
20 #include <compat/tmem.h>
21 #endif
23 struct tmem_host_dependent_client {
24 struct domain *domain;
25 struct xmem_pool *persistent_pool;
26 };
27 typedef struct tmem_host_dependent_client tmh_client_t;
29 typedef uint32_t pagesize_t; /* like size_t, must handle largest PAGE_SIZE */
31 #define IS_PAGE_ALIGNED(addr) \
32 ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr)
33 #define IS_VALID_PAGE(_pi) ( mfn_valid(page_to_mfn(_pi)) )
35 extern struct xmem_pool *tmh_mempool;
36 extern unsigned int tmh_mempool_maxalloc;
37 extern struct page_list_head tmh_page_list;
38 extern spinlock_t tmh_page_list_lock;
39 extern unsigned long tmh_page_list_pages;
40 extern atomic_t freeable_page_count;
42 extern spinlock_t tmem_lock;
43 extern spinlock_t tmem_spinlock;
44 extern rwlock_t tmem_rwlock;
46 extern void tmh_copy_page(char *to, char*from);
47 extern int tmh_init(void);
48 #define tmh_hash hash_long
50 extern void tmh_release_avail_pages_to_host(void);
51 extern void tmh_scrub_page(struct page_info *pi, unsigned int memflags);
53 extern int opt_tmem_compress;
54 static inline int tmh_compression_enabled(void)
55 {
56 return opt_tmem_compress;
57 }
59 extern int opt_tmem_dedup;
60 static inline int tmh_dedup_enabled(void)
61 {
62 return opt_tmem_dedup;
63 }
65 extern int opt_tmem_tze;
66 static inline int tmh_tze_enabled(void)
67 {
68 return opt_tmem_tze;
69 }
71 static inline void tmh_tze_disable(void)
72 {
73 opt_tmem_tze = 0;
74 }
76 extern int opt_tmem_shared_auth;
77 static inline int tmh_shared_auth(void)
78 {
79 return opt_tmem_shared_auth;
80 }
82 extern int opt_tmem;
83 static inline int tmh_enabled(void)
84 {
85 return opt_tmem;
86 }
88 extern int opt_tmem_lock;
90 extern int opt_tmem_flush_dups;
92 /*
93 * Memory free page list management
94 */
96 static inline struct page_info *tmh_page_list_get(void)
97 {
98 struct page_info *pi;
100 spin_lock(&tmh_page_list_lock);
101 if ( (pi = page_list_remove_head(&tmh_page_list)) != NULL )
102 tmh_page_list_pages--;
103 spin_unlock(&tmh_page_list_lock);
104 ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
105 return pi;
106 }
108 static inline void tmh_page_list_put(struct page_info *pi)
109 {
110 ASSERT(IS_VALID_PAGE(pi));
111 spin_lock(&tmh_page_list_lock);
112 page_list_add(pi, &tmh_page_list);
113 tmh_page_list_pages++;
114 spin_unlock(&tmh_page_list_lock);
115 }
117 static inline unsigned long tmh_avail_pages(void)
118 {
119 return tmh_page_list_pages;
120 }
122 /*
123 * Memory allocation for persistent data
124 */
126 static inline bool_t domain_fully_allocated(struct domain *d)
127 {
128 return ( d->tot_pages >= d->max_pages );
129 }
130 #define tmh_client_memory_fully_allocated(_pool) \
131 domain_fully_allocated(_pool->client->tmh->domain)
133 static inline void *_tmh_alloc_subpage_thispool(struct xmem_pool *cmem_mempool,
134 size_t size, size_t align)
135 {
136 #if 0
137 if ( d->tot_pages >= d->max_pages )
138 return NULL;
139 #endif
140 #ifdef __i386__
141 return _xmalloc(size,align);
142 #else
143 ASSERT( size < tmh_mempool_maxalloc );
144 if ( cmem_mempool == NULL )
145 return NULL;
146 return xmem_pool_alloc(size, cmem_mempool);
147 #endif
148 }
149 #define tmh_alloc_subpage_thispool(_pool, _s, _a) \
150 _tmh_alloc_subpage_thispool(pool->client->tmh->persistent_pool, \
151 _s, _a)
153 static inline void _tmh_free_subpage_thispool(struct xmem_pool *cmem_mempool,
154 void *ptr, size_t size)
155 {
156 #ifdef __i386__
157 xfree(ptr);
158 #else
159 ASSERT( size < tmh_mempool_maxalloc );
160 ASSERT( cmem_mempool != NULL );
161 xmem_pool_free(ptr,cmem_mempool);
162 #endif
163 }
164 #define tmh_free_subpage_thispool(_pool, _p, _s) \
165 _tmh_free_subpage_thispool(_pool->client->tmh->persistent_pool, _p, _s)
167 static inline struct page_info *_tmh_alloc_page_thispool(struct domain *d)
168 {
169 struct page_info *pi;
171 /* note that this tot_pages check is not protected by d->page_alloc_lock,
172 * so may race and periodically fail in donate_page or alloc_domheap_pages
173 * That's OK... neither is a problem, though chatty if log_lvl is set */
174 if ( d->tot_pages >= d->max_pages )
175 return NULL;
177 if ( tmh_page_list_pages )
178 {
179 if ( (pi = tmh_page_list_get()) != NULL )
180 {
181 if ( donate_page(d,pi,0) == 0 )
182 goto out;
183 else
184 tmh_page_list_put(pi);
185 }
186 }
188 pi = alloc_domheap_pages(d,0,MEMF_tmem);
190 out:
191 ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
192 return pi;
193 }
194 #define tmh_alloc_page_thispool(_pool) \
195 _tmh_alloc_page_thispool(_pool->client->tmh->domain)
197 static inline void _tmh_free_page_thispool(struct page_info *pi)
198 {
199 struct domain *d = page_get_owner(pi);
201 ASSERT(IS_VALID_PAGE(pi));
202 if ( (d == NULL) || steal_page(d,pi,0) == 0 )
203 tmh_page_list_put(pi);
204 else
205 {
206 scrub_one_page(pi);
207 ASSERT((pi->count_info & ~(PGC_allocated | 1)) == 0);
208 free_domheap_pages(pi,0);
209 }
210 }
211 #define tmh_free_page_thispool(_pool,_pg) \
212 _tmh_free_page_thispool(_pg)
214 /*
215 * Memory allocation for ephemeral (non-persistent) data
216 */
218 static inline void *tmh_alloc_subpage(void *pool, size_t size,
219 size_t align)
220 {
221 #ifdef __i386__
222 ASSERT( size < PAGE_SIZE );
223 return _xmalloc(size, align);
224 #else
225 ASSERT( size < tmh_mempool_maxalloc );
226 ASSERT( tmh_mempool != NULL );
227 return xmem_pool_alloc(size, tmh_mempool);
228 #endif
229 }
231 static inline void tmh_free_subpage(void *ptr, size_t size)
232 {
233 #ifdef __i386__
234 ASSERT( size < PAGE_SIZE );
235 xfree(ptr);
236 #else
237 ASSERT( size < tmh_mempool_maxalloc );
238 xmem_pool_free(ptr,tmh_mempool);
239 #endif
240 }
242 static inline struct page_info *tmh_alloc_page(void *pool, int no_heap)
243 {
244 struct page_info *pi = tmh_page_list_get();
246 if ( pi == NULL && !no_heap )
247 pi = alloc_domheap_pages(0,0,MEMF_tmem);
248 ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
249 if ( pi != NULL && !no_heap )
250 atomic_inc(&freeable_page_count);
251 return pi;
252 }
254 static inline void tmh_free_page(struct page_info *pi)
255 {
256 ASSERT(IS_VALID_PAGE(pi));
257 tmh_page_list_put(pi);
258 atomic_dec(&freeable_page_count);
259 }
261 static inline unsigned int tmem_subpage_maxsize(void)
262 {
263 return tmh_mempool_maxalloc;
264 }
266 static inline unsigned long tmh_freeable_pages(void)
267 {
268 return tmh_avail_pages() + _atomic_read(freeable_page_count);
269 }
271 static inline unsigned long tmh_free_mb(void)
272 {
273 return (tmh_avail_pages() + total_free_pages()) >> (20 - PAGE_SHIFT);
274 }
276 /*
277 * Memory allocation for "infrastructure" data
278 */
280 static inline void *tmh_alloc_infra(size_t size, size_t align)
281 {
282 return _xmalloc(size,align);
283 }
285 static inline void tmh_free_infra(void *p)
286 {
287 return xfree(p);
288 }
290 #define tmh_lock_all opt_tmem_lock
291 #define tmh_flush_dups opt_tmem_flush_dups
292 #define tmh_called_from_tmem(_memflags) (_memflags & MEMF_tmem)
294 /* "Client" (==domain) abstraction */
296 struct client;
297 typedef domid_t cli_id_t;
298 typedef struct domain tmh_cli_ptr_t;
299 typedef struct page_info pfp_t;
301 extern tmh_client_t *tmh_client_init(cli_id_t);
302 extern void tmh_client_destroy(tmh_client_t *);
304 /* this appears to be unreliable when a domain is being shut down */
305 static inline struct client *tmh_client_from_cli_id(cli_id_t cli_id)
306 {
307 struct domain *d = get_domain_by_id(cli_id); /* incs d->refcnt! */
308 if (d == NULL)
309 return NULL;
310 return (struct client *)(d->tmem);
311 }
313 static inline void tmh_client_put(tmh_client_t *tmh)
314 {
315 put_domain(tmh->domain);
316 }
318 static inline struct client *tmh_client_from_current(void)
319 {
320 return (struct client *)(current->domain->tmem);
321 }
323 #define tmh_client_is_dying(_client) (!!_client->tmh->domain->is_dying)
325 static inline cli_id_t tmh_get_cli_id_from_current(void)
326 {
327 return current->domain->domain_id;
328 }
330 static inline tmh_cli_ptr_t *tmh_get_cli_ptr_from_current(void)
331 {
332 return current->domain;
333 }
335 static inline void tmh_set_client_from_id(struct client *client,
336 tmh_client_t *tmh, cli_id_t cli_id)
337 {
338 struct domain *d = get_domain_by_id(cli_id);
339 d->tmem = client;
340 tmh->domain = d;
341 }
343 static inline bool_t tmh_current_is_privileged(void)
344 {
345 return IS_PRIV(current->domain);
346 }
348 static inline uint8_t tmh_get_first_byte(pfp_t *pfp)
349 {
350 void *p = __map_domain_page(pfp);
352 return (uint8_t)(*(char *)p);
353 }
355 static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2)
356 {
357 const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
358 const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2);
359 int i;
361 // FIXME: code in assembly?
362 ASSERT(p1 != NULL);
363 ASSERT(p2 != NULL);
364 for ( i = PAGE_SIZE/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
365 if ( !i )
366 return 0;
367 if ( *p1 < *p2 )
368 return -1;
369 return 1;
370 }
372 static inline int tmh_pcd_cmp(void *va1, pagesize_t len1, void *va2, pagesize_t len2)
373 {
374 const char *p1 = (char *)va1;
375 const char *p2 = (char *)va2;
376 pagesize_t i;
378 ASSERT(len1 <= PAGE_SIZE);
379 ASSERT(len2 <= PAGE_SIZE);
380 if ( len1 < len2 )
381 return -1;
382 if ( len1 > len2 )
383 return 1;
384 ASSERT(len1 == len2);
385 for ( i = len2; i && *p1 == *p2; i--, *p1++, *p2++ );
386 if ( !i )
387 return 0;
388 if ( *p1 < *p2 )
389 return -1;
390 return 1;
391 }
393 static inline int tmh_tze_pfp_cmp(pfp_t *pfp1, pagesize_t pfp_len, void *tva, pagesize_t tze_len)
394 {
395 const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
396 const uint64_t *p2;
397 pagesize_t i;
399 if ( tze_len == PAGE_SIZE )
400 p2 = (uint64_t *)__map_domain_page((pfp_t *)tva);
401 else
402 p2 = (uint64_t *)tva;
403 ASSERT(pfp_len <= PAGE_SIZE);
404 ASSERT(!(pfp_len & (sizeof(uint64_t)-1)));
405 ASSERT(tze_len <= PAGE_SIZE);
406 ASSERT(!(tze_len & (sizeof(uint64_t)-1)));
407 if ( pfp_len < tze_len )
408 return -1;
409 if ( pfp_len > tze_len )
410 return 1;
411 ASSERT(pfp_len == tze_len);
412 for ( i = tze_len/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
413 if ( !i )
414 return 0;
415 if ( *p1 < *p2 )
416 return -1;
417 return 1;
418 }
420 /* return the size of the data in the pfp, ignoring trailing zeroes and
421 * rounded up to the nearest multiple of 8 */
422 static inline pagesize_t tmh_tze_pfp_scan(pfp_t *pfp)
423 {
424 const uint64_t *p = (uint64_t *)__map_domain_page(pfp);
425 pagesize_t bytecount = PAGE_SIZE;
426 pagesize_t len = PAGE_SIZE/sizeof(uint64_t);
427 p += len;
428 while ( len-- && !*--p )
429 bytecount -= sizeof(uint64_t);
430 return bytecount;
431 }
433 static inline void tmh_tze_copy_from_pfp(void *tva, pfp_t *pfp, pagesize_t len)
434 {
435 uint64_t *p1 = (uint64_t *)tva;
436 const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp);
438 pagesize_t i;
439 ASSERT(!(len & (sizeof(uint64_t)-1)));
440 for ( i = len/sizeof(uint64_t); i--; *p1++ = *p2++);
441 }
443 /* these typedefs are in the public/tmem.h interface
444 typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
445 typedef XEN_GUEST_HANDLE(char) cli_va_t;
446 */
447 typedef XEN_GUEST_HANDLE(tmem_op_t) tmem_cli_op_t;
449 static inline int tmh_get_tmemop_from_client(tmem_op_t *op, tmem_cli_op_t uops)
450 {
451 #ifdef CONFIG_COMPAT
452 if ( is_pv_32on64_vcpu(current) )
453 {
454 int rc;
455 enum XLAT_tmem_op_u u;
456 tmem_op_compat_t cop;
458 rc = copy_from_guest(&cop, guest_handle_cast(uops, void), 1);
459 if ( rc )
460 return rc;
461 switch ( cop.cmd )
462 {
463 case TMEM_NEW_POOL: u = XLAT_tmem_op_u_new; break;
464 case TMEM_CONTROL: u = XLAT_tmem_op_u_ctrl; break;
465 case TMEM_AUTH: u = XLAT_tmem_op_u_new; break;
466 case TMEM_RESTORE_NEW:u = XLAT_tmem_op_u_new; break;
467 default: u = XLAT_tmem_op_u_gen ; break;
468 }
469 #define XLAT_tmem_op_HNDL_u_ctrl_buf(_d_, _s_) \
470 guest_from_compat_handle((_d_)->u.ctrl.buf, (_s_)->u.ctrl.buf)
471 XLAT_tmem_op(op, &cop);
472 #undef XLAT_tmem_op_HNDL_u_ctrl_buf
473 return 0;
474 }
475 #endif
476 return copy_from_guest(op, uops, 1);
477 }
479 static inline void tmh_copy_to_client_buf_offset(tmem_cli_va_t clibuf, int off,
480 char *tmembuf, int len)
481 {
482 copy_to_guest_offset(clibuf,off,tmembuf,len);
483 }
485 #define TMH_CLI_ID_NULL ((cli_id_t)((domid_t)-1L))
487 #define tmh_cli_id_str "domid"
488 #define tmh_client_str "domain"
490 extern int tmh_decompress_to_client(tmem_cli_mfn_t,void*,size_t,void*);
492 extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*);
494 extern int tmh_copy_from_client(pfp_t *pfp,
495 tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
496 pagesize_t pfn_offset, pagesize_t len, void *cva);
498 extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
499 pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva);
501 extern int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, pagesize_t len);
504 #define TMEM_PERF
505 #ifdef TMEM_PERF
506 #define DECL_CYC_COUNTER(x) \
507 uint64_t x##_sum_cycles = 0, x##_count = 0; \
508 uint32_t x##_min_cycles = 0x7fffffff, x##_max_cycles = 0;
509 #define EXTERN_CYC_COUNTER(x) \
510 extern uint64_t x##_sum_cycles, x##_count; \
511 extern uint32_t x##_min_cycles, x##_max_cycles;
512 #define DECL_LOCAL_CYC_COUNTER(x) \
513 int64_t x##_start = 0
514 #define START_CYC_COUNTER(x) x##_start = get_cycles()
515 #define DUP_START_CYC_COUNTER(x,y) x##_start = y##_start
516 /* following might race, but since its advisory only, don't care */
517 #define END_CYC_COUNTER(x) \
518 do { \
519 x##_start = get_cycles() - x##_start; \
520 if (x##_start > 0 && x##_start < 1000000000) { \
521 x##_sum_cycles += x##_start; x##_count++; \
522 if ((uint32_t)x##_start < x##_min_cycles) x##_min_cycles = x##_start; \
523 if ((uint32_t)x##_start > x##_max_cycles) x##_max_cycles = x##_start; \
524 } \
525 } while (0)
526 #define END_CYC_COUNTER_CLI(x,y) \
527 do { \
528 x##_start = get_cycles() - x##_start; \
529 if (x##_start > 0 && x##_start < 1000000000) { \
530 x##_sum_cycles += x##_start; x##_count++; \
531 if ((uint32_t)x##_start < x##_min_cycles) x##_min_cycles = x##_start; \
532 if ((uint32_t)x##_start > x##_max_cycles) x##_max_cycles = x##_start; \
533 y->total_cycles += x##_start; \
534 } \
535 } while (0)
536 #define RESET_CYC_COUNTER(x) { x##_sum_cycles = 0, x##_count = 0; \
537 x##_min_cycles = 0x7fffffff, x##_max_cycles = 0; }
538 #define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) \
539 scnprintf(buf,size, \
540 tag"n:%"PRIu64","tag"t:%"PRIu64","tag"x:%"PRId32","tag"m:%"PRId32",", \
541 x##_count,x##_sum_cycles,x##_max_cycles,x##_min_cycles)
542 #else
543 #define DECL_CYC_COUNTER(x)
544 #define EXTERN_CYC_COUNTER(x) \
545 extern uint64_t x##_sum_cycles, x##_count; \
546 extern uint32_t x##_min_cycles, x##_max_cycles;
547 #define DECL_LOCAL_CYC_COUNTER(x) do { } while (0)
548 #define START_CYC_COUNTER(x) do { } while (0)
549 #define DUP_START_CYC_COUNTER(x) do { } while (0)
550 #define END_CYC_COUNTER(x) do { } while (0)
551 #define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) (0)
552 #define RESET_CYC_COUNTER(x) do { } while (0)
553 #endif
555 #endif /* __XEN_TMEM_XEN_H__ */