debuggers.hg

view tools/libxc/xc_domain_save.c @ 20838:0447c5532e9f

x86: add and use XEN_DOMCTL_getpageframeinfo3

To support wider than 28-bit MFNs, add XEN_DOMCTL_getpageframeinfo3
(with the type replacing the passed in MFN rather than getting or-ed
into it) to properly back xc_get_pfn_type_batch().

With xc_get_pfn_type_batch() only used internally to libxc, move its
prototype from xenctrl.h to xc_private.h.

This also fixes a couple of bugs in pre-existing code:
- the failure path for init_mem_info() leaked minfo->pfn_type,
- one error path of the XEN_DOMCTL_getpageframeinfo2 handler used
put_domain() where rcu_unlock_domain() was meant, and
- the XEN_DOMCTL_getpageframeinfo2 handler could call
xsm_getpageframeinfo() with an invalid struct page_info pointer.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jan 13 08:14:01 2010 +0000 (2010-01-13)
parents 0b138a019292
children b684d9e57b8f
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 #include <xen/hvm/params.h>
21 #include "xc_e820.h"
23 /*
24 ** Default values for important tuning parameters. Can override by passing
25 ** non-zero replacement values to xc_domain_save().
26 **
27 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
28 **
29 */
30 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
31 #define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
33 struct save_ctx {
34 unsigned long hvirt_start; /* virtual starting address of the hypervisor */
35 unsigned int pt_levels; /* #levels of page tables used by the current guest */
36 unsigned long max_mfn; /* max mfn of the whole machine */
37 xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
38 xen_pfn_t *live_m2p; /* Live mapping of system MFN to PFN table. */
39 unsigned long m2p_mfn0;
40 struct domain_info_context dinfo;
41 };
43 /* buffer for output */
44 struct outbuf {
45 void* buf;
46 size_t size;
47 size_t pos;
48 };
50 #define OUTBUF_SIZE (16384 * 1024)
52 /* grep fodder: machine_to_phys */
54 #define mfn_to_pfn(_mfn) (ctx->live_m2p[(_mfn)])
56 #define pfn_to_mfn(_pfn) \
57 ((xen_pfn_t) ((dinfo->guest_width==8) \
58 ? (((uint64_t *)ctx->live_p2m)[(_pfn)]) \
59 : ((((uint32_t *)ctx->live_p2m)[(_pfn)]) == 0xffffffffU \
60 ? (-1UL) : (((uint32_t *)ctx->live_p2m)[(_pfn)]))))
62 /*
63 * Returns TRUE if the given machine frame number has a unique mapping
64 * in the guest's pseudophysical map.
65 */
66 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
67 (((_mfn) < (ctx->max_mfn)) && \
68 ((mfn_to_pfn(_mfn) < (dinfo->p2m_size)) && \
69 (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
71 /*
72 ** During (live) save/migrate, we maintain a number of bitmaps to track
73 ** which pages we have to send, to fixup, and to skip.
74 */
76 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
77 #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
78 #define BITMAP_SIZE (BITS_TO_LONGS(dinfo->p2m_size) * sizeof(unsigned long))
80 #define BITMAP_ENTRY(_nr,_bmap) \
81 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
83 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
85 #define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)
87 static inline int test_bit (int nr, volatile void * addr)
88 {
89 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
90 }
92 static inline void clear_bit (int nr, volatile void * addr)
93 {
94 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
95 }
97 static inline void set_bit ( int nr, volatile void * addr)
98 {
99 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
100 }
102 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
103 static inline unsigned int hweight32(unsigned int w)
104 {
105 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
106 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
107 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
108 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
109 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
110 }
112 static inline int count_bits ( int nr, volatile void *addr)
113 {
114 int i, count = 0;
115 volatile unsigned long *p = (volatile unsigned long *)addr;
116 /* We know that the array is padded to unsigned long. */
117 for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
118 count += hweight32(*p);
119 return count;
120 }
122 static uint64_t tv_to_us(struct timeval *new)
123 {
124 return (new->tv_sec * 1000000) + new->tv_usec;
125 }
127 static uint64_t llgettimeofday(void)
128 {
129 struct timeval now;
130 gettimeofday(&now, NULL);
131 return tv_to_us(&now);
132 }
134 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
135 {
136 return (((new->tv_sec - old->tv_sec)*1000000) +
137 (new->tv_usec - old->tv_usec));
138 }
140 static int noncached_write(int fd, int live, void *buffer, int len)
141 {
142 static int write_count = 0;
143 int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
145 write_count += len;
146 if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
147 {
148 /* Time to discard cache - dont care if this fails */
149 discard_file_cache(fd, 0 /* no flush */);
150 write_count = 0;
151 }
153 return rc;
154 }
156 static int outbuf_init(struct outbuf* ob, size_t size)
157 {
158 memset(ob, 0, sizeof(*ob));
160 if ( !(ob->buf = malloc(size)) ) {
161 DPRINTF("error allocating output buffer of size %zu\n", size);
162 return -1;
163 }
165 ob->size = size;
167 return 0;
168 }
170 static inline int outbuf_write(struct outbuf* ob, void* buf, size_t len)
171 {
172 if ( len > ob->size - ob->pos ) {
173 DPRINTF("outbuf_write: %zu > %zu@%zu\n", len, ob->size - ob->pos, ob->pos);
174 return -1;
175 }
177 memcpy(ob->buf + ob->pos, buf, len);
178 ob->pos += len;
180 return 0;
181 }
183 /* prep for nonblocking I/O */
184 static int outbuf_flush(struct outbuf* ob, int fd)
185 {
186 int rc;
187 int cur = 0;
189 if ( !ob->pos )
190 return 0;
192 rc = write(fd, ob->buf, ob->pos);
193 while (rc < 0 || cur + rc < ob->pos) {
194 if (rc < 0 && errno != EAGAIN && errno != EINTR) {
195 DPRINTF("error flushing output: %d\n", errno);
196 return -1;
197 }
198 if (rc > 0)
199 cur += rc;
201 rc = write(fd, ob->buf + cur, ob->pos - cur);
202 }
204 ob->pos = 0;
206 return 0;
207 }
209 /* if there's no room in the buffer, flush it and try again. */
210 static inline int outbuf_hardwrite(struct outbuf* ob, int fd, void* buf,
211 size_t len)
212 {
213 if ( !len )
214 return 0;
216 if ( !outbuf_write(ob, buf, len) )
217 return 0;
219 if ( outbuf_flush(ob, fd) < 0 )
220 return -1;
222 return outbuf_write(ob, buf, len);
223 }
225 /* start buffering output once we've reached checkpoint mode. */
226 static inline int write_buffer(int dobuf, struct outbuf* ob, int fd, void* buf,
227 size_t len)
228 {
229 if ( dobuf )
230 return outbuf_hardwrite(ob, fd, buf, len);
231 else
232 return write_exact(fd, buf, len);
233 }
235 #ifdef ADAPTIVE_SAVE
237 /*
238 ** We control the rate at which we transmit (or save) to minimize impact
239 ** on running domains (including the target if we're doing live migrate).
240 */
242 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
243 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
245 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
246 #define RATE_TO_BTU 781250
248 /* Amount in bytes we allow ourselves to send in a burst */
249 #define BURST_BUDGET (100*1024)
251 /* We keep track of the current and previous transmission rate */
252 static int mbit_rate, ombit_rate = 0;
254 /* Have we reached the maximum transmission rate? */
255 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
257 static inline void initialize_mbit_rate()
258 {
259 mbit_rate = START_MBIT_RATE;
260 }
262 static int ratewrite(int io_fd, int live, void *buf, int n)
263 {
264 static int budget = 0;
265 static int burst_time_us = -1;
266 static struct timeval last_put = { 0 };
267 struct timeval now;
268 struct timespec delay;
269 long long delta;
271 if ( START_MBIT_RATE == 0 )
272 return noncached_write(io_fd, live, buf, n);
274 budget -= n;
275 if ( budget < 0 )
276 {
277 if ( mbit_rate != ombit_rate )
278 {
279 burst_time_us = RATE_TO_BTU / mbit_rate;
280 ombit_rate = mbit_rate;
281 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
282 mbit_rate, BURST_BUDGET, burst_time_us);
283 }
284 if ( last_put.tv_sec == 0 )
285 {
286 budget += BURST_BUDGET;
287 gettimeofday(&last_put, NULL);
288 }
289 else
290 {
291 while ( budget < 0 )
292 {
293 gettimeofday(&now, NULL);
294 delta = tv_delta(&now, &last_put);
295 while ( delta > burst_time_us )
296 {
297 budget += BURST_BUDGET;
298 last_put.tv_usec += burst_time_us;
299 if ( last_put.tv_usec > 1000000 )
300 {
301 last_put.tv_usec -= 1000000;
302 last_put.tv_sec++;
303 }
304 delta -= burst_time_us;
305 }
306 if ( budget > 0 )
307 break;
308 delay.tv_sec = 0;
309 delay.tv_nsec = 1000 * (burst_time_us - delta);
310 while ( delay.tv_nsec > 0 )
311 if ( nanosleep(&delay, &delay) == 0 )
312 break;
313 }
314 }
315 }
316 return noncached_write(io_fd, live, buf, n);
317 }
319 #else /* ! ADAPTIVE SAVE */
321 #define RATE_IS_MAX() (0)
322 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
323 #define initialize_mbit_rate()
325 #endif
327 /* like write_buffer for ratewrite, which returns number of bytes written */
328 static inline int ratewrite_buffer(int dobuf, struct outbuf* ob, int fd,
329 int live, void* buf, size_t len)
330 {
331 if ( dobuf )
332 return outbuf_hardwrite(ob, fd, buf, len) ? -1 : len;
333 else
334 return ratewrite(fd, live, buf, len);
335 }
337 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
338 xc_shadow_op_stats_t *stats, int print)
339 {
340 static struct timeval wall_last;
341 static long long d0_cpu_last;
342 static long long d1_cpu_last;
344 struct timeval wall_now;
345 long long wall_delta;
346 long long d0_cpu_now, d0_cpu_delta;
347 long long d1_cpu_now, d1_cpu_delta;
349 gettimeofday(&wall_now, NULL);
351 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
352 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
354 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
355 DPRINTF("ARRHHH!!\n");
357 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
358 if ( wall_delta == 0 )
359 wall_delta = 1;
361 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
362 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
364 if ( print )
365 DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
366 "dirtied %dMb/s %" PRId32 " pages\n",
367 wall_delta,
368 (int)((d0_cpu_delta*100)/wall_delta),
369 (int)((d1_cpu_delta*100)/wall_delta),
370 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
371 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
372 stats->dirty_count);
374 #ifdef ADAPTIVE_SAVE
375 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
376 {
377 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
378 + 50;
379 if ( mbit_rate > MAX_MBIT_RATE )
380 mbit_rate = MAX_MBIT_RATE;
381 }
382 #endif
384 d0_cpu_last = d0_cpu_now;
385 d1_cpu_last = d1_cpu_now;
386 wall_last = wall_now;
388 return 0;
389 }
392 static int analysis_phase(int xc_handle, uint32_t domid, struct save_ctx *ctx,
393 unsigned long *arr, int runs)
394 {
395 long long start, now;
396 xc_shadow_op_stats_t stats;
397 int j;
398 struct domain_info_context *dinfo = &ctx->dinfo;
400 start = llgettimeofday();
402 for ( j = 0; j < runs; j++ )
403 {
404 int i;
406 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
407 arr, dinfo->p2m_size, NULL, 0, NULL);
408 DPRINTF("#Flush\n");
409 for ( i = 0; i < 40; i++ )
410 {
411 usleep(50000);
412 now = llgettimeofday();
413 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
414 NULL, 0, NULL, 0, &stats);
415 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
416 ((now-start)+500)/1000,
417 stats.fault_count, stats.dirty_count);
418 }
419 }
421 return -1;
422 }
424 static int suspend_and_state(int (*suspend)(void*), void* data,
425 int xc_handle, int io_fd, int dom,
426 xc_dominfo_t *info)
427 {
428 if ( !(*suspend)(data) )
429 {
430 ERROR("Suspend request failed");
431 return -1;
432 }
434 if ( (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) ||
435 !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) )
436 {
437 ERROR("Domain not in suspended state");
438 return -1;
439 }
441 return 0;
442 }
444 /*
445 ** Map the top-level page of MFNs from the guest. The guest might not have
446 ** finished resuming from a previous restore operation, so we wait a while for
447 ** it to update the MFN to a reasonable value.
448 */
449 static void *map_frame_list_list(int xc_handle, uint32_t dom,
450 struct save_ctx *ctx,
451 shared_info_any_t *shinfo)
452 {
453 int count = 100;
454 void *p;
455 struct domain_info_context *dinfo = &ctx->dinfo;
456 uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
458 while ( count-- && (fll == 0) )
459 {
460 usleep(10000);
461 fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
462 }
464 if ( fll == 0 )
465 {
466 ERROR("Timed out waiting for frame list updated.");
467 return NULL;
468 }
470 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll);
471 if ( p == NULL )
472 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
474 return p;
475 }
477 /*
478 ** During transfer (or in the state file), all page-table pages must be
479 ** converted into a 'canonical' form where references to actual mfns
480 ** are replaced with references to the corresponding pfns.
481 **
482 ** This function performs the appropriate conversion, taking into account
483 ** which entries do not require canonicalization (in particular, those
484 ** entries which map the virtual address reserved for the hypervisor).
485 */
486 static int canonicalize_pagetable(struct save_ctx *ctx,
487 unsigned long type, unsigned long pfn,
488 const void *spage, void *dpage)
489 {
490 struct domain_info_context *dinfo = &ctx->dinfo;
491 int i, pte_last, xen_start, xen_end, race = 0;
492 uint64_t pte;
494 /*
495 ** We need to determine which entries in this page table hold
496 ** reserved hypervisor mappings. This depends on the current
497 ** page table type as well as the number of paging levels.
498 */
499 xen_start = xen_end = pte_last = PAGE_SIZE / ((ctx->pt_levels == 2) ? 4 : 8);
501 if ( (ctx->pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
502 xen_start = (ctx->hvirt_start >> L2_PAGETABLE_SHIFT);
504 if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
505 xen_start = L3_PAGETABLE_ENTRIES_PAE;
507 /*
508 ** In PAE only the L2 mapping the top 1GB contains Xen mappings.
509 ** We can spot this by looking for the guest's mappingof the m2p.
510 ** Guests must ensure that this check will fail for other L2s.
511 */
512 if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
513 {
514 int hstart;
515 uint64_t he;
517 hstart = (ctx->hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
518 he = ((const uint64_t *) spage)[hstart];
520 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
521 {
522 /* hvirt starts with xen stuff... */
523 xen_start = hstart;
524 }
525 else if ( ctx->hvirt_start != 0xf5800000 )
526 {
527 /* old L2s from before hole was shrunk... */
528 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
529 he = ((const uint64_t *) spage)[hstart];
530 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
531 xen_start = hstart;
532 }
533 }
535 if ( (ctx->pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
536 {
537 /*
538 ** XXX SMH: should compute these from hvirt_start (which we have)
539 ** and hvirt_end (which we don't)
540 */
541 xen_start = 256;
542 xen_end = 272;
543 }
545 /* Now iterate through the page table, canonicalizing each PTE */
546 for (i = 0; i < pte_last; i++ )
547 {
548 unsigned long pfn, mfn;
550 if ( ctx->pt_levels == 2 )
551 pte = ((const uint32_t*)spage)[i];
552 else
553 pte = ((const uint64_t*)spage)[i];
555 if ( (i >= xen_start) && (i < xen_end) )
556 pte = 0;
558 if ( pte & _PAGE_PRESENT )
559 {
560 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
561 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
562 {
563 /* This will happen if the type info is stale which
564 is quite feasible under live migration */
565 pfn = 0; /* zap it - we'll retransmit this page later */
566 /* XXX: We can't spot Xen mappings in compat-mode L2es
567 * from 64-bit tools, but the only thing in them is the
568 * compat m2p, so we quietly zap them. This doesn't
569 * count as a race, so don't report it. */
570 if ( !(type == XEN_DOMCTL_PFINFO_L2TAB
571 && sizeof (unsigned long) > dinfo->guest_width) )
572 race = 1; /* inform the caller; fatal if !live */
573 }
574 else
575 pfn = mfn_to_pfn(mfn);
577 pte &= ~MADDR_MASK_X86;
578 pte |= (uint64_t)pfn << PAGE_SHIFT;
580 /*
581 * PAE guest L3Es can contain these flags when running on
582 * a 64bit hypervisor. We zap these here to avoid any
583 * surprise at restore time...
584 */
585 if ( (ctx->pt_levels == 3) &&
586 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
587 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
588 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
589 }
591 if ( ctx->pt_levels == 2 )
592 ((uint32_t*)dpage)[i] = pte;
593 else
594 ((uint64_t*)dpage)[i] = pte;
595 }
597 return race;
598 }
600 xen_pfn_t *xc_map_m2p(int xc_handle,
601 unsigned long max_mfn,
602 int prot,
603 unsigned long *mfn0)
604 {
605 struct xen_machphys_mfn_list xmml;
606 privcmd_mmap_entry_t *entries;
607 unsigned long m2p_chunks, m2p_size;
608 xen_pfn_t *m2p;
609 xen_pfn_t *extent_start;
610 int i;
612 m2p = NULL;
613 m2p_size = M2P_SIZE(max_mfn);
614 m2p_chunks = M2P_CHUNKS(max_mfn);
616 xmml.max_extents = m2p_chunks;
618 extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
619 if ( !extent_start )
620 {
621 ERROR("failed to allocate space for m2p mfns");
622 goto err0;
623 }
624 set_xen_guest_handle(xmml.extent_start, extent_start);
626 if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
627 (xmml.nr_extents != m2p_chunks) )
628 {
629 ERROR("xc_get_m2p_mfns");
630 goto err1;
631 }
633 entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
634 if (entries == NULL)
635 {
636 ERROR("failed to allocate space for mmap entries");
637 goto err1;
638 }
640 for ( i = 0; i < m2p_chunks; i++ )
641 entries[i].mfn = extent_start[i];
643 m2p = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
644 m2p_size, prot, M2P_CHUNK_SIZE,
645 entries, m2p_chunks);
646 if (m2p == NULL)
647 {
648 ERROR("xc_mmap_foreign_ranges failed");
649 goto err2;
650 }
652 if (mfn0)
653 *mfn0 = entries[0].mfn;
655 err2:
656 free(entries);
657 err1:
658 free(extent_start);
660 err0:
661 return m2p;
662 }
665 static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
666 int io_fd,
667 uint32_t dom,
668 struct save_ctx *ctx,
669 shared_info_any_t *live_shinfo)
670 {
671 vcpu_guest_context_any_t ctxt;
672 struct domain_info_context *dinfo = &ctx->dinfo;
674 /* Double and single indirect references to the live P2M table */
675 void *live_p2m_frame_list_list = NULL;
676 void *live_p2m_frame_list = NULL;
678 /* Copies of the above. */
679 xen_pfn_t *p2m_frame_list_list = NULL;
680 xen_pfn_t *p2m_frame_list = NULL;
682 /* The mapping of the live p2m table itself */
683 xen_pfn_t *p2m = NULL;
685 int i, success = 0;
687 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom, ctx,
688 live_shinfo);
689 if ( !live_p2m_frame_list_list )
690 goto out;
692 /* Get a local copy of the live_P2M_frame_list_list */
693 if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
694 {
695 ERROR("Couldn't allocate p2m_frame_list_list array");
696 goto out;
697 }
698 memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
700 /* Canonicalize guest's unsigned long vs ours */
701 if ( dinfo->guest_width > sizeof(unsigned long) )
702 for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
703 if ( i < PAGE_SIZE/dinfo->guest_width )
704 p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
705 else
706 p2m_frame_list_list[i] = 0;
707 else if ( dinfo->guest_width < sizeof(unsigned long) )
708 for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
709 p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
711 live_p2m_frame_list =
712 xc_map_foreign_pages(xc_handle, dom, PROT_READ,
713 p2m_frame_list_list,
714 P2M_FLL_ENTRIES);
715 if ( !live_p2m_frame_list )
716 {
717 ERROR("Couldn't map p2m_frame_list");
718 goto out;
719 }
721 /* Get a local copy of the live_P2M_frame_list */
722 if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
723 {
724 ERROR("Couldn't allocate p2m_frame_list array");
725 goto out;
726 }
727 memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
728 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
730 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
731 live_p2m_frame_list = NULL;
733 /* Canonicalize guest's unsigned long vs ours */
734 if ( dinfo->guest_width > sizeof(unsigned long) )
735 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
736 p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
737 else if ( dinfo->guest_width < sizeof(unsigned long) )
738 for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
739 p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
742 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
743 the guest must not change which frames are used for this purpose.
744 (its not clear why it would want to change them, and we'll be OK
745 from a safety POV anyhow. */
747 p2m = xc_map_foreign_pages(xc_handle, dom, PROT_READ,
748 p2m_frame_list,
749 P2M_FL_ENTRIES);
750 if ( !p2m )
751 {
752 ERROR("Couldn't map p2m table");
753 goto out;
754 }
755 ctx->live_p2m = p2m; /* So that translation macros will work */
757 /* Canonicalise the pfn-to-mfn table frame-number list. */
758 for ( i = 0; i < dinfo->p2m_size; i += FPP )
759 {
760 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
761 {
762 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
763 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
764 i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], ctx->max_mfn);
765 if ( p2m_frame_list[i/FPP] < ctx->max_mfn )
766 {
767 ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
768 (uint64_t)p2m_frame_list[i/FPP],
769 (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]]);
770 ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
771 (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]],
772 (uint64_t)p2m[ctx->live_m2p[p2m_frame_list[i/FPP]]]);
774 }
775 goto out;
776 }
777 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
778 }
780 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
781 {
782 ERROR("Could not get vcpu context");
783 goto out;
784 }
786 /*
787 * Write an extended-info structure to inform the restore code that
788 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
789 * slow paths in the restore code.
790 */
791 {
792 unsigned long signature = ~0UL;
793 uint32_t chunk1_sz = ((dinfo->guest_width==8)
794 ? sizeof(ctxt.x64)
795 : sizeof(ctxt.x32));
796 uint32_t chunk2_sz = 0;
797 uint32_t tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
798 if ( write_exact(io_fd, &signature, sizeof(signature)) ||
799 write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
800 write_exact(io_fd, "vcpu", 4) ||
801 write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
802 write_exact(io_fd, &ctxt, chunk1_sz) ||
803 write_exact(io_fd, "extv", 4) ||
804 write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) )
805 {
806 PERROR("write: extended info");
807 goto out;
808 }
809 }
811 if ( write_exact(io_fd, p2m_frame_list,
812 P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
813 {
814 PERROR("write: p2m_frame_list");
815 goto out;
816 }
818 success = 1;
820 out:
822 if ( !success && p2m )
823 munmap(p2m, P2M_FLL_ENTRIES * PAGE_SIZE);
825 if ( live_p2m_frame_list_list )
826 munmap(live_p2m_frame_list_list, PAGE_SIZE);
828 if ( live_p2m_frame_list )
829 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
831 if ( p2m_frame_list_list )
832 free(p2m_frame_list_list);
834 if ( p2m_frame_list )
835 free(p2m_frame_list);
837 return success ? p2m : NULL;
838 }
840 /* must be done AFTER suspend_and_state() */
841 static int save_tsc_info(int xc_handle, uint32_t dom, int io_fd)
842 {
843 int marker = -7;
844 uint32_t tsc_mode, khz, incarn;
845 uint64_t nsec;
847 if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
848 &nsec, &khz, &incarn) < 0 ||
849 write_exact(io_fd, &marker, sizeof(marker)) ||
850 write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
851 write_exact(io_fd, &nsec, sizeof(nsec)) ||
852 write_exact(io_fd, &khz, sizeof(khz)) ||
853 write_exact(io_fd, &incarn, sizeof(incarn)) )
854 return -1;
855 return 0;
856 }
858 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
859 uint32_t max_factor, uint32_t flags,
860 struct save_callbacks* callbacks,
861 int hvm, void (*switch_qemu_logdirty)(int, unsigned))
862 {
863 xc_dominfo_t info;
864 DECLARE_DOMCTL;
866 int rc = 1, frc, i, j, last_iter = 0, iter = 0;
867 int live = (flags & XCFLAGS_LIVE);
868 int debug = (flags & XCFLAGS_DEBUG);
869 int race = 0, sent_last_iter, skip_this_iter;
870 int tmem_saved = 0;
872 /* The new domain's shared-info frame number. */
873 unsigned long shared_info_frame;
875 /* A copy of the CPU context of the guest. */
876 vcpu_guest_context_any_t ctxt;
878 /* A table containing the type of each PFN (/not/ MFN!). */
879 xen_pfn_t *pfn_type = NULL;
880 unsigned long *pfn_batch = NULL;
881 int *pfn_err = NULL;
883 /* A copy of one frame of guest memory. */
884 char page[PAGE_SIZE];
886 /* Live mapping of shared info structure */
887 shared_info_any_t *live_shinfo = NULL;
889 /* base of the region in which domain memory is mapped */
890 unsigned char *region_base = NULL;
892 /* bitmap of pages:
893 - that should be sent this iteration (unless later marked as skip);
894 - to skip this iteration because already dirty;
895 - to fixup by sending at the end if not already resent; */
896 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
898 xc_shadow_op_stats_t stats;
900 unsigned long needed_to_fix = 0;
901 unsigned long total_sent = 0;
903 uint64_t vcpumap = 1ULL;
905 /* HVM: a buffer for holding HVM context */
906 uint32_t hvm_buf_size = 0;
907 uint8_t *hvm_buf = NULL;
909 /* HVM: magic frames for ioreqs and xenstore comms. */
910 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
912 unsigned long mfn;
914 struct outbuf ob;
915 static struct save_ctx _ctx = {
916 .live_p2m = NULL,
917 .live_m2p = NULL,
918 };
919 static struct save_ctx *ctx = &_ctx;
920 struct domain_info_context *dinfo = &ctx->dinfo;
922 int completed = 0;
924 outbuf_init(&ob, OUTBUF_SIZE);
926 /* If no explicit control parameters given, use defaults */
927 max_iters = max_iters ? : DEF_MAX_ITERS;
928 max_factor = max_factor ? : DEF_MAX_FACTOR;
930 initialize_mbit_rate();
932 if ( !get_platform_info(xc_handle, dom,
933 &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
934 {
935 ERROR("Unable to get platform info.");
936 return 1;
937 }
939 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
940 {
941 ERROR("Could not get domain info");
942 return 1;
943 }
945 shared_info_frame = info.shared_info_frame;
947 /* Map the shared info frame */
948 if ( !hvm )
949 {
950 live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
951 PROT_READ, shared_info_frame);
952 if ( !live_shinfo )
953 {
954 ERROR("Couldn't map live_shinfo");
955 goto out;
956 }
957 }
959 /* Get the size of the P2M table */
960 dinfo->p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
962 if ( dinfo->p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK )
963 {
964 ERROR("Cannot save this big a guest");
965 goto out;
966 }
968 /* Domain is still running at this point */
969 if ( live )
970 {
971 /* Live suspend. Enable log-dirty mode. */
972 if ( xc_shadow_control(xc_handle, dom,
973 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
974 NULL, 0, NULL, 0, NULL) < 0 )
975 {
976 /* log-dirty already enabled? There's no test op,
977 so attempt to disable then reenable it */
978 frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
979 NULL, 0, NULL, 0, NULL);
980 if ( frc >= 0 )
981 {
982 frc = xc_shadow_control(xc_handle, dom,
983 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
984 NULL, 0, NULL, 0, NULL);
985 }
987 if ( frc < 0 )
988 {
989 ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
990 goto out;
991 }
992 }
994 /* Enable qemu-dm logging dirty pages to xen */
995 if ( hvm )
996 switch_qemu_logdirty(dom, 1);
997 }
998 else
999 {
1000 /* This is a non-live suspend. Suspend the domain .*/
1001 if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
1002 io_fd, dom, &info) )
1004 ERROR("Domain appears not to have suspended");
1005 goto out;
1009 last_iter = !live;
1011 /* pretend we sent all the pages last iteration */
1012 sent_last_iter = dinfo->p2m_size;
1014 /* Setup to_send / to_fix and to_skip bitmaps */
1015 to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
1016 to_fix = calloc(1, BITMAP_SIZE);
1017 to_skip = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
1019 if ( !to_send || !to_fix || !to_skip )
1021 ERROR("Couldn't allocate to_send array");
1022 goto out;
1025 memset(to_send, 0xff, BITMAP_SIZE);
1027 if ( lock_pages(to_send, BITMAP_SIZE) )
1029 ERROR("Unable to lock to_send");
1030 return 1;
1033 /* (to fix is local only) */
1034 if ( lock_pages(to_skip, BITMAP_SIZE) )
1036 ERROR("Unable to lock to_skip");
1037 return 1;
1040 if ( hvm )
1042 /* Need another buffer for HVM context */
1043 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
1044 if ( hvm_buf_size == -1 )
1046 ERROR("Couldn't get HVM context size from Xen");
1047 goto out;
1049 hvm_buf = malloc(hvm_buf_size);
1050 if ( !hvm_buf )
1052 ERROR("Couldn't allocate memory");
1053 goto out;
1057 analysis_phase(xc_handle, dom, ctx, to_skip, 0);
1059 pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
1060 MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
1061 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
1062 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
1064 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
1065 errno = ENOMEM;
1066 goto out;
1068 memset(pfn_type, 0,
1069 ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
1071 if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
1073 ERROR("Unable to lock pfn_type array");
1074 goto out;
1077 /* Setup the mfn_to_pfn table mapping */
1078 if ( !(ctx->live_m2p = xc_map_m2p(xc_handle, ctx->max_mfn, PROT_READ, &ctx->m2p_mfn0)) )
1080 ERROR("Failed to map live M2P table");
1081 goto out;
1084 /* Start writing out the saved-domain record. */
1085 if ( write_exact(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
1087 PERROR("write: p2m_size");
1088 goto out;
1091 if ( !hvm )
1093 int err = 0;
1095 /* Map the P2M table, and write the list of P2M frames */
1096 ctx->live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom, ctx, live_shinfo);
1097 if ( ctx->live_p2m == NULL )
1099 ERROR("Failed to map/save the p2m frame list");
1100 goto out;
1103 /*
1104 * Quick belt and braces sanity check.
1105 */
1107 for ( i = 0; i < dinfo->p2m_size; i++ )
1109 mfn = pfn_to_mfn(i);
1110 if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
1112 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
1113 mfn, mfn_to_pfn(mfn));
1114 err++;
1117 DPRINTF("Had %d unexplained entries in p2m table\n", err);
1120 print_stats(xc_handle, dom, 0, &stats, 0);
1122 tmem_saved = xc_tmem_save(xc_handle, dom, io_fd, live, -5);
1123 if ( tmem_saved == -1 )
1125 ERROR("Error when writing to state file (tmem)");
1126 goto out;
1129 if ( !live && save_tsc_info(xc_handle, dom, io_fd) < 0 )
1131 ERROR("Error when writing to state file (tsc)");
1132 goto out;
1135 copypages:
1136 #define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), (len))
1137 #ifdef ratewrite
1138 #undef ratewrite
1139 #endif
1140 #define ratewrite(fd, live, buf, len) ratewrite_buffer(last_iter, &ob, (fd), (live), (buf), (len))
1142 /* Now write out each data page, canonicalising page tables as we go... */
1143 for ( ; ; )
1145 unsigned int prev_pc, sent_this_iter, N, batch, run;
1147 iter++;
1148 sent_this_iter = 0;
1149 skip_this_iter = 0;
1150 prev_pc = 0;
1151 N = 0;
1153 DPRINTF("Saving memory pages: iter %d 0%%", iter);
1155 while ( N < dinfo->p2m_size )
1157 unsigned int this_pc = (N * 100) / dinfo->p2m_size;
1159 if ( (this_pc - prev_pc) >= 5 )
1161 DPRINTF("\b\b\b\b%3d%%", this_pc);
1162 prev_pc = this_pc;
1165 if ( !last_iter )
1167 /* Slightly wasteful to peek the whole array evey time,
1168 but this is fast enough for the moment. */
1169 frc = xc_shadow_control(
1170 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
1171 dinfo->p2m_size, NULL, 0, NULL);
1172 if ( frc != dinfo->p2m_size )
1174 ERROR("Error peeking shadow bitmap");
1175 goto out;
1179 /* load pfn_type[] with the mfn of all the pages we're doing in
1180 this batch. */
1181 for ( batch = 0;
1182 (batch < MAX_BATCH_SIZE) && (N < dinfo->p2m_size);
1183 N++ )
1185 int n = N;
1187 if ( debug )
1189 DPRINTF("%d pfn= %08lx mfn= %08lx %d",
1190 iter, (unsigned long)n,
1191 hvm ? 0 : pfn_to_mfn(n),
1192 test_bit(n, to_send));
1193 if ( !hvm && is_mapped(pfn_to_mfn(n)) )
1194 DPRINTF(" [mfn]= %08lx",
1195 mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
1196 DPRINTF("\n");
1199 if ( completed )
1201 /* for sparse bitmaps, word-by-word may save time */
1202 if ( !to_send[N >> ORDER_LONG] )
1204 /* incremented again in for loop! */
1205 N += BITS_PER_LONG - 1;
1206 continue;
1209 if ( !test_bit(n, to_send) )
1210 continue;
1212 pfn_batch[batch] = n;
1213 if ( hvm )
1214 pfn_type[batch] = n;
1215 else
1216 pfn_type[batch] = pfn_to_mfn(n);
1218 else
1220 if ( !last_iter &&
1221 test_bit(n, to_send) &&
1222 test_bit(n, to_skip) )
1223 skip_this_iter++; /* stats keeping */
1225 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
1226 (test_bit(n, to_send) && last_iter) ||
1227 (test_bit(n, to_fix) && last_iter)) )
1228 continue;
1230 /*
1231 ** we get here if:
1232 ** 1. page is marked to_send & hasn't already been re-dirtied
1233 ** 2. (ignore to_skip in last iteration)
1234 ** 3. add in pages that still need fixup (net bufs)
1235 */
1237 pfn_batch[batch] = n;
1239 /* Hypercall interfaces operate in PFNs for HVM guests
1240 * and MFNs for PV guests */
1241 if ( hvm )
1242 pfn_type[batch] = n;
1243 else
1244 pfn_type[batch] = pfn_to_mfn(n);
1246 if ( !is_mapped(pfn_type[batch]) )
1248 /*
1249 ** not currently in psuedo-physical map -- set bit
1250 ** in to_fix since we must send this page in last_iter
1251 ** unless its sent sooner anyhow, or it never enters
1252 ** pseudo-physical map (e.g. for ballooned down doms)
1253 */
1254 set_bit(n, to_fix);
1255 continue;
1258 if ( last_iter &&
1259 test_bit(n, to_fix) &&
1260 !test_bit(n, to_send) )
1262 needed_to_fix++;
1263 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1264 iter, n, pfn_type[batch]);
1267 clear_bit(n, to_fix);
1270 batch++;
1273 if ( batch == 0 )
1274 goto skip; /* vanishingly unlikely... */
1276 region_base = xc_map_foreign_bulk(
1277 xc_handle, dom, PROT_READ, pfn_type, pfn_err, batch);
1278 if ( region_base == NULL )
1280 ERROR("map batch failed");
1281 goto out;
1284 if ( hvm )
1286 /* Look for and skip completely empty batches. */
1287 for ( j = 0; j < batch; j++ )
1289 if ( !pfn_err[j] )
1290 break;
1291 pfn_type[j] |= XEN_DOMCTL_PFINFO_XTAB;
1293 if ( j == batch )
1295 munmap(region_base, batch*PAGE_SIZE);
1296 continue; /* bail on this batch: no valid pages */
1298 for ( ; j < batch; j++ )
1299 if ( pfn_err[j] )
1300 pfn_type[j] |= XEN_DOMCTL_PFINFO_XTAB;
1302 else
1304 /* Get page types */
1305 if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) )
1307 ERROR("get_pfn_type_batch failed");
1308 goto out;
1311 for ( j = 0; j < batch; j++ )
1313 unsigned long mfn = pfn_to_mfn(pfn_batch[j]);
1315 if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB )
1317 DPRINTF("type fail: page %i mfn %08lx\n",
1318 j, mfn);
1319 continue;
1322 if ( debug )
1323 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1324 " sum= %08lx\n",
1325 iter,
1326 pfn_type[j] | pfn_batch[j],
1327 mfn,
1328 mfn_to_pfn(mfn),
1329 csum_page(region_base + (PAGE_SIZE*j)));
1331 /* canonicalise mfn->pfn */
1332 pfn_type[j] |= pfn_batch[j];
1336 if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
1338 PERROR("Error when writing to state file (2)");
1339 goto out;
1342 if ( sizeof(unsigned long) < sizeof(*pfn_type) )
1343 for ( j = 0; j < batch; j++ )
1344 ((unsigned long *)pfn_type)[j] = pfn_type[j];
1345 if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
1347 PERROR("Error when writing to state file (3)");
1348 goto out;
1350 if ( sizeof(unsigned long) < sizeof(*pfn_type) )
1351 while ( --j >= 0 )
1352 pfn_type[j] = ((unsigned long *)pfn_type)[j];
1354 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1355 run = 0;
1356 for ( j = 0; j < batch; j++ )
1358 unsigned long pfn, pagetype;
1359 void *spage = (char *)region_base + (PAGE_SIZE*j);
1361 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1362 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1364 if ( pagetype != 0 )
1366 /* If the page is not a normal data page, write out any
1367 run of pages we may have previously acumulated */
1368 if ( run )
1370 if ( ratewrite(io_fd, live,
1371 (char*)region_base+(PAGE_SIZE*(j-run)),
1372 PAGE_SIZE*run) != PAGE_SIZE*run )
1374 ERROR("Error when writing to state file (4a)"
1375 " (errno %d)", errno);
1376 goto out;
1378 run = 0;
1382 /* skip pages that aren't present */
1383 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1384 continue;
1386 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1388 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1389 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1391 /* We have a pagetable page: need to rewrite it. */
1392 race =
1393 canonicalize_pagetable(ctx, pagetype, pfn, spage, page);
1395 if ( race && !live )
1397 ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
1398 pagetype);
1399 goto out;
1402 if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
1404 ERROR("Error when writing to state file (4b)"
1405 " (errno %d)", errno);
1406 goto out;
1409 else
1411 /* We have a normal page: accumulate it for writing. */
1412 run++;
1414 } /* end of the write out for this batch */
1416 if ( run )
1418 /* write out the last accumulated run of pages */
1419 if ( ratewrite(io_fd, live,
1420 (char*)region_base+(PAGE_SIZE*(j-run)),
1421 PAGE_SIZE*run) != PAGE_SIZE*run )
1423 ERROR("Error when writing to state file (4c)"
1424 " (errno %d)", errno);
1425 goto out;
1429 sent_this_iter += batch;
1431 munmap(region_base, batch*PAGE_SIZE);
1433 } /* end of this while loop for this iteration */
1435 skip:
1437 total_sent += sent_this_iter;
1439 DPRINTF("\r %d: sent %d, skipped %d, ",
1440 iter, sent_this_iter, skip_this_iter );
1442 if ( last_iter )
1444 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1446 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1447 total_sent, ((float)total_sent)/dinfo->p2m_size );
1448 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1451 if ( last_iter && debug )
1453 int minusone = -1;
1454 memset(to_send, 0xff, BITMAP_SIZE);
1455 debug = 0;
1456 DPRINTF("Entering debug resend-all mode\n");
1458 /* send "-1" to put receiver into debug mode */
1459 if ( write_exact(io_fd, &minusone, sizeof(int)) )
1461 PERROR("Error when writing to state file (6)");
1462 goto out;
1465 continue;
1468 if ( last_iter )
1469 break;
1471 if ( live )
1473 if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1474 (iter >= max_iters) ||
1475 (sent_this_iter+skip_this_iter < 50) ||
1476 (total_sent > dinfo->p2m_size*max_factor) )
1478 DPRINTF("Start last iteration\n");
1479 last_iter = 1;
1481 if ( suspend_and_state(callbacks->suspend, callbacks->data,
1482 xc_handle, io_fd, dom, &info) )
1484 ERROR("Domain appears not to have suspended");
1485 goto out;
1488 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1489 if ( (tmem_saved > 0) &&
1490 (xc_tmem_save_extra(xc_handle,dom,io_fd,-6) == -1) )
1492 ERROR("Error when writing to state file (tmem)");
1493 goto out;
1496 if ( save_tsc_info(xc_handle, dom, io_fd) < 0 )
1498 ERROR("Error when writing to state file (tsc)");
1499 goto out;
1505 if ( xc_shadow_control(xc_handle, dom,
1506 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1507 dinfo->p2m_size, NULL, 0, &stats) != dinfo->p2m_size )
1509 ERROR("Error flushing shadow PT");
1510 goto out;
1513 sent_last_iter = sent_this_iter;
1515 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1518 } /* end of infinite for loop */
1520 DPRINTF("All memory is saved\n");
1523 struct {
1524 int minustwo;
1525 int max_vcpu_id;
1526 uint64_t vcpumap;
1527 } chunk = { -2, info.max_vcpu_id };
1529 if ( info.max_vcpu_id >= 64 )
1531 ERROR("Too many VCPUS in guest!");
1532 goto out;
1535 for ( i = 1; i <= info.max_vcpu_id; i++ )
1537 xc_vcpuinfo_t vinfo;
1538 if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
1539 vinfo.online )
1540 vcpumap |= 1ULL << i;
1543 chunk.vcpumap = vcpumap;
1544 if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
1546 PERROR("Error when writing to state file");
1547 goto out;
1551 if ( hvm )
1553 struct {
1554 int id;
1555 uint32_t pad;
1556 uint64_t data;
1557 } chunk = { 0, };
1559 chunk.id = -3;
1560 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
1561 (unsigned long *)&chunk.data);
1563 if ( (chunk.data != 0) &&
1564 write_exact(io_fd, &chunk, sizeof(chunk)) )
1566 PERROR("Error when writing the ident_pt for EPT guest");
1567 goto out;
1570 chunk.id = -4;
1571 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
1572 (unsigned long *)&chunk.data);
1574 if ( (chunk.data != 0) &&
1575 write_exact(io_fd, &chunk, sizeof(chunk)) )
1577 PERROR("Error when writing the vm86 TSS for guest");
1578 goto out;
1582 /* Zero terminate */
1583 i = 0;
1584 if ( write_exact(io_fd, &i, sizeof(int)) )
1586 PERROR("Error when writing to state file (6')");
1587 goto out;
1590 if ( hvm )
1592 uint32_t rec_size;
1594 /* Save magic-page locations. */
1595 memset(magic_pfns, 0, sizeof(magic_pfns));
1596 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
1597 (unsigned long *)&magic_pfns[0]);
1598 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
1599 (unsigned long *)&magic_pfns[1]);
1600 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
1601 (unsigned long *)&magic_pfns[2]);
1602 if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
1604 PERROR("Error when writing to state file (7)");
1605 goto out;
1608 /* Get HVM context from Xen and save it too */
1609 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
1610 hvm_buf_size)) == -1 )
1612 ERROR("HVM:Could not get hvm buffer");
1613 goto out;
1616 if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
1618 PERROR("error write hvm buffer size");
1619 goto out;
1622 if ( write_exact(io_fd, hvm_buf, rec_size) )
1624 PERROR("write HVM info failed!\n");
1625 goto out;
1628 /* HVM guests are done now */
1629 rc = 0;
1630 goto out;
1633 /* PV guests only from now on */
1635 /* Send through a list of all the PFNs that were not in map at the close */
1637 unsigned int i,j;
1638 unsigned long pfntab[1024];
1640 for ( i = 0, j = 0; i < dinfo->p2m_size; i++ )
1642 if ( !is_mapped(pfn_to_mfn(i)) )
1643 j++;
1646 if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
1648 PERROR("Error when writing to state file (6a)");
1649 goto out;
1652 for ( i = 0, j = 0; i < dinfo->p2m_size; )
1654 if ( !is_mapped(pfn_to_mfn(i)) )
1655 pfntab[j++] = i;
1657 i++;
1658 if ( (j == 1024) || (i == dinfo->p2m_size) )
1660 if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
1662 PERROR("Error when writing to state file (6b)");
1663 goto out;
1665 j = 0;
1670 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
1672 ERROR("Could not get vcpu context");
1673 goto out;
1676 /* Canonicalise the suspend-record frame number. */
1677 mfn = GET_FIELD(&ctxt, user_regs.edx);
1678 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1680 ERROR("Suspend record is not in range of pseudophys map");
1681 goto out;
1683 SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn));
1685 for ( i = 0; i <= info.max_vcpu_id; i++ )
1687 if ( !(vcpumap & (1ULL << i)) )
1688 continue;
1690 if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
1692 ERROR("No context for VCPU%d", i);
1693 goto out;
1696 /* Canonicalise each GDT frame number. */
1697 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1699 mfn = GET_FIELD(&ctxt, gdt_frames[j]);
1700 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1702 ERROR("GDT frame is not in range of pseudophys map");
1703 goto out;
1705 SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
1708 /* Canonicalise the page table base pointer. */
1709 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(
1710 GET_FIELD(&ctxt, ctrlreg[3]))) )
1712 ERROR("PT base is not in range of pseudophys map");
1713 goto out;
1715 SET_FIELD(&ctxt, ctrlreg[3],
1716 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3])))));
1718 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1719 if ( (ctx->pt_levels == 4) && ctxt.x64.ctrlreg[1] )
1721 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) )
1723 ERROR("PT base is not in range of pseudophys map");
1724 goto out;
1726 /* Least-significant bit means 'valid PFN'. */
1727 ctxt.x64.ctrlreg[1] = 1 |
1728 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
1731 if ( write_exact(io_fd, &ctxt, ((dinfo->guest_width==8)
1732 ? sizeof(ctxt.x64)
1733 : sizeof(ctxt.x32))) )
1735 PERROR("Error when writing to state file (1)");
1736 goto out;
1739 domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
1740 domctl.domain = dom;
1741 domctl.u.ext_vcpucontext.vcpu = i;
1742 if ( xc_domctl(xc_handle, &domctl) < 0 )
1744 ERROR("No extended context for VCPU%d", i);
1745 goto out;
1747 if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
1749 PERROR("Error when writing to state file (2)");
1750 goto out;
1754 /*
1755 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1756 */
1757 memcpy(page, live_shinfo, PAGE_SIZE);
1758 SET_FIELD(((shared_info_any_t *)page),
1759 arch.pfn_to_mfn_frame_list_list, 0);
1760 if ( write_exact(io_fd, page, PAGE_SIZE) )
1762 PERROR("Error when writing to state file (1)");
1763 goto out;
1766 /* Success! */
1767 rc = 0;
1769 out:
1770 completed = 1;
1772 if ( !rc && callbacks->postcopy )
1773 callbacks->postcopy(callbacks->data);
1775 /* Flush last write and discard cache for file. */
1776 if ( outbuf_flush(&ob, io_fd) < 0 ) {
1777 ERROR("Error when flushing output buffer\n");
1778 rc = 1;
1781 discard_file_cache(io_fd, 1 /* flush */);
1783 /* checkpoint_cb can spend arbitrarily long in between rounds */
1784 if (!rc && callbacks->checkpoint &&
1785 callbacks->checkpoint(callbacks->data) > 0)
1787 /* reset stats timer */
1788 print_stats(xc_handle, dom, 0, &stats, 0);
1790 rc = 1;
1791 /* last_iter = 1; */
1792 if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
1793 io_fd, dom, &info) )
1795 ERROR("Domain appears not to have suspended");
1796 goto out;
1798 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1799 print_stats(xc_handle, dom, 0, &stats, 1);
1801 if ( xc_shadow_control(xc_handle, dom,
1802 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1803 dinfo->p2m_size, NULL, 0, &stats) != dinfo->p2m_size )
1805 ERROR("Error flushing shadow PT");
1808 goto copypages;
1811 if ( tmem_saved != 0 && live )
1812 xc_tmem_save_done(xc_handle, dom);
1814 if ( live )
1816 if ( xc_shadow_control(xc_handle, dom,
1817 XEN_DOMCTL_SHADOW_OP_OFF,
1818 NULL, 0, NULL, 0, NULL) < 0 )
1819 DPRINTF("Warning - couldn't disable shadow mode");
1820 if ( hvm )
1821 switch_qemu_logdirty(dom, 0);
1824 if ( live_shinfo )
1825 munmap(live_shinfo, PAGE_SIZE);
1827 if ( ctx->live_p2m )
1828 munmap(ctx->live_p2m, P2M_FLL_ENTRIES * PAGE_SIZE);
1830 if ( ctx->live_m2p )
1831 munmap(ctx->live_m2p, M2P_SIZE(ctx->max_mfn));
1833 free(pfn_type);
1834 free(pfn_batch);
1835 free(to_send);
1836 free(to_fix);
1837 free(to_skip);
1839 DPRINTF("Save exit rc=%d\n",rc);
1841 return !!rc;
1844 /*
1845 * Local variables:
1846 * mode: C
1847 * c-set-style: "BSD"
1848 * c-basic-offset: 4
1849 * tab-width: 4
1850 * indent-tabs-mode: nil
1851 * End:
1852 */