debuggers.hg

view tools/libxc/xc_linux_save.c @ 13643:a9165141e52d

During suspend, return immediately with a failure if the domain will
never suspend, instead of pointlessly retrying.

Signed-off-by: John Levon <john.levon@sun.com>
author kfraser@localhost.localdomain
date Thu Jan 25 12:30:25 2007 +0000 (2007-01-25)
parents 30af6cfdb05c
children 2efe681d1194
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xg_private.h"
17 #include "xg_save_restore.h"
19 /*
20 ** Default values for important tuning parameters. Can override by passing
21 ** non-zero replacement values to xc_linux_save().
22 **
23 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
24 **
25 */
26 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
27 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
30 /* max mfn of the whole machine */
31 static unsigned long max_mfn;
33 /* virtual starting address of the hypervisor */
34 static unsigned long hvirt_start;
36 /* #levels of page tables used by the currrent guest */
37 static unsigned int pt_levels;
39 /* total number of pages used by the current guest */
40 static unsigned long max_pfn;
42 /* Live mapping of the table mapping each PFN to its current MFN. */
43 static xen_pfn_t *live_p2m = NULL;
45 /* Live mapping of system MFN to PFN table. */
46 static xen_pfn_t *live_m2p = NULL;
47 static unsigned long m2p_mfn0;
49 /* grep fodder: machine_to_phys */
51 #define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
53 /*
54 * Returns TRUE if the given machine frame number has a unique mapping
55 * in the guest's pseudophysical map.
56 */
57 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
58 (((_mfn) < (max_mfn)) && \
59 ((mfn_to_pfn(_mfn) < (max_pfn)) && \
60 (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
63 /* Returns TRUE if MFN is successfully converted to a PFN. */
64 #define translate_mfn_to_pfn(_pmfn) \
65 ({ \
66 unsigned long mfn = *(_pmfn); \
67 int _res = 1; \
68 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
69 _res = 0; \
70 else \
71 *(_pmfn) = mfn_to_pfn(mfn); \
72 _res; \
73 })
75 /*
76 ** During (live) save/migrate, we maintain a number of bitmaps to track
77 ** which pages we have to send, to fixup, and to skip.
78 */
80 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
81 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
83 #define BITMAP_ENTRY(_nr,_bmap) \
84 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
86 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
88 static inline int test_bit (int nr, volatile void * addr)
89 {
90 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
91 }
93 static inline void clear_bit (int nr, volatile void * addr)
94 {
95 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
96 }
98 static inline void set_bit ( int nr, volatile void * addr)
99 {
100 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
101 }
103 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
104 static inline unsigned int hweight32(unsigned int w)
105 {
106 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
107 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
108 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
109 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
110 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
111 }
113 static inline int count_bits ( int nr, volatile void *addr)
114 {
115 int i, count = 0;
116 volatile unsigned long *p = (volatile unsigned long *)addr;
117 /* We know that the array is padded to unsigned long. */
118 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
119 count += hweight32(*p);
120 return count;
121 }
123 static inline int permute( int i, int nr, int order_nr )
124 {
125 /* Need a simple permutation function so that we scan pages in a
126 pseudo random order, enabling us to get a better estimate of
127 the domain's page dirtying rate as we go (there are often
128 contiguous ranges of pfns that have similar behaviour, and we
129 want to mix them up. */
131 /* e.g. nr->oder 15->4 16->4 17->5 */
132 /* 512MB domain, 128k pages, order 17 */
134 /*
135 QPONMLKJIHGFEDCBA
136 QPONMLKJIH
137 GFEDCBA
138 */
140 /*
141 QPONMLKJIHGFEDCBA
142 EDCBA
143 QPONM
144 LKJIHGF
145 */
147 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
148 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
150 return i;
151 }
156 static uint64_t tv_to_us(struct timeval *new)
157 {
158 return (new->tv_sec * 1000000) + new->tv_usec;
159 }
161 static uint64_t llgettimeofday(void)
162 {
163 struct timeval now;
164 gettimeofday(&now, NULL);
165 return tv_to_us(&now);
166 }
168 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
169 {
170 return ((new->tv_sec - old->tv_sec)*1000000 ) +
171 (new->tv_usec - old->tv_usec);
172 }
175 #ifdef ADAPTIVE_SAVE
178 /*
179 ** We control the rate at which we transmit (or save) to minimize impact
180 ** on running domains (including the target if we're doing live migrate).
181 */
183 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
184 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
187 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
188 #define RATE_TO_BTU 781250
190 /* Amount in bytes we allow ourselves to send in a burst */
191 #define BURST_BUDGET (100*1024)
194 /* We keep track of the current and previous transmission rate */
195 static int mbit_rate, ombit_rate = 0;
197 /* Have we reached the maximum transmission rate? */
198 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
201 static inline void initialize_mbit_rate()
202 {
203 mbit_rate = START_MBIT_RATE;
204 }
207 static int ratewrite(int io_fd, void *buf, int n)
208 {
209 static int budget = 0;
210 static int burst_time_us = -1;
211 static struct timeval last_put = { 0 };
212 struct timeval now;
213 struct timespec delay;
214 long long delta;
216 if (START_MBIT_RATE == 0)
217 return write(io_fd, buf, n);
219 budget -= n;
220 if (budget < 0) {
221 if (mbit_rate != ombit_rate) {
222 burst_time_us = RATE_TO_BTU / mbit_rate;
223 ombit_rate = mbit_rate;
224 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
225 mbit_rate, BURST_BUDGET, burst_time_us);
226 }
227 if (last_put.tv_sec == 0) {
228 budget += BURST_BUDGET;
229 gettimeofday(&last_put, NULL);
230 } else {
231 while (budget < 0) {
232 gettimeofday(&now, NULL);
233 delta = tv_delta(&now, &last_put);
234 while (delta > burst_time_us) {
235 budget += BURST_BUDGET;
236 last_put.tv_usec += burst_time_us;
237 if (last_put.tv_usec > 1000000) {
238 last_put.tv_usec -= 1000000;
239 last_put.tv_sec++;
240 }
241 delta -= burst_time_us;
242 }
243 if (budget > 0)
244 break;
245 delay.tv_sec = 0;
246 delay.tv_nsec = 1000 * (burst_time_us - delta);
247 while (delay.tv_nsec > 0)
248 if (nanosleep(&delay, &delay) == 0)
249 break;
250 }
251 }
252 }
253 return write(io_fd, buf, n);
254 }
256 #else /* ! ADAPTIVE SAVE */
258 #define RATE_IS_MAX() (0)
259 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
260 #define initialize_mbit_rate()
262 #endif
265 static inline ssize_t write_exact(int fd, void *buf, size_t count)
266 {
267 if(write(fd, buf, count) != count)
268 return 0;
269 return 1;
270 }
274 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
275 xc_shadow_op_stats_t *stats, int print)
276 {
277 static struct timeval wall_last;
278 static long long d0_cpu_last;
279 static long long d1_cpu_last;
281 struct timeval wall_now;
282 long long wall_delta;
283 long long d0_cpu_now, d0_cpu_delta;
284 long long d1_cpu_now, d1_cpu_delta;
286 gettimeofday(&wall_now, NULL);
288 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
289 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
291 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
292 DPRINTF("ARRHHH!!\n");
294 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
296 if (wall_delta == 0) wall_delta = 1;
298 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
299 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
301 if (print)
302 DPRINTF(
303 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
304 "dirtied %dMb/s %" PRId32 " pages\n",
305 wall_delta,
306 (int)((d0_cpu_delta*100)/wall_delta),
307 (int)((d1_cpu_delta*100)/wall_delta),
308 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
309 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
310 stats->dirty_count);
312 #ifdef ADAPTIVE_SAVE
313 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
314 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
315 + 50;
316 if (mbit_rate > MAX_MBIT_RATE)
317 mbit_rate = MAX_MBIT_RATE;
318 }
319 #endif
321 d0_cpu_last = d0_cpu_now;
322 d1_cpu_last = d1_cpu_now;
323 wall_last = wall_now;
325 return 0;
326 }
329 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
330 unsigned long *arr, int runs)
331 {
332 long long start, now;
333 xc_shadow_op_stats_t stats;
334 int j;
336 start = llgettimeofday();
338 for (j = 0; j < runs; j++) {
339 int i;
341 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
342 arr, max_pfn, NULL, 0, NULL);
343 DPRINTF("#Flush\n");
344 for ( i = 0; i < 40; i++ ) {
345 usleep(50000);
346 now = llgettimeofday();
347 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
348 NULL, 0, NULL, 0, &stats);
350 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
351 ((now-start)+500)/1000,
352 stats.fault_count, stats.dirty_count);
353 }
354 }
356 return -1;
357 }
360 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
361 int dom, xc_dominfo_t *info,
362 vcpu_guest_context_t *ctxt)
363 {
364 int i = 0;
366 if (!(*suspend)(dom)) {
367 ERROR("Suspend request failed");
368 return -1;
369 }
371 retry:
373 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
374 ERROR("Could not get domain info");
375 return -1;
376 }
378 if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
379 ERROR("Could not get vcpu context");
382 if (info->dying) {
383 ERROR("domain is dying");
384 return -1;
385 }
387 if (info->crashed) {
388 ERROR("domain has crashed");
389 return -1;
390 }
392 if (info->shutdown) {
393 switch (info->shutdown_reason) {
394 case SHUTDOWN_poweroff:
395 case SHUTDOWN_reboot:
396 ERROR("domain has shut down");
397 return -1;
398 case SHUTDOWN_suspend:
399 return 0;
400 case SHUTDOWN_crash:
401 ERROR("domain has crashed");
402 return -1;
403 }
404 }
406 if (info->paused) {
407 // try unpausing domain, wait, and retest
408 xc_domain_unpause( xc_handle, dom );
410 ERROR("Domain was paused. Wait and re-test.");
411 usleep(10000); // 10ms
413 goto retry;
414 }
417 if( ++i < 100 ) {
418 ERROR("Retry suspend domain");
419 usleep(10000); // 10ms
420 goto retry;
421 }
423 ERROR("Unable to suspend domain.");
425 return -1;
426 }
428 /*
429 ** Map the top-level page of MFNs from the guest. The guest might not have
430 ** finished resuming from a previous restore operation, so we wait a while for
431 ** it to update the MFN to a reasonable value.
432 */
433 static void *map_frame_list_list(int xc_handle, uint32_t dom,
434 shared_info_t *shinfo)
435 {
436 int count = 100;
437 void *p;
439 while (count-- && shinfo->arch.pfn_to_mfn_frame_list_list == 0)
440 usleep(10000);
442 if (shinfo->arch.pfn_to_mfn_frame_list_list == 0) {
443 ERROR("Timed out waiting for frame list updated.");
444 return NULL;
445 }
447 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
448 shinfo->arch.pfn_to_mfn_frame_list_list);
450 if (p == NULL)
451 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
453 return p;
454 }
456 /*
457 ** During transfer (or in the state file), all page-table pages must be
458 ** converted into a 'canonical' form where references to actual mfns
459 ** are replaced with references to the corresponding pfns.
460 **
461 ** This function performs the appropriate conversion, taking into account
462 ** which entries do not require canonicalization (in particular, those
463 ** entries which map the virtual address reserved for the hypervisor).
464 */
465 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
466 const void *spage, void *dpage)
467 {
469 int i, pte_last, xen_start, xen_end, race = 0;
470 uint64_t pte;
472 /*
473 ** We need to determine which entries in this page table hold
474 ** reserved hypervisor mappings. This depends on the current
475 ** page table type as well as the number of paging levels.
476 */
477 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
479 if (pt_levels == 2 && type == XEN_DOMCTL_PFINFO_L2TAB)
480 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
482 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L3TAB)
483 xen_start = L3_PAGETABLE_ENTRIES_PAE;
485 /*
486 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
487 ** We can spot this by looking for the guest linear mapping which
488 ** Xen always ensures is present in that L2. Guests must ensure
489 ** that this check will fail for other L2s.
490 */
491 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L2TAB) {
492 int hstart;
493 unsigned long he;
495 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
496 he = ((const uint64_t *) spage)[hstart];
498 if ( ((he >> PAGE_SHIFT) & 0x0fffffff) == m2p_mfn0 ) {
499 /* hvirt starts with xen stuff... */
500 xen_start = hstart;
501 } else if ( hvirt_start != 0xf5800000 ) {
502 /* old L2s from before hole was shrunk... */
503 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
504 he = ((const uint64_t *) spage)[hstart];
506 if( ((he >> PAGE_SHIFT) & 0x0fffffff) == m2p_mfn0 )
507 xen_start = hstart;
508 }
509 }
511 if (pt_levels == 4 && type == XEN_DOMCTL_PFINFO_L4TAB) {
512 /*
513 ** XXX SMH: should compute these from hvirt_start (which we have)
514 ** and hvirt_end (which we don't)
515 */
516 xen_start = 256;
517 xen_end = 272;
518 }
520 /* Now iterate through the page table, canonicalizing each PTE */
521 for (i = 0; i < pte_last; i++ ) {
523 unsigned long pfn, mfn;
525 if (pt_levels == 2)
526 pte = ((const uint32_t*)spage)[i];
527 else
528 pte = ((const uint64_t*)spage)[i];
530 if (i >= xen_start && i < xen_end)
531 pte = 0;
533 if (pte & _PAGE_PRESENT) {
535 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
536 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
537 /* This will happen if the type info is stale which
538 is quite feasible under live migration */
539 DPRINTF("PT Race: [%08lx,%d] pte=%llx, mfn=%08lx\n",
540 type, i, (unsigned long long)pte, mfn);
541 pfn = 0; /* zap it - we'll retransmit this page later */
542 race = 1; /* inform the caller of race; fatal if !live */
543 } else
544 pfn = mfn_to_pfn(mfn);
546 pte &= 0xffffff0000000fffULL;
547 pte |= (uint64_t)pfn << PAGE_SHIFT;
548 }
550 if (pt_levels == 2)
551 ((uint32_t*)dpage)[i] = pte;
552 else
553 ((uint64_t*)dpage)[i] = pte;
555 }
557 return race;
558 }
562 static xen_pfn_t *xc_map_m2p(int xc_handle,
563 unsigned long max_mfn,
564 int prot)
565 {
566 struct xen_machphys_mfn_list xmml;
567 privcmd_mmap_entry_t *entries;
568 unsigned long m2p_chunks, m2p_size;
569 xen_pfn_t *m2p;
570 xen_pfn_t *extent_start;
571 int i, rc;
573 m2p_size = M2P_SIZE(max_mfn);
574 m2p_chunks = M2P_CHUNKS(max_mfn);
576 xmml.max_extents = m2p_chunks;
577 if (!(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t)))) {
578 ERROR("failed to allocate space for m2p mfns");
579 return NULL;
580 }
581 set_xen_guest_handle(xmml.extent_start, extent_start);
583 if (xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
584 (xmml.nr_extents != m2p_chunks)) {
585 ERROR("xc_get_m2p_mfns");
586 return NULL;
587 }
589 if ((m2p = mmap(NULL, m2p_size, prot,
590 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
591 ERROR("failed to mmap m2p");
592 return NULL;
593 }
595 if (!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
596 ERROR("failed to allocate space for mmap entries");
597 return NULL;
598 }
600 for (i=0; i < m2p_chunks; i++) {
601 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
602 entries[i].mfn = extent_start[i];
603 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
604 }
606 if ((rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
607 entries, m2p_chunks)) < 0) {
608 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
609 return NULL;
610 }
612 m2p_mfn0 = entries[0].mfn;
614 free(extent_start);
615 free(entries);
617 return m2p;
618 }
622 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
623 uint32_t max_factor, uint32_t flags, int (*suspend)(int))
624 {
625 xc_dominfo_t info;
627 int rc = 1, i, j, last_iter, iter = 0;
628 int live = (flags & XCFLAGS_LIVE);
629 int debug = (flags & XCFLAGS_DEBUG);
630 int race = 0, sent_last_iter, skip_this_iter;
632 /* The new domain's shared-info frame number. */
633 unsigned long shared_info_frame;
635 /* A copy of the CPU context of the guest. */
636 vcpu_guest_context_t ctxt;
638 /* A table containg the type of each PFN (/not/ MFN!). */
639 unsigned long *pfn_type = NULL;
640 unsigned long *pfn_batch = NULL;
642 /* A temporary mapping, and a copy, of one frame of guest memory. */
643 char page[PAGE_SIZE];
645 /* Double and single indirect references to the live P2M table */
646 xen_pfn_t *live_p2m_frame_list_list = NULL;
647 xen_pfn_t *live_p2m_frame_list = NULL;
649 /* A copy of the pfn-to-mfn table frame list. */
650 xen_pfn_t *p2m_frame_list = NULL;
652 /* Live mapping of shared info structure */
653 shared_info_t *live_shinfo = NULL;
655 /* base of the region in which domain memory is mapped */
656 unsigned char *region_base = NULL;
658 /* power of 2 order of max_pfn */
659 int order_nr;
661 /* bitmap of pages:
662 - that should be sent this iteration (unless later marked as skip);
663 - to skip this iteration because already dirty;
664 - to fixup by sending at the end if not already resent; */
665 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
667 xc_shadow_op_stats_t stats;
669 unsigned long needed_to_fix = 0;
670 unsigned long total_sent = 0;
673 /* If no explicit control parameters given, use defaults */
674 if(!max_iters)
675 max_iters = DEF_MAX_ITERS;
676 if(!max_factor)
677 max_factor = DEF_MAX_FACTOR;
679 initialize_mbit_rate();
681 if(!get_platform_info(xc_handle, dom,
682 &max_mfn, &hvirt_start, &pt_levels)) {
683 ERROR("Unable to get platform info.");
684 return 1;
685 }
687 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
688 ERROR("Could not get domain info");
689 return 1;
690 }
692 if (lock_pages(&ctxt, sizeof(ctxt))) {
693 ERROR("Unable to lock ctxt");
694 return 1;
695 }
697 /* Only have to worry about vcpu 0 even for SMP */
698 if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
699 ERROR("Could not get vcpu context");
700 goto out;
701 }
702 shared_info_frame = info.shared_info_frame;
704 /* A cheesy test to see whether the domain contains valid state. */
705 if (ctxt.ctrlreg[3] == 0)
706 {
707 ERROR("Domain is not in a valid Linux guest OS state");
708 goto out;
709 }
711 /* Map the shared info frame */
712 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
713 PROT_READ, shared_info_frame))) {
714 ERROR("Couldn't map live_shinfo");
715 goto out;
716 }
718 max_pfn = live_shinfo->arch.max_pfn;
720 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
721 live_shinfo);
723 if (!live_p2m_frame_list_list)
724 goto out;
726 live_p2m_frame_list =
727 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
728 live_p2m_frame_list_list,
729 P2M_FLL_ENTRIES);
731 if (!live_p2m_frame_list) {
732 ERROR("Couldn't map p2m_frame_list");
733 goto out;
734 }
736 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
737 the guest must not change which frames are used for this purpose.
738 (its not clear why it would want to change them, and we'll be OK
739 from a safety POV anyhow. */
741 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
742 live_p2m_frame_list,
743 P2M_FL_ENTRIES);
745 if (!live_p2m) {
746 ERROR("Couldn't map p2m table");
747 goto out;
748 }
750 /* Setup the mfn_to_pfn table mapping */
751 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
752 ERROR("Failed to map live M2P table");
753 goto out;
754 }
757 /* Get a local copy of the live_P2M_frame_list */
758 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
759 ERROR("Couldn't allocate p2m_frame_list array");
760 goto out;
761 }
762 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
764 /* Canonicalise the pfn-to-mfn table frame-number list. */
765 for (i = 0; i < max_pfn; i += fpp) {
766 if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) {
767 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
768 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
769 (uint64_t)p2m_frame_list[i/fpp]);
770 goto out;
771 }
772 }
774 /* Domain is still running at this point */
775 if (live) {
777 if (xc_shadow_control(xc_handle, dom,
778 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
779 NULL, 0, NULL, 0, NULL) < 0) {
780 ERROR("Couldn't enable shadow mode");
781 goto out;
782 }
784 last_iter = 0;
786 } else {
788 /* This is a non-live suspend. Issue the call back to get the
789 domain suspended */
791 last_iter = 1;
793 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
794 ERROR("Domain appears not to have suspended");
795 goto out;
796 }
798 }
800 /* pretend we sent all the pages last iteration */
801 sent_last_iter = max_pfn;
804 /* calculate the power of 2 order of max_pfn, e.g.
805 15->4 16->4 17->5 */
806 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
807 continue;
809 /* Setup to_send / to_fix and to_skip bitmaps */
810 to_send = malloc(BITMAP_SIZE);
811 to_fix = calloc(1, BITMAP_SIZE);
812 to_skip = malloc(BITMAP_SIZE);
814 if (!to_send || !to_fix || !to_skip) {
815 ERROR("Couldn't allocate to_send array");
816 goto out;
817 }
819 memset(to_send, 0xff, BITMAP_SIZE);
821 if (lock_pages(to_send, BITMAP_SIZE)) {
822 ERROR("Unable to lock to_send");
823 return 1;
824 }
826 /* (to fix is local only) */
827 if (lock_pages(to_skip, BITMAP_SIZE)) {
828 ERROR("Unable to lock to_skip");
829 return 1;
830 }
832 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
834 /* We want zeroed memory so use calloc rather than malloc. */
835 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
836 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
838 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
839 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
840 errno = ENOMEM;
841 goto out;
842 }
844 if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) {
845 ERROR("Unable to lock");
846 goto out;
847 }
849 /*
850 * Quick belt and braces sanity check.
851 */
852 {
853 int err=0;
854 unsigned long mfn;
855 for (i = 0; i < max_pfn; i++) {
857 mfn = live_p2m[i];
858 if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) {
859 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
860 mfn, mfn_to_pfn(mfn));
861 err++;
862 }
863 }
864 DPRINTF("Had %d unexplained entries in p2m table\n", err);
865 }
868 /* Start writing out the saved-domain record. */
870 if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
871 ERROR("write: max_pfn");
872 goto out;
873 }
875 /*
876 * Write an extended-info structure to inform the restore code that
877 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
878 * slow paths in the restore code.
879 */
880 if ((pt_levels == 3) &&
881 (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
882 unsigned long signature = ~0UL;
883 uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
884 uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
885 char chunk_sig[] = "vcpu";
886 if (!write_exact(io_fd, &signature, sizeof(signature)) ||
887 !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
888 !write_exact(io_fd, &chunk_sig, 4) ||
889 !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
890 !write_exact(io_fd, &ctxt, sizeof(ctxt))) {
891 ERROR("write: extended info");
892 goto out;
893 }
894 }
896 if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
897 ERROR("write: p2m_frame_list");
898 goto out;
899 }
901 print_stats(xc_handle, dom, 0, &stats, 0);
903 /* Now write out each data page, canonicalising page tables as we go... */
905 while(1) {
907 unsigned int prev_pc, sent_this_iter, N, batch;
909 iter++;
910 sent_this_iter = 0;
911 skip_this_iter = 0;
912 prev_pc = 0;
913 N=0;
915 DPRINTF("Saving memory pages: iter %d 0%%", iter);
917 while( N < max_pfn ){
919 unsigned int this_pc = (N * 100) / max_pfn;
921 if ((this_pc - prev_pc) >= 5) {
922 DPRINTF("\b\b\b\b%3d%%", this_pc);
923 prev_pc = this_pc;
924 }
926 /* slightly wasteful to peek the whole array evey time,
927 but this is fast enough for the moment. */
928 if (!last_iter && xc_shadow_control(
929 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
930 to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
931 ERROR("Error peeking shadow bitmap");
932 goto out;
933 }
936 /* load pfn_type[] with the mfn of all the pages we're doing in
937 this batch. */
938 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
940 int n = permute(N, max_pfn, order_nr);
942 if (debug) {
943 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
944 iter, (unsigned long)n, live_p2m[n],
945 test_bit(n, to_send),
946 mfn_to_pfn(live_p2m[n]&0xFFFFF));
947 }
949 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
950 skip_this_iter++; /* stats keeping */
952 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
953 (test_bit(n, to_send) && last_iter) ||
954 (test_bit(n, to_fix) && last_iter)))
955 continue;
957 /*
958 ** we get here if:
959 ** 1. page is marked to_send & hasn't already been re-dirtied
960 ** 2. (ignore to_skip in last iteration)
961 ** 3. add in pages that still need fixup (net bufs)
962 */
964 pfn_batch[batch] = n;
965 pfn_type[batch] = live_p2m[n];
967 if(!is_mapped(pfn_type[batch])) {
969 /*
970 ** not currently in psuedo-physical map -- set bit
971 ** in to_fix since we must send this page in last_iter
972 ** unless its sent sooner anyhow, or it never enters
973 ** pseudo-physical map (e.g. for ballooned down domains)
974 */
976 set_bit(n, to_fix);
977 continue;
978 }
980 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
981 needed_to_fix++;
982 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
983 iter, n, pfn_type[batch]);
984 }
986 clear_bit(n, to_fix);
988 batch++;
989 }
991 if (batch == 0)
992 goto skip; /* vanishingly unlikely... */
994 if ((region_base = xc_map_foreign_batch(
995 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
996 ERROR("map batch failed");
997 goto out;
998 }
1000 for ( j = 0; j < batch; j++ )
1001 ((uint32_t *)pfn_type)[i] = pfn_type[i];
1002 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
1003 (uint32_t *)pfn_type) )
1005 ERROR("get_pfn_type_batch failed");
1006 goto out;
1008 for ( j = batch-1; j >= 0; j-- )
1009 pfn_type[i] = ((uint32_t *)pfn_type)[i];
1011 for ( j = 0; j < batch; j++ )
1014 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
1015 XEN_DOMCTL_PFINFO_XTAB )
1017 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
1018 continue;
1021 if (debug)
1022 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1023 " sum= %08lx\n",
1024 iter,
1025 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1026 pfn_batch[j],
1027 pfn_type[j],
1028 mfn_to_pfn(pfn_type[j] &
1029 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1030 csum_page(region_base + (PAGE_SIZE*j)));
1032 /* canonicalise mfn->pfn */
1033 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1034 pfn_batch[j];
1037 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
1038 ERROR("Error when writing to state file (2) (errno %d)",
1039 errno);
1040 goto out;
1043 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
1044 ERROR("Error when writing to state file (3) (errno %d)",
1045 errno);
1046 goto out;
1049 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1050 for ( j = 0; j < batch; j++ )
1052 unsigned long pfn, pagetype;
1053 void *spage = (char *)region_base + (PAGE_SIZE*j);
1055 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1056 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1058 /* write out pages in batch */
1059 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1060 continue;
1062 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1064 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1065 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1067 /* We have a pagetable page: need to rewrite it. */
1068 race =
1069 canonicalize_pagetable(pagetype, pfn, spage, page);
1071 if(race && !live)
1072 goto out;
1074 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
1075 ERROR("Error when writing to state file (4)"
1076 " (errno %d)", errno);
1077 goto out;
1080 } else {
1082 /* We have a normal page: just write it directly. */
1083 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
1084 ERROR("Error when writing to state file (5)"
1085 " (errno %d)", errno);
1086 goto out;
1089 } /* end of the write out for this batch */
1091 sent_this_iter += batch;
1093 munmap(region_base, batch*PAGE_SIZE);
1095 } /* end of this while loop for this iteration */
1097 skip:
1099 total_sent += sent_this_iter;
1101 DPRINTF("\r %d: sent %d, skipped %d, ",
1102 iter, sent_this_iter, skip_this_iter );
1104 if (last_iter) {
1105 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1107 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1108 total_sent, ((float)total_sent)/max_pfn );
1109 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1112 if (last_iter && debug) {
1113 int minusone = -1;
1114 memset(to_send, 0xff, BITMAP_SIZE);
1115 debug = 0;
1116 DPRINTF("Entering debug resend-all mode\n");
1118 /* send "-1" to put receiver into debug mode */
1119 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1120 ERROR("Error when writing to state file (6) (errno %d)",
1121 errno);
1122 goto out;
1125 continue;
1128 if (last_iter)
1129 break;
1131 if (live) {
1132 if (((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1133 (iter >= max_iters) ||
1134 (sent_this_iter+skip_this_iter < 50) ||
1135 (total_sent > max_pfn*max_factor)) {
1136 DPRINTF("Start last iteration\n");
1137 last_iter = 1;
1139 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
1140 &ctxt)) {
1141 ERROR("Domain appears not to have suspended");
1142 goto out;
1145 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1146 info.shared_info_frame,
1147 (unsigned long)ctxt.user_regs.eip,
1148 (unsigned long)ctxt.user_regs.edx);
1151 if (xc_shadow_control(xc_handle, dom,
1152 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1153 max_pfn, NULL, 0, &stats) != max_pfn) {
1154 ERROR("Error flushing shadow PT");
1155 goto out;
1158 sent_last_iter = sent_this_iter;
1160 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1163 } /* end of while 1 */
1165 DPRINTF("All memory is saved\n");
1167 /* Zero terminate */
1168 i = 0;
1169 if (!write_exact(io_fd, &i, sizeof(int))) {
1170 ERROR("Error when writing to state file (6') (errno %d)", errno);
1171 goto out;
1174 /* Send through a list of all the PFNs that were not in map at the close */
1176 unsigned int i,j;
1177 unsigned long pfntab[1024];
1179 for (i = 0, j = 0; i < max_pfn; i++) {
1180 if (!is_mapped(live_p2m[i]))
1181 j++;
1184 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1185 ERROR("Error when writing to state file (6a) (errno %d)", errno);
1186 goto out;
1189 for (i = 0, j = 0; i < max_pfn; ) {
1191 if (!is_mapped(live_p2m[i]))
1192 pfntab[j++] = i;
1194 i++;
1195 if (j == 1024 || i == max_pfn) {
1196 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1197 ERROR("Error when writing to state file (6b) (errno %d)",
1198 errno);
1199 goto out;
1201 j = 0;
1207 /* Canonicalise the suspend-record frame number. */
1208 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1209 ERROR("Suspend record is not in range of pseudophys map");
1210 goto out;
1213 /* Canonicalise each GDT frame number. */
1214 for ( i = 0; (512*i) < ctxt.gdt_ents; i++ ) {
1215 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1216 ERROR("GDT frame is not in range of pseudophys map");
1217 goto out;
1221 /* Canonicalise the page table base pointer. */
1222 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) {
1223 ERROR("PT base is not in range of pseudophys map");
1224 goto out;
1226 ctxt.ctrlreg[3] =
1227 xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
1229 /*
1230 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1231 */
1232 memcpy(page, live_shinfo, PAGE_SIZE);
1233 ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
1235 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1236 !write_exact(io_fd, page, PAGE_SIZE)) {
1237 ERROR("Error when writing to state file (1) (errno %d)", errno);
1238 goto out;
1241 /* Success! */
1242 rc = 0;
1244 out:
1246 if (live) {
1247 if(xc_shadow_control(xc_handle, dom,
1248 XEN_DOMCTL_SHADOW_OP_OFF,
1249 NULL, 0, NULL, 0, NULL) < 0) {
1250 DPRINTF("Warning - couldn't disable shadow mode");
1254 if (live_shinfo)
1255 munmap(live_shinfo, PAGE_SIZE);
1257 if (live_p2m_frame_list_list)
1258 munmap(live_p2m_frame_list_list, PAGE_SIZE);
1260 if (live_p2m_frame_list)
1261 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1263 if(live_p2m)
1264 munmap(live_p2m, P2M_SIZE);
1266 if(live_m2p)
1267 munmap(live_m2p, M2P_SIZE(max_mfn));
1269 free(pfn_type);
1270 free(pfn_batch);
1271 free(to_send);
1272 free(to_fix);
1273 free(to_skip);
1275 DPRINTF("Save exit rc=%d\n",rc);
1277 return !!rc;
1280 /*
1281 * Local variables:
1282 * mode: C
1283 * c-set-style: "BSD"
1284 * c-basic-offset: 4
1285 * tab-width: 4
1286 * indent-tabs-mode: nil
1287 * End:
1288 */