debuggers.hg

view tools/libxc/xc_domain_save.c @ 0:7d21f7218375

Exact replica of unstable on 051908 + README-this
author Mukesh Rathor
date Mon May 19 15:34:57 2008 -0700 (2008-05-19)
parents
children 5c0bf00e371d
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 #include <xen/hvm/params.h>
21 #include "xc_e820.h"
23 /*
24 ** Default values for important tuning parameters. Can override by passing
25 ** non-zero replacement values to xc_domain_save().
26 **
27 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
28 **
29 */
30 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
31 #define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
33 /* max mfn of the whole machine */
34 static unsigned long max_mfn;
36 /* virtual starting address of the hypervisor */
37 static unsigned long hvirt_start;
39 /* #levels of page tables used by the current guest */
40 static unsigned int pt_levels;
42 /* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
43 static unsigned long *qemu_bitmaps[2];
44 static int qemu_active;
45 static int qemu_non_active;
47 /* number of pfns this guest has (i.e. number of entries in the P2M) */
48 static unsigned long p2m_size;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* Live mapping of system MFN to PFN table. */
54 static xen_pfn_t *live_m2p = NULL;
55 static unsigned long m2p_mfn0;
57 /* Address size of the guest */
58 unsigned int guest_width;
60 /* grep fodder: machine_to_phys */
62 #define mfn_to_pfn(_mfn) (live_m2p[(_mfn)])
64 #define pfn_to_mfn(_pfn) \
65 ((xen_pfn_t) ((guest_width==8) \
66 ? (((uint64_t *)live_p2m)[(_pfn)]) \
67 : ((((uint32_t *)live_p2m)[(_pfn)]) == 0xffffffffU \
68 ? (-1UL) : (((uint32_t *)live_p2m)[(_pfn)]))))
70 /*
71 * Returns TRUE if the given machine frame number has a unique mapping
72 * in the guest's pseudophysical map.
73 */
74 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
75 (((_mfn) < (max_mfn)) && \
76 ((mfn_to_pfn(_mfn) < (p2m_size)) && \
77 (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
79 /*
80 ** During (live) save/migrate, we maintain a number of bitmaps to track
81 ** which pages we have to send, to fixup, and to skip.
82 */
84 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
85 #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
86 #define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
88 #define BITMAP_ENTRY(_nr,_bmap) \
89 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
91 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
93 static inline int test_bit (int nr, volatile void * addr)
94 {
95 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
96 }
98 static inline void clear_bit (int nr, volatile void * addr)
99 {
100 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
101 }
103 static inline void set_bit ( int nr, volatile void * addr)
104 {
105 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
106 }
108 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
109 static inline unsigned int hweight32(unsigned int w)
110 {
111 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
112 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
113 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
114 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
115 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
116 }
118 static inline int count_bits ( int nr, volatile void *addr)
119 {
120 int i, count = 0;
121 volatile unsigned long *p = (volatile unsigned long *)addr;
122 /* We know that the array is padded to unsigned long. */
123 for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
124 count += hweight32(*p);
125 return count;
126 }
128 static uint64_t tv_to_us(struct timeval *new)
129 {
130 return (new->tv_sec * 1000000) + new->tv_usec;
131 }
133 static uint64_t llgettimeofday(void)
134 {
135 struct timeval now;
136 gettimeofday(&now, NULL);
137 return tv_to_us(&now);
138 }
140 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
141 {
142 return (((new->tv_sec - old->tv_sec)*1000000) +
143 (new->tv_usec - old->tv_usec));
144 }
146 static int noncached_write(int fd, int live, void *buffer, int len)
147 {
148 static int write_count = 0;
149 int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
151 write_count += len;
152 if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
153 {
154 /* Time to discard cache - dont care if this fails */
155 discard_file_cache(fd, 0 /* no flush */);
156 write_count = 0;
157 }
159 return rc;
160 }
162 #ifdef ADAPTIVE_SAVE
164 /*
165 ** We control the rate at which we transmit (or save) to minimize impact
166 ** on running domains (including the target if we're doing live migrate).
167 */
169 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
170 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
172 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
173 #define RATE_TO_BTU 781250
175 /* Amount in bytes we allow ourselves to send in a burst */
176 #define BURST_BUDGET (100*1024)
178 /* We keep track of the current and previous transmission rate */
179 static int mbit_rate, ombit_rate = 0;
181 /* Have we reached the maximum transmission rate? */
182 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
184 static inline void initialize_mbit_rate()
185 {
186 mbit_rate = START_MBIT_RATE;
187 }
189 static int ratewrite(int io_fd, int live, void *buf, int n)
190 {
191 static int budget = 0;
192 static int burst_time_us = -1;
193 static struct timeval last_put = { 0 };
194 struct timeval now;
195 struct timespec delay;
196 long long delta;
198 if ( START_MBIT_RATE == 0 )
199 return noncached_write(io_fd, live, buf, n);
201 budget -= n;
202 if ( budget < 0 )
203 {
204 if ( mbit_rate != ombit_rate )
205 {
206 burst_time_us = RATE_TO_BTU / mbit_rate;
207 ombit_rate = mbit_rate;
208 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
209 mbit_rate, BURST_BUDGET, burst_time_us);
210 }
211 if ( last_put.tv_sec == 0 )
212 {
213 budget += BURST_BUDGET;
214 gettimeofday(&last_put, NULL);
215 }
216 else
217 {
218 while ( budget < 0 )
219 {
220 gettimeofday(&now, NULL);
221 delta = tv_delta(&now, &last_put);
222 while ( delta > burst_time_us )
223 {
224 budget += BURST_BUDGET;
225 last_put.tv_usec += burst_time_us;
226 if ( last_put.tv_usec > 1000000 )
227 {
228 last_put.tv_usec -= 1000000;
229 last_put.tv_sec++;
230 }
231 delta -= burst_time_us;
232 }
233 if ( budget > 0 )
234 break;
235 delay.tv_sec = 0;
236 delay.tv_nsec = 1000 * (burst_time_us - delta);
237 while ( delay.tv_nsec > 0 )
238 if ( nanosleep(&delay, &delay) == 0 )
239 break;
240 }
241 }
242 }
243 return noncached_write(io_fd, live, buf, n);
244 }
246 #else /* ! ADAPTIVE SAVE */
248 #define RATE_IS_MAX() (0)
249 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
250 #define initialize_mbit_rate()
252 #endif
254 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
255 xc_shadow_op_stats_t *stats, int print)
256 {
257 static struct timeval wall_last;
258 static long long d0_cpu_last;
259 static long long d1_cpu_last;
261 struct timeval wall_now;
262 long long wall_delta;
263 long long d0_cpu_now, d0_cpu_delta;
264 long long d1_cpu_now, d1_cpu_delta;
266 gettimeofday(&wall_now, NULL);
268 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
269 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
271 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
272 DPRINTF("ARRHHH!!\n");
274 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
275 if ( wall_delta == 0 )
276 wall_delta = 1;
278 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
279 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
281 if ( print )
282 DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
283 "dirtied %dMb/s %" PRId32 " pages\n",
284 wall_delta,
285 (int)((d0_cpu_delta*100)/wall_delta),
286 (int)((d1_cpu_delta*100)/wall_delta),
287 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
288 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
289 stats->dirty_count);
291 #ifdef ADAPTIVE_SAVE
292 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
293 {
294 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
295 + 50;
296 if ( mbit_rate > MAX_MBIT_RATE )
297 mbit_rate = MAX_MBIT_RATE;
298 }
299 #endif
301 d0_cpu_last = d0_cpu_now;
302 d1_cpu_last = d1_cpu_now;
303 wall_last = wall_now;
305 return 0;
306 }
309 static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
310 unsigned long *arr, int runs)
311 {
312 long long start, now;
313 xc_shadow_op_stats_t stats;
314 int j;
316 start = llgettimeofday();
318 for ( j = 0; j < runs; j++ )
319 {
320 int i;
322 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
323 arr, p2m_size, NULL, 0, NULL);
324 DPRINTF("#Flush\n");
325 for ( i = 0; i < 40; i++ )
326 {
327 usleep(50000);
328 now = llgettimeofday();
329 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
330 NULL, 0, NULL, 0, &stats);
331 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
332 ((now-start)+500)/1000,
333 stats.fault_count, stats.dirty_count);
334 }
335 }
337 return -1;
338 }
341 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
342 int dom, xc_dominfo_t *info)
343 {
344 int i = 0;
346 if ( !(*suspend)(dom) )
347 {
348 ERROR("Suspend request failed");
349 return -1;
350 }
352 retry:
354 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
355 {
356 ERROR("Could not get domain info");
357 return -1;
358 }
360 if ( info->dying )
361 {
362 ERROR("domain is dying");
363 return -1;
364 }
366 if ( info->crashed )
367 {
368 ERROR("domain has crashed");
369 return -1;
370 }
372 if ( info->shutdown )
373 {
374 switch ( info->shutdown_reason )
375 {
376 case SHUTDOWN_poweroff:
377 case SHUTDOWN_reboot:
378 ERROR("domain has shut down");
379 return -1;
380 case SHUTDOWN_suspend:
381 return 0;
382 case SHUTDOWN_crash:
383 ERROR("domain has crashed");
384 return -1;
385 }
386 }
388 if ( info->paused )
389 {
390 /* Try unpausing domain, wait, and retest. */
391 xc_domain_unpause( xc_handle, dom );
392 ERROR("Domain was paused. Wait and re-test.");
393 usleep(10000); /* 10ms */
394 goto retry;
395 }
397 if ( ++i < 100 )
398 {
399 ERROR("Retry suspend domain");
400 usleep(10000); /* 10ms */
401 goto retry;
402 }
404 ERROR("Unable to suspend domain.");
406 return -1;
407 }
409 /*
410 ** Map the top-level page of MFNs from the guest. The guest might not have
411 ** finished resuming from a previous restore operation, so we wait a while for
412 ** it to update the MFN to a reasonable value.
413 */
414 static void *map_frame_list_list(int xc_handle, uint32_t dom,
415 shared_info_either_t *shinfo)
416 {
417 int count = 100;
418 void *p;
419 uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
421 while ( count-- && (fll == 0) )
422 {
423 usleep(10000);
424 fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
425 }
427 if ( fll == 0 )
428 {
429 ERROR("Timed out waiting for frame list updated.");
430 return NULL;
431 }
433 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll);
434 if ( p == NULL )
435 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
437 return p;
438 }
440 /*
441 ** During transfer (or in the state file), all page-table pages must be
442 ** converted into a 'canonical' form where references to actual mfns
443 ** are replaced with references to the corresponding pfns.
444 **
445 ** This function performs the appropriate conversion, taking into account
446 ** which entries do not require canonicalization (in particular, those
447 ** entries which map the virtual address reserved for the hypervisor).
448 */
449 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
450 const void *spage, void *dpage)
451 {
453 int i, pte_last, xen_start, xen_end, race = 0;
454 uint64_t pte;
456 /*
457 ** We need to determine which entries in this page table hold
458 ** reserved hypervisor mappings. This depends on the current
459 ** page table type as well as the number of paging levels.
460 */
461 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
463 if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
464 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
466 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
467 xen_start = L3_PAGETABLE_ENTRIES_PAE;
469 /*
470 ** In PAE only the L2 mapping the top 1GB contains Xen mappings.
471 ** We can spot this by looking for the guest's mappingof the m2p.
472 ** Guests must ensure that this check will fail for other L2s.
473 */
474 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
475 {
476 int hstart;
477 uint64_t he;
479 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
480 he = ((const uint64_t *) spage)[hstart];
482 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
483 {
484 /* hvirt starts with xen stuff... */
485 xen_start = hstart;
486 }
487 else if ( hvirt_start != 0xf5800000 )
488 {
489 /* old L2s from before hole was shrunk... */
490 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
491 he = ((const uint64_t *) spage)[hstart];
492 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
493 xen_start = hstart;
494 }
495 }
497 if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
498 {
499 /*
500 ** XXX SMH: should compute these from hvirt_start (which we have)
501 ** and hvirt_end (which we don't)
502 */
503 xen_start = 256;
504 xen_end = 272;
505 }
507 /* Now iterate through the page table, canonicalizing each PTE */
508 for (i = 0; i < pte_last; i++ )
509 {
510 unsigned long pfn, mfn;
512 if ( pt_levels == 2 )
513 pte = ((const uint32_t*)spage)[i];
514 else
515 pte = ((const uint64_t*)spage)[i];
517 if ( (i >= xen_start) && (i < xen_end) )
518 pte = 0;
520 if ( pte & _PAGE_PRESENT )
521 {
522 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
523 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
524 {
525 /* This will happen if the type info is stale which
526 is quite feasible under live migration */
527 pfn = 0; /* zap it - we'll retransmit this page later */
528 /* XXX: We can't spot Xen mappings in compat-mode L2es
529 * from 64-bit tools, but the only thing in them is the
530 * compat m2p, so we quietly zap them. This doesn't
531 * count as a race, so don't report it. */
532 if ( !(type == XEN_DOMCTL_PFINFO_L2TAB
533 && sizeof (unsigned long) > guest_width) )
534 race = 1; /* inform the caller; fatal if !live */
535 }
536 else
537 pfn = mfn_to_pfn(mfn);
539 pte &= ~MADDR_MASK_X86;
540 pte |= (uint64_t)pfn << PAGE_SHIFT;
542 /*
543 * PAE guest L3Es can contain these flags when running on
544 * a 64bit hypervisor. We zap these here to avoid any
545 * surprise at restore time...
546 */
547 if ( (pt_levels == 3) &&
548 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
549 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
550 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
551 }
553 if ( pt_levels == 2 )
554 ((uint32_t*)dpage)[i] = pte;
555 else
556 ((uint64_t*)dpage)[i] = pte;
557 }
559 return race;
560 }
562 static xen_pfn_t *xc_map_m2p(int xc_handle,
563 unsigned long max_mfn,
564 int prot)
565 {
566 struct xen_machphys_mfn_list xmml;
567 privcmd_mmap_entry_t *entries;
568 unsigned long m2p_chunks, m2p_size;
569 xen_pfn_t *m2p;
570 xen_pfn_t *extent_start;
571 int i, rc;
573 m2p_size = M2P_SIZE(max_mfn);
574 m2p_chunks = M2P_CHUNKS(max_mfn);
576 xmml.max_extents = m2p_chunks;
577 if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
578 {
579 ERROR("failed to allocate space for m2p mfns");
580 return NULL;
581 }
582 set_xen_guest_handle(xmml.extent_start, extent_start);
584 if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
585 (xmml.nr_extents != m2p_chunks) )
586 {
587 ERROR("xc_get_m2p_mfns");
588 return NULL;
589 }
591 if ( (m2p = mmap(NULL, m2p_size, prot,
592 MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
593 {
594 ERROR("failed to mmap m2p");
595 return NULL;
596 }
598 if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
599 {
600 ERROR("failed to allocate space for mmap entries");
601 return NULL;
602 }
604 for ( i = 0; i < m2p_chunks; i++ )
605 {
606 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
607 entries[i].mfn = extent_start[i];
608 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
609 }
611 if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
612 entries, m2p_chunks)) < 0 )
613 {
614 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
615 return NULL;
616 }
618 m2p_mfn0 = entries[0].mfn;
620 free(extent_start);
621 free(entries);
623 return m2p;
624 }
627 static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
628 int io_fd,
629 uint32_t dom,
630 unsigned long p2m_size,
631 shared_info_either_t *live_shinfo)
632 {
633 vcpu_guest_context_either_t ctxt;
635 /* Double and single indirect references to the live P2M table */
636 void *live_p2m_frame_list_list = NULL;
637 void *live_p2m_frame_list = NULL;
639 /* Copies of the above. */
640 xen_pfn_t *p2m_frame_list_list = NULL;
641 xen_pfn_t *p2m_frame_list = NULL;
643 /* The mapping of the live p2m table itself */
644 xen_pfn_t *p2m = NULL;
646 int i, success = 0;
648 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
649 live_shinfo);
650 if ( !live_p2m_frame_list_list )
651 goto out;
653 /* Get a local copy of the live_P2M_frame_list_list */
654 if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
655 {
656 ERROR("Couldn't allocate p2m_frame_list_list array");
657 goto out;
658 }
659 memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
661 /* Canonicalize guest's unsigned long vs ours */
662 if ( guest_width > sizeof(unsigned long) )
663 for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
664 if ( i < PAGE_SIZE/guest_width )
665 p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
666 else
667 p2m_frame_list_list[i] = 0;
668 else if ( guest_width < sizeof(unsigned long) )
669 for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
670 p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
672 live_p2m_frame_list =
673 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
674 p2m_frame_list_list,
675 P2M_FLL_ENTRIES);
676 if ( !live_p2m_frame_list )
677 {
678 ERROR("Couldn't map p2m_frame_list");
679 goto out;
680 }
682 /* Get a local copy of the live_P2M_frame_list */
683 if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
684 {
685 ERROR("Couldn't allocate p2m_frame_list array");
686 goto out;
687 }
688 memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
689 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
691 /* Canonicalize guest's unsigned long vs ours */
692 if ( guest_width > sizeof(unsigned long) )
693 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
694 p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
695 else if ( guest_width < sizeof(unsigned long) )
696 for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
697 p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
700 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
701 the guest must not change which frames are used for this purpose.
702 (its not clear why it would want to change them, and we'll be OK
703 from a safety POV anyhow. */
705 p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
706 p2m_frame_list,
707 P2M_FL_ENTRIES);
708 if ( !p2m )
709 {
710 ERROR("Couldn't map p2m table");
711 goto out;
712 }
713 live_p2m = p2m; /* So that translation macros will work */
715 /* Canonicalise the pfn-to-mfn table frame-number list. */
716 for ( i = 0; i < p2m_size; i += FPP )
717 {
718 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
719 {
720 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
721 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
722 i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], max_mfn);
723 if ( p2m_frame_list[i/FPP] < max_mfn )
724 {
725 ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
726 (uint64_t)p2m_frame_list[i/FPP],
727 (uint64_t)live_m2p[p2m_frame_list[i/FPP]]);
728 ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
729 (uint64_t)live_m2p[p2m_frame_list[i/FPP]],
730 (uint64_t)p2m[live_m2p[p2m_frame_list[i/FPP]]]);
732 }
733 goto out;
734 }
735 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
736 }
738 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
739 {
740 ERROR("Could not get vcpu context");
741 goto out;
742 }
744 /*
745 * Write an extended-info structure to inform the restore code that
746 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
747 * slow paths in the restore code.
748 */
749 {
750 unsigned long signature = ~0UL;
751 uint32_t chunk1_sz = ((guest_width==8)
752 ? sizeof(ctxt.x64)
753 : sizeof(ctxt.x32));
754 uint32_t chunk2_sz = 0;
755 uint32_t tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
756 if ( write_exact(io_fd, &signature, sizeof(signature)) ||
757 write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
758 write_exact(io_fd, "vcpu", 4) ||
759 write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
760 write_exact(io_fd, &ctxt, chunk1_sz) ||
761 write_exact(io_fd, "extv", 4) ||
762 write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) )
763 {
764 PERROR("write: extended info");
765 goto out;
766 }
767 }
769 if ( write_exact(io_fd, p2m_frame_list,
770 P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
771 {
772 PERROR("write: p2m_frame_list");
773 goto out;
774 }
776 success = 1;
778 out:
780 if ( !success && p2m )
781 munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
783 if ( live_p2m_frame_list_list )
784 munmap(live_p2m_frame_list_list, PAGE_SIZE);
786 if ( live_p2m_frame_list )
787 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
789 if ( p2m_frame_list_list )
790 free(p2m_frame_list_list);
792 if ( p2m_frame_list )
793 free(p2m_frame_list);
795 return success ? p2m : NULL;
796 }
800 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
801 uint32_t max_factor, uint32_t flags, int (*suspend)(int),
802 int hvm, void *(*init_qemu_maps)(int, unsigned),
803 void (*qemu_flip_buffer)(int, int))
804 {
805 xc_dominfo_t info;
806 DECLARE_DOMCTL;
808 int rc = 1, frc, i, j, last_iter, iter = 0;
809 int live = (flags & XCFLAGS_LIVE);
810 int debug = (flags & XCFLAGS_DEBUG);
811 int race = 0, sent_last_iter, skip_this_iter;
813 /* The new domain's shared-info frame number. */
814 unsigned long shared_info_frame;
816 /* A copy of the CPU context of the guest. */
817 vcpu_guest_context_either_t ctxt;
819 /* A table containing the type of each PFN (/not/ MFN!). */
820 unsigned long *pfn_type = NULL;
821 unsigned long *pfn_batch = NULL;
823 /* A copy of one frame of guest memory. */
824 char page[PAGE_SIZE];
826 /* Live mapping of shared info structure */
827 shared_info_either_t *live_shinfo = NULL;
829 /* base of the region in which domain memory is mapped */
830 unsigned char *region_base = NULL;
832 /* bitmap of pages:
833 - that should be sent this iteration (unless later marked as skip);
834 - to skip this iteration because already dirty;
835 - to fixup by sending at the end if not already resent; */
836 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
838 xc_shadow_op_stats_t stats;
840 unsigned long needed_to_fix = 0;
841 unsigned long total_sent = 0;
843 uint64_t vcpumap = 1ULL;
845 /* HVM: a buffer for holding HVM context */
846 uint32_t hvm_buf_size = 0;
847 uint8_t *hvm_buf = NULL;
849 /* HVM: magic frames for ioreqs and xenstore comms. */
850 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
852 unsigned long mfn;
854 /* If no explicit control parameters given, use defaults */
855 max_iters = max_iters ? : DEF_MAX_ITERS;
856 max_factor = max_factor ? : DEF_MAX_FACTOR;
858 initialize_mbit_rate();
860 if ( !get_platform_info(xc_handle, dom,
861 &max_mfn, &hvirt_start, &pt_levels, &guest_width) )
862 {
863 ERROR("Unable to get platform info.");
864 return 1;
865 }
867 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
868 {
869 ERROR("Could not get domain info");
870 return 1;
871 }
873 shared_info_frame = info.shared_info_frame;
875 /* Map the shared info frame */
876 if ( !hvm )
877 {
878 live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
879 PROT_READ, shared_info_frame);
880 if ( !live_shinfo )
881 {
882 ERROR("Couldn't map live_shinfo");
883 goto out;
884 }
885 }
887 /* Get the size of the P2M table */
888 p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
890 /* Domain is still running at this point */
891 if ( live )
892 {
893 /* Live suspend. Enable log-dirty mode. */
894 if ( xc_shadow_control(xc_handle, dom,
895 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
896 NULL, 0, NULL, 0, NULL) < 0 )
897 {
898 /* log-dirty already enabled? There's no test op,
899 so attempt to disable then reenable it */
900 frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
901 NULL, 0, NULL, 0, NULL);
902 if ( frc >= 0 )
903 {
904 frc = xc_shadow_control(xc_handle, dom,
905 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
906 NULL, 0, NULL, 0, NULL);
907 }
909 if ( frc < 0 )
910 {
911 ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
912 goto out;
913 }
914 }
916 if ( hvm )
917 {
918 /* Get qemu-dm logging dirty pages too */
919 void *seg = init_qemu_maps(dom, BITMAP_SIZE);
920 qemu_bitmaps[0] = seg;
921 qemu_bitmaps[1] = seg + BITMAP_SIZE;
922 qemu_active = 0;
923 qemu_non_active = 1;
924 }
925 }
926 else
927 {
928 /* This is a non-live suspend. Suspend the domain .*/
929 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
930 {
931 ERROR("Domain appears not to have suspended");
932 goto out;
933 }
934 }
936 last_iter = !live;
938 /* pretend we sent all the pages last iteration */
939 sent_last_iter = p2m_size;
941 /* Setup to_send / to_fix and to_skip bitmaps */
942 to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
943 to_fix = calloc(1, BITMAP_SIZE);
944 to_skip = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
946 if ( !to_send || !to_fix || !to_skip )
947 {
948 ERROR("Couldn't allocate to_send array");
949 goto out;
950 }
952 memset(to_send, 0xff, BITMAP_SIZE);
954 if ( lock_pages(to_send, BITMAP_SIZE) )
955 {
956 ERROR("Unable to lock to_send");
957 return 1;
958 }
960 /* (to fix is local only) */
961 if ( lock_pages(to_skip, BITMAP_SIZE) )
962 {
963 ERROR("Unable to lock to_skip");
964 return 1;
965 }
967 if ( hvm )
968 {
969 /* Need another buffer for HVM context */
970 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
971 if ( hvm_buf_size == -1 )
972 {
973 ERROR("Couldn't get HVM context size from Xen");
974 goto out;
975 }
976 hvm_buf = malloc(hvm_buf_size);
977 if ( !hvm_buf )
978 {
979 ERROR("Couldn't allocate memory");
980 goto out;
981 }
982 }
984 analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
986 pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
987 MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
988 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
989 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
990 {
991 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
992 errno = ENOMEM;
993 goto out;
994 }
995 memset(pfn_type, 0,
996 ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
998 if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
999 {
1000 ERROR("Unable to lock pfn_type array");
1001 goto out;
1004 /* Setup the mfn_to_pfn table mapping */
1005 if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
1007 ERROR("Failed to map live M2P table");
1008 goto out;
1011 /* Start writing out the saved-domain record. */
1012 if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
1014 PERROR("write: p2m_size");
1015 goto out;
1018 if ( !hvm )
1020 int err = 0;
1022 /* Map the P2M table, and write the list of P2M frames */
1023 live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom,
1024 p2m_size, live_shinfo);
1025 if ( live_p2m == NULL )
1027 ERROR("Failed to map/save the p2m frame list");
1028 goto out;
1031 /*
1032 * Quick belt and braces sanity check.
1033 */
1035 for ( i = 0; i < p2m_size; i++ )
1037 mfn = pfn_to_mfn(i);
1038 if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
1040 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
1041 mfn, mfn_to_pfn(mfn));
1042 err++;
1045 DPRINTF("Had %d unexplained entries in p2m table\n", err);
1048 print_stats(xc_handle, dom, 0, &stats, 0);
1050 /* Now write out each data page, canonicalising page tables as we go... */
1051 for ( ; ; )
1053 unsigned int prev_pc, sent_this_iter, N, batch, run;
1055 iter++;
1056 sent_this_iter = 0;
1057 skip_this_iter = 0;
1058 prev_pc = 0;
1059 N = 0;
1061 DPRINTF("Saving memory pages: iter %d 0%%", iter);
1063 while ( N < p2m_size )
1065 unsigned int this_pc = (N * 100) / p2m_size;
1067 if ( (this_pc - prev_pc) >= 5 )
1069 DPRINTF("\b\b\b\b%3d%%", this_pc);
1070 prev_pc = this_pc;
1073 if ( !last_iter )
1075 /* Slightly wasteful to peek the whole array evey time,
1076 but this is fast enough for the moment. */
1077 frc = xc_shadow_control(
1078 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
1079 p2m_size, NULL, 0, NULL);
1080 if ( frc != p2m_size )
1082 ERROR("Error peeking shadow bitmap");
1083 goto out;
1087 /* load pfn_type[] with the mfn of all the pages we're doing in
1088 this batch. */
1089 for ( batch = 0;
1090 (batch < MAX_BATCH_SIZE) && (N < p2m_size);
1091 N++ )
1093 int n = N;
1095 if ( debug )
1097 DPRINTF("%d pfn= %08lx mfn= %08lx %d",
1098 iter, (unsigned long)n,
1099 hvm ? 0 : pfn_to_mfn(n),
1100 test_bit(n, to_send));
1101 if ( !hvm && is_mapped(pfn_to_mfn(n)) )
1102 DPRINTF(" [mfn]= %08lx",
1103 mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
1104 DPRINTF("\n");
1106 if ( !last_iter &&
1107 test_bit(n, to_send) &&
1108 test_bit(n, to_skip) )
1109 skip_this_iter++; /* stats keeping */
1111 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
1112 (test_bit(n, to_send) && last_iter) ||
1113 (test_bit(n, to_fix) && last_iter)) )
1114 continue;
1116 /* Skip PFNs that aren't really there */
1117 if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
1118 || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
1119 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
1120 continue;
1122 /*
1123 ** we get here if:
1124 ** 1. page is marked to_send & hasn't already been re-dirtied
1125 ** 2. (ignore to_skip in last iteration)
1126 ** 3. add in pages that still need fixup (net bufs)
1127 */
1129 pfn_batch[batch] = n;
1131 /* Hypercall interfaces operate in PFNs for HVM guests
1132 * and MFNs for PV guests */
1133 if ( hvm )
1134 pfn_type[batch] = n;
1135 else
1136 pfn_type[batch] = pfn_to_mfn(n);
1138 if ( !is_mapped(pfn_type[batch]) )
1140 /*
1141 ** not currently in psuedo-physical map -- set bit
1142 ** in to_fix since we must send this page in last_iter
1143 ** unless its sent sooner anyhow, or it never enters
1144 ** pseudo-physical map (e.g. for ballooned down doms)
1145 */
1146 set_bit(n, to_fix);
1147 continue;
1150 if ( last_iter &&
1151 test_bit(n, to_fix) &&
1152 !test_bit(n, to_send) )
1154 needed_to_fix++;
1155 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1156 iter, n, pfn_type[batch]);
1159 clear_bit(n, to_fix);
1161 batch++;
1164 if ( batch == 0 )
1165 goto skip; /* vanishingly unlikely... */
1167 region_base = xc_map_foreign_batch(
1168 xc_handle, dom, PROT_READ, pfn_type, batch);
1169 if ( region_base == NULL )
1171 ERROR("map batch failed");
1172 goto out;
1175 if ( !hvm )
1177 /* Get page types */
1178 for ( j = 0; j < batch; j++ )
1179 ((uint32_t *)pfn_type)[j] = pfn_type[j];
1180 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
1181 (uint32_t *)pfn_type) )
1183 ERROR("get_pfn_type_batch failed");
1184 goto out;
1186 for ( j = batch-1; j >= 0; j-- )
1187 pfn_type[j] = ((uint32_t *)pfn_type)[j];
1189 for ( j = 0; j < batch; j++ )
1192 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
1193 XEN_DOMCTL_PFINFO_XTAB )
1195 DPRINTF("type fail: page %i mfn %08lx\n",
1196 j, pfn_type[j]);
1197 continue;
1200 if ( debug )
1201 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1202 " sum= %08lx\n",
1203 iter,
1204 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1205 pfn_batch[j],
1206 pfn_type[j],
1207 mfn_to_pfn(pfn_type[j] &
1208 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1209 csum_page(region_base + (PAGE_SIZE*j)));
1211 /* canonicalise mfn->pfn */
1212 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1213 pfn_batch[j];
1217 if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
1219 PERROR("Error when writing to state file (2)");
1220 goto out;
1223 if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
1225 PERROR("Error when writing to state file (3)");
1226 goto out;
1229 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1230 run = 0;
1231 for ( j = 0; j < batch; j++ )
1233 unsigned long pfn, pagetype;
1234 void *spage = (char *)region_base + (PAGE_SIZE*j);
1236 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1237 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1239 if ( pagetype != 0 )
1241 /* If the page is not a normal data page, write out any
1242 run of pages we may have previously acumulated */
1243 if ( run )
1245 if ( ratewrite(io_fd, live,
1246 (char*)region_base+(PAGE_SIZE*(j-run)),
1247 PAGE_SIZE*run) != PAGE_SIZE*run )
1249 ERROR("Error when writing to state file (4a)"
1250 " (errno %d)", errno);
1251 goto out;
1253 run = 0;
1257 /* skip pages that aren't present */
1258 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1259 continue;
1261 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1263 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1264 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1266 /* We have a pagetable page: need to rewrite it. */
1267 race =
1268 canonicalize_pagetable(pagetype, pfn, spage, page);
1270 if ( race && !live )
1272 ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
1273 pagetype);
1274 goto out;
1277 if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
1279 ERROR("Error when writing to state file (4b)"
1280 " (errno %d)", errno);
1281 goto out;
1284 else
1286 /* We have a normal page: accumulate it for writing. */
1287 run++;
1289 } /* end of the write out for this batch */
1291 if ( run )
1293 /* write out the last accumulated run of pages */
1294 if ( ratewrite(io_fd, live,
1295 (char*)region_base+(PAGE_SIZE*(j-run)),
1296 PAGE_SIZE*run) != PAGE_SIZE*run )
1298 ERROR("Error when writing to state file (4c)"
1299 " (errno %d)", errno);
1300 goto out;
1304 sent_this_iter += batch;
1306 munmap(region_base, batch*PAGE_SIZE);
1308 } /* end of this while loop for this iteration */
1310 skip:
1312 total_sent += sent_this_iter;
1314 DPRINTF("\r %d: sent %d, skipped %d, ",
1315 iter, sent_this_iter, skip_this_iter );
1317 if ( last_iter )
1319 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1321 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1322 total_sent, ((float)total_sent)/p2m_size );
1323 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1326 if ( last_iter && debug )
1328 int minusone = -1;
1329 memset(to_send, 0xff, BITMAP_SIZE);
1330 debug = 0;
1331 DPRINTF("Entering debug resend-all mode\n");
1333 /* send "-1" to put receiver into debug mode */
1334 if ( write_exact(io_fd, &minusone, sizeof(int)) )
1336 PERROR("Error when writing to state file (6)");
1337 goto out;
1340 continue;
1343 if ( last_iter )
1344 break;
1346 if ( live )
1348 if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1349 (iter >= max_iters) ||
1350 (sent_this_iter+skip_this_iter < 50) ||
1351 (total_sent > p2m_size*max_factor) )
1353 DPRINTF("Start last iteration\n");
1354 last_iter = 1;
1356 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
1358 ERROR("Domain appears not to have suspended");
1359 goto out;
1362 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1365 if ( xc_shadow_control(xc_handle, dom,
1366 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1367 p2m_size, NULL, 0, &stats) != p2m_size )
1369 ERROR("Error flushing shadow PT");
1370 goto out;
1373 if ( hvm )
1375 /* Pull in the dirty bits from qemu-dm too */
1376 if ( !last_iter )
1378 qemu_active = qemu_non_active;
1379 qemu_non_active = qemu_active ? 0 : 1;
1380 qemu_flip_buffer(dom, qemu_active);
1381 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1383 to_send[j] |= qemu_bitmaps[qemu_non_active][j];
1384 qemu_bitmaps[qemu_non_active][j] = 0;
1387 else
1389 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1390 to_send[j] |= qemu_bitmaps[qemu_active][j];
1394 sent_last_iter = sent_this_iter;
1396 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1399 } /* end of infinite for loop */
1401 DPRINTF("All memory is saved\n");
1404 struct {
1405 int minustwo;
1406 int max_vcpu_id;
1407 uint64_t vcpumap;
1408 } chunk = { -2, info.max_vcpu_id };
1410 if ( info.max_vcpu_id >= 64 )
1412 ERROR("Too many VCPUS in guest!");
1413 goto out;
1416 for ( i = 1; i <= info.max_vcpu_id; i++ )
1418 xc_vcpuinfo_t vinfo;
1419 if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
1420 vinfo.online )
1421 vcpumap |= 1ULL << i;
1424 chunk.vcpumap = vcpumap;
1425 if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
1427 PERROR("Error when writing to state file");
1428 goto out;
1432 /* Zero terminate */
1433 i = 0;
1434 if ( write_exact(io_fd, &i, sizeof(int)) )
1436 PERROR("Error when writing to state file (6')");
1437 goto out;
1440 if ( hvm )
1442 uint32_t rec_size;
1444 /* Save magic-page locations. */
1445 memset(magic_pfns, 0, sizeof(magic_pfns));
1446 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
1447 (unsigned long *)&magic_pfns[0]);
1448 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
1449 (unsigned long *)&magic_pfns[1]);
1450 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
1451 (unsigned long *)&magic_pfns[2]);
1452 if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
1454 PERROR("Error when writing to state file (7)");
1455 goto out;
1458 /* Get HVM context from Xen and save it too */
1459 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
1460 hvm_buf_size)) == -1 )
1462 ERROR("HVM:Could not get hvm buffer");
1463 goto out;
1466 if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
1468 PERROR("error write hvm buffer size");
1469 goto out;
1472 if ( write_exact(io_fd, hvm_buf, rec_size) )
1474 PERROR("write HVM info failed!\n");
1475 goto out;
1478 /* HVM guests are done now */
1479 rc = 0;
1480 goto out;
1483 /* PV guests only from now on */
1485 /* Send through a list of all the PFNs that were not in map at the close */
1487 unsigned int i,j;
1488 unsigned long pfntab[1024];
1490 for ( i = 0, j = 0; i < p2m_size; i++ )
1492 if ( !is_mapped(pfn_to_mfn(i)) )
1493 j++;
1496 if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
1498 PERROR("Error when writing to state file (6a)");
1499 goto out;
1502 for ( i = 0, j = 0; i < p2m_size; )
1504 if ( !is_mapped(pfn_to_mfn(i)) )
1505 pfntab[j++] = i;
1507 i++;
1508 if ( (j == 1024) || (i == p2m_size) )
1510 if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
1512 PERROR("Error when writing to state file (6b)");
1513 goto out;
1515 j = 0;
1520 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
1522 ERROR("Could not get vcpu context");
1523 goto out;
1526 /* Canonicalise the suspend-record frame number. */
1527 mfn = GET_FIELD(&ctxt, user_regs.edx);
1528 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1530 ERROR("Suspend record is not in range of pseudophys map");
1531 goto out;
1533 SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn));
1535 for ( i = 0; i <= info.max_vcpu_id; i++ )
1537 if ( !(vcpumap & (1ULL << i)) )
1538 continue;
1540 if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) )
1542 ERROR("No context for VCPU%d", i);
1543 goto out;
1546 /* Canonicalise each GDT frame number. */
1547 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1549 mfn = GET_FIELD(&ctxt, gdt_frames[j]);
1550 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1552 ERROR("GDT frame is not in range of pseudophys map");
1553 goto out;
1555 SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
1558 /* Canonicalise the page table base pointer. */
1559 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(
1560 GET_FIELD(&ctxt, ctrlreg[3]))) )
1562 ERROR("PT base is not in range of pseudophys map");
1563 goto out;
1565 SET_FIELD(&ctxt, ctrlreg[3],
1566 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3])))));
1568 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1569 if ( (pt_levels == 4) && ctxt.x64.ctrlreg[1] )
1571 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) )
1573 ERROR("PT base is not in range of pseudophys map");
1574 goto out;
1576 /* Least-significant bit means 'valid PFN'. */
1577 ctxt.x64.ctrlreg[1] = 1 |
1578 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
1581 if ( write_exact(io_fd, &ctxt, ((guest_width==8)
1582 ? sizeof(ctxt.x64)
1583 : sizeof(ctxt.x32))) )
1585 PERROR("Error when writing to state file (1)");
1586 goto out;
1589 domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
1590 domctl.domain = dom;
1591 domctl.u.ext_vcpucontext.vcpu = i;
1592 if ( xc_domctl(xc_handle, &domctl) < 0 )
1594 ERROR("No extended context for VCPU%d", i);
1595 goto out;
1597 if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
1599 PERROR("Error when writing to state file (2)");
1600 goto out;
1604 /*
1605 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1606 */
1607 memcpy(page, live_shinfo, PAGE_SIZE);
1608 SET_FIELD(((shared_info_either_t *)page),
1609 arch.pfn_to_mfn_frame_list_list, 0);
1610 if ( write_exact(io_fd, page, PAGE_SIZE) )
1612 PERROR("Error when writing to state file (1)");
1613 goto out;
1616 /* Success! */
1617 rc = 0;
1619 out:
1621 if ( live )
1623 if ( xc_shadow_control(xc_handle, dom,
1624 XEN_DOMCTL_SHADOW_OP_OFF,
1625 NULL, 0, NULL, 0, NULL) < 0 )
1626 DPRINTF("Warning - couldn't disable shadow mode");
1629 /* Flush last write and discard cache for file. */
1630 discard_file_cache(io_fd, 1 /* flush */);
1632 if ( live_shinfo )
1633 munmap(live_shinfo, PAGE_SIZE);
1635 if ( live_p2m )
1636 munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
1638 if ( live_m2p )
1639 munmap(live_m2p, M2P_SIZE(max_mfn));
1641 free(pfn_type);
1642 free(pfn_batch);
1643 free(to_send);
1644 free(to_fix);
1645 free(to_skip);
1647 DPRINTF("Save exit rc=%d\n",rc);
1649 return !!rc;
1652 /*
1653 * Local variables:
1654 * mode: C
1655 * c-set-style: "BSD"
1656 * c-basic-offset: 4
1657 * tab-width: 4
1658 * indent-tabs-mode: nil
1659 * End:
1660 */