debuggers.hg

view tools/libxc/xc_domain_save.c @ 16408:f669bf5c6720

libxc: Consolidate read()/write() syscall wrappers to read/write an
exact number of bytes. The consolidated versions are more watertight
than the various versions previously distributed around the library
source code.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Sun Nov 11 18:22:33 2007 +0000 (2007-11-11)
parents 168beb9a27a5
children dfca1120813f
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 #include <xen/hvm/params.h>
21 #include "xc_e820.h"
23 /*
24 ** Default values for important tuning parameters. Can override by passing
25 ** non-zero replacement values to xc_domain_save().
26 **
27 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
28 **
29 */
30 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
31 #define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
33 /* max mfn of the whole machine */
34 static unsigned long max_mfn;
36 /* virtual starting address of the hypervisor */
37 static unsigned long hvirt_start;
39 /* #levels of page tables used by the current guest */
40 static unsigned int pt_levels;
42 /* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
43 static unsigned long *qemu_bitmaps[2];
44 static int qemu_active;
45 static int qemu_non_active;
47 /* number of pfns this guest has (i.e. number of entries in the P2M) */
48 static unsigned long p2m_size;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* Live mapping of system MFN to PFN table. */
54 static xen_pfn_t *live_m2p = NULL;
55 static unsigned long m2p_mfn0;
57 /* Address size of the guest */
58 unsigned int guest_width;
60 /* grep fodder: machine_to_phys */
62 #define mfn_to_pfn(_mfn) (live_m2p[(_mfn)])
64 #define pfn_to_mfn(_pfn) \
65 ((xen_pfn_t) ((guest_width==8) \
66 ? (((uint64_t *)live_p2m)[(_pfn)]) \
67 : (((uint32_t *)live_p2m)[(_pfn)])))
69 /*
70 * Returns TRUE if the given machine frame number has a unique mapping
71 * in the guest's pseudophysical map.
72 */
73 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
74 (((_mfn) < (max_mfn)) && \
75 ((mfn_to_pfn(_mfn) < (p2m_size)) && \
76 (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
78 /*
79 ** During (live) save/migrate, we maintain a number of bitmaps to track
80 ** which pages we have to send, to fixup, and to skip.
81 */
83 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
84 #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
85 #define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
87 #define BITMAP_ENTRY(_nr,_bmap) \
88 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
90 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
92 static inline int test_bit (int nr, volatile void * addr)
93 {
94 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
95 }
97 static inline void clear_bit (int nr, volatile void * addr)
98 {
99 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
100 }
102 static inline void set_bit ( int nr, volatile void * addr)
103 {
104 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
105 }
107 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
108 static inline unsigned int hweight32(unsigned int w)
109 {
110 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
111 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
112 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
113 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
114 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
115 }
117 static inline int count_bits ( int nr, volatile void *addr)
118 {
119 int i, count = 0;
120 volatile unsigned long *p = (volatile unsigned long *)addr;
121 /* We know that the array is padded to unsigned long. */
122 for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
123 count += hweight32(*p);
124 return count;
125 }
127 static inline int permute( int i, int nr, int order_nr )
128 {
129 /* Need a simple permutation function so that we scan pages in a
130 pseudo random order, enabling us to get a better estimate of
131 the domain's page dirtying rate as we go (there are often
132 contiguous ranges of pfns that have similar behaviour, and we
133 want to mix them up. */
135 /* e.g. nr->oder 15->4 16->4 17->5 */
136 /* 512MB domain, 128k pages, order 17 */
138 /*
139 QPONMLKJIHGFEDCBA
140 QPONMLKJIH
141 GFEDCBA
142 */
144 /*
145 QPONMLKJIHGFEDCBA
146 EDCBA
147 QPONM
148 LKJIHGF
149 */
151 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
152 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
154 return i;
155 }
157 static uint64_t tv_to_us(struct timeval *new)
158 {
159 return (new->tv_sec * 1000000) + new->tv_usec;
160 }
162 static uint64_t llgettimeofday(void)
163 {
164 struct timeval now;
165 gettimeofday(&now, NULL);
166 return tv_to_us(&now);
167 }
169 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
170 {
171 return (((new->tv_sec - old->tv_sec)*1000000) +
172 (new->tv_usec - old->tv_usec));
173 }
175 static int noncached_write(int fd, int live, void *buffer, int len)
176 {
177 static int write_count = 0;
178 int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
180 write_count += len;
181 if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
182 {
183 /* Time to discard cache - dont care if this fails */
184 discard_file_cache(fd, 0 /* no flush */);
185 write_count = 0;
186 }
188 return rc;
189 }
191 #ifdef ADAPTIVE_SAVE
193 /*
194 ** We control the rate at which we transmit (or save) to minimize impact
195 ** on running domains (including the target if we're doing live migrate).
196 */
198 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
199 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
201 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
202 #define RATE_TO_BTU 781250
204 /* Amount in bytes we allow ourselves to send in a burst */
205 #define BURST_BUDGET (100*1024)
207 /* We keep track of the current and previous transmission rate */
208 static int mbit_rate, ombit_rate = 0;
210 /* Have we reached the maximum transmission rate? */
211 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
213 static inline void initialize_mbit_rate()
214 {
215 mbit_rate = START_MBIT_RATE;
216 }
218 static int ratewrite(int io_fd, int live, void *buf, int n)
219 {
220 static int budget = 0;
221 static int burst_time_us = -1;
222 static struct timeval last_put = { 0 };
223 struct timeval now;
224 struct timespec delay;
225 long long delta;
227 if ( START_MBIT_RATE == 0 )
228 return noncached_write(io_fd, live, buf, n);
230 budget -= n;
231 if ( budget < 0 )
232 {
233 if ( mbit_rate != ombit_rate )
234 {
235 burst_time_us = RATE_TO_BTU / mbit_rate;
236 ombit_rate = mbit_rate;
237 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
238 mbit_rate, BURST_BUDGET, burst_time_us);
239 }
240 if ( last_put.tv_sec == 0 )
241 {
242 budget += BURST_BUDGET;
243 gettimeofday(&last_put, NULL);
244 }
245 else
246 {
247 while ( budget < 0 )
248 {
249 gettimeofday(&now, NULL);
250 delta = tv_delta(&now, &last_put);
251 while ( delta > burst_time_us )
252 {
253 budget += BURST_BUDGET;
254 last_put.tv_usec += burst_time_us;
255 if ( last_put.tv_usec > 1000000
256 {
257 last_put.tv_usec -= 1000000;
258 last_put.tv_sec++;
259 }
260 delta -= burst_time_us;
261 }
262 if ( budget > 0 )
263 break;
264 delay.tv_sec = 0;
265 delay.tv_nsec = 1000 * (burst_time_us - delta);
266 while ( delay.tv_nsec > 0 )
267 if ( nanosleep(&delay, &delay) == 0 )
268 break;
269 }
270 }
271 }
272 return noncached_write(io_fd, live, buf, n);
273 }
275 #else /* ! ADAPTIVE SAVE */
277 #define RATE_IS_MAX() (0)
278 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
279 #define initialize_mbit_rate()
281 #endif
283 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
284 xc_shadow_op_stats_t *stats, int print)
285 {
286 static struct timeval wall_last;
287 static long long d0_cpu_last;
288 static long long d1_cpu_last;
290 struct timeval wall_now;
291 long long wall_delta;
292 long long d0_cpu_now, d0_cpu_delta;
293 long long d1_cpu_now, d1_cpu_delta;
295 gettimeofday(&wall_now, NULL);
297 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
298 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
300 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
301 DPRINTF("ARRHHH!!\n");
303 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
304 if ( wall_delta == 0 )
305 wall_delta = 1;
307 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
308 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
310 if ( print )
311 DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
312 "dirtied %dMb/s %" PRId32 " pages\n",
313 wall_delta,
314 (int)((d0_cpu_delta*100)/wall_delta),
315 (int)((d1_cpu_delta*100)/wall_delta),
316 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
317 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
318 stats->dirty_count);
320 #ifdef ADAPTIVE_SAVE
321 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
322 {
323 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
324 + 50;
325 if ( mbit_rate > MAX_MBIT_RATE )
326 mbit_rate = MAX_MBIT_RATE;
327 }
328 #endif
330 d0_cpu_last = d0_cpu_now;
331 d1_cpu_last = d1_cpu_now;
332 wall_last = wall_now;
334 return 0;
335 }
338 static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
339 unsigned long *arr, int runs)
340 {
341 long long start, now;
342 xc_shadow_op_stats_t stats;
343 int j;
345 start = llgettimeofday();
347 for ( j = 0; j < runs; j++ )
348 {
349 int i;
351 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
352 arr, p2m_size, NULL, 0, NULL);
353 DPRINTF("#Flush\n");
354 for ( i = 0; i < 40; i++ )
355 {
356 usleep(50000);
357 now = llgettimeofday();
358 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
359 NULL, 0, NULL, 0, &stats);
360 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
361 ((now-start)+500)/1000,
362 stats.fault_count, stats.dirty_count);
363 }
364 }
366 return -1;
367 }
370 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
371 int dom, xc_dominfo_t *info)
372 {
373 int i = 0;
375 if ( !(*suspend)(dom) )
376 {
377 ERROR("Suspend request failed");
378 return -1;
379 }
381 retry:
383 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
384 {
385 ERROR("Could not get domain info");
386 return -1;
387 }
389 if ( info->dying )
390 {
391 ERROR("domain is dying");
392 return -1;
393 }
395 if ( info->crashed )
396 {
397 ERROR("domain has crashed");
398 return -1;
399 }
401 if ( info->shutdown )
402 {
403 switch ( info->shutdown_reason )
404 {
405 case SHUTDOWN_poweroff:
406 case SHUTDOWN_reboot:
407 ERROR("domain has shut down");
408 return -1;
409 case SHUTDOWN_suspend:
410 return 0;
411 case SHUTDOWN_crash:
412 ERROR("domain has crashed");
413 return -1;
414 }
415 }
417 if ( info->paused )
418 {
419 /* Try unpausing domain, wait, and retest. */
420 xc_domain_unpause( xc_handle, dom );
421 ERROR("Domain was paused. Wait and re-test.");
422 usleep(10000); /* 10ms */
423 goto retry;
424 }
426 if ( ++i < 100 )
427 {
428 ERROR("Retry suspend domain");
429 usleep(10000); /* 10ms */
430 goto retry;
431 }
433 ERROR("Unable to suspend domain.");
435 return -1;
436 }
438 /*
439 ** Map the top-level page of MFNs from the guest. The guest might not have
440 ** finished resuming from a previous restore operation, so we wait a while for
441 ** it to update the MFN to a reasonable value.
442 */
443 static void *map_frame_list_list(int xc_handle, uint32_t dom,
444 shared_info_either_t *shinfo)
445 {
446 int count = 100;
447 void *p;
448 uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
450 while ( count-- && (fll == 0) )
451 {
452 usleep(10000);
453 fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
454 }
456 if ( fll == 0 )
457 {
458 ERROR("Timed out waiting for frame list updated.");
459 return NULL;
460 }
462 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll);
463 if ( p == NULL )
464 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
466 return p;
467 }
469 /*
470 ** During transfer (or in the state file), all page-table pages must be
471 ** converted into a 'canonical' form where references to actual mfns
472 ** are replaced with references to the corresponding pfns.
473 **
474 ** This function performs the appropriate conversion, taking into account
475 ** which entries do not require canonicalization (in particular, those
476 ** entries which map the virtual address reserved for the hypervisor).
477 */
478 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
479 const void *spage, void *dpage)
480 {
482 int i, pte_last, xen_start, xen_end, race = 0;
483 uint64_t pte;
485 /*
486 ** We need to determine which entries in this page table hold
487 ** reserved hypervisor mappings. This depends on the current
488 ** page table type as well as the number of paging levels.
489 */
490 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
492 if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
493 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
495 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
496 xen_start = L3_PAGETABLE_ENTRIES_PAE;
498 /*
499 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
500 ** We can spot this by looking for the guest linear mapping which
501 ** Xen always ensures is present in that L2. Guests must ensure
502 ** that this check will fail for other L2s.
503 */
504 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
505 {
506 int hstart;
507 uint64_t he;
509 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
510 he = ((const uint64_t *) spage)[hstart];
512 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
513 {
514 /* hvirt starts with xen stuff... */
515 xen_start = hstart;
516 }
517 else if ( hvirt_start != 0xf5800000 )
518 {
519 /* old L2s from before hole was shrunk... */
520 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
521 he = ((const uint64_t *) spage)[hstart];
522 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
523 xen_start = hstart;
524 }
525 }
527 if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
528 {
529 /*
530 ** XXX SMH: should compute these from hvirt_start (which we have)
531 ** and hvirt_end (which we don't)
532 */
533 xen_start = 256;
534 xen_end = 272;
535 }
537 /* Now iterate through the page table, canonicalizing each PTE */
538 for (i = 0; i < pte_last; i++ )
539 {
540 unsigned long pfn, mfn;
542 if ( pt_levels == 2 )
543 pte = ((const uint32_t*)spage)[i];
544 else
545 pte = ((const uint64_t*)spage)[i];
547 if ( (i >= xen_start) && (i < xen_end) )
548 pte = 0;
550 if ( pte & _PAGE_PRESENT )
551 {
552 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
553 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
554 {
555 /* This will happen if the type info is stale which
556 is quite feasible under live migration */
557 pfn = 0; /* zap it - we'll retransmit this page later */
558 race = 1; /* inform the caller of race; fatal if !live */
559 }
560 else
561 pfn = mfn_to_pfn(mfn);
563 pte &= ~MADDR_MASK_X86;
564 pte |= (uint64_t)pfn << PAGE_SHIFT;
566 /*
567 * PAE guest L3Es can contain these flags when running on
568 * a 64bit hypervisor. We zap these here to avoid any
569 * surprise at restore time...
570 */
571 if ( (pt_levels == 3) &&
572 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
573 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
574 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
575 }
577 if ( pt_levels == 2 )
578 ((uint32_t*)dpage)[i] = pte;
579 else
580 ((uint64_t*)dpage)[i] = pte;
581 }
583 return race;
584 }
586 static xen_pfn_t *xc_map_m2p(int xc_handle,
587 unsigned long max_mfn,
588 int prot)
589 {
590 struct xen_machphys_mfn_list xmml;
591 privcmd_mmap_entry_t *entries;
592 unsigned long m2p_chunks, m2p_size;
593 xen_pfn_t *m2p;
594 xen_pfn_t *extent_start;
595 int i, rc;
597 m2p_size = M2P_SIZE(max_mfn);
598 m2p_chunks = M2P_CHUNKS(max_mfn);
600 xmml.max_extents = m2p_chunks;
601 if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
602 {
603 ERROR("failed to allocate space for m2p mfns");
604 return NULL;
605 }
606 set_xen_guest_handle(xmml.extent_start, extent_start);
608 if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
609 (xmml.nr_extents != m2p_chunks) )
610 {
611 ERROR("xc_get_m2p_mfns");
612 return NULL;
613 }
615 if ( (m2p = mmap(NULL, m2p_size, prot,
616 MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
617 {
618 ERROR("failed to mmap m2p");
619 return NULL;
620 }
622 if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
623 {
624 ERROR("failed to allocate space for mmap entries");
625 return NULL;
626 }
628 for ( i = 0; i < m2p_chunks; i++ )
629 {
630 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
631 entries[i].mfn = extent_start[i];
632 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
633 }
635 if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
636 entries, m2p_chunks)) < 0 )
637 {
638 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
639 return NULL;
640 }
642 m2p_mfn0 = entries[0].mfn;
644 free(extent_start);
645 free(entries);
647 return m2p;
648 }
651 static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
652 int io_fd,
653 uint32_t dom,
654 unsigned long p2m_size,
655 shared_info_either_t *live_shinfo)
656 {
657 vcpu_guest_context_either_t ctxt;
659 /* Double and single indirect references to the live P2M table */
660 void *live_p2m_frame_list_list = NULL;
661 void *live_p2m_frame_list = NULL;
663 /* Copies of the above. */
664 xen_pfn_t *p2m_frame_list_list = NULL;
665 xen_pfn_t *p2m_frame_list = NULL;
667 /* The mapping of the live p2m table itself */
668 xen_pfn_t *p2m = NULL;
670 int i, success = 0;
672 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
673 live_shinfo);
674 if ( !live_p2m_frame_list_list )
675 goto out;
677 /* Get a local copy of the live_P2M_frame_list_list */
678 if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
679 {
680 ERROR("Couldn't allocate p2m_frame_list_list array");
681 goto out;
682 }
683 memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
685 /* Canonicalize guest's unsigned long vs ours */
686 if ( guest_width > sizeof(unsigned long) )
687 for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
688 if ( i < PAGE_SIZE/guest_width )
689 p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
690 else
691 p2m_frame_list_list[i] = 0;
692 else if ( guest_width < sizeof(unsigned long) )
693 for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i++ )
694 p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
696 live_p2m_frame_list =
697 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
698 p2m_frame_list_list,
699 P2M_FLL_ENTRIES);
700 if ( !live_p2m_frame_list )
701 {
702 ERROR("Couldn't map p2m_frame_list");
703 goto out;
704 }
706 /* Get a local copy of the live_P2M_frame_list */
707 if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
708 {
709 ERROR("Couldn't allocate p2m_frame_list array");
710 goto out;
711 }
712 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
714 /* Canonicalize guest's unsigned long vs ours */
715 if ( guest_width > sizeof(unsigned long) )
716 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
717 p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
718 else if ( guest_width < sizeof(unsigned long) )
719 for ( i = P2M_FL_ENTRIES - 1; i >= 0; i++ )
720 p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
723 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
724 the guest must not change which frames are used for this purpose.
725 (its not clear why it would want to change them, and we'll be OK
726 from a safety POV anyhow. */
728 p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
729 p2m_frame_list,
730 P2M_FL_ENTRIES);
731 if ( !p2m )
732 {
733 ERROR("Couldn't map p2m table");
734 goto out;
735 }
736 live_p2m = p2m; /* So that translation macros will work */
738 /* Canonicalise the pfn-to-mfn table frame-number list. */
739 for ( i = 0; i < p2m_size; i += FPP )
740 {
741 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
742 {
743 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
744 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
745 i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], max_mfn);
746 if ( p2m_frame_list[i/FPP] < max_mfn )
747 {
748 ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
749 (uint64_t)p2m_frame_list[i/FPP],
750 (uint64_t)live_m2p[p2m_frame_list[i/FPP]]);
751 ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
752 (uint64_t)live_m2p[p2m_frame_list[i/FPP]],
753 (uint64_t)p2m[live_m2p[p2m_frame_list[i/FPP]]]);
755 }
756 goto out;
757 }
758 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
759 }
761 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
762 {
763 ERROR("Could not get vcpu context");
764 goto out;
765 }
767 /*
768 * Write an extended-info structure to inform the restore code that
769 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
770 * slow paths in the restore code.
771 */
772 {
773 unsigned long signature = ~0UL;
774 uint32_t chunk1_sz = ((guest_width==8)
775 ? sizeof(ctxt.x64)
776 : sizeof(ctxt.x32));
777 uint32_t chunk2_sz = 0;
778 uint32_t tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
779 if ( write_exact(io_fd, &signature, sizeof(signature)) ||
780 write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
781 write_exact(io_fd, "vcpu", 4) ||
782 write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
783 write_exact(io_fd, &ctxt, chunk1_sz) ||
784 write_exact(io_fd, "extv", 4) ||
785 write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) )
786 {
787 ERROR("write: extended info");
788 goto out;
789 }
790 }
792 if ( write_exact(io_fd, p2m_frame_list,
793 P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
794 {
795 ERROR("write: p2m_frame_list");
796 goto out;
797 }
799 success = 1;
801 out:
803 if ( !success && p2m )
804 munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
806 if ( live_p2m_frame_list_list )
807 munmap(live_p2m_frame_list_list, PAGE_SIZE);
809 if ( live_p2m_frame_list )
810 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
812 if ( p2m_frame_list_list )
813 free(p2m_frame_list_list);
815 if ( p2m_frame_list )
816 free(p2m_frame_list);
818 return success ? p2m : NULL;
819 }
823 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
824 uint32_t max_factor, uint32_t flags, int (*suspend)(int),
825 int hvm, void *(*init_qemu_maps)(int, unsigned),
826 void (*qemu_flip_buffer)(int, int))
827 {
828 xc_dominfo_t info;
829 DECLARE_DOMCTL;
831 int rc = 1, frc, i, j, last_iter, iter = 0;
832 int live = (flags & XCFLAGS_LIVE);
833 int debug = (flags & XCFLAGS_DEBUG);
834 int race = 0, sent_last_iter, skip_this_iter;
836 /* The new domain's shared-info frame number. */
837 unsigned long shared_info_frame;
839 /* A copy of the CPU context of the guest. */
840 vcpu_guest_context_either_t ctxt;
842 /* A table containing the type of each PFN (/not/ MFN!). */
843 unsigned long *pfn_type = NULL;
844 unsigned long *pfn_batch = NULL;
846 /* A copy of one frame of guest memory. */
847 char page[PAGE_SIZE];
849 /* Live mapping of shared info structure */
850 shared_info_either_t *live_shinfo = NULL;
852 /* base of the region in which domain memory is mapped */
853 unsigned char *region_base = NULL;
855 /* power of 2 order of p2m_size */
856 int order_nr;
858 /* bitmap of pages:
859 - that should be sent this iteration (unless later marked as skip);
860 - to skip this iteration because already dirty;
861 - to fixup by sending at the end if not already resent; */
862 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
864 xc_shadow_op_stats_t stats;
866 unsigned long needed_to_fix = 0;
867 unsigned long total_sent = 0;
869 uint64_t vcpumap = 1ULL;
871 /* HVM: a buffer for holding HVM context */
872 uint32_t hvm_buf_size = 0;
873 uint8_t *hvm_buf = NULL;
875 /* HVM: magic frames for ioreqs and xenstore comms. */
876 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
878 unsigned long mfn;
880 /* If no explicit control parameters given, use defaults */
881 max_iters = max_iters ? : DEF_MAX_ITERS;
882 max_factor = max_factor ? : DEF_MAX_FACTOR;
884 initialize_mbit_rate();
886 if ( !get_platform_info(xc_handle, dom,
887 &max_mfn, &hvirt_start, &pt_levels, &guest_width) )
888 {
889 ERROR("Unable to get platform info.");
890 return 1;
891 }
893 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
894 {
895 ERROR("Could not get domain info");
896 return 1;
897 }
899 shared_info_frame = info.shared_info_frame;
901 /* Map the shared info frame */
902 if ( !hvm )
903 {
904 live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
905 PROT_READ, shared_info_frame);
906 if ( !live_shinfo )
907 {
908 ERROR("Couldn't map live_shinfo");
909 goto out;
910 }
911 }
913 /* Get the size of the P2M table */
914 p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
916 /* Domain is still running at this point */
917 if ( live )
918 {
919 /* Live suspend. Enable log-dirty mode. */
920 if ( xc_shadow_control(xc_handle, dom,
921 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
922 NULL, 0, NULL, 0, NULL) < 0 )
923 {
924 /* log-dirty already enabled? There's no test op,
925 so attempt to disable then reenable it */
926 frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
927 NULL, 0, NULL, 0, NULL);
928 if ( frc >= 0 )
929 {
930 frc = xc_shadow_control(xc_handle, dom,
931 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
932 NULL, 0, NULL, 0, NULL);
933 }
935 if ( frc < 0 )
936 {
937 ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
938 goto out;
939 }
940 }
942 if ( hvm )
943 {
944 /* Get qemu-dm logging dirty pages too */
945 void *seg = init_qemu_maps(dom, BITMAP_SIZE);
946 qemu_bitmaps[0] = seg;
947 qemu_bitmaps[1] = seg + BITMAP_SIZE;
948 qemu_active = 0;
949 qemu_non_active = 1;
950 }
951 }
952 else
953 {
954 /* This is a non-live suspend. Suspend the domain .*/
955 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
956 {
957 ERROR("Domain appears not to have suspended");
958 goto out;
959 }
960 }
962 last_iter = !live;
964 /* pretend we sent all the pages last iteration */
965 sent_last_iter = p2m_size;
967 /* calculate the power of 2 order of p2m_size, e.g.
968 15->4 16->4 17->5 */
969 for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
970 continue;
972 /* Setup to_send / to_fix and to_skip bitmaps */
973 to_send = malloc(BITMAP_SIZE);
974 to_fix = calloc(1, BITMAP_SIZE);
975 to_skip = malloc(BITMAP_SIZE);
977 if ( !to_send || !to_fix || !to_skip )
978 {
979 ERROR("Couldn't allocate to_send array");
980 goto out;
981 }
983 memset(to_send, 0xff, BITMAP_SIZE);
985 if ( lock_pages(to_send, BITMAP_SIZE) )
986 {
987 ERROR("Unable to lock to_send");
988 return 1;
989 }
991 /* (to fix is local only) */
992 if ( lock_pages(to_skip, BITMAP_SIZE) )
993 {
994 ERROR("Unable to lock to_skip");
995 return 1;
996 }
998 if ( hvm )
999 {
1000 /* Need another buffer for HVM context */
1001 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
1002 if ( hvm_buf_size == -1 )
1004 ERROR("Couldn't get HVM context size from Xen");
1005 goto out;
1007 hvm_buf = malloc(hvm_buf_size);
1008 if ( !hvm_buf )
1010 ERROR("Couldn't allocate memory");
1011 goto out;
1015 analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
1017 /* We want zeroed memory so use calloc rather than malloc. */
1018 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
1019 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
1020 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
1022 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
1023 errno = ENOMEM;
1024 goto out;
1027 if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
1029 ERROR("Unable to lock");
1030 goto out;
1033 /* Setup the mfn_to_pfn table mapping */
1034 if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
1036 ERROR("Failed to map live M2P table");
1037 goto out;
1040 /* Start writing out the saved-domain record. */
1041 if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
1043 ERROR("write: p2m_size");
1044 goto out;
1047 if ( !hvm )
1049 int err = 0;
1051 /* Map the P2M table, and write the list of P2M frames */
1052 live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom,
1053 p2m_size, live_shinfo);
1054 if ( live_p2m == NULL )
1056 ERROR("Failed to map/save the p2m frame list");
1057 goto out;
1060 /*
1061 * Quick belt and braces sanity check.
1062 */
1064 for ( i = 0; i < p2m_size; i++ )
1066 mfn = pfn_to_mfn(i);
1067 if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
1069 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
1070 mfn, mfn_to_pfn(mfn));
1071 err++;
1074 DPRINTF("Had %d unexplained entries in p2m table\n", err);
1077 print_stats(xc_handle, dom, 0, &stats, 0);
1079 /* Now write out each data page, canonicalising page tables as we go... */
1080 for ( ; ; )
1082 unsigned int prev_pc, sent_this_iter, N, batch;
1084 iter++;
1085 sent_this_iter = 0;
1086 skip_this_iter = 0;
1087 prev_pc = 0;
1088 N = 0;
1090 DPRINTF("Saving memory pages: iter %d 0%%", iter);
1092 while ( N < p2m_size )
1094 unsigned int this_pc = (N * 100) / p2m_size;
1096 if ( (this_pc - prev_pc) >= 5 )
1098 DPRINTF("\b\b\b\b%3d%%", this_pc);
1099 prev_pc = this_pc;
1102 if ( !last_iter )
1104 /* Slightly wasteful to peek the whole array evey time,
1105 but this is fast enough for the moment. */
1106 frc = xc_shadow_control(
1107 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
1108 p2m_size, NULL, 0, NULL);
1109 if ( frc != p2m_size )
1111 ERROR("Error peeking shadow bitmap");
1112 goto out;
1116 /* load pfn_type[] with the mfn of all the pages we're doing in
1117 this batch. */
1118 for ( batch = 0;
1119 (batch < MAX_BATCH_SIZE) && (N < p2m_size);
1120 N++ )
1122 int n = permute(N, p2m_size, order_nr);
1124 if ( debug )
1126 DPRINTF("%d pfn= %08lx mfn= %08lx %d",
1127 iter, (unsigned long)n,
1128 hvm ? 0 : pfn_to_mfn(n),
1129 test_bit(n, to_send));
1130 if ( !hvm && is_mapped(pfn_to_mfn(n)) )
1131 DPRINTF(" [mfn]= %08lx",
1132 mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
1133 DPRINTF("\n");
1135 if ( !last_iter &&
1136 test_bit(n, to_send) &&
1137 test_bit(n, to_skip) )
1138 skip_this_iter++; /* stats keeping */
1140 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
1141 (test_bit(n, to_send) && last_iter) ||
1142 (test_bit(n, to_fix) && last_iter)) )
1143 continue;
1145 /* Skip PFNs that aren't really there */
1146 if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
1147 || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
1148 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
1149 continue;
1151 /*
1152 ** we get here if:
1153 ** 1. page is marked to_send & hasn't already been re-dirtied
1154 ** 2. (ignore to_skip in last iteration)
1155 ** 3. add in pages that still need fixup (net bufs)
1156 */
1158 pfn_batch[batch] = n;
1160 /* Hypercall interfaces operate in PFNs for HVM guests
1161 * and MFNs for PV guests */
1162 if ( hvm )
1163 pfn_type[batch] = n;
1164 else
1165 pfn_type[batch] = pfn_to_mfn(n);
1167 if ( !is_mapped(pfn_type[batch]) )
1169 /*
1170 ** not currently in psuedo-physical map -- set bit
1171 ** in to_fix since we must send this page in last_iter
1172 ** unless its sent sooner anyhow, or it never enters
1173 ** pseudo-physical map (e.g. for ballooned down doms)
1174 */
1175 set_bit(n, to_fix);
1176 continue;
1179 if ( last_iter &&
1180 test_bit(n, to_fix) &&
1181 !test_bit(n, to_send) )
1183 needed_to_fix++;
1184 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1185 iter, n, pfn_type[batch]);
1188 clear_bit(n, to_fix);
1190 batch++;
1193 if ( batch == 0 )
1194 goto skip; /* vanishingly unlikely... */
1196 region_base = xc_map_foreign_batch(
1197 xc_handle, dom, PROT_READ, pfn_type, batch);
1198 if ( region_base == NULL )
1200 ERROR("map batch failed");
1201 goto out;
1204 if ( !hvm )
1206 /* Get page types */
1207 for ( j = 0; j < batch; j++ )
1208 ((uint32_t *)pfn_type)[j] = pfn_type[j];
1209 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
1210 (uint32_t *)pfn_type) )
1212 ERROR("get_pfn_type_batch failed");
1213 goto out;
1215 for ( j = batch-1; j >= 0; j-- )
1216 pfn_type[j] = ((uint32_t *)pfn_type)[j];
1218 for ( j = 0; j < batch; j++ )
1221 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
1222 XEN_DOMCTL_PFINFO_XTAB )
1224 DPRINTF("type fail: page %i mfn %08lx\n",
1225 j, pfn_type[j]);
1226 continue;
1229 if ( debug )
1230 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1231 " sum= %08lx\n",
1232 iter,
1233 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1234 pfn_batch[j],
1235 pfn_type[j],
1236 mfn_to_pfn(pfn_type[j] &
1237 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1238 csum_page(region_base + (PAGE_SIZE*j)));
1240 /* canonicalise mfn->pfn */
1241 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1242 pfn_batch[j];
1246 if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
1248 ERROR("Error when writing to state file (2) (errno %d)",
1249 errno);
1250 goto out;
1253 if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
1255 ERROR("Error when writing to state file (3) (errno %d)",
1256 errno);
1257 goto out;
1260 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1261 for ( j = 0; j < batch; j++ )
1263 unsigned long pfn, pagetype;
1264 void *spage = (char *)region_base + (PAGE_SIZE*j);
1266 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1267 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1269 /* write out pages in batch */
1270 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1271 continue;
1273 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1275 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1276 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1278 /* We have a pagetable page: need to rewrite it. */
1279 race =
1280 canonicalize_pagetable(pagetype, pfn, spage, page);
1282 if ( race && !live )
1284 ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
1285 pagetype);
1286 goto out;
1289 if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
1291 ERROR("Error when writing to state file (4)"
1292 " (errno %d)", errno);
1293 goto out;
1296 else
1298 /* We have a normal page: just write it directly. */
1299 if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
1300 PAGE_SIZE )
1302 ERROR("Error when writing to state file (5)"
1303 " (errno %d)", errno);
1304 goto out;
1307 } /* end of the write out for this batch */
1309 sent_this_iter += batch;
1311 munmap(region_base, batch*PAGE_SIZE);
1313 } /* end of this while loop for this iteration */
1315 skip:
1317 total_sent += sent_this_iter;
1319 DPRINTF("\r %d: sent %d, skipped %d, ",
1320 iter, sent_this_iter, skip_this_iter );
1322 if ( last_iter )
1324 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1326 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1327 total_sent, ((float)total_sent)/p2m_size );
1328 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1331 if ( last_iter && debug )
1333 int minusone = -1;
1334 memset(to_send, 0xff, BITMAP_SIZE);
1335 debug = 0;
1336 DPRINTF("Entering debug resend-all mode\n");
1338 /* send "-1" to put receiver into debug mode */
1339 if ( write_exact(io_fd, &minusone, sizeof(int)) )
1341 ERROR("Error when writing to state file (6) (errno %d)",
1342 errno);
1343 goto out;
1346 continue;
1349 if ( last_iter )
1350 break;
1352 if ( live )
1354 if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1355 (iter >= max_iters) ||
1356 (sent_this_iter+skip_this_iter < 50) ||
1357 (total_sent > p2m_size*max_factor) )
1359 DPRINTF("Start last iteration\n");
1360 last_iter = 1;
1362 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
1364 ERROR("Domain appears not to have suspended");
1365 goto out;
1368 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1371 if ( xc_shadow_control(xc_handle, dom,
1372 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1373 p2m_size, NULL, 0, &stats) != p2m_size )
1375 ERROR("Error flushing shadow PT");
1376 goto out;
1379 if ( hvm )
1381 /* Pull in the dirty bits from qemu-dm too */
1382 if ( !last_iter )
1384 qemu_active = qemu_non_active;
1385 qemu_non_active = qemu_active ? 0 : 1;
1386 qemu_flip_buffer(dom, qemu_active);
1387 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1389 to_send[j] |= qemu_bitmaps[qemu_non_active][j];
1390 qemu_bitmaps[qemu_non_active][j] = 0;
1393 else
1395 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1396 to_send[j] |= qemu_bitmaps[qemu_active][j];
1400 sent_last_iter = sent_this_iter;
1402 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1405 } /* end of infinite for loop */
1407 DPRINTF("All memory is saved\n");
1410 struct {
1411 int minustwo;
1412 int max_vcpu_id;
1413 uint64_t vcpumap;
1414 } chunk = { -2, info.max_vcpu_id };
1416 if ( info.max_vcpu_id >= 64 )
1418 ERROR("Too many VCPUS in guest!");
1419 goto out;
1422 for ( i = 1; i <= info.max_vcpu_id; i++ )
1424 xc_vcpuinfo_t vinfo;
1425 if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
1426 vinfo.online )
1427 vcpumap |= 1ULL << i;
1430 chunk.vcpumap = vcpumap;
1431 if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
1433 ERROR("Error when writing to state file (errno %d)", errno);
1434 goto out;
1438 /* Zero terminate */
1439 i = 0;
1440 if ( write_exact(io_fd, &i, sizeof(int)) )
1442 ERROR("Error when writing to state file (6') (errno %d)", errno);
1443 goto out;
1446 if ( hvm )
1448 uint32_t rec_size;
1450 /* Save magic-page locations. */
1451 memset(magic_pfns, 0, sizeof(magic_pfns));
1452 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
1453 (unsigned long *)&magic_pfns[0]);
1454 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
1455 (unsigned long *)&magic_pfns[1]);
1456 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
1457 (unsigned long *)&magic_pfns[2]);
1458 if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
1460 ERROR("Error when writing to state file (7)");
1461 goto out;
1464 /* Get HVM context from Xen and save it too */
1465 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
1466 hvm_buf_size)) == -1 )
1468 ERROR("HVM:Could not get hvm buffer");
1469 goto out;
1472 if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
1474 ERROR("error write hvm buffer size");
1475 goto out;
1478 if ( write_exact(io_fd, hvm_buf, rec_size) )
1480 ERROR("write HVM info failed!\n");
1481 goto out;
1484 /* HVM guests are done now */
1485 rc = 0;
1486 goto out;
1489 /* PV guests only from now on */
1491 /* Send through a list of all the PFNs that were not in map at the close */
1493 unsigned int i,j;
1494 unsigned long pfntab[1024];
1496 for ( i = 0, j = 0; i < p2m_size; i++ )
1498 if ( !is_mapped(pfn_to_mfn(i)) )
1499 j++;
1502 if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
1504 ERROR("Error when writing to state file (6a) (errno %d)", errno);
1505 goto out;
1508 for ( i = 0, j = 0; i < p2m_size; )
1510 if ( !is_mapped(pfn_to_mfn(i)) )
1511 pfntab[j++] = i;
1513 i++;
1514 if ( (j == 1024) || (i == p2m_size) )
1516 if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
1518 ERROR("Error when writing to state file (6b) (errno %d)",
1519 errno);
1520 goto out;
1522 j = 0;
1527 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
1529 ERROR("Could not get vcpu context");
1530 goto out;
1533 /* Canonicalise the suspend-record frame number. */
1534 mfn = GET_FIELD(&ctxt, user_regs.edx);
1535 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1537 ERROR("Suspend record is not in range of pseudophys map");
1538 goto out;
1540 SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn));
1542 for ( i = 0; i <= info.max_vcpu_id; i++ )
1544 if ( !(vcpumap & (1ULL << i)) )
1545 continue;
1547 if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) )
1549 ERROR("No context for VCPU%d", i);
1550 goto out;
1553 /* Canonicalise each GDT frame number. */
1554 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1556 mfn = GET_FIELD(&ctxt, gdt_frames[j]);
1557 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1559 ERROR("GDT frame is not in range of pseudophys map");
1560 goto out;
1562 SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
1565 /* Canonicalise the page table base pointer. */
1566 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(
1567 GET_FIELD(&ctxt, ctrlreg[3]))) )
1569 ERROR("PT base is not in range of pseudophys map");
1570 goto out;
1572 SET_FIELD(&ctxt, ctrlreg[3],
1573 xen_pfn_to_cr3(
1574 mfn_to_pfn(
1575 xen_cr3_to_pfn(
1576 GET_FIELD(&ctxt, ctrlreg[3])))));
1578 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1579 if ( (pt_levels == 4) && ctxt.x64.ctrlreg[1] )
1581 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(
1582 xen_cr3_to_pfn(ctxt.x64.ctrlreg[1])) )
1584 ERROR("PT base is not in range of pseudophys map");
1585 goto out;
1587 /* Least-significant bit means 'valid PFN'. */
1588 ctxt.x64.ctrlreg[1] = 1 |
1589 xen_pfn_to_cr3(
1590 mfn_to_pfn(xen_cr3_to_pfn(ctxt.x64.ctrlreg[1])));
1593 if ( write_exact(io_fd, &ctxt, ((guest_width==8)
1594 ? sizeof(ctxt.x64)
1595 : sizeof(ctxt.x32))) )
1597 ERROR("Error when writing to state file (1) (errno %d)", errno);
1598 goto out;
1601 domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
1602 domctl.domain = dom;
1603 domctl.u.ext_vcpucontext.vcpu = i;
1604 if ( xc_domctl(xc_handle, &domctl) < 0 )
1606 ERROR("No extended context for VCPU%d", i);
1607 goto out;
1609 if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
1611 ERROR("Error when writing to state file (2) (errno %d)", errno);
1612 goto out;
1616 /*
1617 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1618 */
1619 memcpy(page, live_shinfo, PAGE_SIZE);
1620 SET_FIELD(((shared_info_either_t *)page),
1621 arch.pfn_to_mfn_frame_list_list, 0);
1622 if ( write_exact(io_fd, page, PAGE_SIZE) )
1624 ERROR("Error when writing to state file (1) (errno %d)", errno);
1625 goto out;
1628 /* Success! */
1629 rc = 0;
1631 out:
1633 if ( live )
1635 if ( xc_shadow_control(xc_handle, dom,
1636 XEN_DOMCTL_SHADOW_OP_OFF,
1637 NULL, 0, NULL, 0, NULL) < 0 )
1638 DPRINTF("Warning - couldn't disable shadow mode");
1641 /* Flush last write and discard cache for file. */
1642 discard_file_cache(io_fd, 1 /* flush */);
1644 if ( live_shinfo )
1645 munmap(live_shinfo, PAGE_SIZE);
1647 if ( live_p2m )
1648 munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
1650 if ( live_m2p )
1651 munmap(live_m2p, M2P_SIZE(max_mfn));
1653 free(pfn_type);
1654 free(pfn_batch);
1655 free(to_send);
1656 free(to_fix);
1657 free(to_skip);
1659 DPRINTF("Save exit rc=%d\n",rc);
1661 return !!rc;
1664 /*
1665 * Local variables:
1666 * mode: C
1667 * c-set-style: "BSD"
1668 * c-basic-offset: 4
1669 * tab-width: 4
1670 * indent-tabs-mode: nil
1671 * End:
1672 */