debuggers.hg

view tools/libxc/xc_domain_save.c @ 20868:b684d9e57b8f

xc_domain_save: allocate pfn_err before use

Due to recent changes related to xc_map_foreign_bulk, xc_domain_save
segfaults because it tries to use pfn_err without allocating it first.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Jan 19 09:40:30 2010 +0000 (2010-01-19)
parents 0447c5532e9f
children fbe8f32fa257
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 #include <xen/hvm/params.h>
21 #include "xc_e820.h"
23 /*
24 ** Default values for important tuning parameters. Can override by passing
25 ** non-zero replacement values to xc_domain_save().
26 **
27 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
28 **
29 */
30 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
31 #define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
33 struct save_ctx {
34 unsigned long hvirt_start; /* virtual starting address of the hypervisor */
35 unsigned int pt_levels; /* #levels of page tables used by the current guest */
36 unsigned long max_mfn; /* max mfn of the whole machine */
37 xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
38 xen_pfn_t *live_m2p; /* Live mapping of system MFN to PFN table. */
39 unsigned long m2p_mfn0;
40 struct domain_info_context dinfo;
41 };
43 /* buffer for output */
44 struct outbuf {
45 void* buf;
46 size_t size;
47 size_t pos;
48 };
50 #define OUTBUF_SIZE (16384 * 1024)
52 /* grep fodder: machine_to_phys */
54 #define mfn_to_pfn(_mfn) (ctx->live_m2p[(_mfn)])
56 #define pfn_to_mfn(_pfn) \
57 ((xen_pfn_t) ((dinfo->guest_width==8) \
58 ? (((uint64_t *)ctx->live_p2m)[(_pfn)]) \
59 : ((((uint32_t *)ctx->live_p2m)[(_pfn)]) == 0xffffffffU \
60 ? (-1UL) : (((uint32_t *)ctx->live_p2m)[(_pfn)]))))
62 /*
63 * Returns TRUE if the given machine frame number has a unique mapping
64 * in the guest's pseudophysical map.
65 */
66 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
67 (((_mfn) < (ctx->max_mfn)) && \
68 ((mfn_to_pfn(_mfn) < (dinfo->p2m_size)) && \
69 (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
71 /*
72 ** During (live) save/migrate, we maintain a number of bitmaps to track
73 ** which pages we have to send, to fixup, and to skip.
74 */
76 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
77 #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
78 #define BITMAP_SIZE (BITS_TO_LONGS(dinfo->p2m_size) * sizeof(unsigned long))
80 #define BITMAP_ENTRY(_nr,_bmap) \
81 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
83 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
85 #define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)
87 static inline int test_bit (int nr, volatile void * addr)
88 {
89 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
90 }
92 static inline void clear_bit (int nr, volatile void * addr)
93 {
94 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
95 }
97 static inline void set_bit ( int nr, volatile void * addr)
98 {
99 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
100 }
102 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
103 static inline unsigned int hweight32(unsigned int w)
104 {
105 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
106 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
107 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
108 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
109 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
110 }
112 static inline int count_bits ( int nr, volatile void *addr)
113 {
114 int i, count = 0;
115 volatile unsigned long *p = (volatile unsigned long *)addr;
116 /* We know that the array is padded to unsigned long. */
117 for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
118 count += hweight32(*p);
119 return count;
120 }
122 static uint64_t tv_to_us(struct timeval *new)
123 {
124 return (new->tv_sec * 1000000) + new->tv_usec;
125 }
127 static uint64_t llgettimeofday(void)
128 {
129 struct timeval now;
130 gettimeofday(&now, NULL);
131 return tv_to_us(&now);
132 }
134 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
135 {
136 return (((new->tv_sec - old->tv_sec)*1000000) +
137 (new->tv_usec - old->tv_usec));
138 }
140 static int noncached_write(int fd, int live, void *buffer, int len)
141 {
142 static int write_count = 0;
143 int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
145 write_count += len;
146 if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
147 {
148 /* Time to discard cache - dont care if this fails */
149 discard_file_cache(fd, 0 /* no flush */);
150 write_count = 0;
151 }
153 return rc;
154 }
156 static int outbuf_init(struct outbuf* ob, size_t size)
157 {
158 memset(ob, 0, sizeof(*ob));
160 if ( !(ob->buf = malloc(size)) ) {
161 DPRINTF("error allocating output buffer of size %zu\n", size);
162 return -1;
163 }
165 ob->size = size;
167 return 0;
168 }
170 static inline int outbuf_write(struct outbuf* ob, void* buf, size_t len)
171 {
172 if ( len > ob->size - ob->pos ) {
173 DPRINTF("outbuf_write: %zu > %zu@%zu\n", len, ob->size - ob->pos, ob->pos);
174 return -1;
175 }
177 memcpy(ob->buf + ob->pos, buf, len);
178 ob->pos += len;
180 return 0;
181 }
183 /* prep for nonblocking I/O */
184 static int outbuf_flush(struct outbuf* ob, int fd)
185 {
186 int rc;
187 int cur = 0;
189 if ( !ob->pos )
190 return 0;
192 rc = write(fd, ob->buf, ob->pos);
193 while (rc < 0 || cur + rc < ob->pos) {
194 if (rc < 0 && errno != EAGAIN && errno != EINTR) {
195 DPRINTF("error flushing output: %d\n", errno);
196 return -1;
197 }
198 if (rc > 0)
199 cur += rc;
201 rc = write(fd, ob->buf + cur, ob->pos - cur);
202 }
204 ob->pos = 0;
206 return 0;
207 }
209 /* if there's no room in the buffer, flush it and try again. */
210 static inline int outbuf_hardwrite(struct outbuf* ob, int fd, void* buf,
211 size_t len)
212 {
213 if ( !len )
214 return 0;
216 if ( !outbuf_write(ob, buf, len) )
217 return 0;
219 if ( outbuf_flush(ob, fd) < 0 )
220 return -1;
222 return outbuf_write(ob, buf, len);
223 }
225 /* start buffering output once we've reached checkpoint mode. */
226 static inline int write_buffer(int dobuf, struct outbuf* ob, int fd, void* buf,
227 size_t len)
228 {
229 if ( dobuf )
230 return outbuf_hardwrite(ob, fd, buf, len);
231 else
232 return write_exact(fd, buf, len);
233 }
235 #ifdef ADAPTIVE_SAVE
237 /*
238 ** We control the rate at which we transmit (or save) to minimize impact
239 ** on running domains (including the target if we're doing live migrate).
240 */
242 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
243 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
245 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
246 #define RATE_TO_BTU 781250
248 /* Amount in bytes we allow ourselves to send in a burst */
249 #define BURST_BUDGET (100*1024)
251 /* We keep track of the current and previous transmission rate */
252 static int mbit_rate, ombit_rate = 0;
254 /* Have we reached the maximum transmission rate? */
255 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
257 static inline void initialize_mbit_rate()
258 {
259 mbit_rate = START_MBIT_RATE;
260 }
262 static int ratewrite(int io_fd, int live, void *buf, int n)
263 {
264 static int budget = 0;
265 static int burst_time_us = -1;
266 static struct timeval last_put = { 0 };
267 struct timeval now;
268 struct timespec delay;
269 long long delta;
271 if ( START_MBIT_RATE == 0 )
272 return noncached_write(io_fd, live, buf, n);
274 budget -= n;
275 if ( budget < 0 )
276 {
277 if ( mbit_rate != ombit_rate )
278 {
279 burst_time_us = RATE_TO_BTU / mbit_rate;
280 ombit_rate = mbit_rate;
281 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
282 mbit_rate, BURST_BUDGET, burst_time_us);
283 }
284 if ( last_put.tv_sec == 0 )
285 {
286 budget += BURST_BUDGET;
287 gettimeofday(&last_put, NULL);
288 }
289 else
290 {
291 while ( budget < 0 )
292 {
293 gettimeofday(&now, NULL);
294 delta = tv_delta(&now, &last_put);
295 while ( delta > burst_time_us )
296 {
297 budget += BURST_BUDGET;
298 last_put.tv_usec += burst_time_us;
299 if ( last_put.tv_usec > 1000000 )
300 {
301 last_put.tv_usec -= 1000000;
302 last_put.tv_sec++;
303 }
304 delta -= burst_time_us;
305 }
306 if ( budget > 0 )
307 break;
308 delay.tv_sec = 0;
309 delay.tv_nsec = 1000 * (burst_time_us - delta);
310 while ( delay.tv_nsec > 0 )
311 if ( nanosleep(&delay, &delay) == 0 )
312 break;
313 }
314 }
315 }
316 return noncached_write(io_fd, live, buf, n);
317 }
319 #else /* ! ADAPTIVE SAVE */
321 #define RATE_IS_MAX() (0)
322 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
323 #define initialize_mbit_rate()
325 #endif
327 /* like write_buffer for ratewrite, which returns number of bytes written */
328 static inline int ratewrite_buffer(int dobuf, struct outbuf* ob, int fd,
329 int live, void* buf, size_t len)
330 {
331 if ( dobuf )
332 return outbuf_hardwrite(ob, fd, buf, len) ? -1 : len;
333 else
334 return ratewrite(fd, live, buf, len);
335 }
337 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
338 xc_shadow_op_stats_t *stats, int print)
339 {
340 static struct timeval wall_last;
341 static long long d0_cpu_last;
342 static long long d1_cpu_last;
344 struct timeval wall_now;
345 long long wall_delta;
346 long long d0_cpu_now, d0_cpu_delta;
347 long long d1_cpu_now, d1_cpu_delta;
349 gettimeofday(&wall_now, NULL);
351 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
352 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
354 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
355 DPRINTF("ARRHHH!!\n");
357 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
358 if ( wall_delta == 0 )
359 wall_delta = 1;
361 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
362 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
364 if ( print )
365 DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
366 "dirtied %dMb/s %" PRId32 " pages\n",
367 wall_delta,
368 (int)((d0_cpu_delta*100)/wall_delta),
369 (int)((d1_cpu_delta*100)/wall_delta),
370 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
371 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
372 stats->dirty_count);
374 #ifdef ADAPTIVE_SAVE
375 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
376 {
377 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
378 + 50;
379 if ( mbit_rate > MAX_MBIT_RATE )
380 mbit_rate = MAX_MBIT_RATE;
381 }
382 #endif
384 d0_cpu_last = d0_cpu_now;
385 d1_cpu_last = d1_cpu_now;
386 wall_last = wall_now;
388 return 0;
389 }
392 static int analysis_phase(int xc_handle, uint32_t domid, struct save_ctx *ctx,
393 unsigned long *arr, int runs)
394 {
395 long long start, now;
396 xc_shadow_op_stats_t stats;
397 int j;
398 struct domain_info_context *dinfo = &ctx->dinfo;
400 start = llgettimeofday();
402 for ( j = 0; j < runs; j++ )
403 {
404 int i;
406 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
407 arr, dinfo->p2m_size, NULL, 0, NULL);
408 DPRINTF("#Flush\n");
409 for ( i = 0; i < 40; i++ )
410 {
411 usleep(50000);
412 now = llgettimeofday();
413 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
414 NULL, 0, NULL, 0, &stats);
415 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
416 ((now-start)+500)/1000,
417 stats.fault_count, stats.dirty_count);
418 }
419 }
421 return -1;
422 }
424 static int suspend_and_state(int (*suspend)(void*), void* data,
425 int xc_handle, int io_fd, int dom,
426 xc_dominfo_t *info)
427 {
428 if ( !(*suspend)(data) )
429 {
430 ERROR("Suspend request failed");
431 return -1;
432 }
434 if ( (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) ||
435 !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) )
436 {
437 ERROR("Domain not in suspended state");
438 return -1;
439 }
441 return 0;
442 }
444 /*
445 ** Map the top-level page of MFNs from the guest. The guest might not have
446 ** finished resuming from a previous restore operation, so we wait a while for
447 ** it to update the MFN to a reasonable value.
448 */
449 static void *map_frame_list_list(int xc_handle, uint32_t dom,
450 struct save_ctx *ctx,
451 shared_info_any_t *shinfo)
452 {
453 int count = 100;
454 void *p;
455 struct domain_info_context *dinfo = &ctx->dinfo;
456 uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
458 while ( count-- && (fll == 0) )
459 {
460 usleep(10000);
461 fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
462 }
464 if ( fll == 0 )
465 {
466 ERROR("Timed out waiting for frame list updated.");
467 return NULL;
468 }
470 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll);
471 if ( p == NULL )
472 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
474 return p;
475 }
477 /*
478 ** During transfer (or in the state file), all page-table pages must be
479 ** converted into a 'canonical' form where references to actual mfns
480 ** are replaced with references to the corresponding pfns.
481 **
482 ** This function performs the appropriate conversion, taking into account
483 ** which entries do not require canonicalization (in particular, those
484 ** entries which map the virtual address reserved for the hypervisor).
485 */
486 static int canonicalize_pagetable(struct save_ctx *ctx,
487 unsigned long type, unsigned long pfn,
488 const void *spage, void *dpage)
489 {
490 struct domain_info_context *dinfo = &ctx->dinfo;
491 int i, pte_last, xen_start, xen_end, race = 0;
492 uint64_t pte;
494 /*
495 ** We need to determine which entries in this page table hold
496 ** reserved hypervisor mappings. This depends on the current
497 ** page table type as well as the number of paging levels.
498 */
499 xen_start = xen_end = pte_last = PAGE_SIZE / ((ctx->pt_levels == 2) ? 4 : 8);
501 if ( (ctx->pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
502 xen_start = (ctx->hvirt_start >> L2_PAGETABLE_SHIFT);
504 if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
505 xen_start = L3_PAGETABLE_ENTRIES_PAE;
507 /*
508 ** In PAE only the L2 mapping the top 1GB contains Xen mappings.
509 ** We can spot this by looking for the guest's mappingof the m2p.
510 ** Guests must ensure that this check will fail for other L2s.
511 */
512 if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
513 {
514 int hstart;
515 uint64_t he;
517 hstart = (ctx->hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
518 he = ((const uint64_t *) spage)[hstart];
520 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
521 {
522 /* hvirt starts with xen stuff... */
523 xen_start = hstart;
524 }
525 else if ( ctx->hvirt_start != 0xf5800000 )
526 {
527 /* old L2s from before hole was shrunk... */
528 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
529 he = ((const uint64_t *) spage)[hstart];
530 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
531 xen_start = hstart;
532 }
533 }
535 if ( (ctx->pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
536 {
537 /*
538 ** XXX SMH: should compute these from hvirt_start (which we have)
539 ** and hvirt_end (which we don't)
540 */
541 xen_start = 256;
542 xen_end = 272;
543 }
545 /* Now iterate through the page table, canonicalizing each PTE */
546 for (i = 0; i < pte_last; i++ )
547 {
548 unsigned long pfn, mfn;
550 if ( ctx->pt_levels == 2 )
551 pte = ((const uint32_t*)spage)[i];
552 else
553 pte = ((const uint64_t*)spage)[i];
555 if ( (i >= xen_start) && (i < xen_end) )
556 pte = 0;
558 if ( pte & _PAGE_PRESENT )
559 {
560 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
561 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
562 {
563 /* This will happen if the type info is stale which
564 is quite feasible under live migration */
565 pfn = 0; /* zap it - we'll retransmit this page later */
566 /* XXX: We can't spot Xen mappings in compat-mode L2es
567 * from 64-bit tools, but the only thing in them is the
568 * compat m2p, so we quietly zap them. This doesn't
569 * count as a race, so don't report it. */
570 if ( !(type == XEN_DOMCTL_PFINFO_L2TAB
571 && sizeof (unsigned long) > dinfo->guest_width) )
572 race = 1; /* inform the caller; fatal if !live */
573 }
574 else
575 pfn = mfn_to_pfn(mfn);
577 pte &= ~MADDR_MASK_X86;
578 pte |= (uint64_t)pfn << PAGE_SHIFT;
580 /*
581 * PAE guest L3Es can contain these flags when running on
582 * a 64bit hypervisor. We zap these here to avoid any
583 * surprise at restore time...
584 */
585 if ( (ctx->pt_levels == 3) &&
586 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
587 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
588 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
589 }
591 if ( ctx->pt_levels == 2 )
592 ((uint32_t*)dpage)[i] = pte;
593 else
594 ((uint64_t*)dpage)[i] = pte;
595 }
597 return race;
598 }
600 xen_pfn_t *xc_map_m2p(int xc_handle,
601 unsigned long max_mfn,
602 int prot,
603 unsigned long *mfn0)
604 {
605 struct xen_machphys_mfn_list xmml;
606 privcmd_mmap_entry_t *entries;
607 unsigned long m2p_chunks, m2p_size;
608 xen_pfn_t *m2p;
609 xen_pfn_t *extent_start;
610 int i;
612 m2p = NULL;
613 m2p_size = M2P_SIZE(max_mfn);
614 m2p_chunks = M2P_CHUNKS(max_mfn);
616 xmml.max_extents = m2p_chunks;
618 extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
619 if ( !extent_start )
620 {
621 ERROR("failed to allocate space for m2p mfns");
622 goto err0;
623 }
624 set_xen_guest_handle(xmml.extent_start, extent_start);
626 if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
627 (xmml.nr_extents != m2p_chunks) )
628 {
629 ERROR("xc_get_m2p_mfns");
630 goto err1;
631 }
633 entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
634 if (entries == NULL)
635 {
636 ERROR("failed to allocate space for mmap entries");
637 goto err1;
638 }
640 for ( i = 0; i < m2p_chunks; i++ )
641 entries[i].mfn = extent_start[i];
643 m2p = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
644 m2p_size, prot, M2P_CHUNK_SIZE,
645 entries, m2p_chunks);
646 if (m2p == NULL)
647 {
648 ERROR("xc_mmap_foreign_ranges failed");
649 goto err2;
650 }
652 if (mfn0)
653 *mfn0 = entries[0].mfn;
655 err2:
656 free(entries);
657 err1:
658 free(extent_start);
660 err0:
661 return m2p;
662 }
665 static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
666 int io_fd,
667 uint32_t dom,
668 struct save_ctx *ctx,
669 shared_info_any_t *live_shinfo)
670 {
671 vcpu_guest_context_any_t ctxt;
672 struct domain_info_context *dinfo = &ctx->dinfo;
674 /* Double and single indirect references to the live P2M table */
675 void *live_p2m_frame_list_list = NULL;
676 void *live_p2m_frame_list = NULL;
678 /* Copies of the above. */
679 xen_pfn_t *p2m_frame_list_list = NULL;
680 xen_pfn_t *p2m_frame_list = NULL;
682 /* The mapping of the live p2m table itself */
683 xen_pfn_t *p2m = NULL;
685 int i, success = 0;
687 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom, ctx,
688 live_shinfo);
689 if ( !live_p2m_frame_list_list )
690 goto out;
692 /* Get a local copy of the live_P2M_frame_list_list */
693 if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
694 {
695 ERROR("Couldn't allocate p2m_frame_list_list array");
696 goto out;
697 }
698 memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
700 /* Canonicalize guest's unsigned long vs ours */
701 if ( dinfo->guest_width > sizeof(unsigned long) )
702 for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
703 if ( i < PAGE_SIZE/dinfo->guest_width )
704 p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
705 else
706 p2m_frame_list_list[i] = 0;
707 else if ( dinfo->guest_width < sizeof(unsigned long) )
708 for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
709 p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
711 live_p2m_frame_list =
712 xc_map_foreign_pages(xc_handle, dom, PROT_READ,
713 p2m_frame_list_list,
714 P2M_FLL_ENTRIES);
715 if ( !live_p2m_frame_list )
716 {
717 ERROR("Couldn't map p2m_frame_list");
718 goto out;
719 }
721 /* Get a local copy of the live_P2M_frame_list */
722 if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
723 {
724 ERROR("Couldn't allocate p2m_frame_list array");
725 goto out;
726 }
727 memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
728 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
730 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
731 live_p2m_frame_list = NULL;
733 /* Canonicalize guest's unsigned long vs ours */
734 if ( dinfo->guest_width > sizeof(unsigned long) )
735 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
736 p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
737 else if ( dinfo->guest_width < sizeof(unsigned long) )
738 for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
739 p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
742 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
743 the guest must not change which frames are used for this purpose.
744 (its not clear why it would want to change them, and we'll be OK
745 from a safety POV anyhow. */
747 p2m = xc_map_foreign_pages(xc_handle, dom, PROT_READ,
748 p2m_frame_list,
749 P2M_FL_ENTRIES);
750 if ( !p2m )
751 {
752 ERROR("Couldn't map p2m table");
753 goto out;
754 }
755 ctx->live_p2m = p2m; /* So that translation macros will work */
757 /* Canonicalise the pfn-to-mfn table frame-number list. */
758 for ( i = 0; i < dinfo->p2m_size; i += FPP )
759 {
760 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
761 {
762 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
763 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
764 i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], ctx->max_mfn);
765 if ( p2m_frame_list[i/FPP] < ctx->max_mfn )
766 {
767 ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
768 (uint64_t)p2m_frame_list[i/FPP],
769 (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]]);
770 ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
771 (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]],
772 (uint64_t)p2m[ctx->live_m2p[p2m_frame_list[i/FPP]]]);
774 }
775 goto out;
776 }
777 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
778 }
780 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
781 {
782 ERROR("Could not get vcpu context");
783 goto out;
784 }
786 /*
787 * Write an extended-info structure to inform the restore code that
788 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
789 * slow paths in the restore code.
790 */
791 {
792 unsigned long signature = ~0UL;
793 uint32_t chunk1_sz = ((dinfo->guest_width==8)
794 ? sizeof(ctxt.x64)
795 : sizeof(ctxt.x32));
796 uint32_t chunk2_sz = 0;
797 uint32_t tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
798 if ( write_exact(io_fd, &signature, sizeof(signature)) ||
799 write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
800 write_exact(io_fd, "vcpu", 4) ||
801 write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
802 write_exact(io_fd, &ctxt, chunk1_sz) ||
803 write_exact(io_fd, "extv", 4) ||
804 write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) )
805 {
806 PERROR("write: extended info");
807 goto out;
808 }
809 }
811 if ( write_exact(io_fd, p2m_frame_list,
812 P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
813 {
814 PERROR("write: p2m_frame_list");
815 goto out;
816 }
818 success = 1;
820 out:
822 if ( !success && p2m )
823 munmap(p2m, P2M_FLL_ENTRIES * PAGE_SIZE);
825 if ( live_p2m_frame_list_list )
826 munmap(live_p2m_frame_list_list, PAGE_SIZE);
828 if ( live_p2m_frame_list )
829 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
831 if ( p2m_frame_list_list )
832 free(p2m_frame_list_list);
834 if ( p2m_frame_list )
835 free(p2m_frame_list);
837 return success ? p2m : NULL;
838 }
840 /* must be done AFTER suspend_and_state() */
841 static int save_tsc_info(int xc_handle, uint32_t dom, int io_fd)
842 {
843 int marker = -7;
844 uint32_t tsc_mode, khz, incarn;
845 uint64_t nsec;
847 if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
848 &nsec, &khz, &incarn) < 0 ||
849 write_exact(io_fd, &marker, sizeof(marker)) ||
850 write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
851 write_exact(io_fd, &nsec, sizeof(nsec)) ||
852 write_exact(io_fd, &khz, sizeof(khz)) ||
853 write_exact(io_fd, &incarn, sizeof(incarn)) )
854 return -1;
855 return 0;
856 }
858 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
859 uint32_t max_factor, uint32_t flags,
860 struct save_callbacks* callbacks,
861 int hvm, void (*switch_qemu_logdirty)(int, unsigned))
862 {
863 xc_dominfo_t info;
864 DECLARE_DOMCTL;
866 int rc = 1, frc, i, j, last_iter = 0, iter = 0;
867 int live = (flags & XCFLAGS_LIVE);
868 int debug = (flags & XCFLAGS_DEBUG);
869 int race = 0, sent_last_iter, skip_this_iter;
870 int tmem_saved = 0;
872 /* The new domain's shared-info frame number. */
873 unsigned long shared_info_frame;
875 /* A copy of the CPU context of the guest. */
876 vcpu_guest_context_any_t ctxt;
878 /* A table containing the type of each PFN (/not/ MFN!). */
879 xen_pfn_t *pfn_type = NULL;
880 unsigned long *pfn_batch = NULL;
881 int *pfn_err = NULL;
883 /* A copy of one frame of guest memory. */
884 char page[PAGE_SIZE];
886 /* Live mapping of shared info structure */
887 shared_info_any_t *live_shinfo = NULL;
889 /* base of the region in which domain memory is mapped */
890 unsigned char *region_base = NULL;
892 /* bitmap of pages:
893 - that should be sent this iteration (unless later marked as skip);
894 - to skip this iteration because already dirty;
895 - to fixup by sending at the end if not already resent; */
896 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
898 xc_shadow_op_stats_t stats;
900 unsigned long needed_to_fix = 0;
901 unsigned long total_sent = 0;
903 uint64_t vcpumap = 1ULL;
905 /* HVM: a buffer for holding HVM context */
906 uint32_t hvm_buf_size = 0;
907 uint8_t *hvm_buf = NULL;
909 /* HVM: magic frames for ioreqs and xenstore comms. */
910 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
912 unsigned long mfn;
914 struct outbuf ob;
915 static struct save_ctx _ctx = {
916 .live_p2m = NULL,
917 .live_m2p = NULL,
918 };
919 static struct save_ctx *ctx = &_ctx;
920 struct domain_info_context *dinfo = &ctx->dinfo;
922 int completed = 0;
924 outbuf_init(&ob, OUTBUF_SIZE);
926 /* If no explicit control parameters given, use defaults */
927 max_iters = max_iters ? : DEF_MAX_ITERS;
928 max_factor = max_factor ? : DEF_MAX_FACTOR;
930 initialize_mbit_rate();
932 if ( !get_platform_info(xc_handle, dom,
933 &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
934 {
935 ERROR("Unable to get platform info.");
936 return 1;
937 }
939 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
940 {
941 ERROR("Could not get domain info");
942 return 1;
943 }
945 shared_info_frame = info.shared_info_frame;
947 /* Map the shared info frame */
948 if ( !hvm )
949 {
950 live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
951 PROT_READ, shared_info_frame);
952 if ( !live_shinfo )
953 {
954 ERROR("Couldn't map live_shinfo");
955 goto out;
956 }
957 }
959 /* Get the size of the P2M table */
960 dinfo->p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
962 if ( dinfo->p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK )
963 {
964 ERROR("Cannot save this big a guest");
965 goto out;
966 }
968 /* Domain is still running at this point */
969 if ( live )
970 {
971 /* Live suspend. Enable log-dirty mode. */
972 if ( xc_shadow_control(xc_handle, dom,
973 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
974 NULL, 0, NULL, 0, NULL) < 0 )
975 {
976 /* log-dirty already enabled? There's no test op,
977 so attempt to disable then reenable it */
978 frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
979 NULL, 0, NULL, 0, NULL);
980 if ( frc >= 0 )
981 {
982 frc = xc_shadow_control(xc_handle, dom,
983 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
984 NULL, 0, NULL, 0, NULL);
985 }
987 if ( frc < 0 )
988 {
989 ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
990 goto out;
991 }
992 }
994 /* Enable qemu-dm logging dirty pages to xen */
995 if ( hvm )
996 switch_qemu_logdirty(dom, 1);
997 }
998 else
999 {
1000 /* This is a non-live suspend. Suspend the domain .*/
1001 if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
1002 io_fd, dom, &info) )
1004 ERROR("Domain appears not to have suspended");
1005 goto out;
1009 last_iter = !live;
1011 /* pretend we sent all the pages last iteration */
1012 sent_last_iter = dinfo->p2m_size;
1014 /* Setup to_send / to_fix and to_skip bitmaps */
1015 to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
1016 to_fix = calloc(1, BITMAP_SIZE);
1017 to_skip = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
1019 if ( !to_send || !to_fix || !to_skip )
1021 ERROR("Couldn't allocate to_send array");
1022 goto out;
1025 memset(to_send, 0xff, BITMAP_SIZE);
1027 if ( lock_pages(to_send, BITMAP_SIZE) )
1029 ERROR("Unable to lock to_send");
1030 return 1;
1033 /* (to fix is local only) */
1034 if ( lock_pages(to_skip, BITMAP_SIZE) )
1036 ERROR("Unable to lock to_skip");
1037 return 1;
1040 if ( hvm )
1042 /* Need another buffer for HVM context */
1043 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
1044 if ( hvm_buf_size == -1 )
1046 ERROR("Couldn't get HVM context size from Xen");
1047 goto out;
1049 hvm_buf = malloc(hvm_buf_size);
1050 if ( !hvm_buf )
1052 ERROR("Couldn't allocate memory");
1053 goto out;
1057 analysis_phase(xc_handle, dom, ctx, to_skip, 0);
1059 pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
1060 MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
1061 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
1062 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
1064 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
1065 errno = ENOMEM;
1066 goto out;
1068 memset(pfn_type, 0,
1069 ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
1071 if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
1073 ERROR("Unable to lock pfn_type array");
1074 goto out;
1077 /* Setup the mfn_to_pfn table mapping */
1078 if ( !(ctx->live_m2p = xc_map_m2p(xc_handle, ctx->max_mfn, PROT_READ, &ctx->m2p_mfn0)) )
1080 ERROR("Failed to map live M2P table");
1081 goto out;
1084 /* Start writing out the saved-domain record. */
1085 if ( write_exact(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
1087 PERROR("write: p2m_size");
1088 goto out;
1091 if ( !hvm )
1093 int err = 0;
1095 /* Map the P2M table, and write the list of P2M frames */
1096 ctx->live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom, ctx, live_shinfo);
1097 if ( ctx->live_p2m == NULL )
1099 ERROR("Failed to map/save the p2m frame list");
1100 goto out;
1103 /*
1104 * Quick belt and braces sanity check.
1105 */
1107 for ( i = 0; i < dinfo->p2m_size; i++ )
1109 mfn = pfn_to_mfn(i);
1110 if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
1112 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
1113 mfn, mfn_to_pfn(mfn));
1114 err++;
1117 DPRINTF("Had %d unexplained entries in p2m table\n", err);
1120 print_stats(xc_handle, dom, 0, &stats, 0);
1122 tmem_saved = xc_tmem_save(xc_handle, dom, io_fd, live, -5);
1123 if ( tmem_saved == -1 )
1125 ERROR("Error when writing to state file (tmem)");
1126 goto out;
1129 if ( !live && save_tsc_info(xc_handle, dom, io_fd) < 0 )
1131 ERROR("Error when writing to state file (tsc)");
1132 goto out;
1135 copypages:
1136 #define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), (len))
1137 #ifdef ratewrite
1138 #undef ratewrite
1139 #endif
1140 #define ratewrite(fd, live, buf, len) ratewrite_buffer(last_iter, &ob, (fd), (live), (buf), (len))
1142 /* Now write out each data page, canonicalising page tables as we go... */
1143 for ( ; ; )
1145 unsigned int prev_pc, sent_this_iter, N, batch, run;
1147 iter++;
1148 sent_this_iter = 0;
1149 skip_this_iter = 0;
1150 prev_pc = 0;
1151 N = 0;
1153 DPRINTF("Saving memory pages: iter %d 0%%", iter);
1155 while ( N < dinfo->p2m_size )
1157 unsigned int this_pc = (N * 100) / dinfo->p2m_size;
1159 if ( (this_pc - prev_pc) >= 5 )
1161 DPRINTF("\b\b\b\b%3d%%", this_pc);
1162 prev_pc = this_pc;
1165 if ( !last_iter )
1167 /* Slightly wasteful to peek the whole array evey time,
1168 but this is fast enough for the moment. */
1169 frc = xc_shadow_control(
1170 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
1171 dinfo->p2m_size, NULL, 0, NULL);
1172 if ( frc != dinfo->p2m_size )
1174 ERROR("Error peeking shadow bitmap");
1175 goto out;
1179 /* load pfn_type[] with the mfn of all the pages we're doing in
1180 this batch. */
1181 for ( batch = 0;
1182 (batch < MAX_BATCH_SIZE) && (N < dinfo->p2m_size);
1183 N++ )
1185 int n = N;
1187 if ( debug )
1189 DPRINTF("%d pfn= %08lx mfn= %08lx %d",
1190 iter, (unsigned long)n,
1191 hvm ? 0 : pfn_to_mfn(n),
1192 test_bit(n, to_send));
1193 if ( !hvm && is_mapped(pfn_to_mfn(n)) )
1194 DPRINTF(" [mfn]= %08lx",
1195 mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
1196 DPRINTF("\n");
1199 if ( completed )
1201 /* for sparse bitmaps, word-by-word may save time */
1202 if ( !to_send[N >> ORDER_LONG] )
1204 /* incremented again in for loop! */
1205 N += BITS_PER_LONG - 1;
1206 continue;
1209 if ( !test_bit(n, to_send) )
1210 continue;
1212 pfn_batch[batch] = n;
1213 if ( hvm )
1214 pfn_type[batch] = n;
1215 else
1216 pfn_type[batch] = pfn_to_mfn(n);
1218 else
1220 if ( !last_iter &&
1221 test_bit(n, to_send) &&
1222 test_bit(n, to_skip) )
1223 skip_this_iter++; /* stats keeping */
1225 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
1226 (test_bit(n, to_send) && last_iter) ||
1227 (test_bit(n, to_fix) && last_iter)) )
1228 continue;
1230 /*
1231 ** we get here if:
1232 ** 1. page is marked to_send & hasn't already been re-dirtied
1233 ** 2. (ignore to_skip in last iteration)
1234 ** 3. add in pages that still need fixup (net bufs)
1235 */
1237 pfn_batch[batch] = n;
1239 /* Hypercall interfaces operate in PFNs for HVM guests
1240 * and MFNs for PV guests */
1241 if ( hvm )
1242 pfn_type[batch] = n;
1243 else
1244 pfn_type[batch] = pfn_to_mfn(n);
1246 if ( !is_mapped(pfn_type[batch]) )
1248 /*
1249 ** not currently in psuedo-physical map -- set bit
1250 ** in to_fix since we must send this page in last_iter
1251 ** unless its sent sooner anyhow, or it never enters
1252 ** pseudo-physical map (e.g. for ballooned down doms)
1253 */
1254 set_bit(n, to_fix);
1255 continue;
1258 if ( last_iter &&
1259 test_bit(n, to_fix) &&
1260 !test_bit(n, to_send) )
1262 needed_to_fix++;
1263 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1264 iter, n, pfn_type[batch]);
1267 clear_bit(n, to_fix);
1270 batch++;
1273 if ( batch == 0 )
1274 goto skip; /* vanishingly unlikely... */
1276 pfn_err = realloc(pfn_err, sizeof(int) * batch);
1277 region_base = xc_map_foreign_bulk(
1278 xc_handle, dom, PROT_READ, pfn_type, pfn_err, batch);
1279 if ( region_base == NULL )
1281 ERROR("map batch failed");
1282 goto out;
1285 if ( hvm )
1287 /* Look for and skip completely empty batches. */
1288 for ( j = 0; j < batch; j++ )
1290 if ( !pfn_err[j] )
1291 break;
1292 pfn_type[j] |= XEN_DOMCTL_PFINFO_XTAB;
1294 if ( j == batch )
1296 munmap(region_base, batch*PAGE_SIZE);
1297 continue; /* bail on this batch: no valid pages */
1299 for ( ; j < batch; j++ )
1300 if ( pfn_err[j] )
1301 pfn_type[j] |= XEN_DOMCTL_PFINFO_XTAB;
1303 else
1305 /* Get page types */
1306 if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) )
1308 ERROR("get_pfn_type_batch failed");
1309 goto out;
1312 for ( j = 0; j < batch; j++ )
1314 unsigned long mfn = pfn_to_mfn(pfn_batch[j]);
1316 if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB )
1318 DPRINTF("type fail: page %i mfn %08lx\n",
1319 j, mfn);
1320 continue;
1323 if ( debug )
1324 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1325 " sum= %08lx\n",
1326 iter,
1327 pfn_type[j] | pfn_batch[j],
1328 mfn,
1329 mfn_to_pfn(mfn),
1330 csum_page(region_base + (PAGE_SIZE*j)));
1332 /* canonicalise mfn->pfn */
1333 pfn_type[j] |= pfn_batch[j];
1337 if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
1339 PERROR("Error when writing to state file (2)");
1340 goto out;
1343 if ( sizeof(unsigned long) < sizeof(*pfn_type) )
1344 for ( j = 0; j < batch; j++ )
1345 ((unsigned long *)pfn_type)[j] = pfn_type[j];
1346 if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
1348 PERROR("Error when writing to state file (3)");
1349 goto out;
1351 if ( sizeof(unsigned long) < sizeof(*pfn_type) )
1352 while ( --j >= 0 )
1353 pfn_type[j] = ((unsigned long *)pfn_type)[j];
1355 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1356 run = 0;
1357 for ( j = 0; j < batch; j++ )
1359 unsigned long pfn, pagetype;
1360 void *spage = (char *)region_base + (PAGE_SIZE*j);
1362 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1363 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1365 if ( pagetype != 0 )
1367 /* If the page is not a normal data page, write out any
1368 run of pages we may have previously acumulated */
1369 if ( run )
1371 if ( ratewrite(io_fd, live,
1372 (char*)region_base+(PAGE_SIZE*(j-run)),
1373 PAGE_SIZE*run) != PAGE_SIZE*run )
1375 ERROR("Error when writing to state file (4a)"
1376 " (errno %d)", errno);
1377 goto out;
1379 run = 0;
1383 /* skip pages that aren't present */
1384 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1385 continue;
1387 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1389 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1390 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1392 /* We have a pagetable page: need to rewrite it. */
1393 race =
1394 canonicalize_pagetable(ctx, pagetype, pfn, spage, page);
1396 if ( race && !live )
1398 ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
1399 pagetype);
1400 goto out;
1403 if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
1405 ERROR("Error when writing to state file (4b)"
1406 " (errno %d)", errno);
1407 goto out;
1410 else
1412 /* We have a normal page: accumulate it for writing. */
1413 run++;
1415 } /* end of the write out for this batch */
1417 if ( run )
1419 /* write out the last accumulated run of pages */
1420 if ( ratewrite(io_fd, live,
1421 (char*)region_base+(PAGE_SIZE*(j-run)),
1422 PAGE_SIZE*run) != PAGE_SIZE*run )
1424 ERROR("Error when writing to state file (4c)"
1425 " (errno %d)", errno);
1426 goto out;
1430 sent_this_iter += batch;
1432 munmap(region_base, batch*PAGE_SIZE);
1434 } /* end of this while loop for this iteration */
1436 skip:
1438 total_sent += sent_this_iter;
1440 DPRINTF("\r %d: sent %d, skipped %d, ",
1441 iter, sent_this_iter, skip_this_iter );
1443 if ( last_iter )
1445 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1447 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1448 total_sent, ((float)total_sent)/dinfo->p2m_size );
1449 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1452 if ( last_iter && debug )
1454 int minusone = -1;
1455 memset(to_send, 0xff, BITMAP_SIZE);
1456 debug = 0;
1457 DPRINTF("Entering debug resend-all mode\n");
1459 /* send "-1" to put receiver into debug mode */
1460 if ( write_exact(io_fd, &minusone, sizeof(int)) )
1462 PERROR("Error when writing to state file (6)");
1463 goto out;
1466 continue;
1469 if ( last_iter )
1470 break;
1472 if ( live )
1474 if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1475 (iter >= max_iters) ||
1476 (sent_this_iter+skip_this_iter < 50) ||
1477 (total_sent > dinfo->p2m_size*max_factor) )
1479 DPRINTF("Start last iteration\n");
1480 last_iter = 1;
1482 if ( suspend_and_state(callbacks->suspend, callbacks->data,
1483 xc_handle, io_fd, dom, &info) )
1485 ERROR("Domain appears not to have suspended");
1486 goto out;
1489 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1490 if ( (tmem_saved > 0) &&
1491 (xc_tmem_save_extra(xc_handle,dom,io_fd,-6) == -1) )
1493 ERROR("Error when writing to state file (tmem)");
1494 goto out;
1497 if ( save_tsc_info(xc_handle, dom, io_fd) < 0 )
1499 ERROR("Error when writing to state file (tsc)");
1500 goto out;
1506 if ( xc_shadow_control(xc_handle, dom,
1507 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1508 dinfo->p2m_size, NULL, 0, &stats) != dinfo->p2m_size )
1510 ERROR("Error flushing shadow PT");
1511 goto out;
1514 sent_last_iter = sent_this_iter;
1516 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1519 } /* end of infinite for loop */
1521 DPRINTF("All memory is saved\n");
1524 struct {
1525 int minustwo;
1526 int max_vcpu_id;
1527 uint64_t vcpumap;
1528 } chunk = { -2, info.max_vcpu_id };
1530 if ( info.max_vcpu_id >= 64 )
1532 ERROR("Too many VCPUS in guest!");
1533 goto out;
1536 for ( i = 1; i <= info.max_vcpu_id; i++ )
1538 xc_vcpuinfo_t vinfo;
1539 if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
1540 vinfo.online )
1541 vcpumap |= 1ULL << i;
1544 chunk.vcpumap = vcpumap;
1545 if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
1547 PERROR("Error when writing to state file");
1548 goto out;
1552 if ( hvm )
1554 struct {
1555 int id;
1556 uint32_t pad;
1557 uint64_t data;
1558 } chunk = { 0, };
1560 chunk.id = -3;
1561 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
1562 (unsigned long *)&chunk.data);
1564 if ( (chunk.data != 0) &&
1565 write_exact(io_fd, &chunk, sizeof(chunk)) )
1567 PERROR("Error when writing the ident_pt for EPT guest");
1568 goto out;
1571 chunk.id = -4;
1572 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
1573 (unsigned long *)&chunk.data);
1575 if ( (chunk.data != 0) &&
1576 write_exact(io_fd, &chunk, sizeof(chunk)) )
1578 PERROR("Error when writing the vm86 TSS for guest");
1579 goto out;
1583 /* Zero terminate */
1584 i = 0;
1585 if ( write_exact(io_fd, &i, sizeof(int)) )
1587 PERROR("Error when writing to state file (6')");
1588 goto out;
1591 if ( hvm )
1593 uint32_t rec_size;
1595 /* Save magic-page locations. */
1596 memset(magic_pfns, 0, sizeof(magic_pfns));
1597 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
1598 (unsigned long *)&magic_pfns[0]);
1599 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
1600 (unsigned long *)&magic_pfns[1]);
1601 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
1602 (unsigned long *)&magic_pfns[2]);
1603 if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
1605 PERROR("Error when writing to state file (7)");
1606 goto out;
1609 /* Get HVM context from Xen and save it too */
1610 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
1611 hvm_buf_size)) == -1 )
1613 ERROR("HVM:Could not get hvm buffer");
1614 goto out;
1617 if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
1619 PERROR("error write hvm buffer size");
1620 goto out;
1623 if ( write_exact(io_fd, hvm_buf, rec_size) )
1625 PERROR("write HVM info failed!\n");
1626 goto out;
1629 /* HVM guests are done now */
1630 rc = 0;
1631 goto out;
1634 /* PV guests only from now on */
1636 /* Send through a list of all the PFNs that were not in map at the close */
1638 unsigned int i,j;
1639 unsigned long pfntab[1024];
1641 for ( i = 0, j = 0; i < dinfo->p2m_size; i++ )
1643 if ( !is_mapped(pfn_to_mfn(i)) )
1644 j++;
1647 if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
1649 PERROR("Error when writing to state file (6a)");
1650 goto out;
1653 for ( i = 0, j = 0; i < dinfo->p2m_size; )
1655 if ( !is_mapped(pfn_to_mfn(i)) )
1656 pfntab[j++] = i;
1658 i++;
1659 if ( (j == 1024) || (i == dinfo->p2m_size) )
1661 if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
1663 PERROR("Error when writing to state file (6b)");
1664 goto out;
1666 j = 0;
1671 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
1673 ERROR("Could not get vcpu context");
1674 goto out;
1677 /* Canonicalise the suspend-record frame number. */
1678 mfn = GET_FIELD(&ctxt, user_regs.edx);
1679 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1681 ERROR("Suspend record is not in range of pseudophys map");
1682 goto out;
1684 SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn));
1686 for ( i = 0; i <= info.max_vcpu_id; i++ )
1688 if ( !(vcpumap & (1ULL << i)) )
1689 continue;
1691 if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
1693 ERROR("No context for VCPU%d", i);
1694 goto out;
1697 /* Canonicalise each GDT frame number. */
1698 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1700 mfn = GET_FIELD(&ctxt, gdt_frames[j]);
1701 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1703 ERROR("GDT frame is not in range of pseudophys map");
1704 goto out;
1706 SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
1709 /* Canonicalise the page table base pointer. */
1710 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(
1711 GET_FIELD(&ctxt, ctrlreg[3]))) )
1713 ERROR("PT base is not in range of pseudophys map");
1714 goto out;
1716 SET_FIELD(&ctxt, ctrlreg[3],
1717 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3])))));
1719 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1720 if ( (ctx->pt_levels == 4) && ctxt.x64.ctrlreg[1] )
1722 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) )
1724 ERROR("PT base is not in range of pseudophys map");
1725 goto out;
1727 /* Least-significant bit means 'valid PFN'. */
1728 ctxt.x64.ctrlreg[1] = 1 |
1729 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
1732 if ( write_exact(io_fd, &ctxt, ((dinfo->guest_width==8)
1733 ? sizeof(ctxt.x64)
1734 : sizeof(ctxt.x32))) )
1736 PERROR("Error when writing to state file (1)");
1737 goto out;
1740 domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
1741 domctl.domain = dom;
1742 domctl.u.ext_vcpucontext.vcpu = i;
1743 if ( xc_domctl(xc_handle, &domctl) < 0 )
1745 ERROR("No extended context for VCPU%d", i);
1746 goto out;
1748 if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
1750 PERROR("Error when writing to state file (2)");
1751 goto out;
1755 /*
1756 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1757 */
1758 memcpy(page, live_shinfo, PAGE_SIZE);
1759 SET_FIELD(((shared_info_any_t *)page),
1760 arch.pfn_to_mfn_frame_list_list, 0);
1761 if ( write_exact(io_fd, page, PAGE_SIZE) )
1763 PERROR("Error when writing to state file (1)");
1764 goto out;
1767 /* Success! */
1768 rc = 0;
1770 out:
1771 completed = 1;
1773 if ( !rc && callbacks->postcopy )
1774 callbacks->postcopy(callbacks->data);
1776 /* Flush last write and discard cache for file. */
1777 if ( outbuf_flush(&ob, io_fd) < 0 ) {
1778 ERROR("Error when flushing output buffer\n");
1779 rc = 1;
1782 discard_file_cache(io_fd, 1 /* flush */);
1784 /* checkpoint_cb can spend arbitrarily long in between rounds */
1785 if (!rc && callbacks->checkpoint &&
1786 callbacks->checkpoint(callbacks->data) > 0)
1788 /* reset stats timer */
1789 print_stats(xc_handle, dom, 0, &stats, 0);
1791 rc = 1;
1792 /* last_iter = 1; */
1793 if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
1794 io_fd, dom, &info) )
1796 ERROR("Domain appears not to have suspended");
1797 goto out;
1799 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1800 print_stats(xc_handle, dom, 0, &stats, 1);
1802 if ( xc_shadow_control(xc_handle, dom,
1803 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1804 dinfo->p2m_size, NULL, 0, &stats) != dinfo->p2m_size )
1806 ERROR("Error flushing shadow PT");
1809 goto copypages;
1812 if ( tmem_saved != 0 && live )
1813 xc_tmem_save_done(xc_handle, dom);
1815 if ( live )
1817 if ( xc_shadow_control(xc_handle, dom,
1818 XEN_DOMCTL_SHADOW_OP_OFF,
1819 NULL, 0, NULL, 0, NULL) < 0 )
1820 DPRINTF("Warning - couldn't disable shadow mode");
1821 if ( hvm )
1822 switch_qemu_logdirty(dom, 0);
1825 if ( live_shinfo )
1826 munmap(live_shinfo, PAGE_SIZE);
1828 if ( ctx->live_p2m )
1829 munmap(ctx->live_p2m, P2M_FLL_ENTRIES * PAGE_SIZE);
1831 if ( ctx->live_m2p )
1832 munmap(ctx->live_m2p, M2P_SIZE(ctx->max_mfn));
1834 free(pfn_type);
1835 free(pfn_batch);
1836 free(pfn_err);
1837 free(to_send);
1838 free(to_fix);
1839 free(to_skip);
1841 DPRINTF("Save exit rc=%d\n",rc);
1843 return !!rc;
1846 /*
1847 * Local variables:
1848 * mode: C
1849 * c-set-style: "BSD"
1850 * c-basic-offset: 4
1851 * tab-width: 4
1852 * indent-tabs-mode: nil
1853 * End:
1854 */