debuggers.hg

view tools/libxc/xc_linux_save.c @ 6641:f27205ea60ef

merge?
author cl349@firebug.cl.cam.ac.uk
date Sat Sep 03 16:58:50 2005 +0000 (2005-09-03)
parents dd668f7527cb a1de77c1486c
children 29808fef9148 2e2611af05c6
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
17 #include <xen/linux/suspend.h>
18 #include <xen/io/domain_controller.h>
20 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
22 #define MAX_MBIT_RATE 500
25 /*
26 ** Default values for important tuning parameters. Can override by passing
27 ** non-zero replacement values to xc_linux_save().
28 **
29 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
30 **
31 */
32 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
33 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
37 /* Flags to control behaviour of xc_linux_save */
38 #define XCFLAGS_LIVE 1
39 #define XCFLAGS_DEBUG 2
42 #define DEBUG 0
44 #if 1
45 #define ERR(_f, _a...) do { fprintf(stderr, _f , ## _a); fflush(stderr); } while (0)
46 #else
47 #define ERR(_f, _a...) ((void)0)
48 #endif
50 #if DEBUG
51 #define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
52 #else
53 #define DPRINTF(_f, _a...) ((void)0)
54 #endif
56 #define PROGRESS 0
57 #if PROGRESS
58 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
59 #else
60 #define PPRINTF(_f, _a...)
61 #endif
63 /*
64 * Returns TRUE if the given machine frame number has a unique mapping
65 * in the guest's pseudophysical map.
66 */
68 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
69 (((_mfn) < (1024*1024)) && \
70 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
71 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
74 /* Returns TRUE if MFN is successfully converted to a PFN. */
75 #define translate_mfn_to_pfn(_pmfn) \
76 ({ \
77 unsigned long mfn = *(_pmfn); \
78 int _res = 1; \
79 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
80 _res = 0; \
81 else \
82 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
83 _res; \
84 })
86 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
88 static inline int test_bit ( int nr, volatile void * addr)
89 {
90 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
91 (nr % (sizeof(unsigned long)*8))) & 1;
92 }
94 static inline void clear_bit ( int nr, volatile void * addr)
95 {
96 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
97 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
98 }
100 static inline void set_bit ( int nr, volatile void * addr)
101 {
102 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
103 (1 << (nr % (sizeof(unsigned long)*8) ) );
104 }
106 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
107 static inline unsigned int hweight32(unsigned int w)
108 {
109 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
110 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
111 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
112 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
113 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
114 }
116 static inline int count_bits ( int nr, volatile void *addr)
117 {
118 int i, count = 0;
119 unsigned long *p = (unsigned long *)addr;
120 /* We know that the array is padded to unsigned long. */
121 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
122 count += hweight32( *p );
123 return count;
124 }
126 static inline int permute( int i, int nr, int order_nr )
127 {
128 /* Need a simple permutation function so that we scan pages in a
129 pseudo random order, enabling us to get a better estimate of
130 the domain's page dirtying rate as we go (there are often
131 contiguous ranges of pfns that have similar behaviour, and we
132 want to mix them up. */
134 /* e.g. nr->oder 15->4 16->4 17->5 */
135 /* 512MB domain, 128k pages, order 17 */
137 /*
138 QPONMLKJIHGFEDCBA
139 QPONMLKJIH
140 GFEDCBA
141 */
143 /*
144 QPONMLKJIHGFEDCBA
145 EDCBA
146 QPONM
147 LKJIHGF
148 */
150 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
151 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
153 return i;
154 }
156 static long long tv_to_us( struct timeval *new )
157 {
158 return (new->tv_sec * 1000000) + new->tv_usec;
159 }
161 static long long llgettimeofday( void )
162 {
163 struct timeval now;
164 gettimeofday(&now, NULL);
165 return tv_to_us(&now);
166 }
168 static long long tv_delta( struct timeval *new, struct timeval *old )
169 {
170 return ((new->tv_sec - old->tv_sec)*1000000 ) +
171 (new->tv_usec - old->tv_usec);
172 }
175 #define START_MBIT_RATE 0 //ioctxt->resource
177 static int mbit_rate, ombit_rate = 0;
178 static int burst_time_us = -1;
180 #define MBIT_RATE mbit_rate
181 #define BURST_BUDGET (100*1024)
183 /*
184 1000000/((100)*1024*1024/8/(100*1024))
185 7812
186 1000000/((100)*1024/8/(100))
187 7812
188 1000000/((100)*128/(100))
189 7812
190 100000000/((100)*128)
191 7812
192 100000000/128
193 781250
194 */
195 #define RATE_TO_BTU 781250
196 #define BURST_TIME_US burst_time_us
198 static int
199 ratewrite(int io_fd, void *buf, int n)
200 {
201 static int budget = 0;
202 static struct timeval last_put = { 0 };
203 struct timeval now;
204 struct timespec delay;
205 long long delta;
207 if (START_MBIT_RATE == 0)
208 return write(io_fd, buf, n);
210 budget -= n;
211 if (budget < 0) {
212 if (MBIT_RATE != ombit_rate) {
213 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
214 ombit_rate = MBIT_RATE;
215 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
216 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
217 }
218 if (last_put.tv_sec == 0) {
219 budget += BURST_BUDGET;
220 gettimeofday(&last_put, NULL);
221 } else {
222 while (budget < 0) {
223 gettimeofday(&now, NULL);
224 delta = tv_delta(&now, &last_put);
225 while (delta > BURST_TIME_US) {
226 budget += BURST_BUDGET;
227 last_put.tv_usec += BURST_TIME_US;
228 if (last_put.tv_usec > 1000000) {
229 last_put.tv_usec -= 1000000;
230 last_put.tv_sec++;
231 }
232 delta -= BURST_TIME_US;
233 }
234 if (budget > 0)
235 break;
236 delay.tv_sec = 0;
237 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
238 while (delay.tv_nsec > 0)
239 if (nanosleep(&delay, &delay) == 0)
240 break;
241 }
242 }
243 }
244 return write(io_fd, buf, n);
245 }
247 static int print_stats( int xc_handle, u32 domid,
248 int pages_sent, xc_shadow_control_stats_t *stats,
249 int print )
250 {
251 static struct timeval wall_last;
252 static long long d0_cpu_last;
253 static long long d1_cpu_last;
255 struct timeval wall_now;
256 long long wall_delta;
257 long long d0_cpu_now, d0_cpu_delta;
258 long long d1_cpu_now, d1_cpu_delta;
260 gettimeofday(&wall_now, NULL);
262 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
263 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
265 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
266 fprintf(stderr, "ARRHHH!!\n");
268 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
270 if ( wall_delta == 0 ) wall_delta = 1;
272 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
273 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
275 if ( print )
276 fprintf(stderr,
277 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
278 "dirtied %dMb/s %" PRId32 " pages\n",
279 wall_delta,
280 (int)((d0_cpu_delta*100)/wall_delta),
281 (int)((d1_cpu_delta*100)/wall_delta),
282 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
283 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
284 stats->dirty_count);
286 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
287 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
288 + 50;
289 if (mbit_rate > MAX_MBIT_RATE)
290 mbit_rate = MAX_MBIT_RATE;
291 }
293 d0_cpu_last = d0_cpu_now;
294 d1_cpu_last = d1_cpu_now;
295 wall_last = wall_now;
297 return 0;
298 }
300 static int analysis_phase( int xc_handle, u32 domid,
301 int nr_pfns, unsigned long *arr, int runs )
302 {
303 long long start, now;
304 xc_shadow_control_stats_t stats;
305 int j;
307 start = llgettimeofday();
309 for (j = 0; j < runs; j++)
310 {
311 int i;
313 xc_shadow_control( xc_handle, domid,
314 DOM0_SHADOW_CONTROL_OP_CLEAN,
315 arr, nr_pfns, NULL);
316 fprintf(stderr, "#Flush\n");
317 for ( i = 0; i < 40; i++ )
318 {
319 usleep(50000);
320 now = llgettimeofday();
321 xc_shadow_control( xc_handle, domid,
322 DOM0_SHADOW_CONTROL_OP_PEEK,
323 NULL, 0, &stats);
325 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
326 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
327 ((now-start)+500)/1000,
328 stats.fault_count, stats.dirty_count,
329 stats.dirty_net_count, stats.dirty_block_count);
330 }
331 }
333 return -1;
334 }
337 static int suspend_and_state(int xc_handle, int io_fd, int dom,
338 xc_dominfo_t *info,
339 vcpu_guest_context_t *ctxt)
340 {
341 int i = 0;
342 char ans[30];
344 printf("suspend\n");
345 fflush(stdout);
346 if (fgets(ans, sizeof(ans), stdin) == NULL) {
347 ERR("failed reading suspend reply");
348 return -1;
349 }
350 if (strncmp(ans, "done\n", 5)) {
351 ERR("suspend reply incorrect: %s", ans);
352 return -1;
353 }
355 retry:
357 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
358 {
359 ERR("Could not get domain info");
360 return -1;
361 }
363 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */,
364 ctxt) )
365 {
366 ERR("Could not get vcpu context");
367 }
369 if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
370 {
371 return 0; // success
372 }
374 if ( info->paused )
375 {
376 // try unpausing domain, wait, and retest
377 xc_domain_unpause( xc_handle, dom );
379 ERR("Domain was paused. Wait and re-test.");
380 usleep(10000); // 10ms
382 goto retry;
383 }
386 if( ++i < 100 )
387 {
388 ERR("Retry suspend domain.");
389 usleep(10000); // 10ms
390 goto retry;
391 }
393 ERR("Unable to suspend domain.");
395 return -1;
396 }
398 int xc_linux_save(int xc_handle, int io_fd, u32 dom, u32 max_iters,
399 u32 max_factor, u32 flags)
400 {
401 xc_dominfo_t info;
403 int rc = 1, i, j, k, last_iter, iter = 0;
404 unsigned long mfn;
405 int live = (flags & XCFLAGS_LIVE);
406 int debug = (flags & XCFLAGS_DEBUG);
407 int sent_last_iter, skip_this_iter;
409 /* The new domain's shared-info frame number. */
410 unsigned long shared_info_frame;
412 /* A copy of the CPU context of the guest. */
413 vcpu_guest_context_t ctxt;
415 /* A table containg the type of each PFN (/not/ MFN!). */
416 unsigned long *pfn_type = NULL;
417 unsigned long *pfn_batch = NULL;
419 /* A temporary mapping, and a copy, of one frame of guest memory. */
420 unsigned long page[1024];
422 /* A copy of the pfn-to-mfn table frame list. */
423 unsigned long *live_pfn_to_mfn_frame_list = NULL;
424 unsigned long pfn_to_mfn_frame_list[1024];
426 /* Live mapping of the table mapping each PFN to its current MFN. */
427 unsigned long *live_pfn_to_mfn_table = NULL;
428 /* Live mapping of system MFN to PFN table. */
429 unsigned long *live_mfn_to_pfn_table = NULL;
430 unsigned long mfn_to_pfn_table_start_mfn;
432 /* Live mapping of shared info structure */
433 shared_info_t *live_shinfo = NULL;
435 /* base of the region in which domain memory is mapped */
436 unsigned char *region_base = NULL;
438 /* A temporary mapping, and a copy, of the guest's suspend record. */
439 suspend_record_t *p_srec = NULL;
441 /* number of pages we're dealing with */
442 unsigned long nr_pfns;
444 /* power of 2 order of nr_pfns */
445 int order_nr;
447 /* bitmap of pages:
448 - that should be sent this iteration (unless later marked as skip);
449 - to skip this iteration because already dirty;
450 - to fixup by sending at the end if not already resent; */
451 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
453 xc_shadow_control_stats_t stats;
455 int needed_to_fix = 0;
456 int total_sent = 0;
458 MBIT_RATE = START_MBIT_RATE;
461 /* If no explicit control parameters given, use defaults */
462 if(!max_iters)
463 max_iters = DEF_MAX_ITERS;
464 if(!max_factor)
465 max_factor = DEF_MAX_FACTOR;
468 DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false");
470 if (mlock(&ctxt, sizeof(ctxt))) {
471 ERR("Unable to mlock ctxt");
472 return 1;
473 }
475 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1)
476 {
477 ERR("Could not get domain info");
478 goto out;
479 }
480 if ( xc_domain_get_vcpu_context( xc_handle, dom, /* FIXME */ 0,
481 &ctxt) )
482 {
483 ERR("Could not get vcpu context");
484 goto out;
485 }
486 shared_info_frame = info.shared_info_frame;
488 /* A cheesy test to see whether the domain contains valid state. */
489 if ( ctxt.ctrlreg[3] == 0 ){
490 ERR("Domain is not in a valid Linux guest OS state");
491 goto out;
492 }
494 nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
496 /* cheesy sanity check */
497 if ( nr_pfns > 1024*1024 )
498 {
499 ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
500 goto out;
501 }
503 /* Map the shared info frame */
504 live_shinfo = xc_map_foreign_range(xc_handle, dom,
505 PAGE_SIZE, PROT_READ,
506 shared_info_frame);
508 if (!live_shinfo){
509 ERR("Couldn't map live_shinfo");
510 goto out;
511 }
513 /* the pfn_to_mfn_frame_list fits in a single page */
514 live_pfn_to_mfn_frame_list =
515 xc_map_foreign_range(xc_handle, dom,
516 PAGE_SIZE, PROT_READ,
517 live_shinfo->arch.pfn_to_mfn_frame_list );
519 if (!live_pfn_to_mfn_frame_list){
520 ERR("Couldn't map pfn_to_mfn_frame_list");
521 goto out;
522 }
525 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
526 the guest must not change which frames are used for this purpose.
527 (its not clear why it would want to change them, and we'll be OK
528 from a safety POV anyhow. */
530 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom,
531 PROT_READ,
532 live_pfn_to_mfn_frame_list,
533 (nr_pfns+1023)/1024 );
534 if( !live_pfn_to_mfn_table ){
535 ERR("Couldn't map pfn_to_mfn table");
536 goto out;
537 }
539 /* Setup the mfn_to_pfn table mapping */
540 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
542 live_mfn_to_pfn_table =
543 xc_map_foreign_range(xc_handle, DOMID_XEN,
544 PAGE_SIZE*1024, PROT_READ,
545 mfn_to_pfn_table_start_mfn );
547 /* Canonicalise the pfn-to-mfn table frame-number list. */
548 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
550 for ( i = 0; i < nr_pfns; i += 1024 ){
551 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
552 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
553 goto out;
554 }
555 }
558 /* Domain is still running at this point */
560 if( live )
561 {
562 if ( xc_shadow_control( xc_handle, dom,
563 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
564 NULL, 0, NULL ) < 0 ) {
565 ERR("Couldn't enable shadow mode");
566 goto out;
567 }
569 last_iter = 0;
570 } else{
571 /* This is a non-live suspend. Issue the call back to get the
572 domain suspended */
574 last_iter = 1;
576 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
577 {
578 ERR("Domain appears not to have suspended");
579 goto out;
580 }
582 }
583 sent_last_iter = 1<<20; /* 4GB of pages */
585 /* calculate the power of 2 order of nr_pfns, e.g.
586 15->4 16->4 17->5 */
587 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
589 /* Setup to_send bitmap */
590 {
591 /* size these for a maximal 4GB domain, to make interaction
592 with balloon driver easier. It's only user space memory,
593 ater all... (3x 128KB) */
595 int sz = ( 1<<20 ) / 8;
597 to_send = malloc( sz );
598 to_fix = calloc( 1, sz );
599 to_skip = malloc( sz );
601 if (!to_send || !to_fix || !to_skip){
602 ERR("Couldn't allocate to_send array");
603 goto out;
604 }
606 memset( to_send, 0xff, sz );
608 if ( mlock( to_send, sz ) ){
609 ERR("Unable to mlock to_send");
610 return 1;
611 }
613 /* (to fix is local only) */
615 if ( mlock( to_skip, sz ) ){
616 ERR("Unable to mlock to_skip");
617 return 1;
618 }
620 }
622 analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
624 /* We want zeroed memory so use calloc rather than malloc. */
625 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
626 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
628 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
629 errno = ENOMEM;
630 goto out;
631 }
633 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
634 ERR("Unable to mlock");
635 goto out;
636 }
639 /*
640 * Quick belt and braces sanity check.
641 */
642 #if DEBUG
643 {
644 int err=0;
645 for ( i = 0; i < nr_pfns; i++ )
646 {
647 mfn = live_pfn_to_mfn_table[i];
649 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
650 {
651 fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
652 i,mfn,live_mfn_to_pfn_table[mfn]);
653 err++;
654 }
655 }
656 fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
657 }
658 #endif
661 /* Start writing out the saved-domain record. */
663 if (write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
664 sizeof(unsigned long)) {
665 ERR("write: nr_pfns");
666 goto out;
667 }
668 if (write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
669 ERR("write: pfn_to_mfn_frame_list");
670 goto out;
671 }
673 /* Map the suspend-record MFN to pin it. The page must be owned by
674 dom for this to succeed. */
675 p_srec = xc_map_foreign_range(xc_handle, dom,
676 sizeof(*p_srec), PROT_READ | PROT_WRITE,
677 ctxt.user_regs.esi);
678 if (!p_srec){
679 ERR("Couldn't map suspend record");
680 goto out;
681 }
683 /* Canonicalize store mfn. */
684 if ( !translate_mfn_to_pfn(&p_srec->resume_info.store_mfn) ) {
685 ERR("Store frame is not in range of pseudophys map");
686 goto out;
687 }
689 print_stats( xc_handle, dom, 0, &stats, 0 );
691 /* Now write out each data page, canonicalising page tables as we go... */
693 while(1){
694 unsigned int prev_pc, sent_this_iter, N, batch;
696 iter++;
697 sent_this_iter = 0;
698 skip_this_iter = 0;
699 prev_pc = 0;
700 N=0;
702 DPRINTF("Saving memory pages: iter %d 0%%", iter);
704 while( N < nr_pfns ){
705 unsigned int this_pc = (N * 100) / nr_pfns;
707 if ( (this_pc - prev_pc) >= 5 ){
708 DPRINTF("\b\b\b\b%3d%%", this_pc);
709 prev_pc = this_pc;
710 }
712 /* slightly wasteful to peek the whole array evey time,
713 but this is fast enough for the moment. */
715 if ( !last_iter &&
716 xc_shadow_control(xc_handle, dom,
717 DOM0_SHADOW_CONTROL_OP_PEEK,
718 to_skip, nr_pfns, NULL) != nr_pfns )
719 {
720 ERR("Error peeking shadow bitmap");
721 goto out;
722 }
725 /* load pfn_type[] with the mfn of all the pages we're doing in
726 this batch. */
728 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
729 {
730 int n = permute(N, nr_pfns, order_nr );
732 if ( 0 && debug ) {
733 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
734 " [mfn]= %08lx\n",
735 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
736 test_bit(n,to_send),
737 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
738 0xFFFFF]);
739 }
741 if ( !last_iter &&
742 test_bit(n, to_send) &&
743 test_bit(n, to_skip) ) {
744 skip_this_iter++; /* stats keeping */
745 }
747 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
748 (test_bit(n, to_send) && last_iter) ||
749 (test_bit(n, to_fix) && last_iter)) ) {
750 continue;
751 }
753 /* we get here if:
754 1. page is marked to_send & hasn't already been re-dirtied
755 2. (ignore to_skip in last iteration)
756 3. add in pages that still need fixup (net bufs)
757 */
759 pfn_batch[batch] = n;
760 pfn_type[batch] = live_pfn_to_mfn_table[n];
762 if( ! is_mapped(pfn_type[batch]) )
763 {
764 /* not currently in pusedo-physical map -- set bit
765 in to_fix that we must send this page in last_iter
766 unless its sent sooner anyhow */
768 set_bit( n, to_fix );
769 if( iter>1 )
770 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
771 iter,n,pfn_type[batch]);
772 continue;
773 }
775 if ( last_iter &&
776 test_bit(n, to_fix) &&
777 !test_bit(n, to_send) )
778 {
779 needed_to_fix++;
780 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
781 iter,n,pfn_type[batch]);
782 }
784 clear_bit(n, to_fix);
786 batch++;
787 }
789 if ( batch == 0 )
790 goto skip; /* vanishingly unlikely... */
792 if ( (region_base = xc_map_foreign_batch(xc_handle, dom,
793 PROT_READ,
794 pfn_type,
795 batch)) == 0 ){
796 ERR("map batch failed");
797 goto out;
798 }
800 if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
801 ERR("get_pfn_type_batch failed");
802 goto out;
803 }
805 for ( j = 0; j < batch; j++ ){
806 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
807 DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
808 continue;
809 }
811 if ( 0 && debug )
812 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
813 " sum= %08lx\n",
814 iter,
815 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
816 pfn_type[j],
817 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
818 csum_page(region_base + (PAGE_SIZE*j)));
820 /* canonicalise mfn->pfn */
821 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
822 }
824 if (write(io_fd, &batch, sizeof(int)) != sizeof(int)) {
825 ERR("Error when writing to state file (2)");
826 goto out;
827 }
829 if (write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
830 sizeof(unsigned long)*j) {
831 ERR("Error when writing to state file (3)");
832 goto out;
833 }
835 /* entering this loop, pfn_type is now in pfns (Not mfns) */
836 for( j = 0; j < batch; j++ ){
837 /* write out pages in batch */
838 if( (pfn_type[j] & LTAB_MASK) == XTAB){
839 DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
840 continue;
841 }
843 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
844 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
845 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
847 for ( k = 0;
848 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
849 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
850 1024);
851 k++ ){
852 unsigned long pfn;
854 if ( !(page[k] & _PAGE_PRESENT) )
855 continue;
857 mfn = page[k] >> PAGE_SHIFT;
858 pfn = live_mfn_to_pfn_table[mfn];
860 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
861 {
862 /* I don't think this should ever happen */
863 fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
864 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
865 j, pfn_type[j], k,
866 page[k], mfn, live_mfn_to_pfn_table[mfn],
867 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
868 live_pfn_to_mfn_table[
869 live_mfn_to_pfn_table[mfn]] :
870 0xdeadbeef);
872 pfn = 0; /* be suspicious */
873 }
875 page[k] &= PAGE_SIZE - 1;
876 page[k] |= pfn << PAGE_SHIFT;
878 #if 0
879 fprintf(stderr,
880 "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
881 "xpfn=%d\n",
882 pfn_type[j]>>28,
883 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
884 #endif
886 } /* end of page table rewrite for loop */
888 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
889 ERR("Error when writing to state file (4)");
890 goto out;
891 }
893 } /* end of it's a PT page */ else { /* normal page */
895 if (ratewrite(io_fd, region_base + (PAGE_SIZE*j),
896 PAGE_SIZE) != PAGE_SIZE) {
897 ERR("Error when writing to state file (5)");
898 goto out;
899 }
900 }
901 } /* end of the write out for this batch */
903 sent_this_iter += batch;
905 } /* end of this while loop for this iteration */
907 munmap(region_base, batch*PAGE_SIZE);
909 skip:
911 total_sent += sent_this_iter;
913 DPRINTF("\r %d: sent %d, skipped %d, ",
914 iter, sent_this_iter, skip_this_iter );
916 if ( last_iter ) {
917 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
919 DPRINTF("Total pages sent= %d (%.2fx)\n",
920 total_sent, ((float)total_sent)/nr_pfns );
921 DPRINTF("(of which %d were fixups)\n", needed_to_fix );
922 }
924 if (last_iter && debug){
925 int minusone = -1;
926 memset( to_send, 0xff, (nr_pfns+8)/8 );
927 debug = 0;
928 fprintf(stderr, "Entering debug resend-all mode\n");
930 /* send "-1" to put receiver into debug mode */
931 if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
932 ERR("Error when writing to state file (6)");
933 goto out;
934 }
936 continue;
937 }
939 if ( last_iter ) break;
941 if ( live )
942 {
943 if (
944 ( ( sent_this_iter > sent_last_iter ) &&
945 (mbit_rate == MAX_MBIT_RATE ) ) ||
946 (iter >= max_iters) ||
947 (sent_this_iter+skip_this_iter < 50) ||
948 (total_sent > nr_pfns*max_factor) )
949 {
950 DPRINTF("Start last iteration\n");
951 last_iter = 1;
953 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
954 {
955 ERR("Domain appears not to have suspended");
956 goto out;
957 }
959 DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n",
960 info.shared_info_frame,
961 ctxt.user_regs.eip, ctxt.user_regs.esi);
962 }
964 if ( xc_shadow_control( xc_handle, dom,
965 DOM0_SHADOW_CONTROL_OP_CLEAN,
966 to_send, nr_pfns, &stats ) != nr_pfns )
967 {
968 ERR("Error flushing shadow PT");
969 goto out;
970 }
972 sent_last_iter = sent_this_iter;
974 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
976 }
979 } /* end of while 1 */
981 DPRINTF("All memory is saved\n");
983 /* Success! */
984 rc = 0;
986 /* Zero terminate */
987 if (write(io_fd, &rc, sizeof(int)) != sizeof(int)) {
988 ERR("Error when writing to state file (6)");
989 goto out;
990 }
992 /* Send through a list of all the PFNs that were not in map at the close */
993 {
994 unsigned int i,j;
995 unsigned int pfntab[1024];
997 for ( i = 0, j = 0; i < nr_pfns; i++ )
998 {
999 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
1000 j++;
1003 if (write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int)) {
1004 ERR("Error when writing to state file (6a)");
1005 goto out;
1008 for ( i = 0, j = 0; i < nr_pfns; )
1010 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
1012 pfntab[j++] = i;
1014 i++;
1015 if ( j == 1024 || i == nr_pfns )
1017 if (write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
1018 sizeof(unsigned long)*j) {
1019 ERR("Error when writing to state file (6b)");
1020 goto out;
1022 j = 0;
1027 if (nr_pfns != p_srec->nr_pfns )
1029 ERR("Suspend record nr_pfns unexpected (%ld != %ld)",
1030 p_srec->nr_pfns, nr_pfns);
1031 goto out;
1034 /* Canonicalise the suspend-record frame number. */
1035 if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ){
1036 ERR("Suspend record is not in range of pseudophys map");
1037 goto out;
1040 /* Canonicalise each GDT frame number. */
1041 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1042 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1043 ERR("GDT frame is not in range of pseudophys map");
1044 goto out;
1048 /* Canonicalise the page table base pointer. */
1049 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1050 ERR("PT base is not in range of pseudophys map");
1051 goto out;
1053 ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1054 PAGE_SHIFT;
1056 if (write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
1057 write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) {
1058 ERR("Error when writing to state file (1)");
1059 goto out;
1062 out:
1064 if(live_shinfo)
1065 munmap(live_shinfo, PAGE_SIZE);
1067 if(p_srec)
1068 munmap(p_srec, sizeof(*p_srec));
1070 if(live_pfn_to_mfn_frame_list)
1071 munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1073 if(live_pfn_to_mfn_table)
1074 munmap(live_pfn_to_mfn_table, nr_pfns*4);
1076 if(live_mfn_to_pfn_table)
1077 munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
1079 free(pfn_type);
1080 free(pfn_batch);
1081 free(to_send);
1082 free(to_fix);
1083 free(to_skip);
1085 DPRINTF("Save exit rc=%d\n",rc);
1086 return !!rc;