debuggers.hg

view tools/libxc/xc_domain_restore.c @ 22906:700ac6445812

Now add KDB to the non-kdb tree
author Mukesh Rathor
date Thu Feb 03 15:42:41 2011 -0800 (2011-02-03)
parents 8b6e7f43683e
children
line source
1 /******************************************************************************
2 * xc_domain_restore.c
3 *
4 * Restore the state of a guest session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 * Copyright (c) 2006, Intel Corporation
8 * Copyright (c) 2007, XenSource Inc.
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation;
13 * version 2.1 of the License.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 *
24 */
26 #include <stdlib.h>
27 #include <unistd.h>
29 #include "xg_private.h"
30 #include "xg_save_restore.h"
31 #include "xc_dom.h"
33 #include <xen/hvm/ioreq.h>
34 #include <xen/hvm/params.h>
36 struct restore_ctx {
37 unsigned long max_mfn; /* max mfn of the current host machine */
38 unsigned long hvirt_start; /* virtual starting address of the hypervisor */
39 unsigned int pt_levels; /* #levels of page tables used by the current guest */
40 unsigned long nr_pfns; /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
41 xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
42 xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
43 xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */
44 int completed; /* Set when a consistent image is available */
45 int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */
46 struct domain_info_context dinfo;
47 };
49 #define HEARTBEAT_MS 1000
51 #ifndef __MINIOS__
52 static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx,
53 int fd, void* buf, size_t size)
54 {
55 size_t offset = 0;
56 ssize_t len;
57 struct timeval tv;
58 fd_set rfds;
60 while ( offset < size )
61 {
62 if ( ctx->completed ) {
63 /* expect a heartbeat every HEARBEAT_MS ms maximum */
64 tv.tv_sec = HEARTBEAT_MS / 1000;
65 tv.tv_usec = (HEARTBEAT_MS % 1000) * 1000;
67 FD_ZERO(&rfds);
68 FD_SET(fd, &rfds);
69 len = select(fd + 1, &rfds, NULL, NULL, &tv);
70 if ( len == -1 && errno == EINTR )
71 continue;
72 if ( !FD_ISSET(fd, &rfds) ) {
73 ERROR("read_exact_timed failed (select returned %zd)", len);
74 errno = ETIMEDOUT;
75 return -1;
76 }
77 }
79 len = read(fd, buf + offset, size - offset);
80 if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) )
81 continue;
82 if ( len == 0 ) {
83 ERROR("0-length read");
84 errno = 0;
85 }
86 if ( len <= 0 ) {
87 ERROR("read_exact_timed failed (read rc: %d, errno: %d)", len, errno);
88 return -1;
89 }
90 offset += len;
91 }
93 return 0;
94 }
96 #define RDEXACT(fd,buf,size) rdexact(xch, ctx, fd, buf, size)
97 #else
98 #define RDEXACT read_exact
99 #endif
100 /*
101 ** In the state file (or during transfer), all page-table pages are
102 ** converted into a 'canonical' form where references to actual mfns
103 ** are replaced with references to the corresponding pfns.
104 ** This function inverts that operation, replacing the pfn values with
105 ** the (now known) appropriate mfn values.
106 */
107 static int uncanonicalize_pagetable(
108 xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page)
109 {
110 int i, pte_last, nr_mfns = 0;
111 unsigned long pfn;
112 uint64_t pte;
113 struct domain_info_context *dinfo = &ctx->dinfo;
115 pte_last = PAGE_SIZE / ((ctx->pt_levels == 2)? 4 : 8);
117 /* First pass: work out how many (if any) MFNs we need to alloc */
118 for ( i = 0; i < pte_last; i++ )
119 {
120 if ( ctx->pt_levels == 2 )
121 pte = ((uint32_t *)page)[i];
122 else
123 pte = ((uint64_t *)page)[i];
125 /* XXX SMH: below needs fixing for PROT_NONE etc */
126 if ( !(pte & _PAGE_PRESENT) )
127 continue;
129 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
131 if ( pfn >= dinfo->p2m_size )
132 {
133 /* This "page table page" is probably not one; bail. */
134 ERROR("Frame number in page table is out of range: "
135 "i=%d pfn=0x%lx p2m_size=%lu",
136 i, pfn, dinfo->p2m_size);
137 return 0;
138 }
140 if ( ctx->p2m[pfn] == INVALID_P2M_ENTRY )
141 {
142 /* Have a 'valid' PFN without a matching MFN - need to alloc */
143 ctx->p2m_batch[nr_mfns++] = pfn;
144 ctx->p2m[pfn]--;
145 }
146 }
148 /* Allocate the requisite number of mfns. */
149 if ( nr_mfns &&
150 (xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
151 ctx->p2m_batch) != 0) )
152 {
153 ERROR("Failed to allocate memory for batch.!\n");
154 errno = ENOMEM;
155 return 0;
156 }
158 /* Second pass: uncanonicalize each present PTE */
159 nr_mfns = 0;
160 for ( i = 0; i < pte_last; i++ )
161 {
162 if ( ctx->pt_levels == 2 )
163 pte = ((uint32_t *)page)[i];
164 else
165 pte = ((uint64_t *)page)[i];
167 /* XXX SMH: below needs fixing for PROT_NONE etc */
168 if ( !(pte & _PAGE_PRESENT) )
169 continue;
171 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
173 if ( ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) )
174 ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++];
176 pte &= ~MADDR_MASK_X86;
177 pte |= (uint64_t)ctx->p2m[pfn] << PAGE_SHIFT;
179 if ( ctx->pt_levels == 2 )
180 ((uint32_t *)page)[i] = (uint32_t)pte;
181 else
182 ((uint64_t *)page)[i] = (uint64_t)pte;
183 }
185 return 1;
186 }
189 /* Load the p2m frame list, plus potential extended info chunk */
190 static xen_pfn_t *load_p2m_frame_list(
191 xc_interface *xch, struct restore_ctx *ctx,
192 int io_fd, int *pae_extended_cr3, int *ext_vcpucontext,
193 int *vcpuextstate, uint32_t *vcpuextstate_size)
194 {
195 xen_pfn_t *p2m_frame_list;
196 vcpu_guest_context_any_t ctxt;
197 xen_pfn_t p2m_fl_zero;
198 struct domain_info_context *dinfo = &ctx->dinfo;
200 /* Read first entry of P2M list, or extended-info signature (~0UL). */
201 if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(long)) )
202 {
203 PERROR("read extended-info signature failed");
204 return NULL;
205 }
207 if ( p2m_fl_zero == ~0UL )
208 {
209 uint32_t tot_bytes;
211 /* Next 4 bytes: total size of following extended info. */
212 if ( RDEXACT(io_fd, &tot_bytes, sizeof(tot_bytes)) )
213 {
214 PERROR("read extended-info size failed");
215 return NULL;
216 }
218 while ( tot_bytes )
219 {
220 uint32_t chunk_bytes;
221 char chunk_sig[4];
223 /* 4-character chunk signature + 4-byte remaining chunk size. */
224 if ( RDEXACT(io_fd, chunk_sig, sizeof(chunk_sig)) ||
225 RDEXACT(io_fd, &chunk_bytes, sizeof(chunk_bytes)) ||
226 (tot_bytes < (chunk_bytes + 8)) )
227 {
228 PERROR("read extended-info chunk signature failed");
229 return NULL;
230 }
231 tot_bytes -= 8;
233 /* VCPU context structure? */
234 if ( !strncmp(chunk_sig, "vcpu", 4) )
235 {
236 /* Pick a guest word-size and PT depth from the ctxt size */
237 if ( chunk_bytes == sizeof (ctxt.x32) )
238 {
239 dinfo->guest_width = 4;
240 if ( ctx->pt_levels > 2 )
241 ctx->pt_levels = 3;
242 }
243 else if ( chunk_bytes == sizeof (ctxt.x64) )
244 {
245 dinfo->guest_width = 8;
246 ctx->pt_levels = 4;
247 }
248 else
249 {
250 ERROR("bad extended-info context size %d", chunk_bytes);
251 return NULL;
252 }
254 if ( RDEXACT(io_fd, &ctxt, chunk_bytes) )
255 {
256 PERROR("read extended-info vcpu context failed");
257 return NULL;
258 }
259 tot_bytes -= chunk_bytes;
260 chunk_bytes = 0;
262 if ( GET_FIELD(&ctxt, vm_assist)
263 & (1UL << VMASST_TYPE_pae_extended_cr3) )
264 *pae_extended_cr3 = 1;
265 }
266 else if ( !strncmp(chunk_sig, "extv", 4) )
267 {
268 *ext_vcpucontext = 1;
269 }
270 else if ( !strncmp(chunk_sig, "xcnt", 4) )
271 {
272 *vcpuextstate = 1;
273 RDEXACT(io_fd, vcpuextstate_size, sizeof(*vcpuextstate_size));
274 tot_bytes -= chunk_bytes;
275 chunk_bytes = 0;
276 }
278 /* Any remaining bytes of this chunk: read and discard. */
279 while ( chunk_bytes )
280 {
281 unsigned long sz = MIN(chunk_bytes, sizeof(xen_pfn_t));
282 if ( RDEXACT(io_fd, &p2m_fl_zero, sz) )
283 {
284 PERROR("read-and-discard extended-info chunk bytes failed");
285 return NULL;
286 }
287 chunk_bytes -= sz;
288 tot_bytes -= sz;
289 }
290 }
292 /* Now read the real first entry of P2M list. */
293 if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) )
294 {
295 PERROR("read first entry of p2m_frame_list failed");
296 return NULL;
297 }
298 }
300 /* Now that we know the guest's word-size, can safely allocate
301 * the p2m frame list */
302 if ( (p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) == NULL )
303 {
304 ERROR("Couldn't allocate p2m_frame_list array");
305 return NULL;
306 }
308 /* First entry has already been read. */
309 p2m_frame_list[0] = p2m_fl_zero;
310 if ( RDEXACT(io_fd, &p2m_frame_list[1],
311 (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) )
312 {
313 PERROR("read p2m_frame_list failed");
314 return NULL;
315 }
317 return p2m_frame_list;
318 }
320 typedef struct {
321 int ishvm;
322 union {
323 struct tailbuf_pv {
324 unsigned int pfncount;
325 unsigned long* pfntab;
326 unsigned int vcpucount;
327 unsigned char* vcpubuf;
328 unsigned char shared_info_page[PAGE_SIZE];
329 } pv;
330 struct tailbuf_hvm {
331 uint64_t magicpfns[3];
332 uint32_t hvmbufsize, reclen;
333 uint8_t* hvmbuf;
334 struct {
335 uint32_t magic;
336 uint32_t version;
337 uint64_t len;
338 } qemuhdr;
339 uint32_t qemubufsize;
340 uint8_t* qemubuf;
341 } hvm;
342 } u;
343 } tailbuf_t;
345 /* read stream until EOF, growing buffer as necssary */
346 static int compat_buffer_qemu(xc_interface *xch, struct restore_ctx *ctx,
347 int fd, struct tailbuf_hvm *buf)
348 {
349 uint8_t *qbuf, *tmp;
350 int blen = 0, dlen = 0;
351 int rc;
353 /* currently save records tend to be about 7K */
354 blen = 8192;
355 if ( !(qbuf = malloc(blen)) ) {
356 ERROR("Error allocating QEMU buffer");
357 return -1;
358 }
360 while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) {
361 DPRINTF("Read %d bytes of QEMU data\n", rc);
362 dlen += rc;
364 if (dlen == blen) {
365 DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen);
366 blen += 4096;
367 tmp = realloc(qbuf, blen);
368 if ( !tmp ) {
369 ERROR("Error growing QEMU buffer to %d bytes", blen);
370 free(qbuf);
371 return -1;
372 }
373 qbuf = tmp;
374 }
375 }
377 if ( rc < 0 ) {
378 ERROR("Error reading QEMU data");
379 free(qbuf);
380 return -1;
381 }
383 if ( memcmp(qbuf, "QEVM", 4) ) {
384 ERROR("Invalid QEMU magic: 0x%08x", *(unsigned long*)qbuf);
385 free(qbuf);
386 return -1;
387 }
389 buf->qemubuf = qbuf;
390 buf->qemubufsize = dlen;
392 return 0;
393 }
395 static int buffer_qemu(xc_interface *xch, struct restore_ctx *ctx,
396 int fd, struct tailbuf_hvm *buf)
397 {
398 uint32_t qlen;
399 uint8_t *tmp;
401 if ( RDEXACT(fd, &qlen, sizeof(qlen)) ) {
402 PERROR("Error reading QEMU header length");
403 return -1;
404 }
406 if ( qlen > buf->qemubufsize ) {
407 if ( buf->qemubuf) {
408 tmp = realloc(buf->qemubuf, qlen);
409 if ( tmp )
410 buf->qemubuf = tmp;
411 else {
412 ERROR("Error reallocating QEMU state buffer");
413 return -1;
414 }
415 } else {
416 buf->qemubuf = malloc(qlen);
417 if ( !buf->qemubuf ) {
418 ERROR("Error allocating QEMU state buffer");
419 return -1;
420 }
421 }
422 }
423 buf->qemubufsize = qlen;
425 if ( RDEXACT(fd, buf->qemubuf, buf->qemubufsize) ) {
426 PERROR("Error reading QEMU state");
427 return -1;
428 }
430 return 0;
431 }
433 static int dump_qemu(xc_interface *xch, uint32_t dom, struct tailbuf_hvm *buf)
434 {
435 int saved_errno;
436 char path[256];
437 FILE *fp;
439 sprintf(path, XC_DEVICE_MODEL_RESTORE_FILE".%u", dom);
440 fp = fopen(path, "wb");
441 if ( !fp )
442 return -1;
444 DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize);
445 if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) {
446 saved_errno = errno;
447 fclose(fp);
448 errno = saved_errno;
449 return -1;
450 }
452 fclose(fp);
454 return 0;
455 }
457 static int buffer_tail_hvm(xc_interface *xch, struct restore_ctx *ctx,
458 struct tailbuf_hvm *buf, int fd,
459 unsigned int max_vcpu_id, uint64_t vcpumap,
460 int ext_vcpucontext,
461 int vcpuextstate, uint32_t vcpuextstate_size)
462 {
463 uint8_t *tmp;
464 unsigned char qemusig[21];
466 if ( RDEXACT(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) {
467 PERROR("Error reading magic PFNs");
468 return -1;
469 }
471 if ( RDEXACT(fd, &buf->reclen, sizeof(buf->reclen)) ) {
472 PERROR("Error reading HVM params size");
473 return -1;
474 }
476 if ( buf->reclen > buf->hvmbufsize ) {
477 if ( buf->hvmbuf) {
478 tmp = realloc(buf->hvmbuf, buf->reclen);
479 if ( tmp ) {
480 buf->hvmbuf = tmp;
481 buf->hvmbufsize = buf->reclen;
482 } else {
483 ERROR("Error reallocating HVM param buffer");
484 return -1;
485 }
486 } else {
487 buf->hvmbuf = malloc(buf->reclen);
488 if ( !buf->hvmbuf ) {
489 ERROR("Error allocating HVM param buffer");
490 return -1;
491 }
492 buf->hvmbufsize = buf->reclen;
493 }
494 }
496 if ( RDEXACT(fd, buf->hvmbuf, buf->reclen) ) {
497 PERROR("Error reading HVM params");
498 return -1;
499 }
501 if ( RDEXACT(fd, qemusig, sizeof(qemusig)) ) {
502 PERROR("Error reading QEMU signature");
503 return -1;
504 }
506 /* The legacy live-migration QEMU record has no length information.
507 * Short of reimplementing the QEMU parser, we're forced to just read
508 * until EOF.
509 *
510 * Gets around this by sending a different signatures for the new
511 * live-migration QEMU record and Remus which includes a length
512 * prefix
513 */
514 if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) )
515 return compat_buffer_qemu(xch, ctx, fd, buf);
516 else if ( !memcmp(qemusig, "DeviceModelRecord0002", sizeof(qemusig)) ||
517 !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) )
518 return buffer_qemu(xch, ctx, fd, buf);
520 qemusig[20] = '\0';
521 ERROR("Invalid QEMU signature: %s", qemusig);
522 return -1;
523 }
525 static int buffer_tail_pv(xc_interface *xch, struct restore_ctx *ctx,
526 struct tailbuf_pv *buf, int fd,
527 unsigned int max_vcpu_id, uint64_t vcpumap,
528 int ext_vcpucontext,
529 int vcpuextstate,
530 uint32_t vcpuextstate_size)
531 {
532 unsigned int i;
533 size_t pfnlen, vcpulen;
534 struct domain_info_context *dinfo = &ctx->dinfo;
536 /* TODO: handle changing pfntab and vcpu counts */
537 /* PFN tab */
538 if ( RDEXACT(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
539 (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
540 {
541 PERROR("Error when reading pfn count");
542 return -1;
543 }
544 pfnlen = sizeof(unsigned long) * buf->pfncount;
545 if ( !(buf->pfntab) ) {
546 if ( !(buf->pfntab = malloc(pfnlen)) ) {
547 ERROR("Error allocating PFN tail buffer");
548 return -1;
549 }
550 }
551 // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
552 if ( RDEXACT(fd, buf->pfntab, pfnlen) ) {
553 PERROR("Error when reading pfntab");
554 goto free_pfntab;
555 }
557 /* VCPU contexts */
558 buf->vcpucount = 0;
559 for (i = 0; i <= max_vcpu_id; i++) {
560 // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap, i, (vcpumap % (1ULL << i)));
561 if ( (!(vcpumap & (1ULL << i))) )
562 continue;
563 buf->vcpucount++;
564 }
565 // DPRINTF("VCPU count: %d\n", buf->vcpucount);
566 vcpulen = ((dinfo->guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
567 : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
568 if ( ext_vcpucontext )
569 vcpulen += 128 * buf->vcpucount;
570 if ( vcpuextstate ) {
571 vcpulen += vcpuextstate_size * buf->vcpucount;
572 }
574 if ( !(buf->vcpubuf) ) {
575 if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
576 ERROR("Error allocating VCPU ctxt tail buffer");
577 goto free_pfntab;
578 }
579 }
580 // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
581 if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) {
582 PERROR("Error when reading ctxt");
583 goto free_vcpus;
584 }
586 /* load shared_info_page */
587 // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
588 if ( RDEXACT(fd, buf->shared_info_page, PAGE_SIZE) ) {
589 PERROR("Error when reading shared info page");
590 goto free_vcpus;
591 }
593 return 0;
595 free_vcpus:
596 if (buf->vcpubuf) {
597 free (buf->vcpubuf);
598 buf->vcpubuf = NULL;
599 }
600 free_pfntab:
601 if (buf->pfntab) {
602 free (buf->pfntab);
603 buf->pfntab = NULL;
604 }
606 return -1;
607 }
609 static int buffer_tail(xc_interface *xch, struct restore_ctx *ctx,
610 tailbuf_t *buf, int fd, unsigned int max_vcpu_id,
611 uint64_t vcpumap, int ext_vcpucontext,
612 int vcpuextstate, uint32_t vcpuextstate_size)
613 {
614 if ( buf->ishvm )
615 return buffer_tail_hvm(xch, ctx, &buf->u.hvm, fd, max_vcpu_id, vcpumap,
616 ext_vcpucontext, vcpuextstate,
617 vcpuextstate_size);
618 else
619 return buffer_tail_pv(xch, ctx, &buf->u.pv, fd, max_vcpu_id, vcpumap,
620 ext_vcpucontext, vcpuextstate,
621 vcpuextstate_size);
622 }
624 static void tailbuf_free_hvm(struct tailbuf_hvm *buf)
625 {
626 if ( buf->hvmbuf ) {
627 free(buf->hvmbuf);
628 buf->hvmbuf = NULL;
629 }
630 if ( buf->qemubuf ) {
631 free(buf->qemubuf);
632 buf->qemubuf = NULL;
633 }
634 }
636 static void tailbuf_free_pv(struct tailbuf_pv *buf)
637 {
638 if ( buf->vcpubuf ) {
639 free(buf->vcpubuf);
640 buf->vcpubuf = NULL;
641 }
642 if ( buf->pfntab ) {
643 free(buf->pfntab);
644 buf->pfntab = NULL;
645 }
646 }
648 static void tailbuf_free(tailbuf_t *buf)
649 {
650 if ( buf->ishvm )
651 tailbuf_free_hvm(&buf->u.hvm);
652 else
653 tailbuf_free_pv(&buf->u.pv);
654 }
656 typedef struct {
657 void* pages;
658 /* pages is of length nr_physpages, pfn_types is of length nr_pages */
659 unsigned int nr_physpages, nr_pages;
661 /* Types of the pfns in the current region */
662 unsigned long* pfn_types;
664 int verify;
666 int new_ctxt_format;
667 int max_vcpu_id;
668 uint64_t vcpumap;
669 uint64_t identpt;
670 uint64_t vm86_tss;
671 uint64_t console_pfn;
672 uint64_t acpi_ioport_location;
673 } pagebuf_t;
675 static int pagebuf_init(pagebuf_t* buf)
676 {
677 memset(buf, 0, sizeof(*buf));
678 return 0;
679 }
681 static void pagebuf_free(pagebuf_t* buf)
682 {
683 if (buf->pages) {
684 free(buf->pages);
685 buf->pages = NULL;
686 }
687 if(buf->pfn_types) {
688 free(buf->pfn_types);
689 buf->pfn_types = NULL;
690 }
691 }
693 static int pagebuf_get_one(xc_interface *xch, struct restore_ctx *ctx,
694 pagebuf_t* buf, int fd, uint32_t dom)
695 {
696 int count, countpages, oldcount, i;
697 void* ptmp;
699 if ( RDEXACT(fd, &count, sizeof(count)) )
700 {
701 PERROR("Error when reading batch size");
702 return -1;
703 }
705 // DPRINTF("reading batch of %d pages\n", count);
707 switch ( count )
708 {
709 case 0:
710 // DPRINTF("Last batch read\n");
711 return 0;
713 case XC_SAVE_ID_ENABLE_VERIFY_MODE:
714 DPRINTF("Entering page verify mode\n");
715 buf->verify = 1;
716 return pagebuf_get_one(xch, ctx, buf, fd, dom);
718 case XC_SAVE_ID_VCPU_INFO:
719 buf->new_ctxt_format = 1;
720 if ( RDEXACT(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
721 buf->max_vcpu_id >= 64 || RDEXACT(fd, &buf->vcpumap,
722 sizeof(uint64_t)) ) {
723 PERROR("Error when reading max_vcpu_id");
724 return -1;
725 }
726 // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, buf->vcpumap);
727 return pagebuf_get_one(xch, ctx, buf, fd, dom);
729 case XC_SAVE_ID_HVM_IDENT_PT:
730 /* Skip padding 4 bytes then read the EPT identity PT location. */
731 if ( RDEXACT(fd, &buf->identpt, sizeof(uint32_t)) ||
732 RDEXACT(fd, &buf->identpt, sizeof(uint64_t)) )
733 {
734 PERROR("error read the address of the EPT identity map");
735 return -1;
736 }
737 // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
738 return pagebuf_get_one(xch, ctx, buf, fd, dom);
740 case XC_SAVE_ID_HVM_VM86_TSS:
741 /* Skip padding 4 bytes then read the vm86 TSS location. */
742 if ( RDEXACT(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
743 RDEXACT(fd, &buf->vm86_tss, sizeof(uint64_t)) )
744 {
745 PERROR("error read the address of the vm86 TSS");
746 return -1;
747 }
748 // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
749 return pagebuf_get_one(xch, ctx, buf, fd, dom);
751 case XC_SAVE_ID_TMEM:
752 DPRINTF("xc_domain_restore start tmem\n");
753 if ( xc_tmem_restore(xch, dom, fd) ) {
754 PERROR("error reading/restoring tmem");
755 return -1;
756 }
757 return pagebuf_get_one(xch, ctx, buf, fd, dom);
759 case XC_SAVE_ID_TMEM_EXTRA:
760 if ( xc_tmem_restore_extra(xch, dom, fd) ) {
761 PERROR("error reading/restoring tmem extra");
762 return -1;
763 }
764 return pagebuf_get_one(xch, ctx, buf, fd, dom);
766 case XC_SAVE_ID_TSC_INFO:
767 {
768 uint32_t tsc_mode, khz, incarn;
769 uint64_t nsec;
770 if ( RDEXACT(fd, &tsc_mode, sizeof(uint32_t)) ||
771 RDEXACT(fd, &nsec, sizeof(uint64_t)) ||
772 RDEXACT(fd, &khz, sizeof(uint32_t)) ||
773 RDEXACT(fd, &incarn, sizeof(uint32_t)) ||
774 xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
775 PERROR("error reading/restoring tsc info");
776 return -1;
777 }
778 return pagebuf_get_one(xch, ctx, buf, fd, dom);
779 }
781 case XC_SAVE_ID_HVM_CONSOLE_PFN :
782 /* Skip padding 4 bytes then read the console pfn location. */
783 if ( RDEXACT(fd, &buf->console_pfn, sizeof(uint32_t)) ||
784 RDEXACT(fd, &buf->console_pfn, sizeof(uint64_t)) )
785 {
786 PERROR("error read the address of the console pfn");
787 return -1;
788 }
789 // DPRINTF("console pfn location: %llx\n", buf->console_pfn);
790 return pagebuf_get_one(xch, ctx, buf, fd, dom);
792 case XC_SAVE_ID_LAST_CHECKPOINT:
793 ctx->last_checkpoint = 1;
794 // DPRINTF("last checkpoint indication received");
795 return pagebuf_get_one(xch, ctx, buf, fd, dom);
797 case XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION:
798 /* Skip padding 4 bytes then read the acpi ioport location. */
799 if ( RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint32_t)) ||
800 RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint64_t)) )
801 {
802 PERROR("error read the acpi ioport location");
803 return -1;
804 }
805 return pagebuf_get_one(xch, ctx, buf, fd, dom);
807 default:
808 if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
809 ERROR("Max batch size exceeded (%d). Giving up.", count);
810 errno = EMSGSIZE;
811 return -1;
812 }
813 break;
814 }
816 oldcount = buf->nr_pages;
817 buf->nr_pages += count;
818 if (!buf->pfn_types) {
819 if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) {
820 ERROR("Could not allocate PFN type buffer");
821 return -1;
822 }
823 } else {
824 if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * sizeof(*(buf->pfn_types))))) {
825 ERROR("Could not reallocate PFN type buffer");
826 return -1;
827 }
828 buf->pfn_types = ptmp;
829 }
830 if ( RDEXACT(fd, buf->pfn_types + oldcount, count * sizeof(*(buf->pfn_types)))) {
831 PERROR("Error when reading region pfn types");
832 return -1;
833 }
835 countpages = count;
836 for (i = oldcount; i < buf->nr_pages; ++i)
837 if ((buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == XEN_DOMCTL_PFINFO_XTAB)
838 --countpages;
840 if (!countpages)
841 return count;
843 oldcount = buf->nr_physpages;
844 buf->nr_physpages += countpages;
845 if (!buf->pages) {
846 if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
847 ERROR("Could not allocate page buffer");
848 return -1;
849 }
850 } else {
851 if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
852 ERROR("Could not reallocate page buffer");
853 return -1;
854 }
855 buf->pages = ptmp;
856 }
857 if ( RDEXACT(fd, buf->pages + oldcount * PAGE_SIZE, countpages * PAGE_SIZE) ) {
858 PERROR("Error when reading pages");
859 return -1;
860 }
862 return count;
863 }
865 static int pagebuf_get(xc_interface *xch, struct restore_ctx *ctx,
866 pagebuf_t* buf, int fd, uint32_t dom)
867 {
868 int rc;
870 buf->nr_physpages = buf->nr_pages = 0;
872 do {
873 rc = pagebuf_get_one(xch, ctx, buf, fd, dom);
874 } while (rc > 0);
876 if (rc < 0)
877 pagebuf_free(buf);
879 return rc;
880 }
882 static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx *ctx,
883 xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3,
884 unsigned int hvm, struct xc_mmu* mmu,
885 pagebuf_t* pagebuf, int curbatch)
886 {
887 int i, j, curpage, nr_mfns;
888 /* used by debug verify code */
889 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
890 /* Our mapping of the current region (batch) */
891 char *region_base;
892 /* A temporary mapping, and a copy, of one frame of guest memory. */
893 unsigned long *page = NULL;
894 int nraces = 0;
895 struct domain_info_context *dinfo = &ctx->dinfo;
896 int* pfn_err = NULL;
897 int rc = -1;
899 unsigned long mfn, pfn, pagetype;
901 j = pagebuf->nr_pages - curbatch;
902 if (j > MAX_BATCH_SIZE)
903 j = MAX_BATCH_SIZE;
905 /* First pass for this batch: work out how much memory to alloc */
906 nr_mfns = 0;
907 for ( i = 0; i < j; i++ )
908 {
909 unsigned long pfn, pagetype;
910 pfn = pagebuf->pfn_types[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
911 pagetype = pagebuf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
913 if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
914 (ctx->p2m[pfn] == INVALID_P2M_ENTRY) )
915 {
916 /* Have a live PFN which hasn't had an MFN allocated */
917 ctx->p2m_batch[nr_mfns++] = pfn;
918 ctx->p2m[pfn]--;
919 }
920 }
922 /* Now allocate a bunch of mfns for this batch */
923 if ( nr_mfns &&
924 (xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0,
925 0, ctx->p2m_batch) != 0) )
926 {
927 ERROR("Failed to allocate memory for batch.!\n");
928 errno = ENOMEM;
929 return -1;
930 }
932 /* Second pass for this batch: update p2m[] and region_mfn[] */
933 nr_mfns = 0;
934 for ( i = 0; i < j; i++ )
935 {
936 unsigned long pfn, pagetype;
937 pfn = pagebuf->pfn_types[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
938 pagetype = pagebuf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
940 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
941 region_mfn[i] = ~0UL; /* map will fail but we don't care */
942 else
943 {
944 if ( ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) )
945 {
946 /* We just allocated a new mfn above; update p2m */
947 ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++];
948 ctx->nr_pfns++;
949 }
951 /* setup region_mfn[] for batch map.
952 * For HVM guests, this interface takes PFNs, not MFNs */
953 region_mfn[i] = hvm ? pfn : ctx->p2m[pfn];
954 }
955 }
957 /* Map relevant mfns */
958 pfn_err = calloc(j, sizeof(*pfn_err));
959 region_base = xc_map_foreign_bulk(
960 xch, dom, PROT_WRITE, region_mfn, pfn_err, j);
962 if ( region_base == NULL )
963 {
964 PERROR("map batch failed");
965 free(pfn_err);
966 return -1;
967 }
969 for ( i = 0, curpage = -1; i < j; i++ )
970 {
971 pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
972 pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
974 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
975 /* a bogus/unmapped page: skip it */
976 continue;
978 if (pfn_err[i])
979 {
980 ERROR("unexpected PFN mapping failure");
981 goto err_mapped;
982 }
984 ++curpage;
986 if ( pfn > dinfo->p2m_size )
987 {
988 ERROR("pfn out of range");
989 goto err_mapped;
990 }
992 pfn_type[pfn] = pagetype;
994 mfn = ctx->p2m[pfn];
996 /* In verify mode, we use a copy; otherwise we work in place */
997 page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
999 memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, PAGE_SIZE);
1001 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1003 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1004 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1006 /*
1007 ** A page table page - need to 'uncanonicalize' it, i.e.
1008 ** replace all the references to pfns with the corresponding
1009 ** mfns for the new domain.
1010 **
1011 ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
1012 ** so we may need to update the p2m after the main loop.
1013 ** Hence we defer canonicalization of L1s until then.
1014 */
1015 if ((ctx->pt_levels != 3) ||
1016 pae_extended_cr3 ||
1017 (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
1019 if (!uncanonicalize_pagetable(xch, dom, ctx, page)) {
1020 /*
1021 ** Failing to uncanonicalize a page table can be ok
1022 ** under live migration since the pages type may have
1023 ** changed by now (and we'll get an update later).
1024 */
1025 DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
1026 pagetype >> 28, pfn, mfn);
1027 nraces++;
1028 continue;
1032 else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
1034 ERROR("Bogus page type %lx page table is out of range: "
1035 "i=%d p2m_size=%lu", pagetype, i, dinfo->p2m_size);
1036 goto err_mapped;
1039 if ( pagebuf->verify )
1041 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
1042 if ( res )
1044 int v;
1046 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
1047 "actualcs=%08lx\n", pfn, pagebuf->pfn_types[pfn],
1048 csum_page(region_base + (i + curbatch)*PAGE_SIZE),
1049 csum_page(buf));
1051 for ( v = 0; v < 4; v++ )
1053 unsigned long *p = (unsigned long *)
1054 (region_base + i*PAGE_SIZE);
1055 if ( buf[v] != p[v] )
1056 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
1061 if ( !hvm &&
1062 xc_add_mmu_update(xch, mmu,
1063 (((unsigned long long)mfn) << PAGE_SHIFT)
1064 | MMU_MACHPHYS_UPDATE, pfn) )
1066 PERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
1067 goto err_mapped;
1069 } /* end of 'batch' for loop */
1071 rc = nraces;
1073 err_mapped:
1074 munmap(region_base, j*PAGE_SIZE);
1075 free(pfn_err);
1077 return rc;
1080 int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
1081 unsigned int store_evtchn, unsigned long *store_mfn,
1082 unsigned int console_evtchn, unsigned long *console_mfn,
1083 unsigned int hvm, unsigned int pae, int superpages)
1085 DECLARE_DOMCTL;
1086 int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
1087 int vcpuextstate = 0;
1088 uint32_t vcpuextstate_size = 0;
1089 unsigned long mfn, pfn;
1090 unsigned int prev_pc;
1091 int nraces = 0;
1093 /* The new domain's shared-info frame number. */
1094 unsigned long shared_info_frame;
1095 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
1096 shared_info_any_t *old_shared_info =
1097 (shared_info_any_t *)shared_info_page;
1098 shared_info_any_t *new_shared_info;
1100 /* A copy of the CPU context of the guest. */
1101 DECLARE_HYPERCALL_BUFFER(vcpu_guest_context_any_t, ctxt);
1103 /* A copy of the CPU eXtended States of the guest. */
1104 DECLARE_HYPERCALL_BUFFER(void, buffer);
1106 /* A table containing the type of each PFN (/not/ MFN!). */
1107 unsigned long *pfn_type = NULL;
1109 /* A table of MFNs to map in the current region */
1110 xen_pfn_t *region_mfn = NULL;
1112 /* A copy of the pfn-to-mfn table frame list. */
1113 xen_pfn_t *p2m_frame_list = NULL;
1115 /* A temporary mapping of the guest's start_info page. */
1116 start_info_any_t *start_info;
1118 /* Our mapping of the current region (batch) */
1119 char *region_base;
1121 struct xc_mmu *mmu = NULL;
1123 struct mmuext_op pin[MAX_PIN_BATCH];
1124 unsigned int nr_pins;
1126 uint64_t vcpumap = 1ULL;
1127 unsigned int max_vcpu_id = 0;
1128 int new_ctxt_format = 0;
1130 pagebuf_t pagebuf;
1131 tailbuf_t tailbuf, tmptail;
1132 void* vcpup;
1133 uint64_t console_pfn = 0;
1135 int orig_io_fd_flags;
1137 static struct restore_ctx _ctx = {
1138 .live_p2m = NULL,
1139 .p2m = NULL,
1140 };
1141 static struct restore_ctx *ctx = &_ctx;
1142 struct domain_info_context *dinfo = &ctx->dinfo;
1144 pagebuf_init(&pagebuf);
1145 memset(&tailbuf, 0, sizeof(tailbuf));
1146 tailbuf.ishvm = hvm;
1148 /* For info only */
1149 ctx->nr_pfns = 0;
1151 if ( superpages )
1152 return 1;
1154 ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt));
1156 if ( ctxt == NULL )
1158 PERROR("Unable to allocate VCPU ctxt buffer");
1159 return 1;
1163 if ( (orig_io_fd_flags = fcntl(io_fd, F_GETFL, 0)) < 0 ) {
1164 PERROR("unable to read IO FD flags");
1165 goto out;
1168 if ( read_exact(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
1170 PERROR("read: p2m_size");
1171 goto out;
1173 DPRINTF("xc_domain_restore start: p2m_size = %lx\n", dinfo->p2m_size);
1175 if ( !get_platform_info(xch, dom,
1176 &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
1178 ERROR("Unable to get platform info.");
1179 return 1;
1182 /* The *current* word size of the guest isn't very interesting; for now
1183 * assume the guest will be the same as we are. We'll fix that later
1184 * if we discover otherwise. */
1185 dinfo->guest_width = sizeof(unsigned long);
1186 ctx->pt_levels = (dinfo->guest_width == 8) ? 4 : (ctx->pt_levels == 2) ? 2 : 3;
1188 if ( !hvm )
1190 /* Load the p2m frame list, plus potential extended info chunk */
1191 p2m_frame_list = load_p2m_frame_list(xch, ctx,
1192 io_fd, &pae_extended_cr3, &ext_vcpucontext,
1193 &vcpuextstate, &vcpuextstate_size);
1195 if ( !p2m_frame_list )
1196 goto out;
1198 /* Now that we know the word size, tell Xen about it */
1199 memset(&domctl, 0, sizeof(domctl));
1200 domctl.domain = dom;
1201 domctl.cmd = XEN_DOMCTL_set_address_size;
1202 domctl.u.address_size.size = dinfo->guest_width * 8;
1203 frc = do_domctl(xch, &domctl);
1204 if ( frc != 0 )
1206 PERROR("Unable to set guest address size.");
1207 goto out;
1211 /* We want zeroed memory so use calloc rather than malloc. */
1212 ctx->p2m = calloc(dinfo->p2m_size, sizeof(xen_pfn_t));
1213 pfn_type = calloc(dinfo->p2m_size, sizeof(unsigned long));
1215 region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
1216 ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
1218 if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
1219 (region_mfn == NULL) || (ctx->p2m_batch == NULL) )
1221 ERROR("memory alloc failed");
1222 errno = ENOMEM;
1223 goto out;
1226 memset(region_mfn, 0,
1227 ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
1228 memset(ctx->p2m_batch, 0,
1229 ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
1231 /* Get the domain's shared-info frame. */
1232 domctl.cmd = XEN_DOMCTL_getdomaininfo;
1233 domctl.domain = (domid_t)dom;
1234 if ( xc_domctl(xch, &domctl) < 0 )
1236 PERROR("Could not get information on new domain");
1237 goto out;
1239 shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
1241 /* Mark all PFNs as invalid; we allocate on demand */
1242 for ( pfn = 0; pfn < dinfo->p2m_size; pfn++ )
1243 ctx->p2m[pfn] = INVALID_P2M_ENTRY;
1245 mmu = xc_alloc_mmu_updates(xch, dom);
1246 if ( mmu == NULL )
1248 PERROR("Could not initialise for MMU updates");
1249 goto out;
1252 xc_report_progress_start(xch, "Reloading memory pages", dinfo->p2m_size);
1254 /*
1255 * Now simply read each saved frame into its new machine frame.
1256 * We uncanonicalise page tables as we go.
1257 */
1258 prev_pc = 0;
1260 n = m = 0;
1261 loadpages:
1262 for ( ; ; )
1264 int j, curbatch;
1266 xc_report_progress_step(xch, n, dinfo->p2m_size);
1268 if ( !ctx->completed ) {
1269 pagebuf.nr_physpages = pagebuf.nr_pages = 0;
1270 if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) {
1271 PERROR("Error when reading batch");
1272 goto out;
1275 j = pagebuf.nr_pages;
1277 DBGPRINTF("batch %d\n",j);
1279 if ( j == 0 ) {
1280 /* catch vcpu updates */
1281 if (pagebuf.new_ctxt_format) {
1282 vcpumap = pagebuf.vcpumap;
1283 max_vcpu_id = pagebuf.max_vcpu_id;
1285 /* should this be deferred? does it change? */
1286 if ( pagebuf.identpt )
1287 xc_set_hvm_param(xch, dom, HVM_PARAM_IDENT_PT, pagebuf.identpt);
1288 if ( pagebuf.vm86_tss )
1289 xc_set_hvm_param(xch, dom, HVM_PARAM_VM86_TSS, pagebuf.vm86_tss);
1290 if ( pagebuf.console_pfn )
1291 console_pfn = pagebuf.console_pfn;
1292 break; /* our work here is done */
1295 /* break pagebuf into batches */
1296 curbatch = 0;
1297 while ( curbatch < j ) {
1298 int brc;
1300 brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type,
1301 pae_extended_cr3, hvm, mmu, &pagebuf, curbatch);
1302 if ( brc < 0 )
1303 goto out;
1305 nraces += brc;
1307 curbatch += MAX_BATCH_SIZE;
1310 pagebuf.nr_physpages = pagebuf.nr_pages = 0;
1312 n += j; /* crude stats */
1314 /*
1315 * Discard cache for portion of file read so far up to last
1316 * page boundary every 16MB or so.
1317 */
1318 m += j;
1319 if ( m > MAX_PAGECACHE_USAGE )
1321 discard_file_cache(xch, io_fd, 0 /* no flush */);
1322 m = 0;
1326 /*
1327 * Ensure we flush all machphys updates before potential PAE-specific
1328 * reallocations below.
1329 */
1330 if ( !hvm && xc_flush_mmu_updates(xch, mmu) )
1332 PERROR("Error doing flush_mmu_updates()");
1333 goto out;
1336 // DPRINTF("Received all pages (%d races)\n", nraces);
1338 if ( !ctx->completed ) {
1340 if ( buffer_tail(xch, ctx, &tailbuf, io_fd, max_vcpu_id, vcpumap,
1341 ext_vcpucontext, vcpuextstate, vcpuextstate_size) < 0 ) {
1342 ERROR ("error buffering image tail");
1343 goto out;
1346 ctx->completed = 1;
1348 /*
1349 * If more checkpoints are expected then shift into
1350 * nonblocking mode for the remainder.
1351 */
1352 if ( !ctx->last_checkpoint )
1353 fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK);
1356 if (pagebuf.acpi_ioport_location == 1) {
1357 DBGPRINTF("Use new firmware ioport from the checkpoint\n");
1358 xc_set_hvm_param(xch, dom, HVM_PARAM_ACPI_IOPORTS_LOCATION, 1);
1359 } else if (pagebuf.acpi_ioport_location == 0) {
1360 DBGPRINTF("Use old firmware ioport from the checkpoint\n");
1361 } else {
1362 ERROR("Error, unknow acpi ioport location (%i)", pagebuf.acpi_ioport_location);
1365 if ( ctx->last_checkpoint )
1367 // DPRINTF("Last checkpoint, finishing\n");
1368 goto finish;
1371 // DPRINTF("Buffered checkpoint\n");
1373 if ( pagebuf_get(xch, ctx, &pagebuf, io_fd, dom) ) {
1374 PERROR("error when buffering batch, finishing");
1375 goto finish;
1377 memset(&tmptail, 0, sizeof(tmptail));
1378 tmptail.ishvm = hvm;
1379 if ( buffer_tail(xch, ctx, &tmptail, io_fd, max_vcpu_id, vcpumap,
1380 ext_vcpucontext, vcpuextstate, vcpuextstate_size) < 0 ) {
1381 ERROR ("error buffering image tail, finishing");
1382 goto finish;
1384 tailbuf_free(&tailbuf);
1385 memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
1387 goto loadpages;
1389 finish:
1390 if ( hvm )
1391 goto finish_hvm;
1393 if ( (ctx->pt_levels == 3) && !pae_extended_cr3 )
1395 /*
1396 ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
1397 ** is a little awkward and involves (a) finding all such PGDs and
1398 ** replacing them with 'lowmem' versions; (b) upating the p2m[]
1399 ** with the new info; and (c) canonicalizing all the L1s using the
1400 ** (potentially updated) p2m[].
1401 **
1402 ** This is relatively slow (and currently involves two passes through
1403 ** the pfn_type[] array), but at least seems to be correct. May wish
1404 ** to consider more complex approaches to optimize this later.
1405 */
1407 int j, k;
1409 /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
1410 for ( i = 0; i < dinfo->p2m_size; i++ )
1412 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
1413 XEN_DOMCTL_PFINFO_L3TAB) &&
1414 (ctx->p2m[i] > 0xfffffUL) )
1416 unsigned long new_mfn;
1417 uint64_t l3ptes[4];
1418 uint64_t *l3tab;
1420 l3tab = (uint64_t *)
1421 xc_map_foreign_range(xch, dom, PAGE_SIZE,
1422 PROT_READ, ctx->p2m[i]);
1424 for ( j = 0; j < 4; j++ )
1425 l3ptes[j] = l3tab[j];
1427 munmap(l3tab, PAGE_SIZE);
1429 new_mfn = xc_make_page_below_4G(xch, dom, ctx->p2m[i]);
1430 if ( !new_mfn )
1432 PERROR("Couldn't get a page below 4GB :-(");
1433 goto out;
1436 ctx->p2m[i] = new_mfn;
1437 if ( xc_add_mmu_update(xch, mmu,
1438 (((unsigned long long)new_mfn)
1439 << PAGE_SHIFT) |
1440 MMU_MACHPHYS_UPDATE, i) )
1442 PERROR("Couldn't m2p on PAE root pgdir");
1443 goto out;
1446 l3tab = (uint64_t *)
1447 xc_map_foreign_range(xch, dom, PAGE_SIZE,
1448 PROT_READ | PROT_WRITE, ctx->p2m[i]);
1450 for ( j = 0; j < 4; j++ )
1451 l3tab[j] = l3ptes[j];
1453 munmap(l3tab, PAGE_SIZE);
1457 /* Second pass: find all L1TABs and uncanonicalize them */
1458 j = 0;
1460 for ( i = 0; i < dinfo->p2m_size; i++ )
1462 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
1463 XEN_DOMCTL_PFINFO_L1TAB) )
1465 region_mfn[j] = ctx->p2m[i];
1466 j++;
1469 if ( (i == (dinfo->p2m_size-1)) || (j == MAX_BATCH_SIZE) )
1471 region_base = xc_map_foreign_pages(
1472 xch, dom, PROT_READ | PROT_WRITE, region_mfn, j);
1473 if ( region_base == NULL )
1475 PERROR("map batch failed");
1476 goto out;
1479 for ( k = 0; k < j; k++ )
1481 if ( !uncanonicalize_pagetable(
1482 xch, dom, ctx,
1483 region_base + k*PAGE_SIZE) )
1485 ERROR("failed uncanonicalize pt!");
1486 goto out;
1490 munmap(region_base, j*PAGE_SIZE);
1491 j = 0;
1495 if ( xc_flush_mmu_updates(xch, mmu) )
1497 PERROR("Error doing xc_flush_mmu_updates()");
1498 goto out;
1502 /*
1503 * Pin page tables. Do this after writing to them as otherwise Xen
1504 * will barf when doing the type-checking.
1505 */
1506 nr_pins = 0;
1507 for ( i = 0; i < dinfo->p2m_size; i++ )
1509 if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
1510 continue;
1512 switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
1514 case XEN_DOMCTL_PFINFO_L1TAB:
1515 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
1516 break;
1518 case XEN_DOMCTL_PFINFO_L2TAB:
1519 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
1520 break;
1522 case XEN_DOMCTL_PFINFO_L3TAB:
1523 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
1524 break;
1526 case XEN_DOMCTL_PFINFO_L4TAB:
1527 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
1528 break;
1530 default:
1531 continue;
1534 pin[nr_pins].arg1.mfn = ctx->p2m[i];
1535 nr_pins++;
1537 /* Batch full? Then flush. */
1538 if ( nr_pins == MAX_PIN_BATCH )
1540 if ( xc_mmuext_op(xch, pin, nr_pins, dom) < 0 )
1542 PERROR("Failed to pin batch of %d page tables", nr_pins);
1543 goto out;
1545 nr_pins = 0;
1549 /* Flush final partial batch. */
1550 if ( (nr_pins != 0) && (xc_mmuext_op(xch, pin, nr_pins, dom) < 0) )
1552 PERROR("Failed to pin batch of %d page tables", nr_pins);
1553 goto out;
1556 DPRINTF("Memory reloaded (%ld pages)\n", ctx->nr_pfns);
1558 /* Get the list of PFNs that are not in the psuedo-phys map */
1560 int nr_frees = 0;
1562 for ( i = 0; i < tailbuf.u.pv.pfncount; i++ )
1564 unsigned long pfn = tailbuf.u.pv.pfntab[i];
1566 if ( ctx->p2m[pfn] != INVALID_P2M_ENTRY )
1568 /* pfn is not in physmap now, but was at some point during
1569 the save/migration process - need to free it */
1570 tailbuf.u.pv.pfntab[nr_frees++] = ctx->p2m[pfn];
1571 ctx->p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
1575 if ( nr_frees > 0 )
1577 if ( (frc = xc_domain_decrease_reservation(xch, dom, nr_frees, 0, tailbuf.u.pv.pfntab)) != nr_frees )
1579 PERROR("Could not decrease reservation : %d", frc);
1580 goto out;
1582 else
1583 DPRINTF("Decreased reservation by %d pages\n", tailbuf.u.pv.pfncount);
1587 vcpup = tailbuf.u.pv.vcpubuf;
1588 for ( i = 0; i <= max_vcpu_id; i++ )
1590 if ( !(vcpumap & (1ULL << i)) )
1591 continue;
1593 memcpy(ctxt, vcpup, ((dinfo->guest_width == 8) ? sizeof(ctxt->x64)
1594 : sizeof(ctxt->x32)));
1595 vcpup += (dinfo->guest_width == 8) ? sizeof(ctxt->x64) : sizeof(ctxt->x32);
1597 DPRINTF("read VCPU %d\n", i);
1599 if ( !new_ctxt_format )
1600 SET_FIELD(ctxt, flags, GET_FIELD(ctxt, flags) | VGCF_online);
1602 if ( i == 0 )
1604 /*
1605 * Uncanonicalise the suspend-record frame number and poke
1606 * resume record.
1607 */
1608 pfn = GET_FIELD(ctxt, user_regs.edx);
1609 if ( (pfn >= dinfo->p2m_size) ||
1610 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1612 ERROR("Suspend record frame number is bad");
1613 goto out;
1615 mfn = ctx->p2m[pfn];
1616 SET_FIELD(ctxt, user_regs.edx, mfn);
1617 start_info = xc_map_foreign_range(
1618 xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
1619 SET_FIELD(start_info, nr_pages, dinfo->p2m_size);
1620 SET_FIELD(start_info, shared_info, shared_info_frame<<PAGE_SHIFT);
1621 SET_FIELD(start_info, flags, 0);
1622 *store_mfn = ctx->p2m[GET_FIELD(start_info, store_mfn)];
1623 SET_FIELD(start_info, store_mfn, *store_mfn);
1624 SET_FIELD(start_info, store_evtchn, store_evtchn);
1625 *console_mfn = ctx->p2m[GET_FIELD(start_info, console.domU.mfn)];
1626 SET_FIELD(start_info, console.domU.mfn, *console_mfn);
1627 SET_FIELD(start_info, console.domU.evtchn, console_evtchn);
1628 munmap(start_info, PAGE_SIZE);
1630 /* Uncanonicalise each GDT frame number. */
1631 if ( GET_FIELD(ctxt, gdt_ents) > 8192 )
1633 ERROR("GDT entry count out of range");
1634 goto out;
1637 for ( j = 0; (512*j) < GET_FIELD(ctxt, gdt_ents); j++ )
1639 pfn = GET_FIELD(ctxt, gdt_frames[j]);
1640 if ( (pfn >= dinfo->p2m_size) ||
1641 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1643 ERROR("GDT frame number %i (0x%lx) is bad",
1644 j, (unsigned long)pfn);
1645 goto out;
1647 SET_FIELD(ctxt, gdt_frames[j], ctx->p2m[pfn]);
1649 /* Uncanonicalise the page table base pointer. */
1650 pfn = UNFOLD_CR3(GET_FIELD(ctxt, ctrlreg[3]));
1652 if ( pfn >= dinfo->p2m_size )
1654 ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
1655 pfn, dinfo->p2m_size, pfn_type[pfn]);
1656 goto out;
1659 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1660 ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1662 ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1663 pfn, dinfo->p2m_size, pfn_type[pfn],
1664 (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1665 goto out;
1667 SET_FIELD(ctxt, ctrlreg[3], FOLD_CR3(ctx->p2m[pfn]));
1669 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1670 if ( (ctx->pt_levels == 4) && (ctxt->x64.ctrlreg[1] & 1) )
1672 pfn = UNFOLD_CR3(ctxt->x64.ctrlreg[1] & ~1);
1673 if ( pfn >= dinfo->p2m_size )
1675 ERROR("User PT base is bad: pfn=%lu p2m_size=%lu",
1676 pfn, dinfo->p2m_size);
1677 goto out;
1679 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1680 ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1682 ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1683 pfn, dinfo->p2m_size, pfn_type[pfn],
1684 (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1685 goto out;
1687 ctxt->x64.ctrlreg[1] = FOLD_CR3(ctx->p2m[pfn]);
1689 domctl.cmd = XEN_DOMCTL_setvcpucontext;
1690 domctl.domain = (domid_t)dom;
1691 domctl.u.vcpucontext.vcpu = i;
1692 set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
1693 frc = xc_domctl(xch, &domctl);
1694 if ( frc != 0 )
1696 PERROR("Couldn't build vcpu%d", i);
1697 goto out;
1700 if ( !ext_vcpucontext )
1701 goto vcpu_ext_state_restore;
1702 memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
1703 vcpup += 128;
1704 domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
1705 domctl.domain = dom;
1706 frc = xc_domctl(xch, &domctl);
1707 if ( frc != 0 )
1709 PERROR("Couldn't set extended vcpu%d info", i);
1710 goto out;
1713 vcpu_ext_state_restore:
1714 if ( !vcpuextstate )
1715 continue;
1717 memcpy(&domctl.u.vcpuextstate.xfeature_mask, vcpup,
1718 sizeof(domctl.u.vcpuextstate.xfeature_mask));
1719 vcpup += sizeof(domctl.u.vcpuextstate.xfeature_mask);
1720 memcpy(&domctl.u.vcpuextstate.size, vcpup,
1721 sizeof(domctl.u.vcpuextstate.size));
1722 vcpup += sizeof(domctl.u.vcpuextstate.size);
1724 buffer = xc_hypercall_buffer_alloc(xch, buffer,
1725 domctl.u.vcpuextstate.size);
1726 if ( !buffer )
1728 PERROR("Could not allocate buffer to restore eXtended States");
1729 goto out;
1731 memcpy(buffer, vcpup, domctl.u.vcpuextstate.size);
1732 vcpup += domctl.u.vcpuextstate.size;
1734 domctl.cmd = XEN_DOMCTL_setvcpuextstate;
1735 domctl.domain = dom;
1736 domctl.u.vcpuextstate.vcpu = i;
1737 set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
1738 frc = xc_domctl(xch, &domctl);
1739 if ( frc != 0 )
1741 PERROR("Couldn't set eXtended States for vcpu%d", i);
1742 goto out;
1744 xc_hypercall_buffer_free(xch, buffer);
1747 memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE);
1749 DPRINTF("Completed checkpoint load\n");
1751 /* Restore contents of shared-info page. No checking needed. */
1752 new_shared_info = xc_map_foreign_range(
1753 xch, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
1755 /* restore saved vcpu_info and arch specific info */
1756 MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info);
1757 MEMCPY_FIELD(new_shared_info, old_shared_info, arch);
1759 /* clear any pending events and the selector */
1760 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0);
1761 for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
1762 SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0);
1764 /* mask event channels */
1765 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff);
1767 /* leave wallclock time. set by hypervisor */
1768 munmap(new_shared_info, PAGE_SIZE);
1770 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
1771 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
1773 pfn = p2m_frame_list[i];
1774 if ( (pfn >= dinfo->p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1776 ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn);
1777 goto out;
1779 p2m_frame_list[i] = ctx->p2m[pfn];
1782 /* Copy the P2M we've constructed to the 'live' P2M */
1783 if ( !(ctx->live_p2m = xc_map_foreign_pages(xch, dom, PROT_WRITE,
1784 p2m_frame_list, P2M_FL_ENTRIES)) )
1786 PERROR("Couldn't map p2m table");
1787 goto out;
1790 /* If the domain we're restoring has a different word size to ours,
1791 * we need to adjust the live_p2m assignment appropriately */
1792 if ( dinfo->guest_width > sizeof (xen_pfn_t) )
1793 for ( i = dinfo->p2m_size - 1; i >= 0; i-- )
1794 ((int64_t *)ctx->live_p2m)[i] = (long)ctx->p2m[i];
1795 else if ( dinfo->guest_width < sizeof (xen_pfn_t) )
1796 for ( i = 0; i < dinfo->p2m_size; i++ )
1797 ((uint32_t *)ctx->live_p2m)[i] = ctx->p2m[i];
1798 else
1799 memcpy(ctx->live_p2m, ctx->p2m, dinfo->p2m_size * sizeof(xen_pfn_t));
1800 munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
1802 DPRINTF("Domain ready to be built.\n");
1803 rc = 0;
1804 goto out;
1806 finish_hvm:
1807 /* Dump the QEMU state to a state file for QEMU to load */
1808 if ( dump_qemu(xch, dom, &tailbuf.u.hvm) ) {
1809 PERROR("Error dumping QEMU state to file");
1810 goto out;
1813 /* These comms pages need to be zeroed at the start of day */
1814 if ( xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[0]) ||
1815 xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[1]) ||
1816 xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[2]) )
1818 PERROR("error zeroing magic pages");
1819 goto out;
1822 if ( (frc = xc_set_hvm_param(xch, dom,
1823 HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0]))
1824 || (frc = xc_set_hvm_param(xch, dom,
1825 HVM_PARAM_BUFIOREQ_PFN, tailbuf.u.hvm.magicpfns[1]))
1826 || (frc = xc_set_hvm_param(xch, dom,
1827 HVM_PARAM_STORE_PFN, tailbuf.u.hvm.magicpfns[2]))
1828 || (frc = xc_set_hvm_param(xch, dom,
1829 HVM_PARAM_PAE_ENABLED, pae))
1830 || (frc = xc_set_hvm_param(xch, dom,
1831 HVM_PARAM_STORE_EVTCHN,
1832 store_evtchn)) )
1834 PERROR("error setting HVM params: %i", frc);
1835 goto out;
1837 *store_mfn = tailbuf.u.hvm.magicpfns[2];
1839 if ( console_pfn ) {
1840 if ( xc_clear_domain_page(xch, dom, console_pfn) ) {
1841 PERROR("error zeroing console page");
1842 goto out;
1844 if ( (frc = xc_set_hvm_param(xch, dom,
1845 HVM_PARAM_CONSOLE_PFN, console_pfn)) ) {
1846 PERROR("error setting HVM param: %i", frc);
1847 goto out;
1849 *console_mfn = console_pfn;
1852 frc = xc_domain_hvm_setcontext(xch, dom, tailbuf.u.hvm.hvmbuf,
1853 tailbuf.u.hvm.reclen);
1854 if ( frc )
1856 PERROR("error setting the HVM context");
1857 goto out;
1860 /* HVM success! */
1861 rc = 0;
1863 out:
1864 if ( (rc != 0) && (dom != 0) )
1865 xc_domain_destroy(xch, dom);
1866 xc_hypercall_buffer_free(xch, ctxt);
1867 free(mmu);
1868 free(ctx->p2m);
1869 free(pfn_type);
1870 tailbuf_free(&tailbuf);
1872 /* discard cache for save file */
1873 discard_file_cache(xch, io_fd, 1 /*flush*/);
1875 fcntl(io_fd, F_SETFL, orig_io_fd_flags);
1877 DPRINTF("Restore exit with rc=%d\n", rc);
1879 return rc;
1881 /*
1882 * Local variables:
1883 * mode: C
1884 * c-set-style: "BSD"
1885 * c-basic-offset: 4
1886 * tab-width: 4
1887 * indent-tabs-mode: nil
1888 * End:
1889 */