debuggers.hg

annotate tools/libxc/xc_domain_restore.c @ 20983:a948403c8f99

Remus: increase failover timeout from 500ms to 1s

500ms is aggressive enough to trigger split-brain under fairly
ordinary workloads, particularly for HVM. The long-term fix is to
integrate with a real HA monitor like linux HA.

Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Feb 12 09:23:10 2010 +0000 (2010-02-12)
parents fbe8f32fa257
children 779c0ef9682c
rev   line source
mjw@1661 1 /******************************************************************************
Tim@14782 2 * xc_domain_restore.c
kaf24@9698 3 *
Tim@14782 4 * Restore the state of a guest session.
kaf24@9698 5 *
mjw@1661 6 * Copyright (c) 2003, K A Fraser.
Tim@14782 7 * Copyright (c) 2006, Intel Corporation
Tim@14782 8 * Copyright (c) 2007, XenSource Inc.
Tim@14782 9 *
Tim@14782 10 * This program is free software; you can redistribute it and/or modify it
Tim@14782 11 * under the terms and conditions of the GNU General Public License,
Tim@14782 12 * version 2, as published by the Free Software Foundation.
Tim@14782 13 *
Tim@14782 14 * This program is distributed in the hope it will be useful, but WITHOUT
Tim@14782 15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
Tim@14782 16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
Tim@14782 17 * more details.
Tim@14782 18 *
Tim@14782 19 * You should have received a copy of the GNU General Public License along with
Tim@14782 20 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Tim@14782 21 * Place - Suite 330, Boston, MA 02111-1307 USA.
Tim@14782 22 *
mjw@1661 23 */
mjw@1661 24
cl349@6427 25 #include <stdlib.h>
cl349@6427 26 #include <unistd.h>
smh22@7740 27
cl349@6427 28 #include "xg_private.h"
smh22@7740 29 #include "xg_save_restore.h"
keir@14138 30 #include "xc_dom.h"
smh22@7740 31
Tim@14782 32 #include <xen/hvm/ioreq.h>
Tim@14782 33 #include <xen/hvm/params.h>
Tim@14782 34
keir@20587 35 struct restore_ctx {
keir@20587 36 unsigned long max_mfn; /* max mfn of the current host machine */
keir@20587 37 unsigned long hvirt_start; /* virtual starting address of the hypervisor */
keir@20587 38 unsigned int pt_levels; /* #levels of page tables used by the current guest */
keir@20587 39 unsigned long nr_pfns; /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
keir@20587 40 xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
keir@20587 41 xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
keir@20587 42 unsigned no_superpage_mem; /* If have enough continuous memory for super page allocation */
keir@20589 43 struct domain_info_context dinfo;
keir@20587 44 };
steven@14732 45
smh22@7740 46 /*
keir@19677 47 **
keir@19677 48 **
keir@19677 49 */
keir@19677 50 #define SUPERPAGE_PFN_SHIFT 9
keir@19677 51 #define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
keir@19677 52
keir@20164 53 /*
keir@20164 54 * Setting bit 31 force to allocate super page even not all pfns come out,
keir@20164 55 * bit 30 indicate that not is in a super page tracking.
keir@20164 56 */
keir@20164 57 #define FORCE_SP_SHIFT 31
keir@20164 58 #define FORCE_SP_MASK (1UL << FORCE_SP_SHIFT)
keir@19677 59
keir@20164 60 #define INVALID_SUPER_PAGE ((1UL << 30) + 1)
keir@20164 61 #define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
keir@20164 62 #define SUPER_PAGE_TRACKING(pfn) ( (pfn) != INVALID_SUPER_PAGE )
keir@20164 63 #define SUPER_PAGE_DONE(pfn) ( SUPER_PAGE_START(pfn) )
keir@19677 64
keir@20591 65 static int super_page_populated(struct restore_ctx *ctx, unsigned long pfn)
keir@20164 66 {
keir@20164 67 int i;
keir@20164 68 pfn &= ~(SUPERPAGE_NR_PFNS - 1);
keir@20164 69 for ( i = pfn; i < pfn + SUPERPAGE_NR_PFNS; i++ )
keir@20164 70 {
keir@20587 71 if ( ctx->p2m[i] != INVALID_P2M_ENTRY )
keir@20164 72 return 1;
keir@20164 73 }
keir@20164 74 return 0;
keir@20164 75 }
keir@19677 76
keir@20164 77 /*
keir@20164 78 * Break a 2M page and move contents of [extent start, next_pfn-1] to
keir@20164 79 * some new allocated 4K pages
keir@20164 80 */
keir@20164 81 static int break_super_page(int xc_handle,
keir@20164 82 uint32_t dom,
keir@20591 83 struct restore_ctx *ctx,
keir@20164 84 xen_pfn_t next_pfn)
keir@20164 85 {
keir@20164 86 xen_pfn_t *page_array, start_pfn, mfn;
keir@20164 87 uint8_t *ram_base, *save_buf;
keir@20164 88 unsigned long i;
keir@20164 89 int tot_pfns, rc = 0;
keir@20164 90
keir@20164 91 tot_pfns = (next_pfn & (SUPERPAGE_NR_PFNS - 1));
keir@20164 92
keir@20164 93 start_pfn = next_pfn & ~(SUPERPAGE_NR_PFNS - 1);
keir@20164 94 for ( i = start_pfn; i < start_pfn + SUPERPAGE_NR_PFNS; i++ )
keir@20164 95 {
keir@20164 96 /* check the 2M page are populated */
keir@20587 97 if ( ctx->p2m[i] == INVALID_P2M_ENTRY ) {
keir@20164 98 DPRINTF("Previous super page was populated wrongly!\n");
keir@19677 99 return 1;
keir@19677 100 }
keir@20164 101 }
keir@20164 102
keir@20164 103 page_array = (xen_pfn_t*)malloc(tot_pfns * sizeof(xen_pfn_t));
keir@20164 104 save_buf = (uint8_t*)malloc(tot_pfns * PAGE_SIZE);
keir@20164 105
keir@20164 106 if ( !page_array || !save_buf )
keir@20164 107 {
keir@20164 108 ERROR("alloc page_array failed\n");
keir@20164 109 errno = ENOMEM;
keir@20164 110 rc = 1;
keir@20164 111 goto out;
keir@20164 112 }
keir@20164 113
keir@20164 114 /* save previous super page contents */
keir@20164 115 for ( i = 0; i < tot_pfns; i++ )
keir@20164 116 {
keir@20164 117 /* only support HVM, as the mfn of the 2M page is missing */
keir@20164 118 page_array[i] = start_pfn + i;
keir@20164 119 }
keir@20164 120
keir@20837 121 ram_base = xc_map_foreign_pages(xc_handle, dom, PROT_READ,
keir@20164 122 page_array, tot_pfns);
keir@20164 123
keir@20164 124 if ( ram_base == NULL )
keir@20164 125 {
keir@20164 126 ERROR("map batch failed\n");
keir@20164 127 rc = 1;
keir@20164 128 goto out;
keir@20164 129 }
keir@20164 130
keir@20164 131 memcpy(save_buf, ram_base, tot_pfns * PAGE_SIZE);
keir@20164 132 munmap(ram_base, tot_pfns * PAGE_SIZE);
keir@20164 133
keir@20164 134 /* free the super page */
keir@20164 135 if ( xc_domain_memory_decrease_reservation(xc_handle, dom, 1,
keir@20164 136 SUPERPAGE_PFN_SHIFT, &start_pfn) != 0 )
keir@20164 137 {
keir@20164 138 ERROR("free 2M page failure @ 0x%ld.\n", next_pfn);
keir@20164 139 rc = 1;
keir@20164 140 goto out;
keir@20164 141 }
keir@20164 142
keir@20164 143 start_pfn = next_pfn & ~(SUPERPAGE_NR_PFNS - 1);
keir@20164 144 for ( i = start_pfn; i < start_pfn + SUPERPAGE_NR_PFNS; i++ )
keir@20164 145 {
keir@20587 146 ctx->p2m[i] = INVALID_P2M_ENTRY;
keir@20164 147 }
keir@20164 148
keir@20164 149 for ( i = start_pfn; i < start_pfn + tot_pfns; i++ )
keir@20164 150 {
keir@20164 151 mfn = i;
keir@20164 152 if (xc_domain_memory_populate_physmap(xc_handle, dom, 1, 0,
keir@20164 153 0, &mfn) != 0)
keir@19677 154 {
keir@20164 155 ERROR("Failed to allocate physical memory.!\n");
keir@20164 156 errno = ENOMEM;
keir@20164 157 rc = 1;
keir@20164 158 goto out;
keir@20164 159 }
keir@20587 160 ctx->p2m[i] = mfn;
keir@20164 161 }
keir@20164 162
keir@20164 163 /* restore contents */
keir@20164 164 for ( i = 0; i < tot_pfns; i++ )
keir@20164 165 {
keir@20164 166 page_array[i] = start_pfn + i;
keir@20164 167 }
keir@20164 168
keir@20837 169 ram_base = xc_map_foreign_pages(xc_handle, dom, PROT_WRITE,
keir@20164 170 page_array, tot_pfns);
keir@20164 171 if ( ram_base == NULL )
keir@20164 172 {
keir@20164 173 ERROR("map batch failed\n");
keir@20164 174 rc = 1;
keir@20164 175 goto out;
keir@20164 176 }
keir@20164 177
keir@20164 178 memcpy(ram_base, save_buf, tot_pfns * PAGE_SIZE);
keir@20164 179 munmap(ram_base, tot_pfns * PAGE_SIZE);
keir@20164 180
keir@20164 181 out:
keir@20164 182 free(page_array);
keir@20164 183 free(save_buf);
keir@20164 184 return rc;
keir@20164 185 }
keir@20164 186
keir@20164 187
keir@20164 188 /*
keir@20164 189 * According to pfn list allocate pages: one 2M page or series of 4K pages.
keir@20164 190 * Also optimistically allocate a 2M page even when not all pages in the 2M
keir@20164 191 * extent come out, and fix it up in next batch:
keir@20164 192 * If new pages fit the missing one in the 2M extent, do nothing; Else take
keir@20164 193 * place of the original 2M page by some 4K pages.
keir@20164 194 */
keir@20164 195 static int allocate_mfn_list(int xc_handle,
keir@20164 196 uint32_t dom,
keir@20591 197 struct restore_ctx *ctx,
keir@20164 198 unsigned long nr_extents,
keir@20164 199 xen_pfn_t *batch_buf,
keir@20164 200 xen_pfn_t *next_pfn,
keir@20164 201 int superpages)
keir@20164 202 {
keir@20164 203 unsigned int i;
keir@20164 204 unsigned long mfn, pfn, sp_pfn;
keir@20164 205
keir@20164 206 /*Check if force super page, then clear it */
keir@20164 207 unsigned force_super_page = !!(*next_pfn & FORCE_SP_MASK);
keir@20164 208 *next_pfn &= ~FORCE_SP_MASK;
keir@20164 209
keir@20164 210 sp_pfn = *next_pfn;
keir@20164 211
keir@20164 212 if ( !superpages ||
keir@20587 213 ctx->no_superpage_mem ||
keir@20164 214 !SUPER_PAGE_TRACKING(sp_pfn) )
keir@20164 215 goto normal_page;
keir@20164 216
keir@20164 217 if ( !batch_buf )
keir@20164 218 {
keir@20164 219 /* Break previous 2M page, if 512 pages split across a batch boundary */
keir@20164 220 if ( SUPER_PAGE_TRACKING(sp_pfn) &&
keir@20164 221 !SUPER_PAGE_DONE(sp_pfn))
keir@20164 222 {
keir@20164 223 /* break previously allocated super page*/
keir@20591 224 if ( break_super_page(xc_handle, dom, ctx, sp_pfn) != 0 )
keir@20164 225 {
keir@20164 226 ERROR("Break previous super page fail!\n");
keir@20164 227 return 1;
keir@20164 228 }
keir@20164 229 }
keir@20164 230
keir@20164 231 /* follwing pages fit the order in 2M extent */
keir@20164 232 return 0;
keir@20164 233 }
keir@20164 234
keir@20164 235 /*
keir@20164 236 * We try to allocate a 2M page only when:
keir@20164 237 * user require this(superpages),
keir@20164 238 * AND have enough memory,
keir@20164 239 * AND is in the tracking,
keir@20164 240 * AND tracked all pages in 2M extent, OR partial 2M extent for speculation
keir@20164 241 * AND any page in 2M extent are not populated
keir@20164 242 */
keir@20164 243 if ( !SUPER_PAGE_DONE(sp_pfn) && !force_super_page )
keir@20164 244 goto normal_page;
keir@20164 245
keir@20164 246 pfn = batch_buf[0] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@20591 247 if ( super_page_populated(ctx, pfn) )
keir@20164 248 goto normal_page;
keir@20164 249
keir@20164 250 pfn &= ~(SUPERPAGE_NR_PFNS - 1);
keir@20164 251 mfn = pfn;
keir@20164 252
keir@20164 253 if ( xc_domain_memory_populate_physmap(xc_handle, dom, 1,
keir@20164 254 SUPERPAGE_PFN_SHIFT, 0, &mfn) == 0)
keir@20164 255 {
keir@20164 256 for ( i = pfn; i < pfn + SUPERPAGE_NR_PFNS; i++, mfn++ )
keir@20164 257 {
keir@20587 258 ctx->p2m[i] = mfn;
keir@20164 259 }
keir@20164 260 return 0;
keir@20164 261 }
keir@20164 262 DPRINTF("No 2M page available for pfn 0x%lx, fall back to 4K page.\n",
keir@20164 263 pfn);
keir@20587 264 ctx->no_superpage_mem = 1;
keir@20164 265
keir@20164 266 normal_page:
keir@20164 267 if ( !batch_buf )
keir@20164 268 return 0;
keir@20164 269
keir@20164 270 /* End the tracking, if want a 2M page but end by 4K pages, */
keir@20164 271 *next_pfn = INVALID_SUPER_PAGE;
keir@20164 272
keir@20164 273 for ( i = 0; i < nr_extents; i++ )
keir@20164 274 {
keir@20164 275 unsigned long pagetype = batch_buf[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@20164 276 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
keir@20164 277 continue;
keir@20164 278
keir@20164 279 pfn = mfn = batch_buf[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@20587 280 if ( ctx->p2m[pfn] == INVALID_P2M_ENTRY )
keir@20164 281 {
keir@20164 282 if (xc_domain_memory_populate_physmap(xc_handle, dom, 1, 0,
keir@20164 283 0, &mfn) != 0)
keir@20164 284 {
keir@20164 285 ERROR("Failed to allocate physical memory.! pfn=0x%lx, mfn=0x%lx.\n",
keir@20164 286 pfn, mfn);
keir@20164 287 errno = ENOMEM;
keir@20164 288 return 1;
keir@20164 289 }
keir@20587 290 ctx->p2m[pfn] = mfn;
keir@19677 291 }
keir@19677 292 }
keir@20164 293
keir@19677 294 return 0;
keir@19677 295 }
keir@19677 296
keir@19677 297 static int allocate_physmem(int xc_handle, uint32_t dom,
keir@20591 298 struct restore_ctx *ctx,
keir@19677 299 unsigned long *region_pfn_type, int region_size,
keir@19677 300 unsigned int hvm, xen_pfn_t *region_mfn, int superpages)
keir@19677 301 {
keir@20164 302 int i;
keir@19677 303 unsigned long pfn;
keir@19677 304 unsigned long pagetype;
keir@19677 305
keir@20164 306 /* Next expected pfn in order to track a possible 2M page */
keir@20164 307 static unsigned long required_pfn = INVALID_SUPER_PAGE;
keir@20164 308
keir@20164 309 /* Buffer of pfn list for 2M page, or series of 4K pages */
keir@20164 310 xen_pfn_t *batch_buf;
keir@20164 311 unsigned int batch_buf_len;
keir@20589 312 struct domain_info_context *dinfo = &ctx->dinfo;
keir@20164 313
keir@20164 314 if ( !superpages )
keir@20164 315 {
keir@20164 316 batch_buf = &region_pfn_type[0];
keir@20164 317 batch_buf_len = region_size;
keir@20164 318 goto alloc_page;
keir@20164 319 }
keir@20164 320
keir@20164 321 batch_buf = NULL;
keir@20164 322 batch_buf_len = 0;
keir@20164 323 /* This loop tracks the possible 2M page */
keir@20164 324 for (i = 0; i < region_size; i++)
keir@20164 325 {
keir@20164 326 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@20164 327 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@20164 328
keir@20164 329 if (pagetype == XEN_DOMCTL_PFINFO_XTAB)
keir@20164 330 {
keir@20164 331 /* Do not start collecting pfns until get a valid pfn */
keir@20164 332 if ( batch_buf_len != 0 )
keir@20164 333 batch_buf_len++;
keir@20164 334 continue;
keir@20164 335 }
keir@20164 336
keir@20164 337 if ( SUPER_PAGE_START(pfn) )
keir@20164 338 {
keir@20164 339 /* Start of a 2M extent, populate previsous buf */
keir@20591 340 if ( allocate_mfn_list(xc_handle, dom, ctx,
keir@20164 341 batch_buf_len, batch_buf,
keir@20164 342 &required_pfn, superpages) != 0 )
keir@20164 343 {
keir@20164 344 errno = ENOMEM;
keir@20164 345 return 1;
keir@20164 346 }
keir@20164 347
keir@20164 348 /* start new tracking for 2M page */
keir@20164 349 batch_buf = &region_pfn_type[i];
keir@20164 350 batch_buf_len = 1;
keir@20164 351 required_pfn = pfn + 1;
keir@20164 352 }
keir@20164 353 else if ( pfn == required_pfn )
keir@20164 354 {
keir@20164 355 /* this page fit the 2M extent in order */
keir@20164 356 batch_buf_len++;
keir@20164 357 required_pfn++;
keir@20164 358 }
keir@20164 359 else if ( SUPER_PAGE_TRACKING(required_pfn) )
keir@20164 360 {
keir@20164 361 /* break of a 2M extent, populate previous buf */
keir@20591 362 if ( allocate_mfn_list(xc_handle, dom, ctx,
keir@20164 363 batch_buf_len, batch_buf,
keir@20164 364 &required_pfn, superpages) != 0 )
keir@20164 365 {
keir@20164 366 errno = ENOMEM;
keir@20164 367 return 1;
keir@20164 368 }
keir@20164 369 /* start new tracking for a series of 4K pages */
keir@20164 370 batch_buf = &region_pfn_type[i];
keir@20164 371 batch_buf_len = 1;
keir@20164 372 required_pfn = INVALID_SUPER_PAGE;
keir@20164 373 }
keir@20164 374 else
keir@20164 375 {
keir@20164 376 /* this page is 4K */
keir@20164 377 if ( !batch_buf )
keir@20164 378 batch_buf = &region_pfn_type[i];
keir@20164 379 batch_buf_len++;
keir@20164 380 }
keir@20164 381 }
keir@20164 382
keir@20164 383 /*
keir@20164 384 * populate rest batch_buf in the end.
keir@20164 385 * In a speculative way, we allocate a 2M page even when not see all the
keir@20164 386 * pages in order(set bit 31). If not require super page support,
keir@20164 387 * we can skip the tracking loop and come here directly.
keir@20164 388 * Speculative allocation can't be used for PV guest, as we have no mfn to
keir@20164 389 * map previous 2M mem range if need break it.
keir@20164 390 */
keir@20164 391 if ( SUPER_PAGE_TRACKING(required_pfn) &&
keir@20164 392 !SUPER_PAGE_DONE(required_pfn) )
keir@20164 393 {
keir@20164 394 if (hvm)
keir@20164 395 required_pfn |= FORCE_SP_MASK;
keir@20164 396 else
keir@20164 397 required_pfn = INVALID_SUPER_PAGE;
keir@20164 398 }
keir@20164 399
keir@20164 400 alloc_page:
keir@20164 401 if ( batch_buf )
keir@20164 402 {
keir@20591 403 if ( allocate_mfn_list(xc_handle, dom, ctx,
keir@20164 404 batch_buf_len, batch_buf,
keir@20164 405 &required_pfn,
keir@20164 406 superpages) != 0 )
keir@20164 407 {
keir@20164 408 errno = ENOMEM;
keir@20164 409 return 1;
keir@20164 410 }
keir@20164 411 }
keir@20164 412
keir@19677 413 for (i = 0; i < region_size; i++)
keir@19677 414 {
keir@19677 415 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@19677 416 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@19677 417
keir@20585 418 if ( pfn > dinfo->p2m_size )
keir@19677 419 {
keir@19677 420 ERROR("pfn out of range");
keir@19677 421 return 1;
keir@19677 422 }
keir@19677 423 if (pagetype == XEN_DOMCTL_PFINFO_XTAB)
keir@19677 424 {
keir@19677 425 region_mfn[i] = ~0UL;
keir@19677 426 }
keir@19677 427 else
keir@19677 428 {
keir@20587 429 if (ctx->p2m[pfn] == INVALID_P2M_ENTRY)
keir@19677 430 {
keir@20164 431 DPRINTF("Warning: pfn 0x%lx are not allocated!\n", pfn);
keir@20164 432 /*XXX:allocate this page?*/
keir@19677 433 }
keir@19677 434
keir@19677 435 /* setup region_mfn[] for batch map.
keir@19677 436 * For HVM guests, this interface takes PFNs, not MFNs */
keir@20587 437 region_mfn[i] = hvm ? pfn : ctx->p2m[pfn];
keir@19677 438 }
keir@19677 439 }
keir@19677 440 return 0;
keir@19677 441 }
keir@19677 442
keir@19677 443
keir@20453 444 /* set when a consistent image is available */
keir@20453 445 static int completed = 0;
keir@20453 446
keir@20983 447 #define HEARTBEAT_MS 1000
keir@20453 448
keir@20453 449 #ifndef __MINIOS__
keir@20453 450 static ssize_t read_exact_timed(int fd, void* buf, size_t size)
keir@20453 451 {
keir@20453 452 size_t offset = 0;
keir@20453 453 ssize_t len;
keir@20453 454 struct timeval tv;
keir@20453 455 fd_set rfds;
keir@20453 456
keir@20453 457 while ( offset < size )
keir@20453 458 {
keir@20453 459 if ( completed ) {
keir@20453 460 /* expect a heartbeat every HEARBEAT_MS ms maximum */
keir@20983 461 tv.tv_sec = HEARTBEAT_MS / 1000;
keir@20983 462 tv.tv_usec = (HEARTBEAT_MS % 1000) * 1000;
keir@20453 463
keir@20453 464 FD_ZERO(&rfds);
keir@20453 465 FD_SET(fd, &rfds);
keir@20453 466 len = select(fd + 1, &rfds, NULL, NULL, &tv);
keir@20453 467 if ( !FD_ISSET(fd, &rfds) ) {
keir@20453 468 fprintf(stderr, "read_exact_timed failed (select returned %zd)\n", len);
keir@20453 469 return -1;
keir@20453 470 }
keir@20453 471 }
keir@20453 472
keir@20453 473 len = read(fd, buf + offset, size - offset);
keir@20453 474 if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) )
keir@20453 475 continue;
keir@20453 476 if ( len <= 0 )
keir@20453 477 return -1;
keir@20453 478 offset += len;
keir@20453 479 }
keir@20453 480
keir@20453 481 return 0;
keir@20453 482 }
keir@20453 483
keir@20453 484 #define read_exact read_exact_timed
keir@20453 485
keir@20453 486 #else
keir@20453 487 #define read_exact_timed read_exact
keir@20453 488 #endif
keir@19677 489 /*
kaf24@9698 490 ** In the state file (or during transfer), all page-table pages are
kaf24@9698 491 ** converted into a 'canonical' form where references to actual mfns
kaf24@9698 492 ** are replaced with references to the corresponding pfns.
kaf24@9698 493 ** This function inverts that operation, replacing the pfn values with
kaf24@9698 494 ** the (now known) appropriate mfn values.
smh22@7740 495 */
keir@20591 496 static int uncanonicalize_pagetable(int xc_handle, uint32_t dom, struct restore_ctx *ctx,
keir@20837 497 void *page, int superpages)
kaf24@9698 498 {
kaf24@9698 499 int i, pte_last;
kaf24@9698 500 unsigned long pfn;
kaf24@9698 501 uint64_t pte;
keir@20589 502 struct domain_info_context *dinfo = &ctx->dinfo;
smh22@7740 503
keir@20587 504 pte_last = PAGE_SIZE / ((ctx->pt_levels == 2)? 4 : 8);
smh22@7740 505
keir@14809 506 for ( i = 0; i < pte_last; i++ )
keir@14809 507 {
keir@20587 508 if ( ctx->pt_levels == 2 )
kaf24@9698 509 pte = ((uint32_t *)page)[i];
kaf24@9698 510 else
kaf24@9698 511 pte = ((uint64_t *)page)[i];
steven@13424 512
steven@13424 513 /* XXX SMH: below needs fixing for PROT_NONE etc */
keir@14809 514 if ( !(pte & _PAGE_PRESENT) )
steven@13424 515 continue;
steven@13424 516
kfraser@14005 517 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
kfraser@14819 518
keir@19677 519 /* Allocate mfn if necessary */
keir@20587 520 if ( ctx->p2m[pfn] == INVALID_P2M_ENTRY )
keir@19677 521 {
keir@20164 522 unsigned long force_pfn = superpages ? FORCE_SP_MASK : pfn;
keir@20591 523 if (allocate_mfn_list(xc_handle, dom, ctx,
keir@20164 524 1, &pfn, &force_pfn, superpages) != 0)
keir@19677 525 return 0;
keir@19677 526 }
kfraser@14005 527 pte &= ~MADDR_MASK_X86;
keir@20587 528 pte |= (uint64_t)ctx->p2m[pfn] << PAGE_SHIFT;
smh22@7886 529
keir@20587 530 if ( ctx->pt_levels == 2 )
steven@13424 531 ((uint32_t *)page)[i] = (uint32_t)pte;
steven@13424 532 else
steven@13424 533 ((uint64_t *)page)[i] = (uint64_t)pte;
smh22@7740 534 }
kaf24@9698 535
kaf24@9698 536 return 1;
smh22@7740 537 }
smh22@7740 538
steven@14732 539
Tim@14782 540 /* Load the p2m frame list, plus potential extended info chunk */
keir@20591 541 static xen_pfn_t *load_p2m_frame_list(struct restore_ctx *ctx,
keir@16257 542 int io_fd, int *pae_extended_cr3, int *ext_vcpucontext)
Tim@14782 543 {
Tim@14782 544 xen_pfn_t *p2m_frame_list;
keir@17918 545 vcpu_guest_context_any_t ctxt;
Tim@15955 546 xen_pfn_t p2m_fl_zero;
keir@20589 547 struct domain_info_context *dinfo = &ctx->dinfo;
Tim@14782 548
Tim@14782 549 /* Read first entry of P2M list, or extended-info signature (~0UL). */
keir@16408 550 if ( read_exact(io_fd, &p2m_fl_zero, sizeof(long)) )
keir@14809 551 {
keir@14809 552 ERROR("read extended-info signature failed");
keir@14809 553 return NULL;
keir@14809 554 }
Tim@14782 555
Tim@15955 556 if ( p2m_fl_zero == ~0UL )
keir@14809 557 {
Tim@14782 558 uint32_t tot_bytes;
Tim@14782 559
Tim@14782 560 /* Next 4 bytes: total size of following extended info. */
keir@16408 561 if ( read_exact(io_fd, &tot_bytes, sizeof(tot_bytes)) )
keir@14809 562 {
Tim@14782 563 ERROR("read extended-info size failed");
Tim@14782 564 return NULL;
Tim@14782 565 }
Tim@14782 566
keir@14809 567 while ( tot_bytes )
keir@14809 568 {
Tim@14782 569 uint32_t chunk_bytes;
Tim@14782 570 char chunk_sig[4];
Tim@14782 571
Tim@14782 572 /* 4-character chunk signature + 4-byte remaining chunk size. */
keir@16408 573 if ( read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
keir@16408 574 read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes)) ||
keir@16257 575 (tot_bytes < (chunk_bytes + 8)) )
keir@14809 576 {
Tim@14782 577 ERROR("read extended-info chunk signature failed");
Tim@14782 578 return NULL;
Tim@14782 579 }
Tim@14782 580 tot_bytes -= 8;
keir@14809 581
Tim@14782 582 /* VCPU context structure? */
keir@14809 583 if ( !strncmp(chunk_sig, "vcpu", 4) )
keir@14809 584 {
Tim@15955 585 /* Pick a guest word-size and PT depth from the ctxt size */
Tim@15955 586 if ( chunk_bytes == sizeof (ctxt.x32) )
Tim@15955 587 {
keir@20585 588 dinfo->guest_width = 4;
keir@20587 589 if ( ctx->pt_levels > 2 )
keir@20587 590 ctx->pt_levels = 3;
Tim@15955 591 }
Tim@15955 592 else if ( chunk_bytes == sizeof (ctxt.x64) )
Tim@15955 593 {
keir@20585 594 dinfo->guest_width = 8;
keir@20587 595 ctx->pt_levels = 4;
Tim@15955 596 }
Tim@15955 597 else
Tim@15955 598 {
Tim@15955 599 ERROR("bad extended-info context size %d", chunk_bytes);
Tim@15955 600 return NULL;
Tim@15955 601 }
Tim@15955 602
keir@16408 603 if ( read_exact(io_fd, &ctxt, chunk_bytes) )
keir@14809 604 {
Tim@14782 605 ERROR("read extended-info vcpu context failed");
Tim@14782 606 return NULL;
Tim@14782 607 }
Tim@15955 608 tot_bytes -= chunk_bytes;
Tim@15955 609 chunk_bytes = 0;
Tim@15955 610
Tim@15955 611 if ( GET_FIELD(&ctxt, vm_assist)
Tim@15955 612 & (1UL << VMASST_TYPE_pae_extended_cr3) )
Tim@14782 613 *pae_extended_cr3 = 1;
Tim@14782 614 }
keir@16257 615 else if ( !strncmp(chunk_sig, "extv", 4) )
keir@16257 616 {
keir@16257 617 *ext_vcpucontext = 1;
keir@16257 618 }
Tim@14782 619
Tim@14782 620 /* Any remaining bytes of this chunk: read and discard. */
keir@14809 621 while ( chunk_bytes )
keir@14809 622 {
Tim@15955 623 unsigned long sz = MIN(chunk_bytes, sizeof(xen_pfn_t));
keir@16408 624 if ( read_exact(io_fd, &p2m_fl_zero, sz) )
keir@14809 625 {
Tim@14782 626 ERROR("read-and-discard extended-info chunk bytes failed");
Tim@14782 627 return NULL;
Tim@14782 628 }
Tim@14782 629 chunk_bytes -= sz;
Tim@14782 630 tot_bytes -= sz;
Tim@14782 631 }
Tim@14782 632 }
keir@14809 633
Tim@14782 634 /* Now read the real first entry of P2M list. */
keir@16408 635 if ( read_exact(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) )
keir@14809 636 {
Tim@14782 637 ERROR("read first entry of p2m_frame_list failed");
Tim@14782 638 return NULL;
Tim@14782 639 }
Tim@14782 640 }
keir@14809 641
Tim@15955 642 /* Now that we know the guest's word-size, can safely allocate
Tim@15955 643 * the p2m frame list */
keir@17074 644 if ( (p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) == NULL )
Tim@15955 645 {
Tim@15955 646 ERROR("Couldn't allocate p2m_frame_list array");
Tim@15955 647 return NULL;
Tim@15955 648 }
Tim@15955 649
Tim@15955 650 /* First entry has already been read. */
Tim@15955 651 p2m_frame_list[0] = p2m_fl_zero;
keir@16408 652 if ( read_exact(io_fd, &p2m_frame_list[1],
keir@16408 653 (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) )
keir@14809 654 {
keir@14809 655 ERROR("read p2m_frame_list failed");
keir@14809 656 return NULL;
Tim@14782 657 }
Tim@14782 658
Tim@14782 659 return p2m_frame_list;
Tim@14782 660 }
Tim@14782 661
keir@20452 662 typedef struct {
keir@20457 663 int ishvm;
keir@20457 664 union {
keir@20457 665 struct tailbuf_pv {
keir@20457 666 unsigned int pfncount;
keir@20457 667 unsigned long* pfntab;
keir@20457 668 unsigned int vcpucount;
keir@20457 669 unsigned char* vcpubuf;
keir@20457 670 unsigned char shared_info_page[PAGE_SIZE];
keir@20457 671 } pv;
keir@20457 672 struct tailbuf_hvm {
keir@20457 673 uint64_t magicpfns[3];
keir@20457 674 uint32_t hvmbufsize, reclen;
keir@20457 675 uint8_t* hvmbuf;
keir@20457 676 struct {
keir@20457 677 uint32_t magic;
keir@20457 678 uint32_t version;
keir@20457 679 uint64_t len;
keir@20457 680 } qemuhdr;
keir@20457 681 uint32_t qemubufsize;
keir@20457 682 uint8_t* qemubuf;
keir@20457 683 } hvm;
keir@20457 684 } u;
keir@20452 685 } tailbuf_t;
keir@20452 686
keir@20457 687 /* read stream until EOF, growing buffer as necssary */
keir@20457 688 static int compat_buffer_qemu(int fd, struct tailbuf_hvm *buf)
keir@20457 689 {
keir@20457 690 uint8_t *qbuf, *tmp;
keir@20457 691 int blen = 0, dlen = 0;
keir@20457 692 int rc;
keir@20457 693
keir@20457 694 /* currently save records tend to be about 7K */
keir@20457 695 blen = 8192;
keir@20457 696 if ( !(qbuf = malloc(blen)) ) {
keir@20457 697 ERROR("Error allocating QEMU buffer");
keir@20457 698 return -1;
keir@20457 699 }
keir@20457 700
keir@20457 701 while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) {
keir@20457 702 DPRINTF("Read %d bytes of QEMU data\n", rc);
keir@20457 703 dlen += rc;
keir@20457 704
keir@20457 705 if (dlen == blen) {
keir@20457 706 DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen);
keir@20457 707 blen += 4096;
keir@20457 708 tmp = realloc(qbuf, blen);
keir@20457 709 if ( !tmp ) {
keir@20457 710 ERROR("Error growing QEMU buffer to %d bytes", blen);
keir@20457 711 free(qbuf);
keir@20457 712 return -1;
keir@20457 713 }
keir@20457 714 qbuf = tmp;
keir@20457 715 }
keir@20457 716 }
keir@20457 717
keir@20457 718 if ( rc < 0 ) {
keir@20457 719 ERROR("Error reading QEMU data");
keir@20457 720 free(qbuf);
keir@20457 721 return -1;
keir@20457 722 }
keir@20457 723
keir@20457 724 if ( memcmp(qbuf, "QEVM", 4) ) {
keir@20457 725 ERROR("Invalid QEMU magic: 0x%08x", *(unsigned long*)qbuf);
keir@20457 726 free(qbuf);
keir@20457 727 return -1;
keir@20457 728 }
keir@20457 729
keir@20457 730 buf->qemubuf = qbuf;
keir@20457 731 buf->qemubufsize = dlen;
keir@20457 732
keir@20457 733 return 0;
keir@20457 734 }
keir@20457 735
keir@20457 736 static int buffer_qemu(int fd, struct tailbuf_hvm *buf)
keir@20457 737 {
keir@20457 738 uint32_t qlen;
keir@20457 739 uint8_t *tmp;
keir@20457 740
keir@20457 741 if ( read_exact(fd, &qlen, sizeof(qlen)) ) {
keir@20457 742 ERROR("Error reading QEMU header length");
keir@20457 743 return -1;
keir@20457 744 }
keir@20457 745
keir@20457 746 if ( qlen > buf->qemubufsize ) {
keir@20457 747 if ( buf->qemubuf) {
keir@20457 748 tmp = realloc(buf->qemubuf, qlen);
keir@20457 749 if ( tmp )
keir@20457 750 buf->qemubuf = tmp;
keir@20457 751 else {
keir@20457 752 ERROR("Error reallocating QEMU state buffer");
keir@20457 753 return -1;
keir@20457 754 }
keir@20457 755 } else {
keir@20457 756 buf->qemubuf = malloc(qlen);
keir@20457 757 if ( !buf->qemubuf ) {
keir@20457 758 ERROR("Error allocating QEMU state buffer");
keir@20457 759 return -1;
keir@20457 760 }
keir@20457 761 }
keir@20457 762 }
keir@20457 763 buf->qemubufsize = qlen;
keir@20457 764
keir@20457 765 if ( read_exact(fd, buf->qemubuf, buf->qemubufsize) ) {
keir@20457 766 ERROR("Error reading QEMU state");
keir@20457 767 return -1;
keir@20457 768 }
keir@20457 769
keir@20457 770 return 0;
keir@20457 771 }
keir@20457 772
keir@20457 773 static int dump_qemu(uint32_t dom, struct tailbuf_hvm *buf)
keir@20457 774 {
keir@20457 775 int saved_errno;
keir@20457 776 char path[256];
keir@20457 777 FILE *fp;
keir@20457 778
keir@20457 779 sprintf(path, "/var/lib/xen/qemu-save.%u", dom);
keir@20457 780 fp = fopen(path, "wb");
keir@20457 781 if ( !fp )
keir@20457 782 return -1;
keir@20457 783
keir@20457 784 DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize);
keir@20457 785 if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) {
keir@20457 786 saved_errno = errno;
keir@20457 787 fclose(fp);
keir@20457 788 errno = saved_errno;
keir@20457 789 return -1;
keir@20457 790 }
keir@20457 791
keir@20457 792 fclose(fp);
keir@20457 793
keir@20457 794 return 0;
keir@20457 795 }
keir@20457 796
keir@20591 797 static int buffer_tail_hvm(struct restore_ctx *ctx, struct tailbuf_hvm *buf, int fd,
keir@20457 798 unsigned int max_vcpu_id, uint64_t vcpumap,
keir@20457 799 int ext_vcpucontext)
keir@20457 800 {
keir@20457 801 uint8_t *tmp;
keir@20457 802 unsigned char qemusig[21];
keir@20457 803
keir@20457 804 if ( read_exact(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) {
keir@20457 805 ERROR("Error reading magic PFNs");
keir@20457 806 return -1;
keir@20457 807 }
keir@20457 808
keir@20457 809 if ( read_exact(fd, &buf->reclen, sizeof(buf->reclen)) ) {
keir@20457 810 ERROR("Error reading HVM params size");
keir@20457 811 return -1;
keir@20457 812 }
keir@20457 813
keir@20457 814 if ( buf->reclen > buf->hvmbufsize ) {
keir@20457 815 if ( buf->hvmbuf) {
keir@20457 816 tmp = realloc(buf->hvmbuf, buf->reclen);
keir@20457 817 if ( tmp ) {
keir@20457 818 buf->hvmbuf = tmp;
keir@20457 819 buf->hvmbufsize = buf->reclen;
keir@20457 820 } else {
keir@20457 821 ERROR("Error reallocating HVM param buffer");
keir@20457 822 return -1;
keir@20457 823 }
keir@20457 824 } else {
keir@20457 825 buf->hvmbuf = malloc(buf->reclen);
keir@20457 826 if ( !buf->hvmbuf ) {
keir@20457 827 ERROR("Error allocating HVM param buffer");
keir@20457 828 return -1;
keir@20457 829 }
keir@20457 830 buf->hvmbufsize = buf->reclen;
keir@20457 831 }
keir@20457 832 }
keir@20457 833
keir@20457 834 if ( read_exact(fd, buf->hvmbuf, buf->reclen) ) {
keir@20457 835 ERROR("Error reading HVM params");
keir@20457 836 return -1;
keir@20457 837 }
keir@20457 838
keir@20457 839 if ( read_exact(fd, qemusig, sizeof(qemusig)) ) {
keir@20457 840 ERROR("Error reading QEMU signature");
keir@20457 841 return -1;
keir@20457 842 }
keir@20457 843
keir@20457 844 /* The normal live-migration QEMU record has no length information.
keir@20457 845 * Short of reimplementing the QEMU parser, we're forced to just read
keir@20457 846 * until EOF. Remus gets around this by sending a different signature
keir@20457 847 * which includes a length prefix */
keir@20457 848 if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) )
keir@20457 849 return compat_buffer_qemu(fd, buf);
keir@20457 850 else if ( !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) )
keir@20457 851 return buffer_qemu(fd, buf);
keir@20457 852
keir@20457 853 qemusig[20] = '\0';
keir@20457 854 ERROR("Invalid QEMU signature: %s", qemusig);
keir@20457 855 return -1;
keir@20457 856 }
keir@20457 857
keir@20591 858 static int buffer_tail_pv(struct restore_ctx *ctx, struct tailbuf_pv *buf, int fd,
keir@20457 859 unsigned int max_vcpu_id, uint64_t vcpumap,
keir@20457 860 int ext_vcpucontext)
keir@20452 861 {
keir@20452 862 unsigned int i;
keir@20452 863 size_t pfnlen, vcpulen;
keir@20589 864 struct domain_info_context *dinfo = &ctx->dinfo;
keir@20452 865
keir@20452 866 /* TODO: handle changing pfntab and vcpu counts */
keir@20452 867 /* PFN tab */
keir@20452 868 if ( read_exact(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
keir@20452 869 (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
keir@20452 870 {
keir@20452 871 ERROR("Error when reading pfn count");
keir@20452 872 return -1;
keir@20452 873 }
keir@20452 874 pfnlen = sizeof(unsigned long) * buf->pfncount;
keir@20452 875 if ( !(buf->pfntab) ) {
keir@20452 876 if ( !(buf->pfntab = malloc(pfnlen)) ) {
keir@20452 877 ERROR("Error allocating PFN tail buffer");
keir@20452 878 return -1;
keir@20452 879 }
keir@20452 880 }
keir@20452 881 // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
keir@20452 882 if ( read_exact(fd, buf->pfntab, pfnlen) ) {
keir@20452 883 ERROR("Error when reading pfntab");
keir@20452 884 goto free_pfntab;
keir@20452 885 }
keir@20452 886
keir@20452 887 /* VCPU contexts */
keir@20452 888 buf->vcpucount = 0;
keir@20452 889 for (i = 0; i <= max_vcpu_id; i++) {
keir@20452 890 // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap, i, (vcpumap % (1ULL << i)));
keir@20452 891 if ( (!(vcpumap & (1ULL << i))) )
keir@20452 892 continue;
keir@20452 893 buf->vcpucount++;
keir@20452 894 }
keir@20452 895 // DPRINTF("VCPU count: %d\n", buf->vcpucount);
keir@20585 896 vcpulen = ((dinfo->guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
keir@20452 897 : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
keir@20452 898 if ( ext_vcpucontext )
keir@20452 899 vcpulen += 128 * buf->vcpucount;
keir@20452 900
keir@20452 901 if ( !(buf->vcpubuf) ) {
keir@20452 902 if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
keir@20452 903 ERROR("Error allocating VCPU ctxt tail buffer");
keir@20452 904 goto free_pfntab;
keir@20452 905 }
keir@20452 906 }
keir@20452 907 // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
keir@20452 908 if ( read_exact(fd, buf->vcpubuf, vcpulen) ) {
keir@20452 909 ERROR("Error when reading ctxt");
keir@20452 910 goto free_vcpus;
keir@20452 911 }
keir@20452 912
keir@20452 913 /* load shared_info_page */
keir@20452 914 // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
keir@20452 915 if ( read_exact(fd, buf->shared_info_page, PAGE_SIZE) ) {
keir@20452 916 ERROR("Error when reading shared info page");
keir@20452 917 goto free_vcpus;
keir@20452 918 }
keir@20452 919
keir@20452 920 return 0;
keir@20452 921
keir@20452 922 free_vcpus:
keir@20452 923 if (buf->vcpubuf) {
keir@20452 924 free (buf->vcpubuf);
keir@20452 925 buf->vcpubuf = NULL;
keir@20452 926 }
keir@20452 927 free_pfntab:
keir@20452 928 if (buf->pfntab) {
keir@20452 929 free (buf->pfntab);
keir@20452 930 buf->pfntab = NULL;
keir@20452 931 }
keir@20452 932
keir@20452 933 return -1;
keir@20452 934 }
keir@20452 935
keir@20591 936 static int buffer_tail(struct restore_ctx *ctx, tailbuf_t *buf, int fd, unsigned int max_vcpu_id,
keir@20457 937 uint64_t vcpumap, int ext_vcpucontext)
keir@20457 938 {
keir@20457 939 if ( buf->ishvm )
keir@20591 940 return buffer_tail_hvm(ctx, &buf->u.hvm, fd, max_vcpu_id, vcpumap,
keir@20457 941 ext_vcpucontext);
keir@20457 942 else
keir@20591 943 return buffer_tail_pv(ctx, &buf->u.pv, fd, max_vcpu_id, vcpumap,
keir@20457 944 ext_vcpucontext);
keir@20457 945 }
keir@20457 946
keir@20457 947 static void tailbuf_free_hvm(struct tailbuf_hvm *buf)
keir@20452 948 {
keir@20457 949 if ( buf->hvmbuf ) {
keir@20457 950 free(buf->hvmbuf);
keir@20457 951 buf->hvmbuf = NULL;
keir@20457 952 }
keir@20457 953 if ( buf->qemubuf ) {
keir@20457 954 free(buf->qemubuf);
keir@20457 955 buf->qemubuf = NULL;
keir@20457 956 }
keir@20457 957 }
keir@20457 958
keir@20457 959 static void tailbuf_free_pv(struct tailbuf_pv *buf)
keir@20457 960 {
keir@20457 961 if ( buf->vcpubuf ) {
keir@20452 962 free(buf->vcpubuf);
keir@20452 963 buf->vcpubuf = NULL;
keir@20452 964 }
keir@20457 965 if ( buf->pfntab ) {
keir@20452 966 free(buf->pfntab);
keir@20452 967 buf->pfntab = NULL;
keir@20452 968 }
keir@20452 969 }
keir@20452 970
keir@20457 971 static void tailbuf_free(tailbuf_t *buf)
keir@20457 972 {
keir@20457 973 if ( buf->ishvm )
keir@20457 974 tailbuf_free_hvm(&buf->u.hvm);
keir@20457 975 else
keir@20457 976 tailbuf_free_pv(&buf->u.pv);
keir@20457 977 }
keir@20457 978
keir@20452 979 typedef struct {
keir@20452 980 void* pages;
keir@20452 981 /* pages is of length nr_physpages, pfn_types is of length nr_pages */
keir@20452 982 unsigned int nr_physpages, nr_pages;
keir@20452 983
keir@20452 984 /* Types of the pfns in the current region */
keir@20452 985 unsigned long* pfn_types;
keir@20452 986
keir@20452 987 int verify;
keir@20452 988
keir@20452 989 int new_ctxt_format;
keir@20452 990 int max_vcpu_id;
keir@20452 991 uint64_t vcpumap;
keir@20452 992 uint64_t identpt;
keir@20452 993 uint64_t vm86_tss;
keir@20452 994 } pagebuf_t;
keir@20452 995
keir@20452 996 static int pagebuf_init(pagebuf_t* buf)
keir@20452 997 {
keir@20452 998 memset(buf, 0, sizeof(*buf));
keir@20452 999 return 0;
keir@20452 1000 }
keir@20452 1001
keir@20452 1002 static void pagebuf_free(pagebuf_t* buf)
keir@20452 1003 {
keir@20452 1004 if (buf->pages) {
keir@20452 1005 free(buf->pages);
keir@20452 1006 buf->pages = NULL;
keir@20452 1007 }
keir@20452 1008 if(buf->pfn_types) {
keir@20452 1009 free(buf->pfn_types);
keir@20452 1010 buf->pfn_types = NULL;
keir@20452 1011 }
keir@20452 1012 }
keir@20452 1013
keir@20452 1014 static int pagebuf_get_one(pagebuf_t* buf, int fd, int xch, uint32_t dom)
keir@20452 1015 {
keir@20452 1016 int count, countpages, oldcount, i;
keir@20452 1017 void* ptmp;
keir@20452 1018
keir@20452 1019 if ( read_exact(fd, &count, sizeof(count)) )
keir@20452 1020 {
keir@20452 1021 ERROR("Error when reading batch size");
keir@20452 1022 return -1;
keir@20452 1023 }
keir@20452 1024
keir@20452 1025 // DPRINTF("reading batch of %d pages\n", count);
keir@20452 1026
keir@20452 1027 if (!count) {
keir@20452 1028 // DPRINTF("Last batch read\n");
keir@20452 1029 return 0;
keir@20452 1030 } else if (count == -1) {
keir@20452 1031 DPRINTF("Entering page verify mode\n");
keir@20452 1032 buf->verify = 1;
keir@20452 1033 return pagebuf_get_one(buf, fd, xch, dom);
keir@20452 1034 } else if (count == -2) {
keir@20452 1035 buf->new_ctxt_format = 1;
keir@20452 1036 if ( read_exact(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
keir@20452 1037 buf->max_vcpu_id >= 64 || read_exact(fd, &buf->vcpumap,
keir@20452 1038 sizeof(uint64_t)) ) {
keir@20452 1039 ERROR("Error when reading max_vcpu_id");
keir@20452 1040 return -1;
keir@20452 1041 }
keir@20452 1042 // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, buf->vcpumap);
keir@20452 1043 return pagebuf_get_one(buf, fd, xch, dom);
keir@20452 1044 } else if (count == -3) {
keir@20452 1045 /* Skip padding 4 bytes then read the EPT identity PT location. */
keir@20452 1046 if ( read_exact(fd, &buf->identpt, sizeof(uint32_t)) ||
keir@20452 1047 read_exact(fd, &buf->identpt, sizeof(uint64_t)) )
keir@20452 1048 {
keir@20452 1049 ERROR("error read the address of the EPT identity map");
keir@20452 1050 return -1;
keir@20452 1051 }
keir@20452 1052 // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
keir@20452 1053 return pagebuf_get_one(buf, fd, xch, dom);
keir@20452 1054 } else if ( count == -4 ) {
keir@20452 1055 /* Skip padding 4 bytes then read the vm86 TSS location. */
keir@20452 1056 if ( read_exact(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
keir@20452 1057 read_exact(fd, &buf->vm86_tss, sizeof(uint64_t)) )
keir@20452 1058 {
keir@20452 1059 ERROR("error read the address of the vm86 TSS");
keir@20452 1060 return -1;
keir@20452 1061 }
keir@20452 1062 // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
keir@20452 1063 return pagebuf_get_one(buf, fd, xch, dom);
keir@20452 1064 } else if ( count == -5 ) {
keir@20452 1065 DPRINTF("xc_domain_restore start tmem\n");
keir@20452 1066 if ( xc_tmem_restore(xch, dom, fd) ) {
keir@20452 1067 ERROR("error reading/restoring tmem");
keir@20452 1068 return -1;
keir@20452 1069 }
keir@20452 1070 return pagebuf_get_one(buf, fd, xch, dom);
keir@20452 1071 }
keir@20452 1072 else if ( count == -6 ) {
keir@20452 1073 if ( xc_tmem_restore_extra(xch, dom, fd) ) {
keir@20452 1074 ERROR("error reading/restoring tmem extra");
keir@20452 1075 return -1;
keir@20452 1076 }
keir@20452 1077 return pagebuf_get_one(buf, fd, xch, dom);
keir@20539 1078 } else if ( count == -7 ) {
keir@20539 1079 uint32_t tsc_mode, khz, incarn;
keir@20539 1080 uint64_t nsec;
keir@20539 1081 if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) ||
keir@20539 1082 read_exact(fd, &nsec, sizeof(uint64_t)) ||
keir@20539 1083 read_exact(fd, &khz, sizeof(uint32_t)) ||
keir@20539 1084 read_exact(fd, &incarn, sizeof(uint32_t)) ||
keir@20539 1085 xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
keir@20539 1086 ERROR("error reading/restoring tsc info");
keir@20539 1087 return -1;
keir@20539 1088 }
keir@20539 1089 return pagebuf_get_one(buf, fd, xch, dom);
keir@20452 1090 } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
keir@20452 1091 ERROR("Max batch size exceeded (%d). Giving up.", count);
keir@20452 1092 return -1;
keir@20452 1093 }
keir@20452 1094
keir@20452 1095 oldcount = buf->nr_pages;
keir@20452 1096 buf->nr_pages += count;
keir@20452 1097 if (!buf->pfn_types) {
keir@20452 1098 if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) {
keir@20452 1099 ERROR("Could not allocate PFN type buffer");
keir@20452 1100 return -1;
keir@20452 1101 }
keir@20452 1102 } else {
keir@20452 1103 if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * sizeof(*(buf->pfn_types))))) {
keir@20452 1104 ERROR("Could not reallocate PFN type buffer");
keir@20452 1105 return -1;
keir@20452 1106 }
keir@20452 1107 buf->pfn_types = ptmp;
keir@20452 1108 }
keir@20452 1109 if ( read_exact(fd, buf->pfn_types + oldcount, count * sizeof(*(buf->pfn_types)))) {
keir@20452 1110 ERROR("Error when reading region pfn types");
keir@20452 1111 return -1;
keir@20452 1112 }
keir@20452 1113
keir@20452 1114 countpages = count;
keir@20452 1115 for (i = oldcount; i < buf->nr_pages; ++i)
keir@20452 1116 if ((buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == XEN_DOMCTL_PFINFO_XTAB)
keir@20452 1117 --countpages;
keir@20452 1118
keir@20452 1119 if (!countpages)
keir@20452 1120 return count;
keir@20452 1121
keir@20452 1122 oldcount = buf->nr_physpages;
keir@20452 1123 buf->nr_physpages += countpages;
keir@20452 1124 if (!buf->pages) {
keir@20452 1125 if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
keir@20452 1126 ERROR("Could not allocate page buffer");
keir@20452 1127 return -1;
keir@20452 1128 }
keir@20452 1129 } else {
keir@20452 1130 if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
keir@20452 1131 ERROR("Could not reallocate page buffer");
keir@20452 1132 return -1;
keir@20452 1133 }
keir@20452 1134 buf->pages = ptmp;
keir@20452 1135 }
keir@20452 1136 if ( read_exact(fd, buf->pages + oldcount * PAGE_SIZE, countpages * PAGE_SIZE) ) {
keir@20452 1137 ERROR("Error when reading pages");
keir@20452 1138 return -1;
keir@20452 1139 }
keir@20452 1140
keir@20452 1141 return count;
keir@20452 1142 }
keir@20452 1143
keir@20452 1144 static int pagebuf_get(pagebuf_t* buf, int fd, int xch, uint32_t dom)
keir@20452 1145 {
keir@20452 1146 int rc;
keir@20452 1147
keir@20452 1148 buf->nr_physpages = buf->nr_pages = 0;
keir@20452 1149
keir@20452 1150 do {
keir@20452 1151 rc = pagebuf_get_one(buf, fd, xch, dom);
keir@20452 1152 } while (rc > 0);
keir@20452 1153
keir@20452 1154 if (rc < 0)
keir@20452 1155 pagebuf_free(buf);
keir@20452 1156
keir@20452 1157 return rc;
keir@20452 1158 }
keir@20452 1159
keir@20591 1160 static int apply_batch(int xc_handle, uint32_t dom, struct restore_ctx *ctx,
keir@20591 1161 xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3,
keir@20452 1162 unsigned int hvm, struct xc_mmu* mmu,
keir@20452 1163 pagebuf_t* pagebuf, int curbatch, int superpages)
keir@20452 1164 {
keir@20452 1165 int i, j, curpage;
keir@20452 1166 /* used by debug verify code */
keir@20452 1167 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
keir@20452 1168 /* Our mapping of the current region (batch) */
keir@20452 1169 char *region_base;
keir@20452 1170 /* A temporary mapping, and a copy, of one frame of guest memory. */
keir@20452 1171 unsigned long *page = NULL;
keir@20452 1172 int nraces = 0;
keir@20589 1173 struct domain_info_context *dinfo = &ctx->dinfo;
keir@20877 1174 int* pfn_err = NULL;
keir@20877 1175 int rc = -1;
keir@20452 1176
keir@20452 1177 unsigned long mfn, pfn, pagetype;
keir@20452 1178
keir@20452 1179 j = pagebuf->nr_pages - curbatch;
keir@20452 1180 if (j > MAX_BATCH_SIZE)
keir@20452 1181 j = MAX_BATCH_SIZE;
keir@20452 1182
keir@20591 1183 if (allocate_physmem(xc_handle, dom, ctx, &pagebuf->pfn_types[curbatch],
keir@20452 1184 j, hvm, region_mfn, superpages) != 0)
keir@20452 1185 {
keir@20452 1186 ERROR("allocate_physmem() failed\n");
keir@20452 1187 return -1;
keir@20452 1188 }
keir@20452 1189
keir@20452 1190 /* Map relevant mfns */
keir@20877 1191 pfn_err = calloc(j, sizeof(*pfn_err));
keir@20877 1192 region_base = xc_map_foreign_bulk(
keir@20877 1193 xc_handle, dom, PROT_WRITE, region_mfn, pfn_err, j);
keir@20452 1194
keir@20452 1195 if ( region_base == NULL )
keir@20452 1196 {
keir@20452 1197 ERROR("map batch failed");
keir@20877 1198 free(pfn_err);
keir@20452 1199 return -1;
keir@20452 1200 }
keir@20452 1201
keir@20452 1202 for ( i = 0, curpage = -1; i < j; i++ )
keir@20452 1203 {
keir@20452 1204 pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@20452 1205 pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
keir@20452 1206
keir@20452 1207 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
keir@20452 1208 /* a bogus/unmapped page: skip it */
keir@20452 1209 continue;
keir@20452 1210
keir@20877 1211 if (pfn_err[i])
keir@20877 1212 {
keir@20877 1213 ERROR("unexpected PFN mapping failure");
keir@20877 1214 goto err_mapped;
keir@20877 1215 }
keir@20877 1216
keir@20452 1217 ++curpage;
keir@20452 1218
keir@20585 1219 if ( pfn > dinfo->p2m_size )
keir@20452 1220 {
keir@20452 1221 ERROR("pfn out of range");
keir@20877 1222 goto err_mapped;
keir@20452 1223 }
keir@20452 1224
keir@20452 1225 pfn_type[pfn] = pagetype;
keir@20452 1226
keir@20587 1227 mfn = ctx->p2m[pfn];
keir@20452 1228
keir@20452 1229 /* In verify mode, we use a copy; otherwise we work in place */
keir@20452 1230 page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
keir@20452 1231
keir@20452 1232 memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, PAGE_SIZE);
keir@20452 1233
keir@20452 1234 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
keir@20452 1235
keir@20452 1236 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
keir@20452 1237 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
keir@20452 1238 {
keir@20452 1239 /*
keir@20452 1240 ** A page table page - need to 'uncanonicalize' it, i.e.
keir@20452 1241 ** replace all the references to pfns with the corresponding
keir@20452 1242 ** mfns for the new domain.
keir@20452 1243 **
keir@20452 1244 ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
keir@20452 1245 ** so we may need to update the p2m after the main loop.
keir@20452 1246 ** Hence we defer canonicalization of L1s until then.
keir@20452 1247 */
keir@20587 1248 if ((ctx->pt_levels != 3) ||
keir@20452 1249 pae_extended_cr3 ||
keir@20452 1250 (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
keir@20452 1251
keir@20591 1252 if (!uncanonicalize_pagetable(xc_handle, dom, ctx,
keir@20837 1253 page, superpages)) {
keir@20452 1254 /*
keir@20452 1255 ** Failing to uncanonicalize a page table can be ok
keir@20452 1256 ** under live migration since the pages type may have
keir@20452 1257 ** changed by now (and we'll get an update later).
keir@20452 1258 */
keir@20452 1259 DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
keir@20452 1260 pagetype >> 28, pfn, mfn);
keir@20452 1261 nraces++;
keir@20452 1262 continue;
keir@20452 1263 }
keir@20452 1264 }
keir@20452 1265 }
keir@20452 1266 else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
keir@20452 1267 {
keir@20452 1268 ERROR("Bogus page type %lx page table is out of range: "
keir@20585 1269 "i=%d p2m_size=%lu", pagetype, i, dinfo->p2m_size);
keir@20877 1270 goto err_mapped;
keir@20452 1271 }
keir@20452 1272
keir@20452 1273 if ( pagebuf->verify )
keir@20452 1274 {
keir@20452 1275 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
keir@20452 1276 if ( res )
keir@20452 1277 {
keir@20452 1278 int v;
keir@20452 1279
keir@20452 1280 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
keir@20452 1281 "actualcs=%08lx\n", pfn, pagebuf->pfn_types[pfn],
keir@20452 1282 csum_page(region_base + (i + curbatch)*PAGE_SIZE),
keir@20452 1283 csum_page(buf));
keir@20452 1284
keir@20452 1285 for ( v = 0; v < 4; v++ )
keir@20452 1286 {
keir@20452 1287 unsigned long *p = (unsigned long *)
keir@20452 1288 (region_base + i*PAGE_SIZE);
keir@20452 1289 if ( buf[v] != p[v] )
keir@20452 1290 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
keir@20452 1291 }
keir@20452 1292 }
keir@20452 1293 }
keir@20452 1294
keir@20452 1295 if ( !hvm &&
keir@20452 1296 xc_add_mmu_update(xc_handle, mmu,
keir@20452 1297 (((unsigned long long)mfn) << PAGE_SHIFT)
keir@20452 1298 | MMU_MACHPHYS_UPDATE, pfn) )
keir@20452 1299 {
keir@20452 1300 ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
keir@20877 1301 goto err_mapped;
keir@20452 1302 }
keir@20452 1303 } /* end of 'batch' for loop */
keir@20452 1304
keir@20877 1305 rc = nraces;
keir@20452 1306
keir@20877 1307 err_mapped:
keir@20877 1308 munmap(region_base, j*PAGE_SIZE);
keir@20877 1309 free(pfn_err);
keir@20877 1310
keir@20877 1311 return rc;
keir@20452 1312 }
keir@20452 1313
Tim@14782 1314 int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
Tim@14782 1315 unsigned int store_evtchn, unsigned long *store_mfn,
Tim@14782 1316 unsigned int console_evtchn, unsigned long *console_mfn,
keir@19677 1317 unsigned int hvm, unsigned int pae, int superpages)
mjw@1661 1318 {
kfraser@11295 1319 DECLARE_DOMCTL;
keir@16257 1320 int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
kaf24@9698 1321 unsigned long mfn, pfn;
mjw@1661 1322 unsigned int prev_pc, this_pc;
kaf24@9698 1323 int nraces = 0;
mjw@1661 1324
mjw@1661 1325 /* The new domain's shared-info frame number. */
mjw@1661 1326 unsigned long shared_info_frame;
cl349@2964 1327 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
keir@17918 1328 shared_info_any_t *old_shared_info =
keir@17918 1329 (shared_info_any_t *)shared_info_page;
keir@17918 1330 shared_info_any_t *new_shared_info;
kaf24@9698 1331
mjw@1661 1332 /* A copy of the CPU context of the guest. */
keir@17918 1333 vcpu_guest_context_any_t ctxt;
mjw@1661 1334
smh22@7740 1335 /* A table containing the type of each PFN (/not/ MFN!). */
mjw@1661 1336 unsigned long *pfn_type = NULL;
mjw@1661 1337
mjw@1661 1338 /* A table of MFNs to map in the current region */
kaf24@10314 1339 xen_pfn_t *region_mfn = NULL;
mjw@1661 1340
mjw@1661 1341 /* A copy of the pfn-to-mfn table frame list. */
kaf24@10314 1342 xen_pfn_t *p2m_frame_list = NULL;
Tim@14782 1343
cl349@6659 1344 /* A temporary mapping of the guest's start_info page. */
keir@17918 1345 start_info_any_t *start_info;
mjw@1661 1346
steven@13424 1347 /* Our mapping of the current region (batch) */
cl349@5014 1348 char *region_base;
mjw@1661 1349
kfraser@14808 1350 struct xc_mmu *mmu = NULL;
mjw@1661 1351
kaf24@4457 1352 struct mmuext_op pin[MAX_PIN_BATCH];
kaf24@9698 1353 unsigned int nr_pins;
kaf24@4457 1354
kfraser@14236 1355 uint64_t vcpumap = 1ULL;
kfraser@14236 1356 unsigned int max_vcpu_id = 0;
kfraser@14388 1357 int new_ctxt_format = 0;
smh22@7740 1358
keir@20452 1359 pagebuf_t pagebuf;
keir@20452 1360 tailbuf_t tailbuf, tmptail;
keir@20452 1361 void* vcpup;
keir@20452 1362
keir@20591 1363 static struct restore_ctx _ctx = {
keir@20591 1364 .live_p2m = NULL,
keir@20591 1365 .p2m = NULL,
keir@20591 1366 .no_superpage_mem = 0,
keir@20591 1367 };
keir@20591 1368 static struct restore_ctx *ctx = &_ctx;
keir@20589 1369 struct domain_info_context *dinfo = &ctx->dinfo;
keir@20589 1370
keir@20452 1371 pagebuf_init(&pagebuf);
keir@20452 1372 memset(&tailbuf, 0, sizeof(tailbuf));
keir@20457 1373 tailbuf.ishvm = hvm;
keir@20452 1374
steven@14732 1375 /* For info only */
keir@20587 1376 ctx->nr_pfns = 0;
steven@14732 1377
keir@20164 1378 /* Always try to allocate 2M pages for HVM */
keir@20164 1379 if ( hvm )
keir@20164 1380 superpages = 1;
keir@20164 1381
keir@20585 1382 if ( read_exact(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
kfraser@14754 1383 {
kfraser@14754 1384 ERROR("read: p2m_size");
kfraser@14754 1385 goto out;
kfraser@14754 1386 }
keir@20585 1387 DPRINTF("xc_domain_restore start: p2m_size = %lx\n", dinfo->p2m_size);
smh22@7740 1388
Tim@15955 1389 if ( !get_platform_info(xc_handle, dom,
keir@20587 1390 &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
Tim@14782 1391 {
Tim@15955 1392 ERROR("Unable to get platform info.");
Tim@15955 1393 return 1;
Tim@15955 1394 }
Tim@15955 1395
Tim@15955 1396 /* The *current* word size of the guest isn't very interesting; for now
Tim@15955 1397 * assume the guest will be the same as we are. We'll fix that later
Tim@15955 1398 * if we discover otherwise. */
keir@20585 1399 dinfo->guest_width = sizeof(unsigned long);
keir@20587 1400 ctx->pt_levels = (dinfo->guest_width == 8) ? 4 : (ctx->pt_levels == 2) ? 2 : 3;
Tim@15955 1401
Tim@15955 1402 if ( !hvm )
Tim@15955 1403 {
Tim@15955 1404 /* Load the p2m frame list, plus potential extended info chunk */
keir@20591 1405 p2m_frame_list = load_p2m_frame_list(ctx,
keir@16257 1406 io_fd, &pae_extended_cr3, &ext_vcpucontext);
Tim@15955 1407 if ( !p2m_frame_list )
Tim@15955 1408 goto out;
Tim@15955 1409
Tim@15955 1410 /* Now that we know the word size, tell Xen about it */
Tim@14782 1411 memset(&domctl, 0, sizeof(domctl));
Tim@14782 1412 domctl.domain = dom;
Tim@14782 1413 domctl.cmd = XEN_DOMCTL_set_address_size;
keir@20585 1414 domctl.u.address_size.size = dinfo->guest_width * 8;
keir@16257 1415 frc = do_domctl(xc_handle, &domctl);
keir@16257 1416 if ( frc != 0 )
keir@14809 1417 {
Tim@14782 1418 ERROR("Unable to set guest address size.");
Tim@14782 1419 goto out;
Tim@14782 1420 }
Christian@14235 1421 }
smh22@7740 1422
cl349@5091 1423 /* We want zeroed memory so use calloc rather than malloc. */
keir@20587 1424 ctx->p2m = calloc(dinfo->p2m_size, sizeof(xen_pfn_t));
keir@20585 1425 pfn_type = calloc(dinfo->p2m_size, sizeof(unsigned long));
keir@17462 1426
keir@20887 1427 region_mfn = xc_memalign(PAGE_SIZE, ROUNDUP(
keir@17462 1428 MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
mjw@1661 1429
keir@20587 1430 if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
keir@19677 1431 (region_mfn == NULL) )
keir@14809 1432 {
kfraser@11814 1433 ERROR("memory alloc failed");
mjw@1661 1434 errno = ENOMEM;
mjw@1661 1435 goto out;
mjw@1661 1436 }
kaf24@9698 1437
keir@17462 1438 memset(region_mfn, 0,
keir@17462 1439 ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
keir@17462 1440
keir@14809 1441 if ( lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
keir@14809 1442 {
kfraser@11895 1443 ERROR("Could not lock region_mfn");
iap10@2202 1444 goto out;
iap10@2202 1445 }
mjw@1661 1446
mjw@1661 1447 /* Get the domain's shared-info frame. */
kfraser@11295 1448 domctl.cmd = XEN_DOMCTL_getdomaininfo;
kfraser@11295 1449 domctl.domain = (domid_t)dom;
keir@14809 1450 if ( xc_domctl(xc_handle, &domctl) < 0 )
keir@14809 1451 {
kfraser@11814 1452 ERROR("Could not get information on new domain");
mjw@1661 1453 goto out;
mjw@1661 1454 }
kfraser@11295 1455 shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
mjw@1661 1456
steven@13424 1457 /* Mark all PFNs as invalid; we allocate on demand */
keir@20585 1458 for ( pfn = 0; pfn < dinfo->p2m_size; pfn++ )
keir@20587 1459 ctx->p2m[pfn] = INVALID_P2M_ENTRY;
smh22@7886 1460
kfraser@14808 1461 mmu = xc_alloc_mmu_updates(xc_handle, dom);
kfraser@14808 1462 if ( mmu == NULL )
kfraser@14808 1463 {
kfraser@11814 1464 ERROR("Could not initialise for MMU updates");
mjw@1661 1465 goto out;
mjw@1661 1466 }
mjw@1661 1467
smh22@7740 1468 DPRINTF("Reloading memory pages: 0%%\n");
mjw@1661 1469
mjw@1661 1470 /*
mjw@1661 1471 * Now simply read each saved frame into its new machine frame.
mjw@1661 1472 * We uncanonicalise page tables as we go.
mjw@1661 1473 */
mjw@1661 1474 prev_pc = 0;
mjw@1661 1475
keir@14142 1476 n = m = 0;
keir@20452 1477 loadpages:
keir@14809 1478 for ( ; ; )
keir@14809 1479 {
keir@20452 1480 int j, curbatch;
mjw@1661 1481
keir@20585 1482 this_pc = (n * 100) / dinfo->p2m_size;
kaf24@1683 1483 if ( (this_pc - prev_pc) >= 5 )
kaf24@1683 1484 {
cl349@5091 1485 PPRINTF("\b\b\b\b%3d%%", this_pc);
mjw@1661 1486 prev_pc = this_pc;
mjw@1661 1487 }
mjw@1661 1488
keir@20452 1489 if ( !completed ) {
keir@20452 1490 pagebuf.nr_physpages = pagebuf.nr_pages = 0;
keir@20452 1491 if ( pagebuf_get_one(&pagebuf, io_fd, xc_handle, dom) < 0 ) {
keir@20452 1492 ERROR("Error when reading batch\n");
keir@20452 1493 goto out;
keir@20452 1494 }
mjw@1661 1495 }
keir@20452 1496 j = pagebuf.nr_pages;
mjw@1661 1497
cl349@5091 1498 PPRINTF("batch %d\n",j);
kaf24@9698 1499
keir@20452 1500 if ( j == 0 ) {
keir@20452 1501 /* catch vcpu updates */
keir@20452 1502 if (pagebuf.new_ctxt_format) {
keir@20452 1503 vcpumap = pagebuf.vcpumap;
keir@20452 1504 max_vcpu_id = pagebuf.max_vcpu_id;
keir@18929 1505 }
keir@20452 1506 /* should this be deferred? does it change? */
keir@20452 1507 if ( pagebuf.identpt )
keir@20452 1508 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, pagebuf.identpt);
keir@20452 1509 if ( pagebuf.vm86_tss )
keir@20452 1510 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, pagebuf.vm86_tss);
kaf24@1683 1511 break; /* our work here is done */
mjw@1661 1512 }
mjw@1661 1513
keir@20452 1514 /* break pagebuf into batches */
keir@20452 1515 curbatch = 0;
keir@20452 1516 while ( curbatch < j ) {
keir@20452 1517 int brc;
kaf24@9698 1518
keir@20591 1519 brc = apply_batch(xc_handle, dom, ctx, region_mfn, pfn_type,
keir@20452 1520 pae_extended_cr3, hvm, mmu, &pagebuf, curbatch, superpages);
keir@20452 1521 if ( brc < 0 )
mjw@1661 1522 goto out;
mjw@1661 1523
keir@20452 1524 nraces += brc;
smh22@7740 1525
keir@20452 1526 curbatch += MAX_BATCH_SIZE;
keir@20452 1527 }
mjw@1661 1528
keir@20452 1529 pagebuf.nr_physpages = pagebuf.nr_pages = 0;
mjw@1661 1530
keir@20452 1531 n += j; /* crude stats */
keir@14142 1532
keir@14142 1533 /*
keir@14142 1534 * Discard cache for portion of file read so far up to last
keir@14142 1535 * page boundary every 16MB or so.
keir@14142 1536 */
keir@14142 1537 m += j;
keir@14142 1538 if ( m > MAX_PAGECACHE_USAGE )
keir@14142 1539 {
keir@14142 1540 discard_file_cache(io_fd, 0 /* no flush */);
keir@14142 1541 m = 0;
keir@14142 1542 }
mjw@1661 1543 }
mjw@1661 1544
kfraser@10383 1545 /*
kfraser@10383 1546 * Ensure we flush all machphys updates before potential PAE-specific
kfraser@10383 1547 * reallocations below.
kfraser@10383 1548 */
keir@14809 1549 if ( !hvm && xc_flush_mmu_updates(xc_handle, mmu) )
keir@14809 1550 {
kfraser@14808 1551 ERROR("Error doing flush_mmu_updates()");
kfraser@10383 1552 goto out;
kfraser@10383 1553 }
kfraser@10383 1554
keir@20452 1555 // DPRINTF("Received all pages (%d races)\n", nraces);
mjw@1661 1556
keir@20452 1557 if ( !completed ) {
keir@20453 1558 int flags = 0;
keir@20453 1559
keir@20591 1560 if ( buffer_tail(ctx, &tailbuf, io_fd, max_vcpu_id, vcpumap,
keir@20452 1561 ext_vcpucontext) < 0 ) {
keir@20452 1562 ERROR ("error buffering image tail");
keir@20452 1563 goto out;
keir@20452 1564 }
keir@20452 1565 completed = 1;
keir@20453 1566 /* shift into nonblocking mode for the remainder */
keir@20453 1567 if ( (flags = fcntl(io_fd, F_GETFL,0)) < 0 )
keir@20453 1568 flags = 0;
keir@20453 1569 fcntl(io_fd, F_SETFL, flags | O_NONBLOCK);
keir@20452 1570 }
keir@20452 1571
keir@20452 1572 // DPRINTF("Buffered checkpoint\n");
keir@20452 1573
keir@20452 1574 if ( pagebuf_get(&pagebuf, io_fd, xc_handle, dom) ) {
keir@20452 1575 ERROR("error when buffering batch, finishing\n");
keir@20452 1576 goto finish;
keir@20452 1577 }
keir@20452 1578 memset(&tmptail, 0, sizeof(tmptail));
keir@20457 1579 tmptail.ishvm = hvm;
keir@20591 1580 if ( buffer_tail(ctx, &tmptail, io_fd, max_vcpu_id, vcpumap,
keir@20452 1581 ext_vcpucontext) < 0 ) {
keir@20452 1582 ERROR ("error buffering image tail, finishing");
keir@20452 1583 goto finish;
keir@20452 1584 }
keir@20452 1585 tailbuf_free(&tailbuf);
keir@20452 1586 memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
keir@20452 1587
keir@20452 1588 goto loadpages;
keir@20452 1589
keir@20452 1590 finish:
keir@20457 1591 if ( hvm )
keir@20457 1592 goto finish_hvm;
keir@20452 1593
keir@20587 1594 if ( (ctx->pt_levels == 3) && !pae_extended_cr3 )
keir@14809 1595 {
kaf24@10304 1596 /*
kaf24@10304 1597 ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
kaf24@10304 1598 ** is a little awkward and involves (a) finding all such PGDs and
kaf24@10304 1599 ** replacing them with 'lowmem' versions; (b) upating the p2m[]
kaf24@10304 1600 ** with the new info; and (c) canonicalizing all the L1s using the
kaf24@10304 1601 ** (potentially updated) p2m[].
kaf24@10304 1602 **
kaf24@10304 1603 ** This is relatively slow (and currently involves two passes through
kaf24@10304 1604 ** the pfn_type[] array), but at least seems to be correct. May wish
kaf24@10304 1605 ** to consider more complex approaches to optimize this later.
kaf24@10304 1606 */
kaf24@10304 1607
kaf24@10304 1608 int j, k;
steven@13424 1609
kaf24@10304 1610 /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
keir@20585 1611 for ( i = 0; i < dinfo->p2m_size; i++ )
kfraser@11295 1612 {
kfraser@11295 1613 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
kfraser@11295 1614 XEN_DOMCTL_PFINFO_L3TAB) &&
keir@20587 1615 (ctx->p2m[i] > 0xfffffUL) )
kfraser@11295 1616 {
kaf24@10304 1617 unsigned long new_mfn;
kaf24@10304 1618 uint64_t l3ptes[4];
kaf24@10304 1619 uint64_t *l3tab;
kaf24@10304 1620
kaf24@10304 1621 l3tab = (uint64_t *)
kaf24@10304 1622 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
keir@20587 1623 PROT_READ, ctx->p2m[i]);
kaf24@10304 1624
keir@14809 1625 for ( j = 0; j < 4; j++ )
kaf24@10304 1626 l3ptes[j] = l3tab[j];
kaf24@10304 1627
kaf24@10304 1628 munmap(l3tab, PAGE_SIZE);
kaf24@10304 1629
keir@20587 1630 new_mfn = xc_make_page_below_4G(xc_handle, dom, ctx->p2m[i]);
keir@14809 1631 if ( !new_mfn )
keir@14809 1632 {
kfraser@11814 1633 ERROR("Couldn't get a page below 4GB :-(");
kaf24@10304 1634 goto out;
kaf24@10304 1635 }
kaf24@10304 1636
keir@20587 1637 ctx->p2m[i] = new_mfn;
keir@14809 1638 if ( xc_add_mmu_update(xc_handle, mmu,
keir@14809 1639 (((unsigned long long)new_mfn)
keir@14809 1640 << PAGE_SHIFT) |
keir@14809 1641 MMU_MACHPHYS_UPDATE, i) )
keir@14809 1642 {
kfraser@11814 1643 ERROR("Couldn't m2p on PAE root pgdir");
kaf24@10304 1644 goto out;
kaf24@10304 1645 }
kaf24@10304 1646
kaf24@10304 1647 l3tab = (uint64_t *)
kaf24@10304 1648 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
keir@20587 1649 PROT_READ | PROT_WRITE, ctx->p2m[i]);
kaf24@10304 1650
keir@14809 1651 for ( j = 0; j < 4; j++ )
kaf24@10304 1652 l3tab[j] = l3ptes[j];
kaf24@10304 1653
kaf24@10304 1654 munmap(l3tab, PAGE_SIZE);
kaf24@10304 1655 }
kaf24@10304 1656 }
kaf24@10304 1657
kaf24@10304 1658 /* Second pass: find all L1TABs and uncanonicalize them */
kaf24@10304 1659 j = 0;
kaf24@10304 1660
keir@20585 1661 for ( i = 0; i < dinfo->p2m_size; i++ )
kfraser@11295 1662 {
kfraser@11295 1663 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
kfraser@11295 1664 XEN_DOMCTL_PFINFO_L1TAB) )
kfraser@11295 1665 {
keir@20587 1666 region_mfn[j] = ctx->p2m[i];
kaf24@10304 1667 j++;
kaf24@10304 1668 }
kaf24@10304 1669
keir@20585 1670 if ( (i == (dinfo->p2m_size-1)) || (j == MAX_BATCH_SIZE) )
keir@14809 1671 {
keir@20837 1672 region_base = xc_map_foreign_pages(
keir@14809 1673 xc_handle, dom, PROT_READ | PROT_WRITE, region_mfn, j);
keir@14809 1674 if ( region_base == NULL )
keir@14809 1675 {
kfraser@11814 1676 ERROR("map batch failed");
kaf24@10304 1677 goto out;
kaf24@10304 1678 }
kaf24@10304 1679
keir@14809 1680 for ( k = 0; k < j; k++ )
keir@14809 1681 {
keir@14809 1682 if ( !uncanonicalize_pagetable(
keir@20837 1683 xc_handle, dom, ctx,
keir@19677 1684 region_base + k*PAGE_SIZE, superpages) )
keir@14809 1685 {
kfraser@11814 1686 ERROR("failed uncanonicalize pt!");
kaf24@10304 1687 goto out;
kaf24@10304 1688 }
kaf24@10304 1689 }
kaf24@10304 1690
kaf24@10304 1691 munmap(region_base, j*PAGE_SIZE);
kaf24@10304 1692 j = 0;
kaf24@10304 1693 }
kaf24@10304 1694 }
kaf24@10304 1695
keir@14809 1696 if ( xc_flush_mmu_updates(xc_handle, mmu) )
keir@14809 1697 {
kfraser@14808 1698 ERROR("Error doing xc_flush_mmu_updates()");
kfraser@10383 1699 goto out;
kfraser@10383 1700 }
kaf24@10304 1701 }
kaf24@10304 1702
mjw@1661 1703 /*
mjw@1661 1704 * Pin page tables. Do this after writing to them as otherwise Xen
mjw@1661 1705 * will barf when doing the type-checking.
mjw@1661 1706 */
kaf24@9698 1707 nr_pins = 0;
keir@20585 1708 for ( i = 0; i < dinfo->p2m_size; i++ )
kfraser@11295 1709 {
kfraser@11295 1710 if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
kaf24@4457 1711 continue;
smh22@7886 1712
kfraser@11295 1713 switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
kfraser@11295 1714 {
kfraser@11295 1715 case XEN_DOMCTL_PFINFO_L1TAB:
kaf24@4457 1716 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
kaf24@9698 1717 break;
kaf24@9698 1718
kfraser@11295 1719 case XEN_DOMCTL_PFINFO_L2TAB:
kaf24@4457 1720 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
kaf24@9698 1721 break;
kaf24@9698 1722
kfraser@11295 1723 case XEN_DOMCTL_PFINFO_L3TAB:
smh22@7740 1724 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
kaf24@9698 1725 break;
smh22@7740 1726
kfraser@11295 1727 case XEN_DOMCTL_PFINFO_L4TAB:
smh22@7740 1728 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
kaf24@9698 1729 break;
kaf24@9698 1730
kaf24@9698 1731 default:
kaf24@9698 1732 continue;
smh22@7740 1733 }
smh22@7740 1734
keir@20587 1735 pin[nr_pins].arg1.mfn = ctx->p2m[i];
kaf24@9698 1736 nr_pins++;
smh22@7886 1737
kaf24@10513 1738 /* Batch full? Then flush. */
keir@14809 1739 if ( nr_pins == MAX_PIN_BATCH )
keir@14809 1740 {
keir@14809 1741 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
keir@14809 1742 {
kfraser@11814 1743 ERROR("Failed to pin batch of %d page tables", nr_pins);
kaf24@10513 1744 goto out;
kaf24@10513 1745 }
kaf24@10513 1746 nr_pins = 0;
kaf24@10513 1747 }
kaf24@10513 1748 }
kaf24@10513 1749
kaf24@10513 1750 /* Flush final partial batch. */
keir@14809 1751 if ( (nr_pins != 0) && (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
keir@14809 1752 {
kfraser@11814 1753 ERROR("Failed to pin batch of %d page tables", nr_pins);
kaf24@10513 1754 goto out;
iap10@2507 1755 }
mjw@1661 1756
cl349@5091 1757 DPRINTF("\b\b\b\b100%%\n");
keir@20587 1758 DPRINTF("Memory reloaded (%ld pages)\n", ctx->nr_pfns);
mjw@1661 1759
iap10@2291 1760 /* Get the list of PFNs that are not in the psuedo-phys map */
iap10@2291 1761 {
keir@20452 1762 int nr_frees = 0;
kaf24@2624 1763
keir@20457 1764 for ( i = 0; i < tailbuf.u.pv.pfncount; i++ )
keir@14809 1765 {
keir@20457 1766 unsigned long pfn = tailbuf.u.pv.pfntab[i];
smh22@7740 1767
keir@20587 1768 if ( ctx->p2m[pfn] != INVALID_P2M_ENTRY )
keir@14809 1769 {
keir@20452 1770 /* pfn is not in physmap now, but was at some point during
steven@13424 1771 the save/migration process - need to free it */
keir@20587 1772 tailbuf.u.pv.pfntab[nr_frees++] = ctx->p2m[pfn];
keir@20587 1773 ctx->p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
steven@13424 1774 }
kaf24@6775 1775 }
kaf24@9698 1776
keir@14809 1777 if ( nr_frees > 0 )
keir@14809 1778 {
kaf24@6506 1779 struct xen_memory_reservation reservation = {
steven@13424 1780 .nr_extents = nr_frees,
kaf24@6506 1781 .extent_order = 0,
kaf24@6510 1782 .domid = dom
kaf24@6506 1783 };
keir@20457 1784 set_xen_guest_handle(reservation.extent_start, tailbuf.u.pv.pfntab);
smh22@7740 1785
keir@16257 1786 if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
keir@16257 1787 &reservation)) != nr_frees )
keir@14809 1788 {
keir@16257 1789 ERROR("Could not decrease reservation : %d", frc);
kaf24@6775 1790 goto out;
keir@14809 1791 }
keir@14809 1792 else
keir@20457 1793 DPRINTF("Decreased reservation by %d pages\n", tailbuf.u.pv.pfncount);
kaf24@9698 1794 }
iap10@2291 1795 }
iap10@2291 1796
keir@17462 1797 if ( lock_pages(&ctxt, sizeof(ctxt)) )
keir@17462 1798 {
keir@17462 1799 ERROR("Unable to lock ctxt");
keir@17462 1800 return 1;
keir@17462 1801 }
keir@17462 1802
keir@20457 1803 vcpup = tailbuf.u.pv.vcpubuf;
keir@14809 1804 for ( i = 0; i <= max_vcpu_id; i++ )
keir@14809 1805 {
keir@14809 1806 if ( !(vcpumap & (1ULL << i)) )
kfraser@14236 1807 continue;
kfraser@14236 1808
keir@20585 1809 memcpy(&ctxt, vcpup, ((dinfo->guest_width == 8) ? sizeof(ctxt.x64)
keir@20452 1810 : sizeof(ctxt.x32)));
keir@20585 1811 vcpup += (dinfo->guest_width == 8) ? sizeof(ctxt.x64) : sizeof(ctxt.x32);
keir@20452 1812
keir@20452 1813 DPRINTF("read VCPU %d\n", i);
kfraser@14236 1814
kfraser@14388 1815 if ( !new_ctxt_format )
Tim@15955 1816 SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online);
kfraser@14388 1817
keir@14809 1818 if ( i == 0 )
keir@14809 1819 {
kfraser@14236 1820 /*
kfraser@14236 1821 * Uncanonicalise the suspend-record frame number and poke
kfraser@14236 1822 * resume record.
kfraser@14236 1823 */
Tim@15955 1824 pfn = GET_FIELD(&ctxt, user_regs.edx);
keir@20585 1825 if ( (pfn >= dinfo->p2m_size) ||
keir@14809 1826 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
keir@14809 1827 {
kfraser@14236 1828 ERROR("Suspend record frame number is bad");
kfraser@14236 1829 goto out;
kfraser@14236 1830 }
keir@20587 1831 mfn = ctx->p2m[pfn];
Tim@15955 1832 SET_FIELD(&ctxt, user_regs.edx, mfn);
kfraser@14236 1833 start_info = xc_map_foreign_range(
kfraser@14236 1834 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
keir@20585 1835 SET_FIELD(start_info, nr_pages, dinfo->p2m_size);
Tim@15955 1836 SET_FIELD(start_info, shared_info, shared_info_frame<<PAGE_SHIFT);
Tim@15955 1837 SET_FIELD(start_info, flags, 0);
keir@20587 1838 *store_mfn = ctx->p2m[GET_FIELD(start_info, store_mfn)];
Tim@15955 1839 SET_FIELD(start_info, store_mfn, *store_mfn);
Tim@15955 1840 SET_FIELD(start_info, store_evtchn, store_evtchn);
keir@20587 1841 *console_mfn = ctx->p2m[GET_FIELD(start_info, console.domU.mfn)];
Tim@15955 1842 SET_FIELD(start_info, console.domU.mfn, *console_mfn);
Tim@15955 1843 SET_FIELD(start_info, console.domU.evtchn, console_evtchn);
kfraser@14236 1844 munmap(start_info, PAGE_SIZE);
kfraser@14236 1845 }
kfraser@14236 1846 /* Uncanonicalise each GDT frame number. */
Tim@15955 1847 if ( GET_FIELD(&ctxt, gdt_ents) > 8192 )
keir@14809 1848 {
kfraser@14236 1849 ERROR("GDT entry count out of range");
kfraser@14236 1850 goto out;
kfraser@14236 1851 }
kfraser@14236 1852
Tim@15955 1853 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
keir@14809 1854 {
Tim@15955 1855 pfn = GET_FIELD(&ctxt, gdt_frames[j]);
keir@20585 1856 if ( (pfn >= dinfo->p2m_size) ||
keir@14809 1857 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
keir@14809 1858 {
Tim@15955 1859 ERROR("GDT frame number %i (0x%lx) is bad",
Tim@15955 1860 j, (unsigned long)pfn);
kfraser@14236 1861 goto out;
kfraser@14236 1862 }
keir@20587 1863 SET_FIELD(&ctxt, gdt_frames[j], ctx->p2m[pfn]);
kfraser@14236 1864 }
kfraser@14236 1865 /* Uncanonicalise the page table base pointer. */
keir@17074 1866 pfn = UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3]));
kfraser@14236 1867
keir@20585 1868 if ( pfn >= dinfo->p2m_size )
keir@14809 1869 {
steven@14732 1870 ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
keir@20585 1871 pfn, dinfo->p2m_size, pfn_type[pfn]);
kfraser@14236 1872 goto out;
kfraser@14236 1873 }
kfraser@14236 1874
kfraser@14236 1875 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
keir@20587 1876 ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
keir@14809 1877 {
kfraser@14236 1878 ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
keir@20585 1879 pfn, dinfo->p2m_size, pfn_type[pfn],
keir@20587 1880 (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
kfraser@14236 1881 goto out;
kfraser@14236 1882 }
keir@20587 1883 SET_FIELD(&ctxt, ctrlreg[3], FOLD_CR3(ctx->p2m[pfn]));
kfraser@14236 1884
keir@14254 1885 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
keir@20587 1886 if ( (ctx->pt_levels == 4) && (ctxt.x64.ctrlreg[1] & 1) )
keir@14254 1887 {
keir@17074 1888 pfn = UNFOLD_CR3(ctxt.x64.ctrlreg[1] & ~1);
keir@20585 1889 if ( pfn >= dinfo->p2m_size )
keir@14809 1890 {
Tim@15955 1891 ERROR("User PT base is bad: pfn=%lu p2m_size=%lu",
keir@20585 1892 pfn, dinfo->p2m_size);
keir@14254 1893 goto out;
keir@14254 1894 }
keir@14254 1895 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
keir@20587 1896 ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
keir@14809 1897 {
keir@14254 1898 ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
keir@20585 1899 pfn, dinfo->p2m_size, pfn_type[pfn],
keir@20587 1900 (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
keir@14254 1901 goto out;
keir@14254 1902 }
keir@20587 1903 ctxt.x64.ctrlreg[1] = FOLD_CR3(ctx->p2m[pfn]);
keir@14254 1904 }
kfraser@14236 1905 domctl.cmd = XEN_DOMCTL_setvcpucontext;
kfraser@14236 1906 domctl.domain = (domid_t)dom;
kfraser@14236 1907 domctl.u.vcpucontext.vcpu = i;
Tim@15955 1908 set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt.c);
keir@16257 1909 frc = xc_domctl(xc_handle, &domctl);
keir@16257 1910 if ( frc != 0 )
keir@14809 1911 {
kfraser@14236 1912 ERROR("Couldn't build vcpu%d", i);
kfraser@14236 1913 goto out;
kfraser@14236 1914 }
keir@16257 1915
keir@16257 1916 if ( !ext_vcpucontext )
keir@16257 1917 continue;
keir@20452 1918 memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
keir@20452 1919 vcpup += 128;
keir@16257 1920 domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
keir@16257 1921 domctl.domain = dom;
keir@16257 1922 frc = xc_domctl(xc_handle, &domctl);
keir@16257 1923 if ( frc != 0 )
keir@16257 1924 {
keir@16257 1925 ERROR("Couldn't set extended vcpu%d info\n", i);
keir@16257 1926 goto out;
keir@16257 1927 }
mjw@1661 1928 }
mjw@1661 1929
keir@20457 1930 memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE);
keir@20452 1931
keir@20452 1932 DPRINTF("Completed checkpoint load\n");
mjw@1661 1933
ian@15744 1934 /* Restore contents of shared-info page. No checking needed. */
ian@15744 1935 new_shared_info = xc_map_foreign_range(
ian@15744 1936 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
ian@15744 1937
ian@15744 1938 /* restore saved vcpu_info and arch specific info */
Tim@15955 1939 MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info);
Tim@15955 1940 MEMCPY_FIELD(new_shared_info, old_shared_info, arch);
mjw@1661 1941
ian@15744 1942 /* clear any pending events and the selector */
Tim@15955 1943 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0);
keir@19826 1944 for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
Tim@15955 1945 SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0);
ian@15744 1946
ian@15744 1947 /* mask event channels */
Tim@15955 1948 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff);
ian@15744 1949
ian@15744 1950 /* leave wallclock time. set by hypervisor */
ian@15744 1951 munmap(new_shared_info, PAGE_SIZE);
kaf24@9698 1952
mjw@1661 1953 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
keir@14809 1954 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
keir@14809 1955 {
smh22@7740 1956 pfn = p2m_frame_list[i];
keir@20585 1957 if ( (pfn >= dinfo->p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
keir@14809 1958 {
Tim@15955 1959 ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn);
mjw@1661 1960 goto out;
mjw@1661 1961 }
keir@20587 1962 p2m_frame_list[i] = ctx->p2m[pfn];
mjw@1661 1963 }
kaf24@9698 1964
smh22@7740 1965 /* Copy the P2M we've constructed to the 'live' P2M */
keir@20837 1966 if ( !(ctx->live_p2m = xc_map_foreign_pages(xc_handle, dom, PROT_WRITE,
keir@14809 1967 p2m_frame_list, P2M_FL_ENTRIES)) )
keir@14809 1968 {
kfraser@11814 1969 ERROR("Couldn't map p2m table");
mjw@1661 1970 goto out;
mjw@1661 1971 }
mjw@1661 1972
Tim@15955 1973 /* If the domain we're restoring has a different word size to ours,
keir@18367 1974 * we need to adjust the live_p2m assignment appropriately */
keir@20585 1975 if ( dinfo->guest_width > sizeof (xen_pfn_t) )
keir@20585 1976 for ( i = dinfo->p2m_size - 1; i >= 0; i-- )
keir@20587 1977 ((int64_t *)ctx->live_p2m)[i] = (long)ctx->p2m[i];
keir@20585 1978 else if ( dinfo->guest_width < sizeof (xen_pfn_t) )
keir@20585 1979 for ( i = 0; i < dinfo->p2m_size; i++ )
keir@20587 1980 ((uint32_t *)ctx->live_p2m)[i] = ctx->p2m[i];
keir@18367 1981 else
keir@20587 1982 memcpy(ctx->live_p2m, ctx->p2m, dinfo->p2m_size * sizeof(xen_pfn_t));
keir@20587 1983 munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
mjw@1661 1984
cl349@5091 1985 DPRINTF("Domain ready to be built.\n");
Tim@14782 1986 rc = 0;
keir@20457 1987 goto out;
keir@20457 1988
keir@20457 1989 finish_hvm:
keir@20457 1990 /* Dump the QEMU state to a state file for QEMU to load */
keir@20457 1991 if ( dump_qemu(dom, &tailbuf.u.hvm) ) {
keir@20457 1992 ERROR("Error dumping QEMU state to file");
keir@20457 1993 goto out;
keir@20457 1994 }
keir@20457 1995
keir@20457 1996 /* These comms pages need to be zeroed at the start of day */
keir@20457 1997 if ( xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[0]) ||
keir@20457 1998 xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[1]) ||
keir@20457 1999 xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[2]) )
keir@20457 2000 {
keir@20457 2001 ERROR("error zeroing magic pages");
keir@20457 2002 goto out;
keir@20457 2003 }
keir@20457 2004
keir@20457 2005 if ( (frc = xc_set_hvm_param(xc_handle, dom,
keir@20457 2006 HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0]))
keir@20457 2007 || (frc = xc_set_hvm_param(xc_handle, dom,
keir@20457 2008 HVM_PARAM_BUFIOREQ_PFN, tailbuf.u.hvm.magicpfns[1]))
keir@20457 2009 || (frc = xc_set_hvm_param(xc_handle, dom,
keir@20457 2010 HVM_PARAM_STORE_PFN, tailbuf.u.hvm.magicpfns[2]))
keir@20457 2011 || (frc = xc_set_hvm_param(xc_handle, dom,
keir@20457 2012 HVM_PARAM_PAE_ENABLED, pae))
keir@20457 2013 || (frc = xc_set_hvm_param(xc_handle, dom,
keir@20457 2014 HVM_PARAM_STORE_EVTCHN,
keir@20457 2015 store_evtchn)) )
keir@20457 2016 {
keir@20457 2017 ERROR("error setting HVM params: %i", frc);
keir@20457 2018 goto out;
keir@20457 2019 }
keir@20457 2020 *store_mfn = tailbuf.u.hvm.magicpfns[2];
keir@20457 2021
keir@20457 2022 frc = xc_domain_hvm_setcontext(xc_handle, dom, tailbuf.u.hvm.hvmbuf,
keir@20457 2023 tailbuf.u.hvm.reclen);
keir@20457 2024 if ( frc )
keir@20457 2025 {
keir@20457 2026 ERROR("error setting the HVM context");
keir@20457 2027 goto out;
keir@20457 2028 }
keir@20457 2029
keir@20457 2030 /* HVM success! */
keir@20457 2031 rc = 0;
cl349@2791 2032
mjw@1661 2033 out:
kaf24@1683 2034 if ( (rc != 0) && (dom != 0) )
mjw@1661 2035 xc_domain_destroy(xc_handle, dom);
vh249@6159 2036 free(mmu);
keir@20587 2037 free(ctx->p2m);
vh249@6159 2038 free(pfn_type);
keir@20457 2039 tailbuf_free(&tailbuf);
mjw@1661 2040
keir@14142 2041 /* discard cache for save file */
keir@14142 2042 discard_file_cache(io_fd, 1 /*flush*/);
keir@14142 2043
cl349@5091 2044 DPRINTF("Restore exit with rc=%d\n", rc);
steven@13424 2045
mjw@1661 2046 return rc;
mjw@1661 2047 }
keir@19677 2048 /*
keir@19677 2049 * Local variables:
keir@19677 2050 * mode: C
keir@19677 2051 * c-set-style: "BSD"
keir@19677 2052 * c-basic-offset: 4
keir@19677 2053 * tab-width: 4
keir@19677 2054 * indent-tabs-mode: nil
keir@19677 2055 * End:
keir@19677 2056 */