1 #include <assert.h>
2 #include <limits.h>
3
4 #include "xc_sr_common_x86_pv.h"
5
6 /* Check a 64 bit virtual address for being canonical. */
is_canonical_address(xen_vaddr_t vaddr)7 static inline bool is_canonical_address(xen_vaddr_t vaddr)
8 {
9 return ((int64_t)vaddr >> 47) == ((int64_t)vaddr >> 63);
10 }
11
12 /*
13 * Maps the guests shared info page.
14 */
map_shinfo(struct xc_sr_context * ctx)15 static int map_shinfo(struct xc_sr_context *ctx)
16 {
17 xc_interface *xch = ctx->xch;
18
19 ctx->x86.pv.shinfo = xc_map_foreign_range(
20 xch, ctx->domid, PAGE_SIZE, PROT_READ, ctx->dominfo.shared_info_frame);
21 if ( !ctx->x86.pv.shinfo )
22 {
23 PERROR("Failed to map shared info frame at mfn %#lx",
24 ctx->dominfo.shared_info_frame);
25 return -1;
26 }
27
28 return 0;
29 }
30
31 /*
32 * Copy a list of mfns from a guest, accounting for differences between guest
33 * and toolstack width. Can fail if truncation would occur.
34 */
copy_mfns_from_guest(const struct xc_sr_context * ctx,xen_pfn_t * dst,const void * src,size_t count)35 static int copy_mfns_from_guest(const struct xc_sr_context *ctx,
36 xen_pfn_t *dst, const void *src, size_t count)
37 {
38 size_t x;
39
40 if ( ctx->x86.pv.width == sizeof(unsigned long) )
41 memcpy(dst, src, count * sizeof(*dst));
42 else
43 {
44 for ( x = 0; x < count; ++x )
45 {
46 #ifdef __x86_64__
47 /* 64bit toolstack, 32bit guest. Expand any INVALID_MFN. */
48 uint32_t s = ((uint32_t *)src)[x];
49
50 dst[x] = s == ~0U ? INVALID_MFN : s;
51 #else
52 /*
53 * 32bit toolstack, 64bit guest. Truncate INVALID_MFN, but bail
54 * if any other truncation would occur.
55 *
56 * This will only occur on hosts where a PV guest has ram above
57 * the 16TB boundary. A 32bit dom0 is unlikely to have
58 * successfully booted on a system this large.
59 */
60 uint64_t s = ((uint64_t *)src)[x];
61
62 if ( (s != ~0ULL) && ((s >> 32) != 0) )
63 {
64 errno = E2BIG;
65 return -1;
66 }
67
68 dst[x] = s;
69 #endif
70 }
71 }
72
73 return 0;
74 }
75
76 /*
77 * Map the p2m leave pages and build an array of their pfns.
78 */
map_p2m_leaves(struct xc_sr_context * ctx,xen_pfn_t * mfns,size_t n_mfns)79 static int map_p2m_leaves(struct xc_sr_context *ctx, xen_pfn_t *mfns,
80 size_t n_mfns)
81 {
82 xc_interface *xch = ctx->xch;
83 unsigned int x;
84
85 ctx->x86.pv.p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
86 mfns, n_mfns);
87 if ( !ctx->x86.pv.p2m )
88 {
89 PERROR("Failed to map p2m frames");
90 return -1;
91 }
92
93 ctx->save.p2m_size = ctx->x86.pv.max_pfn + 1;
94 ctx->x86.pv.p2m_frames = n_mfns;
95 ctx->x86.pv.p2m_pfns = malloc(n_mfns * sizeof(*mfns));
96 if ( !ctx->x86.pv.p2m_pfns )
97 {
98 ERROR("Cannot allocate %zu bytes for p2m pfns list",
99 n_mfns * sizeof(*mfns));
100 return -1;
101 }
102
103 /* Convert leaf frames from mfns to pfns. */
104 for ( x = 0; x < n_mfns; ++x )
105 {
106 if ( !mfn_in_pseudophysmap(ctx, mfns[x]) )
107 {
108 ERROR("Bad mfn in p2m_frame_list[%u]", x);
109 dump_bad_pseudophysmap_entry(ctx, mfns[x]);
110 errno = ERANGE;
111 return -1;
112 }
113
114 ctx->x86.pv.p2m_pfns[x] = mfn_to_pfn(ctx, mfns[x]);
115 }
116
117 return 0;
118 }
119
120 /*
121 * Walk the guests frame list list and frame list to identify and map the
122 * frames making up the guests p2m table. Construct a list of pfns making up
123 * the table.
124 */
map_p2m_tree(struct xc_sr_context * ctx)125 static int map_p2m_tree(struct xc_sr_context *ctx)
126 {
127 /* Terminology:
128 *
129 * fll - frame list list, top level p2m, list of fl mfns
130 * fl - frame list, mid level p2m, list of leaf mfns
131 * local - own allocated buffers, adjusted for bitness
132 * guest - mappings into the domain
133 */
134 xc_interface *xch = ctx->xch;
135 int rc = -1;
136 unsigned int x, saved_x, fpp, fll_entries, fl_entries;
137 xen_pfn_t fll_mfn, saved_mfn, max_pfn;
138
139 xen_pfn_t *local_fll = NULL;
140 void *guest_fll = NULL;
141 size_t local_fll_size;
142
143 xen_pfn_t *local_fl = NULL;
144 void *guest_fl = NULL;
145 size_t local_fl_size;
146
147 fpp = PAGE_SIZE / ctx->x86.pv.width;
148 fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
149 if ( fll_entries > fpp )
150 {
151 ERROR("max_pfn %#lx too large for p2m tree", ctx->x86.pv.max_pfn);
152 goto err;
153 }
154
155 fll_mfn = GET_FIELD(ctx->x86.pv.shinfo, arch.pfn_to_mfn_frame_list_list,
156 ctx->x86.pv.width);
157 if ( fll_mfn == 0 || fll_mfn > ctx->x86.pv.max_mfn )
158 {
159 ERROR("Bad mfn %#lx for p2m frame list list", fll_mfn);
160 goto err;
161 }
162
163 /* Map the guest top p2m. */
164 guest_fll = xc_map_foreign_range(xch, ctx->domid, PAGE_SIZE,
165 PROT_READ, fll_mfn);
166 if ( !guest_fll )
167 {
168 PERROR("Failed to map p2m frame list list at %#lx", fll_mfn);
169 goto err;
170 }
171
172 local_fll_size = fll_entries * sizeof(*local_fll);
173 local_fll = malloc(local_fll_size);
174 if ( !local_fll )
175 {
176 ERROR("Cannot allocate %zu bytes for local p2m frame list list",
177 local_fll_size);
178 goto err;
179 }
180
181 if ( copy_mfns_from_guest(ctx, local_fll, guest_fll, fll_entries) )
182 {
183 ERROR("Truncation detected copying p2m frame list list");
184 goto err;
185 }
186
187 /* Check for bad mfns in frame list list. */
188 saved_mfn = 0;
189 saved_x = 0;
190 for ( x = 0; x < fll_entries; ++x )
191 {
192 if ( local_fll[x] == 0 || local_fll[x] > ctx->x86.pv.max_mfn )
193 {
194 ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list list",
195 local_fll[x], x, fll_entries);
196 goto err;
197 }
198 if ( local_fll[x] != saved_mfn )
199 {
200 saved_mfn = local_fll[x];
201 saved_x = x;
202 }
203 }
204
205 /*
206 * Check for actual lower max_pfn:
207 * If the trailing entries of the frame list list were all the same we can
208 * assume they all reference mid pages all referencing p2m pages with all
209 * invalid entries. Otherwise there would be multiple pfns referencing all
210 * the same mfn which can't work across migration, as this sharing would be
211 * broken by the migration process.
212 * Adjust max_pfn if possible to avoid allocating much larger areas as
213 * needed for p2m and logdirty map.
214 */
215 max_pfn = (saved_x + 1) * fpp * fpp - 1;
216 if ( max_pfn < ctx->x86.pv.max_pfn )
217 {
218 ctx->x86.pv.max_pfn = max_pfn;
219 fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
220 }
221 ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
222 DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
223 ctx->x86.pv.p2m_frames);
224 fl_entries = (ctx->x86.pv.max_pfn / fpp) + 1;
225
226 /* Map the guest mid p2m frames. */
227 guest_fl = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
228 local_fll, fll_entries);
229 if ( !guest_fl )
230 {
231 PERROR("Failed to map p2m frame list");
232 goto err;
233 }
234
235 local_fl_size = fl_entries * sizeof(*local_fl);
236 local_fl = malloc(local_fl_size);
237 if ( !local_fl )
238 {
239 ERROR("Cannot allocate %zu bytes for local p2m frame list",
240 local_fl_size);
241 goto err;
242 }
243
244 if ( copy_mfns_from_guest(ctx, local_fl, guest_fl, fl_entries) )
245 {
246 ERROR("Truncation detected copying p2m frame list");
247 goto err;
248 }
249
250 for ( x = 0; x < fl_entries; ++x )
251 {
252 if ( local_fl[x] == 0 || local_fl[x] > ctx->x86.pv.max_mfn )
253 {
254 ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list",
255 local_fl[x], x, fl_entries);
256 goto err;
257 }
258 }
259
260 /* Map the p2m leaves themselves. */
261 rc = map_p2m_leaves(ctx, local_fl, fl_entries);
262
263 err:
264 free(local_fl);
265 if ( guest_fl )
266 munmap(guest_fl, fll_entries * PAGE_SIZE);
267
268 free(local_fll);
269 if ( guest_fll )
270 munmap(guest_fll, PAGE_SIZE);
271
272 return rc;
273 }
274
275 /*
276 * Get p2m_generation count.
277 * Returns an error if the generation count has changed since the last call.
278 */
get_p2m_generation(struct xc_sr_context * ctx)279 static int get_p2m_generation(struct xc_sr_context *ctx)
280 {
281 uint64_t p2m_generation;
282 int rc;
283
284 p2m_generation = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_generation,
285 ctx->x86.pv.width);
286
287 rc = (p2m_generation == ctx->x86.pv.p2m_generation) ? 0 : -1;
288 ctx->x86.pv.p2m_generation = p2m_generation;
289
290 return rc;
291 }
292
x86_pv_check_vm_state_p2m_list(struct xc_sr_context * ctx)293 static int x86_pv_check_vm_state_p2m_list(struct xc_sr_context *ctx)
294 {
295 xc_interface *xch = ctx->xch;
296 int rc;
297
298 if ( !ctx->save.live )
299 return 0;
300
301 rc = get_p2m_generation(ctx);
302 if ( rc )
303 ERROR("p2m generation count changed. Migration aborted.");
304
305 return rc;
306 }
307
308 /*
309 * Map the guest p2m frames specified via a cr3 value, a virtual address, and
310 * the maximum pfn. PTE entries are 64 bits for both, 32 and 64 bit guests as
311 * in 32 bit case we support PAE guests only.
312 */
map_p2m_list(struct xc_sr_context * ctx,uint64_t p2m_cr3)313 static int map_p2m_list(struct xc_sr_context *ctx, uint64_t p2m_cr3)
314 {
315 xc_interface *xch = ctx->xch;
316 xen_vaddr_t p2m_vaddr, p2m_end, mask, off;
317 xen_pfn_t p2m_mfn, mfn, saved_mfn, max_pfn;
318 uint64_t *ptes = NULL;
319 xen_pfn_t *mfns = NULL;
320 unsigned int fpp, n_pages, level, shift, idx_start, idx_end, idx, saved_idx;
321 int rc = -1;
322
323 p2m_mfn = cr3_to_mfn(ctx, p2m_cr3);
324 assert(p2m_mfn != 0);
325 if ( p2m_mfn > ctx->x86.pv.max_mfn )
326 {
327 ERROR("Bad p2m_cr3 value %#" PRIx64, p2m_cr3);
328 errno = ERANGE;
329 goto err;
330 }
331
332 get_p2m_generation(ctx);
333
334 p2m_vaddr = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_vaddr,
335 ctx->x86.pv.width);
336 fpp = PAGE_SIZE / ctx->x86.pv.width;
337 ctx->x86.pv.p2m_frames = ctx->x86.pv.max_pfn / fpp + 1;
338 p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
339
340 if ( ctx->x86.pv.width == 8 )
341 {
342 mask = 0x0000ffffffffffffULL;
343 if ( !is_canonical_address(p2m_vaddr) ||
344 !is_canonical_address(p2m_end) ||
345 p2m_end < p2m_vaddr ||
346 (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_64 &&
347 p2m_end > HYPERVISOR_VIRT_START_X86_64) )
348 {
349 ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
350 p2m_vaddr, p2m_end);
351 errno = ERANGE;
352 goto err;
353 }
354 }
355 else
356 {
357 mask = 0x00000000ffffffffULL;
358 if ( p2m_vaddr > mask || p2m_end > mask || p2m_end < p2m_vaddr ||
359 (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_32 &&
360 p2m_end > HYPERVISOR_VIRT_START_X86_32) )
361 {
362 ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
363 p2m_vaddr, p2m_end);
364 errno = ERANGE;
365 goto err;
366 }
367 }
368
369 DPRINTF("p2m list from %#" PRIx64 " to %#" PRIx64 ", root at %#lx",
370 p2m_vaddr, p2m_end, p2m_mfn);
371 DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
372 ctx->x86.pv.p2m_frames);
373
374 mfns = malloc(sizeof(*mfns));
375 if ( !mfns )
376 {
377 ERROR("Cannot allocate memory for array of %u mfns", 1);
378 goto err;
379 }
380 mfns[0] = p2m_mfn;
381 off = 0;
382 saved_mfn = 0;
383 idx_start = idx_end = saved_idx = 0;
384
385 for ( level = ctx->x86.pv.levels; level > 0; level-- )
386 {
387 n_pages = idx_end - idx_start + 1;
388 ptes = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, mfns, n_pages);
389 if ( !ptes )
390 {
391 PERROR("Failed to map %u page table pages for p2m list", n_pages);
392 goto err;
393 }
394 free(mfns);
395
396 shift = level * 9 + 3;
397 idx_start = ((p2m_vaddr - off) & mask) >> shift;
398 idx_end = ((p2m_end - off) & mask) >> shift;
399 idx = idx_end - idx_start + 1;
400 mfns = malloc(sizeof(*mfns) * idx);
401 if ( !mfns )
402 {
403 ERROR("Cannot allocate memory for array of %u mfns", idx);
404 goto err;
405 }
406
407 for ( idx = idx_start; idx <= idx_end; idx++ )
408 {
409 mfn = pte_to_frame(ptes[idx]);
410 if ( mfn == 0 || mfn > ctx->x86.pv.max_mfn )
411 {
412 ERROR("Bad mfn %#lx during page table walk for vaddr %#" PRIx64 " at level %d of p2m list",
413 mfn, off + ((xen_vaddr_t)idx << shift), level);
414 errno = ERANGE;
415 goto err;
416 }
417 mfns[idx - idx_start] = mfn;
418
419 /* Maximum pfn check at level 2. Same reasoning as for p2m tree. */
420 if ( level == 2 )
421 {
422 if ( mfn != saved_mfn )
423 {
424 saved_mfn = mfn;
425 saved_idx = idx - idx_start;
426 }
427 }
428 }
429
430 if ( level == 2 )
431 {
432 if ( saved_idx == idx_end )
433 saved_idx++;
434 max_pfn = ((xen_pfn_t)saved_idx << 9) * fpp - 1;
435 if ( max_pfn < ctx->x86.pv.max_pfn )
436 {
437 ctx->x86.pv.max_pfn = max_pfn;
438 ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
439 p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
440 idx_end = idx_start + saved_idx;
441 }
442 }
443
444 munmap(ptes, n_pages * PAGE_SIZE);
445 ptes = NULL;
446 off = p2m_vaddr & ((mask >> shift) << shift);
447 }
448
449 /* Map the p2m leaves themselves. */
450 rc = map_p2m_leaves(ctx, mfns, idx_end - idx_start + 1);
451
452 err:
453 free(mfns);
454 if ( ptes )
455 munmap(ptes, n_pages * PAGE_SIZE);
456
457 return rc;
458 }
459
460 /*
461 * Map the guest p2m frames.
462 * Depending on guest support this might either be a virtual mapped linear
463 * list (preferred format) or a 3 level tree linked via mfns.
464 */
map_p2m(struct xc_sr_context * ctx)465 static int map_p2m(struct xc_sr_context *ctx)
466 {
467 uint64_t p2m_cr3;
468
469 ctx->x86.pv.p2m_generation = ~0ULL;
470 ctx->x86.pv.max_pfn = GET_FIELD(ctx->x86.pv.shinfo, arch.max_pfn,
471 ctx->x86.pv.width) - 1;
472 p2m_cr3 = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_cr3, ctx->x86.pv.width);
473
474 return p2m_cr3 ? map_p2m_list(ctx, p2m_cr3) : map_p2m_tree(ctx);
475 }
476
477 /*
478 * Obtain a specific vcpus basic state and write an X86_PV_VCPU_BASIC record
479 * into the stream. Performs mfn->pfn conversion on architectural state.
480 */
write_one_vcpu_basic(struct xc_sr_context * ctx,uint32_t id)481 static int write_one_vcpu_basic(struct xc_sr_context *ctx, uint32_t id)
482 {
483 xc_interface *xch = ctx->xch;
484 xen_pfn_t mfn, pfn;
485 unsigned int i, gdt_count;
486 int rc = -1;
487 vcpu_guest_context_any_t vcpu;
488 struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
489 .vcpu_id = id,
490 };
491 struct xc_sr_record rec = {
492 .type = REC_TYPE_X86_PV_VCPU_BASIC,
493 .length = sizeof(vhdr),
494 .data = &vhdr,
495 };
496
497 if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) )
498 {
499 PERROR("Failed to get vcpu%u context", id);
500 goto err;
501 }
502
503 /* Vcpu0 is special: Convert the suspend record to a pfn. */
504 if ( id == 0 )
505 {
506 mfn = GET_FIELD(&vcpu, user_regs.edx, ctx->x86.pv.width);
507 if ( !mfn_in_pseudophysmap(ctx, mfn) )
508 {
509 ERROR("Bad mfn for suspend record");
510 dump_bad_pseudophysmap_entry(ctx, mfn);
511 errno = ERANGE;
512 goto err;
513 }
514 SET_FIELD(&vcpu, user_regs.edx, mfn_to_pfn(ctx, mfn),
515 ctx->x86.pv.width);
516 }
517
518 gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86.pv.width);
519 if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
520 {
521 ERROR("GDT entry count (%u) out of range (max %u)",
522 gdt_count, FIRST_RESERVED_GDT_ENTRY);
523 errno = ERANGE;
524 goto err;
525 }
526 gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
527
528 /* Convert GDT frames to pfns. */
529 for ( i = 0; i < gdt_count; ++i )
530 {
531 mfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86.pv.width);
532 if ( !mfn_in_pseudophysmap(ctx, mfn) )
533 {
534 ERROR("Bad mfn for frame %u of vcpu%u's GDT", i, id);
535 dump_bad_pseudophysmap_entry(ctx, mfn);
536 errno = ERANGE;
537 goto err;
538 }
539 SET_FIELD(&vcpu, gdt_frames[i], mfn_to_pfn(ctx, mfn),
540 ctx->x86.pv.width);
541 }
542
543 /* Convert CR3 to a pfn. */
544 mfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86.pv.width));
545 if ( !mfn_in_pseudophysmap(ctx, mfn) )
546 {
547 ERROR("Bad mfn for vcpu%u's cr3", id);
548 dump_bad_pseudophysmap_entry(ctx, mfn);
549 errno = ERANGE;
550 goto err;
551 }
552 pfn = mfn_to_pfn(ctx, mfn);
553 SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, pfn), ctx->x86.pv.width);
554
555 /* 64bit guests: Convert CR1 (guest pagetables) to pfn. */
556 if ( ctx->x86.pv.levels == 4 && vcpu.x64.ctrlreg[1] )
557 {
558 mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
559 if ( !mfn_in_pseudophysmap(ctx, mfn) )
560 {
561 ERROR("Bad mfn for vcpu%u's cr1", id);
562 dump_bad_pseudophysmap_entry(ctx, mfn);
563 errno = ERANGE;
564 goto err;
565 }
566 pfn = mfn_to_pfn(ctx, mfn);
567 vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT);
568 }
569
570 if ( ctx->x86.pv.width == 8 )
571 rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x64));
572 else
573 rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x32));
574
575 err:
576 return rc;
577 }
578
579 /*
580 * Obtain a specific vcpus extended state and write an X86_PV_VCPU_EXTENDED
581 * record into the stream.
582 */
write_one_vcpu_extended(struct xc_sr_context * ctx,uint32_t id)583 static int write_one_vcpu_extended(struct xc_sr_context *ctx, uint32_t id)
584 {
585 xc_interface *xch = ctx->xch;
586 struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
587 .vcpu_id = id,
588 };
589 struct xc_sr_record rec = {
590 .type = REC_TYPE_X86_PV_VCPU_EXTENDED,
591 .length = sizeof(vhdr),
592 .data = &vhdr,
593 };
594 struct xen_domctl domctl = {
595 .cmd = XEN_DOMCTL_get_ext_vcpucontext,
596 .domain = ctx->domid,
597 .u.ext_vcpucontext.vcpu = id,
598 };
599
600 if ( xc_domctl(xch, &domctl) < 0 )
601 {
602 PERROR("Unable to get vcpu%u extended context", id);
603 return -1;
604 }
605
606 /* No content? Skip the record. */
607 if ( domctl.u.ext_vcpucontext.size == 0 )
608 return 0;
609
610 return write_split_record(ctx, &rec, &domctl.u.ext_vcpucontext,
611 domctl.u.ext_vcpucontext.size);
612 }
613
614 /*
615 * Query to see whether a specific vcpu has xsave state and if so, write an
616 * X86_PV_VCPU_XSAVE record into the stream.
617 */
write_one_vcpu_xsave(struct xc_sr_context * ctx,uint32_t id)618 static int write_one_vcpu_xsave(struct xc_sr_context *ctx, uint32_t id)
619 {
620 xc_interface *xch = ctx->xch;
621 int rc = -1;
622 DECLARE_HYPERCALL_BUFFER(void, buffer);
623 struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
624 .vcpu_id = id,
625 };
626 struct xc_sr_record rec = {
627 .type = REC_TYPE_X86_PV_VCPU_XSAVE,
628 .length = sizeof(vhdr),
629 .data = &vhdr,
630 };
631 struct xen_domctl domctl = {
632 .cmd = XEN_DOMCTL_getvcpuextstate,
633 .domain = ctx->domid,
634 .u.vcpuextstate.vcpu = id,
635 };
636
637 if ( xc_domctl(xch, &domctl) < 0 )
638 {
639 PERROR("Unable to get vcpu%u's xsave context", id);
640 goto err;
641 }
642
643 /* No xsave state? skip this record. */
644 if ( !domctl.u.vcpuextstate.xfeature_mask )
645 goto out;
646
647 buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size);
648 if ( !buffer )
649 {
650 ERROR("Unable to allocate %"PRIx64" bytes for vcpu%u's xsave context",
651 domctl.u.vcpuextstate.size, id);
652 goto err;
653 }
654
655 set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
656 if ( xc_domctl(xch, &domctl) < 0 )
657 {
658 PERROR("Unable to get vcpu%u's xsave context", id);
659 goto err;
660 }
661
662 /* No xsave state? Skip this record. */
663 if ( domctl.u.vcpuextstate.size == 0 )
664 goto out;
665
666 rc = write_split_record(ctx, &rec, buffer, domctl.u.vcpuextstate.size);
667 if ( rc )
668 goto err;
669
670 out:
671 rc = 0;
672
673 err:
674 xc_hypercall_buffer_free(xch, buffer);
675
676 return rc;
677 }
678
679 /*
680 * Query to see whether a specific vcpu has msr state and if so, write an
681 * X86_PV_VCPU_MSRS record into the stream.
682 */
write_one_vcpu_msrs(struct xc_sr_context * ctx,uint32_t id)683 static int write_one_vcpu_msrs(struct xc_sr_context *ctx, uint32_t id)
684 {
685 xc_interface *xch = ctx->xch;
686 int rc = -1;
687 size_t buffersz;
688 DECLARE_HYPERCALL_BUFFER(void, buffer);
689 struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
690 .vcpu_id = id,
691 };
692 struct xc_sr_record rec = {
693 .type = REC_TYPE_X86_PV_VCPU_MSRS,
694 .length = sizeof(vhdr),
695 .data = &vhdr,
696 };
697 struct xen_domctl domctl = {
698 .cmd = XEN_DOMCTL_get_vcpu_msrs,
699 .domain = ctx->domid,
700 .u.vcpu_msrs.vcpu = id,
701 };
702
703 if ( xc_domctl(xch, &domctl) < 0 )
704 {
705 PERROR("Unable to get vcpu%u's msrs", id);
706 goto err;
707 }
708
709 /* No MSRs? skip this record. */
710 if ( !domctl.u.vcpu_msrs.msr_count )
711 goto out;
712
713 buffersz = domctl.u.vcpu_msrs.msr_count * sizeof(xen_domctl_vcpu_msr_t);
714 buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz);
715 if ( !buffer )
716 {
717 ERROR("Unable to allocate %zu bytes for vcpu%u's msrs",
718 buffersz, id);
719 goto err;
720 }
721
722 set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
723 if ( xc_domctl(xch, &domctl) < 0 )
724 {
725 PERROR("Unable to get vcpu%u's msrs", id);
726 goto err;
727 }
728
729 /* No MSRs? Skip this record. */
730 if ( domctl.u.vcpu_msrs.msr_count == 0 )
731 goto out;
732
733 rc = write_split_record(ctx, &rec, buffer,
734 domctl.u.vcpu_msrs.msr_count *
735 sizeof(xen_domctl_vcpu_msr_t));
736 if ( rc )
737 goto err;
738
739 out:
740 rc = 0;
741
742 err:
743 xc_hypercall_buffer_free(xch, buffer);
744
745 return rc;
746 }
747
748 /*
749 * For each vcpu, if it is online, write its state into the stream.
750 */
write_all_vcpu_information(struct xc_sr_context * ctx)751 static int write_all_vcpu_information(struct xc_sr_context *ctx)
752 {
753 xc_interface *xch = ctx->xch;
754 xc_vcpuinfo_t vinfo;
755 unsigned int i;
756 int rc;
757
758 for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i )
759 {
760 rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo);
761 if ( rc )
762 {
763 PERROR("Failed to get vcpu%u information", i);
764 return rc;
765 }
766
767 /* Vcpu offline? skip all these records. */
768 if ( !vinfo.online )
769 continue;
770
771 rc = write_one_vcpu_basic(ctx, i);
772 if ( rc )
773 return rc;
774
775 rc = write_one_vcpu_extended(ctx, i);
776 if ( rc )
777 return rc;
778
779 rc = write_one_vcpu_xsave(ctx, i);
780 if ( rc )
781 return rc;
782
783 rc = write_one_vcpu_msrs(ctx, i);
784 if ( rc )
785 return rc;
786 }
787
788 return 0;
789 }
790
791 /*
792 * Writes an X86_PV_INFO record into the stream.
793 */
write_x86_pv_info(struct xc_sr_context * ctx)794 static int write_x86_pv_info(struct xc_sr_context *ctx)
795 {
796 struct xc_sr_rec_x86_pv_info info = {
797 .guest_width = ctx->x86.pv.width,
798 .pt_levels = ctx->x86.pv.levels,
799 };
800 struct xc_sr_record rec = {
801 .type = REC_TYPE_X86_PV_INFO,
802 .length = sizeof(info),
803 .data = &info,
804 };
805
806 return write_record(ctx, &rec);
807 }
808
809 /*
810 * Writes an X86_PV_P2M_FRAMES record into the stream. This contains the list
811 * of pfns making up the p2m table.
812 */
write_x86_pv_p2m_frames(struct xc_sr_context * ctx)813 static int write_x86_pv_p2m_frames(struct xc_sr_context *ctx)
814 {
815 xc_interface *xch = ctx->xch;
816 int rc; unsigned int i;
817 size_t datasz = ctx->x86.pv.p2m_frames * sizeof(uint64_t);
818 uint64_t *data = NULL;
819 struct xc_sr_rec_x86_pv_p2m_frames hdr = {
820 .end_pfn = ctx->x86.pv.max_pfn,
821 };
822 struct xc_sr_record rec = {
823 .type = REC_TYPE_X86_PV_P2M_FRAMES,
824 .length = sizeof(hdr),
825 .data = &hdr,
826 };
827
828 /* No need to translate if sizeof(uint64_t) == sizeof(xen_pfn_t). */
829 if ( sizeof(uint64_t) != sizeof(*ctx->x86.pv.p2m_pfns) )
830 {
831 if ( !(data = malloc(datasz)) )
832 {
833 ERROR("Cannot allocate %zu bytes for X86_PV_P2M_FRAMES data",
834 datasz);
835 return -1;
836 }
837
838 for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i )
839 data[i] = ctx->x86.pv.p2m_pfns[i];
840 }
841 else
842 data = (uint64_t *)ctx->x86.pv.p2m_pfns;
843
844 rc = write_split_record(ctx, &rec, data, datasz);
845
846 if ( data != (uint64_t *)ctx->x86.pv.p2m_pfns )
847 free(data);
848
849 return rc;
850 }
851
852 /*
853 * Writes an SHARED_INFO record into the stream.
854 */
write_shared_info(struct xc_sr_context * ctx)855 static int write_shared_info(struct xc_sr_context *ctx)
856 {
857 struct xc_sr_record rec = {
858 .type = REC_TYPE_SHARED_INFO,
859 .length = PAGE_SIZE,
860 .data = ctx->x86.pv.shinfo,
861 };
862
863 return write_record(ctx, &rec);
864 }
865
866 /*
867 * Normalise a pagetable for the migration stream. Performs mfn->pfn
868 * conversions on the ptes.
869 */
normalise_pagetable(struct xc_sr_context * ctx,const uint64_t * src,uint64_t * dst,unsigned long type)870 static int normalise_pagetable(struct xc_sr_context *ctx, const uint64_t *src,
871 uint64_t *dst, unsigned long type)
872 {
873 xc_interface *xch = ctx->xch;
874 uint64_t pte;
875 unsigned int i, xen_first = -1, xen_last = -1; /* Indices of Xen mappings. */
876
877 type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
878
879 if ( ctx->x86.pv.levels == 4 )
880 {
881 /* 64bit guests only have Xen mappings in their L4 tables. */
882 if ( type == XEN_DOMCTL_PFINFO_L4TAB )
883 {
884 xen_first = (HYPERVISOR_VIRT_START_X86_64 >>
885 L4_PAGETABLE_SHIFT_X86_64) & 511;
886 xen_last = (HYPERVISOR_VIRT_END_X86_64 >>
887 L4_PAGETABLE_SHIFT_X86_64) & 511;
888 }
889 }
890 else
891 {
892 switch ( type )
893 {
894 case XEN_DOMCTL_PFINFO_L4TAB:
895 ERROR("??? Found L4 table for 32bit guest");
896 errno = EINVAL;
897 return -1;
898
899 case XEN_DOMCTL_PFINFO_L3TAB:
900 /* 32bit guests can only use the first 4 entries of their L3 tables.
901 * All other are potentially used by Xen. */
902 xen_first = 4;
903 xen_last = 511;
904 break;
905
906 case XEN_DOMCTL_PFINFO_L2TAB:
907 /* It is hard to spot Xen mappings in a 32bit guest's L2. Most
908 * are normal but only a few will have Xen mappings.
909 */
910 i = (HYPERVISOR_VIRT_START_X86_32 >> L2_PAGETABLE_SHIFT_PAE) & 511;
911 if ( pte_to_frame(src[i]) == ctx->x86.pv.compat_m2p_mfn0 )
912 {
913 xen_first = i;
914 xen_last = (HYPERVISOR_VIRT_END_X86_32 >>
915 L2_PAGETABLE_SHIFT_PAE) & 511;
916 }
917 break;
918 }
919 }
920
921 for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
922 {
923 xen_pfn_t mfn;
924
925 pte = src[i];
926
927 /* Remove Xen mappings: Xen will reconstruct on the other side. */
928 if ( i >= xen_first && i <= xen_last )
929 pte = 0;
930
931 /*
932 * Errors during the live part of migration are expected as a result
933 * of split pagetable updates, page type changes, active grant
934 * mappings etc. The pagetable will need to be resent after pausing.
935 * In such cases we fail with EAGAIN.
936 *
937 * For domains which are already paused, errors are fatal.
938 */
939 if ( pte & _PAGE_PRESENT )
940 {
941 mfn = pte_to_frame(pte);
942
943 #ifdef __i386__
944 if ( mfn == INVALID_MFN )
945 {
946 if ( !ctx->dominfo.paused )
947 errno = EAGAIN;
948 else
949 {
950 ERROR("PTE truncation detected. L%lu[%u] = %016"PRIx64,
951 type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
952 errno = E2BIG;
953 }
954 return -1;
955 }
956 #endif
957
958 if ( (type > XEN_DOMCTL_PFINFO_L1TAB) && (pte & _PAGE_PSE) )
959 {
960 ERROR("Cannot migrate superpage (L%lu[%u]: 0x%016"PRIx64")",
961 type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
962 errno = E2BIG;
963 return -1;
964 }
965
966 if ( !mfn_in_pseudophysmap(ctx, mfn) )
967 {
968 if ( !ctx->dominfo.paused )
969 errno = EAGAIN;
970 else
971 {
972 ERROR("Bad mfn for L%lu[%u]",
973 type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i);
974 dump_bad_pseudophysmap_entry(ctx, mfn);
975 errno = ERANGE;
976 }
977 return -1;
978 }
979
980 pte = merge_pte(pte, mfn_to_pfn(ctx, mfn));
981 }
982
983 dst[i] = pte;
984 }
985
986 return 0;
987 }
988
x86_pv_pfn_to_gfn(const struct xc_sr_context * ctx,xen_pfn_t pfn)989 static xen_pfn_t x86_pv_pfn_to_gfn(const struct xc_sr_context *ctx,
990 xen_pfn_t pfn)
991 {
992 assert(pfn <= ctx->x86.pv.max_pfn);
993
994 return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width);
995 }
996
997
998 /*
999 * save_ops function. Performs pagetable normalisation on appropriate pages.
1000 */
x86_pv_normalise_page(struct xc_sr_context * ctx,xen_pfn_t type,void ** page)1001 static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type,
1002 void **page)
1003 {
1004 xc_interface *xch = ctx->xch;
1005 void *local_page;
1006 int rc;
1007
1008 type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1009
1010 if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
1011 return 0;
1012
1013 local_page = malloc(PAGE_SIZE);
1014 if ( !local_page )
1015 {
1016 ERROR("Unable to allocate scratch page");
1017 rc = -1;
1018 goto out;
1019 }
1020
1021 rc = normalise_pagetable(ctx, *page, local_page, type);
1022 *page = local_page;
1023
1024 out:
1025 return rc;
1026 }
1027
1028 /*
1029 * save_ops function. Queries domain information and maps the Xen m2p and the
1030 * guests shinfo and p2m table.
1031 */
x86_pv_setup(struct xc_sr_context * ctx)1032 static int x86_pv_setup(struct xc_sr_context *ctx)
1033 {
1034 int rc;
1035
1036 rc = x86_pv_domain_info(ctx);
1037 if ( rc )
1038 return rc;
1039
1040 rc = x86_pv_map_m2p(ctx);
1041 if ( rc )
1042 return rc;
1043
1044 rc = map_shinfo(ctx);
1045 if ( rc )
1046 return rc;
1047
1048 rc = map_p2m(ctx);
1049 if ( rc )
1050 return rc;
1051
1052 return 0;
1053 }
1054
x86_pv_static_data(struct xc_sr_context * ctx)1055 static int x86_pv_static_data(struct xc_sr_context *ctx)
1056 {
1057 int rc;
1058
1059 rc = write_x86_pv_info(ctx);
1060 if ( rc )
1061 return rc;
1062
1063 rc = write_x86_cpu_policy_records(ctx);
1064 if ( rc )
1065 return rc;
1066
1067 return 0;
1068 }
1069
x86_pv_start_of_stream(struct xc_sr_context * ctx)1070 static int x86_pv_start_of_stream(struct xc_sr_context *ctx)
1071 {
1072 int rc;
1073
1074 /*
1075 * Ideally should be able to change during migration. Currently
1076 * corruption will occur if the contents or location of the P2M changes
1077 * during the live migration loop. If one is very lucky, the breakage
1078 * will not be subtle.
1079 */
1080 rc = write_x86_pv_p2m_frames(ctx);
1081 if ( rc )
1082 return rc;
1083
1084 return 0;
1085 }
1086
x86_pv_start_of_checkpoint(struct xc_sr_context * ctx)1087 static int x86_pv_start_of_checkpoint(struct xc_sr_context *ctx)
1088 {
1089 return 0;
1090 }
1091
x86_pv_end_of_checkpoint(struct xc_sr_context * ctx)1092 static int x86_pv_end_of_checkpoint(struct xc_sr_context *ctx)
1093 {
1094 int rc;
1095
1096 rc = write_x86_tsc_info(ctx);
1097 if ( rc )
1098 return rc;
1099
1100 rc = write_shared_info(ctx);
1101 if ( rc )
1102 return rc;
1103
1104 rc = write_all_vcpu_information(ctx);
1105 if ( rc )
1106 return rc;
1107
1108 return 0;
1109 }
1110
x86_pv_check_vm_state(struct xc_sr_context * ctx)1111 static int x86_pv_check_vm_state(struct xc_sr_context *ctx)
1112 {
1113 if ( ctx->x86.pv.p2m_generation == ~0ULL )
1114 return 0;
1115
1116 return x86_pv_check_vm_state_p2m_list(ctx);
1117 }
1118
x86_pv_cleanup(struct xc_sr_context * ctx)1119 static int x86_pv_cleanup(struct xc_sr_context *ctx)
1120 {
1121 free(ctx->x86.pv.p2m_pfns);
1122
1123 if ( ctx->x86.pv.p2m )
1124 munmap(ctx->x86.pv.p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE);
1125
1126 if ( ctx->x86.pv.shinfo )
1127 munmap(ctx->x86.pv.shinfo, PAGE_SIZE);
1128
1129 if ( ctx->x86.pv.m2p )
1130 munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE);
1131
1132 return 0;
1133 }
1134
1135 struct xc_sr_save_ops save_ops_x86_pv =
1136 {
1137 .pfn_to_gfn = x86_pv_pfn_to_gfn,
1138 .normalise_page = x86_pv_normalise_page,
1139 .setup = x86_pv_setup,
1140 .static_data = x86_pv_static_data,
1141 .start_of_stream = x86_pv_start_of_stream,
1142 .start_of_checkpoint = x86_pv_start_of_checkpoint,
1143 .end_of_checkpoint = x86_pv_end_of_checkpoint,
1144 .check_vm_state = x86_pv_check_vm_state,
1145 .cleanup = x86_pv_cleanup,
1146 };
1147
1148 /*
1149 * Local variables:
1150 * mode: C
1151 * c-file-style: "BSD"
1152 * c-basic-offset: 4
1153 * tab-width: 4
1154 * indent-tabs-mode: nil
1155 * End:
1156 */
1157