1 #include <assert.h>
2 #include <limits.h>
3 
4 #include "xc_sr_common_x86_pv.h"
5 
6 /* Check a 64 bit virtual address for being canonical. */
is_canonical_address(xen_vaddr_t vaddr)7 static inline bool is_canonical_address(xen_vaddr_t vaddr)
8 {
9     return ((int64_t)vaddr >> 47) == ((int64_t)vaddr >> 63);
10 }
11 
12 /*
13  * Maps the guests shared info page.
14  */
map_shinfo(struct xc_sr_context * ctx)15 static int map_shinfo(struct xc_sr_context *ctx)
16 {
17     xc_interface *xch = ctx->xch;
18 
19     ctx->x86.pv.shinfo = xc_map_foreign_range(
20         xch, ctx->domid, PAGE_SIZE, PROT_READ, ctx->dominfo.shared_info_frame);
21     if ( !ctx->x86.pv.shinfo )
22     {
23         PERROR("Failed to map shared info frame at mfn %#lx",
24                ctx->dominfo.shared_info_frame);
25         return -1;
26     }
27 
28     return 0;
29 }
30 
31 /*
32  * Copy a list of mfns from a guest, accounting for differences between guest
33  * and toolstack width.  Can fail if truncation would occur.
34  */
copy_mfns_from_guest(const struct xc_sr_context * ctx,xen_pfn_t * dst,const void * src,size_t count)35 static int copy_mfns_from_guest(const struct xc_sr_context *ctx,
36                                 xen_pfn_t *dst, const void *src, size_t count)
37 {
38     size_t x;
39 
40     if ( ctx->x86.pv.width == sizeof(unsigned long) )
41         memcpy(dst, src, count * sizeof(*dst));
42     else
43     {
44         for ( x = 0; x < count; ++x )
45         {
46 #ifdef __x86_64__
47             /* 64bit toolstack, 32bit guest.  Expand any INVALID_MFN. */
48             uint32_t s = ((uint32_t *)src)[x];
49 
50             dst[x] = s == ~0U ? INVALID_MFN : s;
51 #else
52             /*
53              * 32bit toolstack, 64bit guest.  Truncate INVALID_MFN, but bail
54              * if any other truncation would occur.
55              *
56              * This will only occur on hosts where a PV guest has ram above
57              * the 16TB boundary.  A 32bit dom0 is unlikely to have
58              * successfully booted on a system this large.
59              */
60             uint64_t s = ((uint64_t *)src)[x];
61 
62             if ( (s != ~0ULL) && ((s >> 32) != 0) )
63             {
64                 errno = E2BIG;
65                 return -1;
66             }
67 
68             dst[x] = s;
69 #endif
70         }
71     }
72 
73     return 0;
74 }
75 
76 /*
77  * Map the p2m leave pages and build an array of their pfns.
78  */
map_p2m_leaves(struct xc_sr_context * ctx,xen_pfn_t * mfns,size_t n_mfns)79 static int map_p2m_leaves(struct xc_sr_context *ctx, xen_pfn_t *mfns,
80                           size_t n_mfns)
81 {
82     xc_interface *xch = ctx->xch;
83     unsigned int x;
84 
85     ctx->x86.pv.p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
86                                            mfns, n_mfns);
87     if ( !ctx->x86.pv.p2m )
88     {
89         PERROR("Failed to map p2m frames");
90         return -1;
91     }
92 
93     ctx->save.p2m_size = ctx->x86.pv.max_pfn + 1;
94     ctx->x86.pv.p2m_frames = n_mfns;
95     ctx->x86.pv.p2m_pfns = malloc(n_mfns * sizeof(*mfns));
96     if ( !ctx->x86.pv.p2m_pfns )
97     {
98         ERROR("Cannot allocate %zu bytes for p2m pfns list",
99               n_mfns * sizeof(*mfns));
100         return -1;
101     }
102 
103     /* Convert leaf frames from mfns to pfns. */
104     for ( x = 0; x < n_mfns; ++x )
105     {
106         if ( !mfn_in_pseudophysmap(ctx, mfns[x]) )
107         {
108             ERROR("Bad mfn in p2m_frame_list[%u]", x);
109             dump_bad_pseudophysmap_entry(ctx, mfns[x]);
110             errno = ERANGE;
111             return -1;
112         }
113 
114         ctx->x86.pv.p2m_pfns[x] = mfn_to_pfn(ctx, mfns[x]);
115     }
116 
117     return 0;
118 }
119 
120 /*
121  * Walk the guests frame list list and frame list to identify and map the
122  * frames making up the guests p2m table.  Construct a list of pfns making up
123  * the table.
124  */
map_p2m_tree(struct xc_sr_context * ctx)125 static int map_p2m_tree(struct xc_sr_context *ctx)
126 {
127     /* Terminology:
128      *
129      * fll   - frame list list, top level p2m, list of fl mfns
130      * fl    - frame list, mid level p2m, list of leaf mfns
131      * local - own allocated buffers, adjusted for bitness
132      * guest - mappings into the domain
133      */
134     xc_interface *xch = ctx->xch;
135     int rc = -1;
136     unsigned int x, saved_x, fpp, fll_entries, fl_entries;
137     xen_pfn_t fll_mfn, saved_mfn, max_pfn;
138 
139     xen_pfn_t *local_fll = NULL;
140     void *guest_fll = NULL;
141     size_t local_fll_size;
142 
143     xen_pfn_t *local_fl = NULL;
144     void *guest_fl = NULL;
145     size_t local_fl_size;
146 
147     fpp = PAGE_SIZE / ctx->x86.pv.width;
148     fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
149     if ( fll_entries > fpp )
150     {
151         ERROR("max_pfn %#lx too large for p2m tree", ctx->x86.pv.max_pfn);
152         goto err;
153     }
154 
155     fll_mfn = GET_FIELD(ctx->x86.pv.shinfo, arch.pfn_to_mfn_frame_list_list,
156                         ctx->x86.pv.width);
157     if ( fll_mfn == 0 || fll_mfn > ctx->x86.pv.max_mfn )
158     {
159         ERROR("Bad mfn %#lx for p2m frame list list", fll_mfn);
160         goto err;
161     }
162 
163     /* Map the guest top p2m. */
164     guest_fll = xc_map_foreign_range(xch, ctx->domid, PAGE_SIZE,
165                                      PROT_READ, fll_mfn);
166     if ( !guest_fll )
167     {
168         PERROR("Failed to map p2m frame list list at %#lx", fll_mfn);
169         goto err;
170     }
171 
172     local_fll_size = fll_entries * sizeof(*local_fll);
173     local_fll = malloc(local_fll_size);
174     if ( !local_fll )
175     {
176         ERROR("Cannot allocate %zu bytes for local p2m frame list list",
177               local_fll_size);
178         goto err;
179     }
180 
181     if ( copy_mfns_from_guest(ctx, local_fll, guest_fll, fll_entries) )
182     {
183         ERROR("Truncation detected copying p2m frame list list");
184         goto err;
185     }
186 
187     /* Check for bad mfns in frame list list. */
188     saved_mfn = 0;
189     saved_x = 0;
190     for ( x = 0; x < fll_entries; ++x )
191     {
192         if ( local_fll[x] == 0 || local_fll[x] > ctx->x86.pv.max_mfn )
193         {
194             ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list list",
195                   local_fll[x], x, fll_entries);
196             goto err;
197         }
198         if ( local_fll[x] != saved_mfn )
199         {
200             saved_mfn = local_fll[x];
201             saved_x = x;
202         }
203     }
204 
205     /*
206      * Check for actual lower max_pfn:
207      * If the trailing entries of the frame list list were all the same we can
208      * assume they all reference mid pages all referencing p2m pages with all
209      * invalid entries. Otherwise there would be multiple pfns referencing all
210      * the same mfn which can't work across migration, as this sharing would be
211      * broken by the migration process.
212      * Adjust max_pfn if possible to avoid allocating much larger areas as
213      * needed for p2m and logdirty map.
214      */
215     max_pfn = (saved_x + 1) * fpp * fpp - 1;
216     if ( max_pfn < ctx->x86.pv.max_pfn )
217     {
218         ctx->x86.pv.max_pfn = max_pfn;
219         fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
220     }
221     ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
222     DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
223             ctx->x86.pv.p2m_frames);
224     fl_entries  = (ctx->x86.pv.max_pfn / fpp) + 1;
225 
226     /* Map the guest mid p2m frames. */
227     guest_fl = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
228                                     local_fll, fll_entries);
229     if ( !guest_fl )
230     {
231         PERROR("Failed to map p2m frame list");
232         goto err;
233     }
234 
235     local_fl_size = fl_entries * sizeof(*local_fl);
236     local_fl = malloc(local_fl_size);
237     if ( !local_fl )
238     {
239         ERROR("Cannot allocate %zu bytes for local p2m frame list",
240               local_fl_size);
241         goto err;
242     }
243 
244     if ( copy_mfns_from_guest(ctx, local_fl, guest_fl, fl_entries) )
245     {
246         ERROR("Truncation detected copying p2m frame list");
247         goto err;
248     }
249 
250     for ( x = 0; x < fl_entries; ++x )
251     {
252         if ( local_fl[x] == 0 || local_fl[x] > ctx->x86.pv.max_mfn )
253         {
254             ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list",
255                   local_fl[x], x, fl_entries);
256             goto err;
257         }
258     }
259 
260     /* Map the p2m leaves themselves. */
261     rc = map_p2m_leaves(ctx, local_fl, fl_entries);
262 
263  err:
264     free(local_fl);
265     if ( guest_fl )
266         munmap(guest_fl, fll_entries * PAGE_SIZE);
267 
268     free(local_fll);
269     if ( guest_fll )
270         munmap(guest_fll, PAGE_SIZE);
271 
272     return rc;
273 }
274 
275 /*
276  * Get p2m_generation count.
277  * Returns an error if the generation count has changed since the last call.
278  */
get_p2m_generation(struct xc_sr_context * ctx)279 static int get_p2m_generation(struct xc_sr_context *ctx)
280 {
281     uint64_t p2m_generation;
282     int rc;
283 
284     p2m_generation = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_generation,
285                                ctx->x86.pv.width);
286 
287     rc = (p2m_generation == ctx->x86.pv.p2m_generation) ? 0 : -1;
288     ctx->x86.pv.p2m_generation = p2m_generation;
289 
290     return rc;
291 }
292 
x86_pv_check_vm_state_p2m_list(struct xc_sr_context * ctx)293 static int x86_pv_check_vm_state_p2m_list(struct xc_sr_context *ctx)
294 {
295     xc_interface *xch = ctx->xch;
296     int rc;
297 
298     if ( !ctx->save.live )
299         return 0;
300 
301     rc = get_p2m_generation(ctx);
302     if ( rc )
303         ERROR("p2m generation count changed. Migration aborted.");
304 
305     return rc;
306 }
307 
308 /*
309  * Map the guest p2m frames specified via a cr3 value, a virtual address, and
310  * the maximum pfn. PTE entries are 64 bits for both, 32 and 64 bit guests as
311  * in 32 bit case we support PAE guests only.
312  */
map_p2m_list(struct xc_sr_context * ctx,uint64_t p2m_cr3)313 static int map_p2m_list(struct xc_sr_context *ctx, uint64_t p2m_cr3)
314 {
315     xc_interface *xch = ctx->xch;
316     xen_vaddr_t p2m_vaddr, p2m_end, mask, off;
317     xen_pfn_t p2m_mfn, mfn, saved_mfn, max_pfn;
318     uint64_t *ptes = NULL;
319     xen_pfn_t *mfns = NULL;
320     unsigned int fpp, n_pages, level, shift, idx_start, idx_end, idx, saved_idx;
321     int rc = -1;
322 
323     p2m_mfn = cr3_to_mfn(ctx, p2m_cr3);
324     assert(p2m_mfn != 0);
325     if ( p2m_mfn > ctx->x86.pv.max_mfn )
326     {
327         ERROR("Bad p2m_cr3 value %#" PRIx64, p2m_cr3);
328         errno = ERANGE;
329         goto err;
330     }
331 
332     get_p2m_generation(ctx);
333 
334     p2m_vaddr = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_vaddr,
335                           ctx->x86.pv.width);
336     fpp = PAGE_SIZE / ctx->x86.pv.width;
337     ctx->x86.pv.p2m_frames = ctx->x86.pv.max_pfn / fpp + 1;
338     p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
339 
340     if ( ctx->x86.pv.width == 8 )
341     {
342         mask = 0x0000ffffffffffffULL;
343         if ( !is_canonical_address(p2m_vaddr) ||
344              !is_canonical_address(p2m_end) ||
345              p2m_end < p2m_vaddr ||
346              (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_64 &&
347               p2m_end > HYPERVISOR_VIRT_START_X86_64) )
348         {
349             ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
350                   p2m_vaddr, p2m_end);
351             errno = ERANGE;
352             goto err;
353         }
354     }
355     else
356     {
357         mask = 0x00000000ffffffffULL;
358         if ( p2m_vaddr > mask || p2m_end > mask || p2m_end < p2m_vaddr ||
359              (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_32 &&
360               p2m_end > HYPERVISOR_VIRT_START_X86_32) )
361         {
362             ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
363                   p2m_vaddr, p2m_end);
364             errno = ERANGE;
365             goto err;
366         }
367     }
368 
369     DPRINTF("p2m list from %#" PRIx64 " to %#" PRIx64 ", root at %#lx",
370             p2m_vaddr, p2m_end, p2m_mfn);
371     DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
372             ctx->x86.pv.p2m_frames);
373 
374     mfns = malloc(sizeof(*mfns));
375     if ( !mfns )
376     {
377         ERROR("Cannot allocate memory for array of %u mfns", 1);
378         goto err;
379     }
380     mfns[0] = p2m_mfn;
381     off = 0;
382     saved_mfn = 0;
383     idx_start = idx_end = saved_idx = 0;
384 
385     for ( level = ctx->x86.pv.levels; level > 0; level-- )
386     {
387         n_pages = idx_end - idx_start + 1;
388         ptes = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, mfns, n_pages);
389         if ( !ptes )
390         {
391             PERROR("Failed to map %u page table pages for p2m list", n_pages);
392             goto err;
393         }
394         free(mfns);
395 
396         shift = level * 9 + 3;
397         idx_start = ((p2m_vaddr - off) & mask) >> shift;
398         idx_end = ((p2m_end - off) & mask) >> shift;
399         idx = idx_end - idx_start + 1;
400         mfns = malloc(sizeof(*mfns) * idx);
401         if ( !mfns )
402         {
403             ERROR("Cannot allocate memory for array of %u mfns", idx);
404             goto err;
405         }
406 
407         for ( idx = idx_start; idx <= idx_end; idx++ )
408         {
409             mfn = pte_to_frame(ptes[idx]);
410             if ( mfn == 0 || mfn > ctx->x86.pv.max_mfn )
411             {
412                 ERROR("Bad mfn %#lx during page table walk for vaddr %#" PRIx64 " at level %d of p2m list",
413                       mfn, off + ((xen_vaddr_t)idx << shift), level);
414                 errno = ERANGE;
415                 goto err;
416             }
417             mfns[idx - idx_start] = mfn;
418 
419             /* Maximum pfn check at level 2. Same reasoning as for p2m tree. */
420             if ( level == 2 )
421             {
422                 if ( mfn != saved_mfn )
423                 {
424                     saved_mfn = mfn;
425                     saved_idx = idx - idx_start;
426                 }
427             }
428         }
429 
430         if ( level == 2 )
431         {
432             if ( saved_idx == idx_end )
433                 saved_idx++;
434             max_pfn = ((xen_pfn_t)saved_idx << 9) * fpp - 1;
435             if ( max_pfn < ctx->x86.pv.max_pfn )
436             {
437                 ctx->x86.pv.max_pfn = max_pfn;
438                 ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
439                 p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
440                 idx_end = idx_start + saved_idx;
441             }
442         }
443 
444         munmap(ptes, n_pages * PAGE_SIZE);
445         ptes = NULL;
446         off = p2m_vaddr & ((mask >> shift) << shift);
447     }
448 
449     /* Map the p2m leaves themselves. */
450     rc = map_p2m_leaves(ctx, mfns, idx_end - idx_start + 1);
451 
452  err:
453     free(mfns);
454     if ( ptes )
455         munmap(ptes, n_pages * PAGE_SIZE);
456 
457     return rc;
458 }
459 
460 /*
461  * Map the guest p2m frames.
462  * Depending on guest support this might either be a virtual mapped linear
463  * list (preferred format) or a 3 level tree linked via mfns.
464  */
map_p2m(struct xc_sr_context * ctx)465 static int map_p2m(struct xc_sr_context *ctx)
466 {
467     uint64_t p2m_cr3;
468 
469     ctx->x86.pv.p2m_generation = ~0ULL;
470     ctx->x86.pv.max_pfn = GET_FIELD(ctx->x86.pv.shinfo, arch.max_pfn,
471                                     ctx->x86.pv.width) - 1;
472     p2m_cr3 = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_cr3, ctx->x86.pv.width);
473 
474     return p2m_cr3 ? map_p2m_list(ctx, p2m_cr3) : map_p2m_tree(ctx);
475 }
476 
477 /*
478  * Obtain a specific vcpus basic state and write an X86_PV_VCPU_BASIC record
479  * into the stream.  Performs mfn->pfn conversion on architectural state.
480  */
write_one_vcpu_basic(struct xc_sr_context * ctx,uint32_t id)481 static int write_one_vcpu_basic(struct xc_sr_context *ctx, uint32_t id)
482 {
483     xc_interface *xch = ctx->xch;
484     xen_pfn_t mfn, pfn;
485     unsigned int i, gdt_count;
486     int rc = -1;
487     vcpu_guest_context_any_t vcpu;
488     struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
489         .vcpu_id = id,
490     };
491     struct xc_sr_record rec = {
492         .type = REC_TYPE_X86_PV_VCPU_BASIC,
493         .length = sizeof(vhdr),
494         .data = &vhdr,
495     };
496 
497     if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) )
498     {
499         PERROR("Failed to get vcpu%u context", id);
500         goto err;
501     }
502 
503     /* Vcpu0 is special: Convert the suspend record to a pfn. */
504     if ( id == 0 )
505     {
506         mfn = GET_FIELD(&vcpu, user_regs.edx, ctx->x86.pv.width);
507         if ( !mfn_in_pseudophysmap(ctx, mfn) )
508         {
509             ERROR("Bad mfn for suspend record");
510             dump_bad_pseudophysmap_entry(ctx, mfn);
511             errno = ERANGE;
512             goto err;
513         }
514         SET_FIELD(&vcpu, user_regs.edx, mfn_to_pfn(ctx, mfn),
515                   ctx->x86.pv.width);
516     }
517 
518     gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86.pv.width);
519     if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
520     {
521         ERROR("GDT entry count (%u) out of range (max %u)",
522               gdt_count, FIRST_RESERVED_GDT_ENTRY);
523         errno = ERANGE;
524         goto err;
525     }
526     gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
527 
528     /* Convert GDT frames to pfns. */
529     for ( i = 0; i < gdt_count; ++i )
530     {
531         mfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86.pv.width);
532         if ( !mfn_in_pseudophysmap(ctx, mfn) )
533         {
534             ERROR("Bad mfn for frame %u of vcpu%u's GDT", i, id);
535             dump_bad_pseudophysmap_entry(ctx, mfn);
536             errno = ERANGE;
537             goto err;
538         }
539         SET_FIELD(&vcpu, gdt_frames[i], mfn_to_pfn(ctx, mfn),
540                   ctx->x86.pv.width);
541     }
542 
543     /* Convert CR3 to a pfn. */
544     mfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86.pv.width));
545     if ( !mfn_in_pseudophysmap(ctx, mfn) )
546     {
547         ERROR("Bad mfn for vcpu%u's cr3", id);
548         dump_bad_pseudophysmap_entry(ctx, mfn);
549         errno = ERANGE;
550         goto err;
551     }
552     pfn = mfn_to_pfn(ctx, mfn);
553     SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, pfn), ctx->x86.pv.width);
554 
555     /* 64bit guests: Convert CR1 (guest pagetables) to pfn. */
556     if ( ctx->x86.pv.levels == 4 && vcpu.x64.ctrlreg[1] )
557     {
558         mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
559         if ( !mfn_in_pseudophysmap(ctx, mfn) )
560         {
561             ERROR("Bad mfn for vcpu%u's cr1", id);
562             dump_bad_pseudophysmap_entry(ctx, mfn);
563             errno = ERANGE;
564             goto err;
565         }
566         pfn = mfn_to_pfn(ctx, mfn);
567         vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT);
568     }
569 
570     if ( ctx->x86.pv.width == 8 )
571         rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x64));
572     else
573         rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x32));
574 
575  err:
576     return rc;
577 }
578 
579 /*
580  * Obtain a specific vcpus extended state and write an X86_PV_VCPU_EXTENDED
581  * record into the stream.
582  */
write_one_vcpu_extended(struct xc_sr_context * ctx,uint32_t id)583 static int write_one_vcpu_extended(struct xc_sr_context *ctx, uint32_t id)
584 {
585     xc_interface *xch = ctx->xch;
586     struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
587         .vcpu_id = id,
588     };
589     struct xc_sr_record rec = {
590         .type = REC_TYPE_X86_PV_VCPU_EXTENDED,
591         .length = sizeof(vhdr),
592         .data = &vhdr,
593     };
594     struct xen_domctl domctl = {
595         .cmd = XEN_DOMCTL_get_ext_vcpucontext,
596         .domain = ctx->domid,
597         .u.ext_vcpucontext.vcpu = id,
598     };
599 
600     if ( xc_domctl(xch, &domctl) < 0 )
601     {
602         PERROR("Unable to get vcpu%u extended context", id);
603         return -1;
604     }
605 
606     /* No content? Skip the record. */
607     if ( domctl.u.ext_vcpucontext.size == 0 )
608         return 0;
609 
610     return write_split_record(ctx, &rec, &domctl.u.ext_vcpucontext,
611                               domctl.u.ext_vcpucontext.size);
612 }
613 
614 /*
615  * Query to see whether a specific vcpu has xsave state and if so, write an
616  * X86_PV_VCPU_XSAVE record into the stream.
617  */
write_one_vcpu_xsave(struct xc_sr_context * ctx,uint32_t id)618 static int write_one_vcpu_xsave(struct xc_sr_context *ctx, uint32_t id)
619 {
620     xc_interface *xch = ctx->xch;
621     int rc = -1;
622     DECLARE_HYPERCALL_BUFFER(void, buffer);
623     struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
624         .vcpu_id = id,
625     };
626     struct xc_sr_record rec = {
627         .type = REC_TYPE_X86_PV_VCPU_XSAVE,
628         .length = sizeof(vhdr),
629         .data = &vhdr,
630     };
631     struct xen_domctl domctl = {
632         .cmd = XEN_DOMCTL_getvcpuextstate,
633         .domain = ctx->domid,
634         .u.vcpuextstate.vcpu = id,
635     };
636 
637     if ( xc_domctl(xch, &domctl) < 0 )
638     {
639         PERROR("Unable to get vcpu%u's xsave context", id);
640         goto err;
641     }
642 
643     /* No xsave state? skip this record. */
644     if ( !domctl.u.vcpuextstate.xfeature_mask )
645         goto out;
646 
647     buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size);
648     if ( !buffer )
649     {
650         ERROR("Unable to allocate %"PRIx64" bytes for vcpu%u's xsave context",
651               domctl.u.vcpuextstate.size, id);
652         goto err;
653     }
654 
655     set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
656     if ( xc_domctl(xch, &domctl) < 0 )
657     {
658         PERROR("Unable to get vcpu%u's xsave context", id);
659         goto err;
660     }
661 
662     /* No xsave state? Skip this record. */
663     if ( domctl.u.vcpuextstate.size == 0 )
664         goto out;
665 
666     rc = write_split_record(ctx, &rec, buffer, domctl.u.vcpuextstate.size);
667     if ( rc )
668         goto err;
669 
670  out:
671     rc = 0;
672 
673  err:
674     xc_hypercall_buffer_free(xch, buffer);
675 
676     return rc;
677 }
678 
679 /*
680  * Query to see whether a specific vcpu has msr state and if so, write an
681  * X86_PV_VCPU_MSRS record into the stream.
682  */
write_one_vcpu_msrs(struct xc_sr_context * ctx,uint32_t id)683 static int write_one_vcpu_msrs(struct xc_sr_context *ctx, uint32_t id)
684 {
685     xc_interface *xch = ctx->xch;
686     int rc = -1;
687     size_t buffersz;
688     DECLARE_HYPERCALL_BUFFER(void, buffer);
689     struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
690         .vcpu_id = id,
691     };
692     struct xc_sr_record rec = {
693         .type = REC_TYPE_X86_PV_VCPU_MSRS,
694         .length = sizeof(vhdr),
695         .data = &vhdr,
696     };
697     struct xen_domctl domctl = {
698         .cmd = XEN_DOMCTL_get_vcpu_msrs,
699         .domain = ctx->domid,
700         .u.vcpu_msrs.vcpu = id,
701     };
702 
703     if ( xc_domctl(xch, &domctl) < 0 )
704     {
705         PERROR("Unable to get vcpu%u's msrs", id);
706         goto err;
707     }
708 
709     /* No MSRs? skip this record. */
710     if ( !domctl.u.vcpu_msrs.msr_count )
711         goto out;
712 
713     buffersz = domctl.u.vcpu_msrs.msr_count * sizeof(xen_domctl_vcpu_msr_t);
714     buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz);
715     if ( !buffer )
716     {
717         ERROR("Unable to allocate %zu bytes for vcpu%u's msrs",
718               buffersz, id);
719         goto err;
720     }
721 
722     set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
723     if ( xc_domctl(xch, &domctl) < 0 )
724     {
725         PERROR("Unable to get vcpu%u's msrs", id);
726         goto err;
727     }
728 
729     /* No MSRs? Skip this record. */
730     if ( domctl.u.vcpu_msrs.msr_count == 0 )
731         goto out;
732 
733     rc = write_split_record(ctx, &rec, buffer,
734                             domctl.u.vcpu_msrs.msr_count *
735                             sizeof(xen_domctl_vcpu_msr_t));
736     if ( rc )
737         goto err;
738 
739  out:
740     rc = 0;
741 
742  err:
743     xc_hypercall_buffer_free(xch, buffer);
744 
745     return rc;
746 }
747 
748 /*
749  * For each vcpu, if it is online, write its state into the stream.
750  */
write_all_vcpu_information(struct xc_sr_context * ctx)751 static int write_all_vcpu_information(struct xc_sr_context *ctx)
752 {
753     xc_interface *xch = ctx->xch;
754     xc_vcpuinfo_t vinfo;
755     unsigned int i;
756     int rc;
757 
758     for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i )
759     {
760         rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo);
761         if ( rc )
762         {
763             PERROR("Failed to get vcpu%u information", i);
764             return rc;
765         }
766 
767         /* Vcpu offline? skip all these records. */
768         if ( !vinfo.online )
769             continue;
770 
771         rc = write_one_vcpu_basic(ctx, i);
772         if ( rc )
773             return rc;
774 
775         rc = write_one_vcpu_extended(ctx, i);
776         if ( rc )
777             return rc;
778 
779         rc = write_one_vcpu_xsave(ctx, i);
780         if ( rc )
781             return rc;
782 
783         rc = write_one_vcpu_msrs(ctx, i);
784         if ( rc )
785             return rc;
786     }
787 
788     return 0;
789 }
790 
791 /*
792  * Writes an X86_PV_INFO record into the stream.
793  */
write_x86_pv_info(struct xc_sr_context * ctx)794 static int write_x86_pv_info(struct xc_sr_context *ctx)
795 {
796     struct xc_sr_rec_x86_pv_info info = {
797         .guest_width = ctx->x86.pv.width,
798         .pt_levels = ctx->x86.pv.levels,
799     };
800     struct xc_sr_record rec = {
801         .type = REC_TYPE_X86_PV_INFO,
802         .length = sizeof(info),
803         .data = &info,
804     };
805 
806     return write_record(ctx, &rec);
807 }
808 
809 /*
810  * Writes an X86_PV_P2M_FRAMES record into the stream.  This contains the list
811  * of pfns making up the p2m table.
812  */
write_x86_pv_p2m_frames(struct xc_sr_context * ctx)813 static int write_x86_pv_p2m_frames(struct xc_sr_context *ctx)
814 {
815     xc_interface *xch = ctx->xch;
816     int rc; unsigned int i;
817     size_t datasz = ctx->x86.pv.p2m_frames * sizeof(uint64_t);
818     uint64_t *data = NULL;
819     struct xc_sr_rec_x86_pv_p2m_frames hdr = {
820         .end_pfn = ctx->x86.pv.max_pfn,
821     };
822     struct xc_sr_record rec = {
823         .type = REC_TYPE_X86_PV_P2M_FRAMES,
824         .length = sizeof(hdr),
825         .data = &hdr,
826     };
827 
828     /* No need to translate if sizeof(uint64_t) == sizeof(xen_pfn_t). */
829     if ( sizeof(uint64_t) != sizeof(*ctx->x86.pv.p2m_pfns) )
830     {
831         if ( !(data = malloc(datasz)) )
832         {
833             ERROR("Cannot allocate %zu bytes for X86_PV_P2M_FRAMES data",
834                   datasz);
835             return -1;
836         }
837 
838         for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i )
839             data[i] = ctx->x86.pv.p2m_pfns[i];
840     }
841     else
842         data = (uint64_t *)ctx->x86.pv.p2m_pfns;
843 
844     rc = write_split_record(ctx, &rec, data, datasz);
845 
846     if ( data != (uint64_t *)ctx->x86.pv.p2m_pfns )
847         free(data);
848 
849     return rc;
850 }
851 
852 /*
853  * Writes an SHARED_INFO record into the stream.
854  */
write_shared_info(struct xc_sr_context * ctx)855 static int write_shared_info(struct xc_sr_context *ctx)
856 {
857     struct xc_sr_record rec = {
858         .type = REC_TYPE_SHARED_INFO,
859         .length = PAGE_SIZE,
860         .data = ctx->x86.pv.shinfo,
861     };
862 
863     return write_record(ctx, &rec);
864 }
865 
866 /*
867  * Normalise a pagetable for the migration stream.  Performs mfn->pfn
868  * conversions on the ptes.
869  */
normalise_pagetable(struct xc_sr_context * ctx,const uint64_t * src,uint64_t * dst,unsigned long type)870 static int normalise_pagetable(struct xc_sr_context *ctx, const uint64_t *src,
871                                uint64_t *dst, unsigned long type)
872 {
873     xc_interface *xch = ctx->xch;
874     uint64_t pte;
875     unsigned int i, xen_first = -1, xen_last = -1; /* Indices of Xen mappings. */
876 
877     type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
878 
879     if ( ctx->x86.pv.levels == 4 )
880     {
881         /* 64bit guests only have Xen mappings in their L4 tables. */
882         if ( type == XEN_DOMCTL_PFINFO_L4TAB )
883         {
884             xen_first = (HYPERVISOR_VIRT_START_X86_64 >>
885                          L4_PAGETABLE_SHIFT_X86_64) & 511;
886             xen_last = (HYPERVISOR_VIRT_END_X86_64 >>
887                         L4_PAGETABLE_SHIFT_X86_64) & 511;
888         }
889     }
890     else
891     {
892         switch ( type )
893         {
894         case XEN_DOMCTL_PFINFO_L4TAB:
895             ERROR("??? Found L4 table for 32bit guest");
896             errno = EINVAL;
897             return -1;
898 
899         case XEN_DOMCTL_PFINFO_L3TAB:
900             /* 32bit guests can only use the first 4 entries of their L3 tables.
901              * All other are potentially used by Xen. */
902             xen_first = 4;
903             xen_last = 511;
904             break;
905 
906         case XEN_DOMCTL_PFINFO_L2TAB:
907             /* It is hard to spot Xen mappings in a 32bit guest's L2.  Most
908              * are normal but only a few will have Xen mappings.
909              */
910             i = (HYPERVISOR_VIRT_START_X86_32 >> L2_PAGETABLE_SHIFT_PAE) & 511;
911             if ( pte_to_frame(src[i]) == ctx->x86.pv.compat_m2p_mfn0 )
912             {
913                 xen_first = i;
914                 xen_last = (HYPERVISOR_VIRT_END_X86_32 >>
915                             L2_PAGETABLE_SHIFT_PAE) & 511;
916             }
917             break;
918         }
919     }
920 
921     for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
922     {
923         xen_pfn_t mfn;
924 
925         pte = src[i];
926 
927         /* Remove Xen mappings: Xen will reconstruct on the other side. */
928         if ( i >= xen_first && i <= xen_last )
929             pte = 0;
930 
931         /*
932          * Errors during the live part of migration are expected as a result
933          * of split pagetable updates, page type changes, active grant
934          * mappings etc.  The pagetable will need to be resent after pausing.
935          * In such cases we fail with EAGAIN.
936          *
937          * For domains which are already paused, errors are fatal.
938          */
939         if ( pte & _PAGE_PRESENT )
940         {
941             mfn = pte_to_frame(pte);
942 
943 #ifdef __i386__
944             if ( mfn == INVALID_MFN )
945             {
946                 if ( !ctx->dominfo.paused )
947                     errno = EAGAIN;
948                 else
949                 {
950                     ERROR("PTE truncation detected.  L%lu[%u] = %016"PRIx64,
951                           type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
952                     errno = E2BIG;
953                 }
954                 return -1;
955             }
956 #endif
957 
958             if ( (type > XEN_DOMCTL_PFINFO_L1TAB) && (pte & _PAGE_PSE) )
959             {
960                 ERROR("Cannot migrate superpage (L%lu[%u]: 0x%016"PRIx64")",
961                       type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
962                 errno = E2BIG;
963                 return -1;
964             }
965 
966             if ( !mfn_in_pseudophysmap(ctx, mfn) )
967             {
968                 if ( !ctx->dominfo.paused )
969                     errno = EAGAIN;
970                 else
971                 {
972                     ERROR("Bad mfn for L%lu[%u]",
973                           type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i);
974                     dump_bad_pseudophysmap_entry(ctx, mfn);
975                     errno = ERANGE;
976                 }
977                 return -1;
978             }
979 
980             pte = merge_pte(pte, mfn_to_pfn(ctx, mfn));
981         }
982 
983         dst[i] = pte;
984     }
985 
986     return 0;
987 }
988 
x86_pv_pfn_to_gfn(const struct xc_sr_context * ctx,xen_pfn_t pfn)989 static xen_pfn_t x86_pv_pfn_to_gfn(const struct xc_sr_context *ctx,
990                                    xen_pfn_t pfn)
991 {
992     assert(pfn <= ctx->x86.pv.max_pfn);
993 
994     return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width);
995 }
996 
997 
998 /*
999  * save_ops function.  Performs pagetable normalisation on appropriate pages.
1000  */
x86_pv_normalise_page(struct xc_sr_context * ctx,xen_pfn_t type,void ** page)1001 static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type,
1002                                  void **page)
1003 {
1004     xc_interface *xch = ctx->xch;
1005     void *local_page;
1006     int rc;
1007 
1008     type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1009 
1010     if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
1011         return 0;
1012 
1013     local_page = malloc(PAGE_SIZE);
1014     if ( !local_page )
1015     {
1016         ERROR("Unable to allocate scratch page");
1017         rc = -1;
1018         goto out;
1019     }
1020 
1021     rc = normalise_pagetable(ctx, *page, local_page, type);
1022     *page = local_page;
1023 
1024  out:
1025     return rc;
1026 }
1027 
1028 /*
1029  * save_ops function.  Queries domain information and maps the Xen m2p and the
1030  * guests shinfo and p2m table.
1031  */
x86_pv_setup(struct xc_sr_context * ctx)1032 static int x86_pv_setup(struct xc_sr_context *ctx)
1033 {
1034     int rc;
1035 
1036     rc = x86_pv_domain_info(ctx);
1037     if ( rc )
1038         return rc;
1039 
1040     rc = x86_pv_map_m2p(ctx);
1041     if ( rc )
1042         return rc;
1043 
1044     rc = map_shinfo(ctx);
1045     if ( rc )
1046         return rc;
1047 
1048     rc = map_p2m(ctx);
1049     if ( rc )
1050         return rc;
1051 
1052     return 0;
1053 }
1054 
x86_pv_static_data(struct xc_sr_context * ctx)1055 static int x86_pv_static_data(struct xc_sr_context *ctx)
1056 {
1057     int rc;
1058 
1059     rc = write_x86_pv_info(ctx);
1060     if ( rc )
1061         return rc;
1062 
1063     rc = write_x86_cpu_policy_records(ctx);
1064     if ( rc )
1065         return rc;
1066 
1067     return 0;
1068 }
1069 
x86_pv_start_of_stream(struct xc_sr_context * ctx)1070 static int x86_pv_start_of_stream(struct xc_sr_context *ctx)
1071 {
1072     int rc;
1073 
1074     /*
1075      * Ideally should be able to change during migration.  Currently
1076      * corruption will occur if the contents or location of the P2M changes
1077      * during the live migration loop.  If one is very lucky, the breakage
1078      * will not be subtle.
1079      */
1080     rc = write_x86_pv_p2m_frames(ctx);
1081     if ( rc )
1082         return rc;
1083 
1084     return 0;
1085 }
1086 
x86_pv_start_of_checkpoint(struct xc_sr_context * ctx)1087 static int x86_pv_start_of_checkpoint(struct xc_sr_context *ctx)
1088 {
1089     return 0;
1090 }
1091 
x86_pv_end_of_checkpoint(struct xc_sr_context * ctx)1092 static int x86_pv_end_of_checkpoint(struct xc_sr_context *ctx)
1093 {
1094     int rc;
1095 
1096     rc = write_x86_tsc_info(ctx);
1097     if ( rc )
1098         return rc;
1099 
1100     rc = write_shared_info(ctx);
1101     if ( rc )
1102         return rc;
1103 
1104     rc = write_all_vcpu_information(ctx);
1105     if ( rc )
1106         return rc;
1107 
1108     return 0;
1109 }
1110 
x86_pv_check_vm_state(struct xc_sr_context * ctx)1111 static int x86_pv_check_vm_state(struct xc_sr_context *ctx)
1112 {
1113     if ( ctx->x86.pv.p2m_generation == ~0ULL )
1114         return 0;
1115 
1116     return x86_pv_check_vm_state_p2m_list(ctx);
1117 }
1118 
x86_pv_cleanup(struct xc_sr_context * ctx)1119 static int x86_pv_cleanup(struct xc_sr_context *ctx)
1120 {
1121     free(ctx->x86.pv.p2m_pfns);
1122 
1123     if ( ctx->x86.pv.p2m )
1124         munmap(ctx->x86.pv.p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE);
1125 
1126     if ( ctx->x86.pv.shinfo )
1127         munmap(ctx->x86.pv.shinfo, PAGE_SIZE);
1128 
1129     if ( ctx->x86.pv.m2p )
1130         munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE);
1131 
1132     return 0;
1133 }
1134 
1135 struct xc_sr_save_ops save_ops_x86_pv =
1136 {
1137     .pfn_to_gfn          = x86_pv_pfn_to_gfn,
1138     .normalise_page      = x86_pv_normalise_page,
1139     .setup               = x86_pv_setup,
1140     .static_data         = x86_pv_static_data,
1141     .start_of_stream     = x86_pv_start_of_stream,
1142     .start_of_checkpoint = x86_pv_start_of_checkpoint,
1143     .end_of_checkpoint   = x86_pv_end_of_checkpoint,
1144     .check_vm_state      = x86_pv_check_vm_state,
1145     .cleanup             = x86_pv_cleanup,
1146 };
1147 
1148 /*
1149  * Local variables:
1150  * mode: C
1151  * c-file-style: "BSD"
1152  * c-basic-offset: 4
1153  * tab-width: 4
1154  * indent-tabs-mode: nil
1155  * End:
1156  */
1157