1 #include <assert.h>
2 
3 #include "xc_sr_common_x86_pv.h"
4 
pfn_to_mfn(const struct xc_sr_context * ctx,xen_pfn_t pfn)5 static xen_pfn_t pfn_to_mfn(const struct xc_sr_context *ctx, xen_pfn_t pfn)
6 {
7     assert(pfn <= ctx->x86.pv.max_pfn);
8 
9     return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width);
10 }
11 
12 /*
13  * Expand our local tracking information for the p2m table and domains maximum
14  * size.  Normally this will be called once to expand from 0 to max_pfn, but
15  * is liable to expand multiple times if the domain grows on the sending side
16  * after migration has started.
17  */
expand_p2m(struct xc_sr_context * ctx,unsigned long max_pfn)18 static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn)
19 {
20     xc_interface *xch = ctx->xch;
21     unsigned long old_max = ctx->x86.pv.max_pfn, i;
22     unsigned int fpp = PAGE_SIZE / ctx->x86.pv.width;
23     unsigned long end_frame = (max_pfn / fpp) + 1;
24     unsigned long old_end_frame = (old_max / fpp) + 1;
25     xen_pfn_t *p2m = NULL, *p2m_pfns = NULL;
26     uint32_t *pfn_types = NULL;
27     size_t p2msz, p2m_pfnsz, pfn_typesz;
28 
29     assert(max_pfn > old_max);
30 
31     p2msz = (max_pfn + 1) * ctx->x86.pv.width;
32     p2m = realloc(ctx->x86.pv.p2m, p2msz);
33     if ( !p2m )
34     {
35         ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz);
36         return -1;
37     }
38     ctx->x86.pv.p2m = p2m;
39 
40     pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types);
41     pfn_types = realloc(ctx->x86.pv.restore.pfn_types, pfn_typesz);
42     if ( !pfn_types )
43     {
44         ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz);
45         return -1;
46     }
47     ctx->x86.pv.restore.pfn_types = pfn_types;
48 
49     p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns);
50     p2m_pfns = realloc(ctx->x86.pv.p2m_pfns, p2m_pfnsz);
51     if ( !p2m_pfns )
52     {
53         ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz);
54         return -1;
55     }
56     ctx->x86.pv.p2m_frames = end_frame;
57     ctx->x86.pv.p2m_pfns = p2m_pfns;
58 
59     ctx->x86.pv.max_pfn = max_pfn;
60     for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
61     {
62         ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
63         ctx->restore.ops.set_page_type(ctx, i, 0);
64     }
65 
66     for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i )
67         ctx->x86.pv.p2m_pfns[i] = INVALID_MFN;
68 
69     DPRINTF("Changed max_pfn from %#lx to %#lx", old_max, max_pfn);
70     return 0;
71 }
72 
73 /*
74  * Pin all of the pagetables.
75  */
pin_pagetables(struct xc_sr_context * ctx)76 static int pin_pagetables(struct xc_sr_context *ctx)
77 {
78     xc_interface *xch = ctx->xch;
79     unsigned long i, nr_pins;
80     struct mmuext_op pin[MAX_PIN_BATCH];
81 
82     for ( i = nr_pins = 0; i <= ctx->x86.pv.max_pfn; ++i )
83     {
84         if ( (ctx->x86.pv.restore.pfn_types[i] &
85               XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
86             continue;
87 
88         switch ( (ctx->x86.pv.restore.pfn_types[i] &
89                   XEN_DOMCTL_PFINFO_LTABTYPE_MASK) )
90         {
91         case XEN_DOMCTL_PFINFO_L1TAB:
92             pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
93             break;
94         case XEN_DOMCTL_PFINFO_L2TAB:
95             pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
96             break;
97         case XEN_DOMCTL_PFINFO_L3TAB:
98             pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
99             break;
100         case XEN_DOMCTL_PFINFO_L4TAB:
101             pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
102             break;
103         default:
104             continue;
105         }
106 
107         pin[nr_pins].arg1.mfn = pfn_to_mfn(ctx, i);
108         nr_pins++;
109 
110         if ( nr_pins == MAX_PIN_BATCH )
111         {
112             if ( xc_mmuext_op(xch, pin, nr_pins, ctx->domid) != 0 )
113             {
114                 PERROR("Failed to pin batch of pagetables");
115                 return -1;
116             }
117             nr_pins = 0;
118         }
119     }
120 
121     if ( (nr_pins > 0) && (xc_mmuext_op(xch, pin, nr_pins, ctx->domid) < 0) )
122     {
123         PERROR("Failed to pin batch of pagetables");
124         return -1;
125     }
126 
127     return 0;
128 }
129 
130 /*
131  * Update details in a guests start_info structure.
132  */
process_start_info(struct xc_sr_context * ctx,vcpu_guest_context_any_t * vcpu)133 static int process_start_info(struct xc_sr_context *ctx,
134                               vcpu_guest_context_any_t *vcpu)
135 {
136     xc_interface *xch = ctx->xch;
137     xen_pfn_t pfn, mfn;
138     start_info_any_t *guest_start_info = NULL;
139     int rc = -1;
140 
141     pfn = GET_FIELD(vcpu, user_regs.edx, ctx->x86.pv.width);
142 
143     if ( pfn > ctx->x86.pv.max_pfn )
144     {
145         ERROR("Start Info pfn %#lx out of range", pfn);
146         goto err;
147     }
148 
149     if ( ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
150     {
151         ERROR("Start Info pfn %#lx has bad type %u", pfn,
152               (ctx->x86.pv.restore.pfn_types[pfn] >>
153                XEN_DOMCTL_PFINFO_LTAB_SHIFT));
154         goto err;
155     }
156 
157     mfn = pfn_to_mfn(ctx, pfn);
158     if ( !mfn_in_pseudophysmap(ctx, mfn) )
159     {
160         ERROR("Start Info has bad mfn");
161         dump_bad_pseudophysmap_entry(ctx, mfn);
162         goto err;
163     }
164 
165     SET_FIELD(vcpu, user_regs.edx, mfn, ctx->x86.pv.width);
166     guest_start_info = xc_map_foreign_range(
167         xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
168     if ( !guest_start_info )
169     {
170         PERROR("Failed to map Start Info at mfn %#lx", mfn);
171         goto err;
172     }
173 
174     /* Deal with xenstore stuff */
175     pfn = GET_FIELD(guest_start_info, store_mfn, ctx->x86.pv.width);
176     if ( pfn > ctx->x86.pv.max_pfn )
177     {
178         ERROR("XenStore pfn %#lx out of range", pfn);
179         goto err;
180     }
181 
182     mfn = pfn_to_mfn(ctx, pfn);
183     if ( !mfn_in_pseudophysmap(ctx, mfn) )
184     {
185         ERROR("XenStore pfn has bad mfn");
186         dump_bad_pseudophysmap_entry(ctx, mfn);
187         goto err;
188     }
189 
190     ctx->restore.xenstore_gfn = mfn;
191     SET_FIELD(guest_start_info, store_mfn, mfn, ctx->x86.pv.width);
192     SET_FIELD(guest_start_info, store_evtchn,
193               ctx->restore.xenstore_evtchn, ctx->x86.pv.width);
194 
195     /* Deal with console stuff */
196     pfn = GET_FIELD(guest_start_info, console.domU.mfn, ctx->x86.pv.width);
197     if ( pfn > ctx->x86.pv.max_pfn )
198     {
199         ERROR("Console pfn %#lx out of range", pfn);
200         goto err;
201     }
202 
203     mfn = pfn_to_mfn(ctx, pfn);
204     if ( !mfn_in_pseudophysmap(ctx, mfn) )
205     {
206         ERROR("Console pfn has bad mfn");
207         dump_bad_pseudophysmap_entry(ctx, mfn);
208         goto err;
209     }
210 
211     ctx->restore.console_gfn = mfn;
212     SET_FIELD(guest_start_info, console.domU.mfn, mfn, ctx->x86.pv.width);
213     SET_FIELD(guest_start_info, console.domU.evtchn,
214               ctx->restore.console_evtchn, ctx->x86.pv.width);
215 
216     /* Set other information */
217     SET_FIELD(guest_start_info, nr_pages,
218               ctx->x86.pv.max_pfn + 1, ctx->x86.pv.width);
219     SET_FIELD(guest_start_info, shared_info,
220               ctx->dominfo.shared_info_frame << PAGE_SHIFT, ctx->x86.pv.width);
221     SET_FIELD(guest_start_info, flags, 0, ctx->x86.pv.width);
222 
223     rc = 0;
224 
225  err:
226     if ( guest_start_info )
227         munmap(guest_start_info, PAGE_SIZE);
228 
229     return rc;
230 }
231 
232 /*
233  * Process one stashed vcpu worth of basic state and send to Xen.
234  */
process_vcpu_basic(struct xc_sr_context * ctx,unsigned int vcpuid)235 static int process_vcpu_basic(struct xc_sr_context *ctx,
236                               unsigned int vcpuid)
237 {
238     xc_interface *xch = ctx->xch;
239     vcpu_guest_context_any_t *vcpu = ctx->x86.pv.restore.vcpus[vcpuid].basic.ptr;
240     xen_pfn_t pfn, mfn;
241     unsigned int i, gdt_count;
242     int rc = -1;
243 
244     /* Vcpu 0 is special: Convert the suspend record to an mfn. */
245     if ( vcpuid == 0 )
246     {
247         rc = process_start_info(ctx, vcpu);
248         if ( rc )
249             return rc;
250         rc = -1;
251     }
252 
253     SET_FIELD(vcpu, flags,
254               GET_FIELD(vcpu, flags, ctx->x86.pv.width) | VGCF_online,
255               ctx->x86.pv.width);
256 
257     gdt_count = GET_FIELD(vcpu, gdt_ents, ctx->x86.pv.width);
258     if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
259     {
260         ERROR("GDT entry count (%u) out of range (max %u)",
261               gdt_count, FIRST_RESERVED_GDT_ENTRY);
262         errno = ERANGE;
263         goto err;
264     }
265     gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
266 
267     /* Convert GDT frames to mfns. */
268     for ( i = 0; i < gdt_count; ++i )
269     {
270         pfn = GET_FIELD(vcpu, gdt_frames[i], ctx->x86.pv.width);
271         if ( pfn > ctx->x86.pv.max_pfn )
272         {
273             ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn);
274             goto err;
275         }
276 
277         if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
278         {
279             ERROR("GDT frame %u (pfn %#lx) has bad type %u", i, pfn,
280                   (ctx->x86.pv.restore.pfn_types[pfn] >>
281                    XEN_DOMCTL_PFINFO_LTAB_SHIFT));
282             goto err;
283         }
284 
285         mfn = pfn_to_mfn(ctx, pfn);
286         if ( !mfn_in_pseudophysmap(ctx, mfn) )
287         {
288             ERROR("GDT frame %u has bad mfn", i);
289             dump_bad_pseudophysmap_entry(ctx, mfn);
290             goto err;
291         }
292 
293         SET_FIELD(vcpu, gdt_frames[i], mfn, ctx->x86.pv.width);
294     }
295 
296     /* Convert CR3 to an mfn. */
297     pfn = cr3_to_mfn(ctx, GET_FIELD(vcpu, ctrlreg[3], ctx->x86.pv.width));
298     if ( pfn > ctx->x86.pv.max_pfn )
299     {
300         ERROR("cr3 (pfn %#lx) out of range", pfn);
301         goto err;
302     }
303 
304     if ( (ctx->x86.pv.restore.pfn_types[pfn] &
305           XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
306          (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
307     {
308         ERROR("cr3 (pfn %#lx) has bad type %u, expected %u", pfn,
309               (ctx->x86.pv.restore.pfn_types[pfn] >>
310                XEN_DOMCTL_PFINFO_LTAB_SHIFT),
311               ctx->x86.pv.levels);
312         goto err;
313     }
314 
315     mfn = pfn_to_mfn(ctx, pfn);
316     if ( !mfn_in_pseudophysmap(ctx, mfn) )
317     {
318         ERROR("cr3 has bad mfn");
319         dump_bad_pseudophysmap_entry(ctx, mfn);
320         goto err;
321     }
322 
323     SET_FIELD(vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn), ctx->x86.pv.width);
324 
325     /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */
326     if ( ctx->x86.pv.levels == 4 && (vcpu->x64.ctrlreg[1] & 1) )
327     {
328         pfn = vcpu->x64.ctrlreg[1] >> PAGE_SHIFT;
329 
330         if ( pfn > ctx->x86.pv.max_pfn )
331         {
332             ERROR("cr1 (pfn %#lx) out of range", pfn);
333             goto err;
334         }
335 
336         if ( (ctx->x86.pv.restore.pfn_types[pfn] &
337               XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
338              (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
339         {
340             ERROR("cr1 (pfn %#lx) has bad type %u, expected %u", pfn,
341                   (ctx->x86.pv.restore.pfn_types[pfn] >>
342                    XEN_DOMCTL_PFINFO_LTAB_SHIFT),
343                   ctx->x86.pv.levels);
344             goto err;
345         }
346 
347         mfn = pfn_to_mfn(ctx, pfn);
348         if ( !mfn_in_pseudophysmap(ctx, mfn) )
349         {
350             ERROR("cr1 has bad mfn");
351             dump_bad_pseudophysmap_entry(ctx, mfn);
352             goto err;
353         }
354 
355         vcpu->x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT;
356     }
357 
358     if ( xc_vcpu_setcontext(xch, ctx->domid, vcpuid, vcpu) )
359     {
360         PERROR("Failed to set vcpu%u's basic info", vcpuid);
361         goto err;
362     }
363 
364     rc = 0;
365 
366  err:
367     return rc;
368 }
369 
370 /*
371  * Process one stashed vcpu worth of extended state and send to Xen.
372  */
process_vcpu_extended(struct xc_sr_context * ctx,unsigned int vcpuid)373 static int process_vcpu_extended(struct xc_sr_context *ctx,
374                                  unsigned int vcpuid)
375 {
376     xc_interface *xch = ctx->xch;
377     struct xc_sr_x86_pv_restore_vcpu *vcpu =
378         &ctx->x86.pv.restore.vcpus[vcpuid];
379     DECLARE_DOMCTL;
380 
381     domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
382     domctl.domain = ctx->domid;
383     memcpy(&domctl.u.ext_vcpucontext, vcpu->extd.ptr, vcpu->extd.size);
384 
385     if ( xc_domctl(xch, &domctl) != 0 )
386     {
387         PERROR("Failed to set vcpu%u's extended info", vcpuid);
388         return -1;
389     }
390 
391     return 0;
392 }
393 
394 /*
395  * Process one stashed vcpu worth of xsave state and send to Xen.
396  */
process_vcpu_xsave(struct xc_sr_context * ctx,unsigned int vcpuid)397 static int process_vcpu_xsave(struct xc_sr_context *ctx,
398                               unsigned int vcpuid)
399 {
400     xc_interface *xch = ctx->xch;
401     struct xc_sr_x86_pv_restore_vcpu *vcpu =
402         &ctx->x86.pv.restore.vcpus[vcpuid];
403     int rc;
404     DECLARE_DOMCTL;
405     DECLARE_HYPERCALL_BUFFER(void, buffer);
406 
407     buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->xsave.size);
408     if ( !buffer )
409     {
410         ERROR("Unable to allocate %zu bytes for xsave hypercall buffer",
411               vcpu->xsave.size);
412         return -1;
413     }
414 
415     domctl.cmd = XEN_DOMCTL_setvcpuextstate;
416     domctl.domain = ctx->domid;
417     domctl.u.vcpuextstate.vcpu = vcpuid;
418     domctl.u.vcpuextstate.size = vcpu->xsave.size;
419     set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
420 
421     memcpy(buffer, vcpu->xsave.ptr, vcpu->xsave.size);
422 
423     rc = xc_domctl(xch, &domctl);
424     if ( rc )
425         PERROR("Failed to set vcpu%u's xsave info", vcpuid);
426 
427     xc_hypercall_buffer_free(xch, buffer);
428 
429     return rc;
430 }
431 
432 /*
433  * Process one stashed vcpu worth of msr state and send to Xen.
434  */
process_vcpu_msrs(struct xc_sr_context * ctx,unsigned int vcpuid)435 static int process_vcpu_msrs(struct xc_sr_context *ctx,
436                              unsigned int vcpuid)
437 {
438     xc_interface *xch = ctx->xch;
439     struct xc_sr_x86_pv_restore_vcpu *vcpu =
440         &ctx->x86.pv.restore.vcpus[vcpuid];
441     int rc;
442     DECLARE_DOMCTL;
443     DECLARE_HYPERCALL_BUFFER(void, buffer);
444 
445     buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->msr.size);
446     if ( !buffer )
447     {
448         ERROR("Unable to allocate %zu bytes for msr hypercall buffer",
449               vcpu->msr.size);
450         return -1;
451     }
452 
453     domctl.cmd = XEN_DOMCTL_set_vcpu_msrs;
454     domctl.domain = ctx->domid;
455     domctl.u.vcpu_msrs.vcpu = vcpuid;
456     domctl.u.vcpu_msrs.msr_count = vcpu->msr.size / sizeof(xen_domctl_vcpu_msr_t);
457     set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
458 
459     memcpy(buffer, vcpu->msr.ptr, vcpu->msr.size);
460 
461     rc = xc_domctl(xch, &domctl);
462     if ( rc )
463         PERROR("Failed to set vcpu%u's msrs", vcpuid);
464 
465     xc_hypercall_buffer_free(xch, buffer);
466 
467     return rc;
468 }
469 
470 /*
471  * Process all stashed vcpu context and send to Xen.
472  */
update_vcpu_context(struct xc_sr_context * ctx)473 static int update_vcpu_context(struct xc_sr_context *ctx)
474 {
475     xc_interface *xch = ctx->xch;
476     struct xc_sr_x86_pv_restore_vcpu *vcpu;
477     unsigned int i;
478     int rc = 0;
479 
480     for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i )
481     {
482         vcpu = &ctx->x86.pv.restore.vcpus[i];
483 
484         if ( vcpu->basic.ptr )
485         {
486             rc = process_vcpu_basic(ctx, i);
487             if ( rc )
488                 return rc;
489         }
490         else if ( i == 0 )
491         {
492             ERROR("Sender didn't send vcpu0's basic state");
493             return -1;
494         }
495 
496         if ( vcpu->extd.ptr )
497         {
498             rc = process_vcpu_extended(ctx, i);
499             if ( rc )
500                 return rc;
501         }
502 
503         if ( vcpu->xsave.ptr )
504         {
505             rc = process_vcpu_xsave(ctx, i);
506             if ( rc )
507                 return rc;
508         }
509 
510         if ( vcpu->msr.ptr )
511         {
512             rc = process_vcpu_msrs(ctx, i);
513             if ( rc )
514                 return rc;
515         }
516     }
517 
518     return rc;
519 }
520 
521 /*
522  * Copy the p2m which has been constructed locally as memory has been
523  * allocated, over the p2m in guest, so the guest can find its memory again on
524  * resume.
525  */
update_guest_p2m(struct xc_sr_context * ctx)526 static int update_guest_p2m(struct xc_sr_context *ctx)
527 {
528     xc_interface *xch = ctx->xch;
529     xen_pfn_t mfn, pfn, *guest_p2m = NULL;
530     unsigned int i;
531     int rc = -1;
532 
533     for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i )
534     {
535         pfn = ctx->x86.pv.p2m_pfns[i];
536 
537         if ( pfn > ctx->x86.pv.max_pfn )
538         {
539             ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range",
540                   pfn, i);
541             goto err;
542         }
543 
544         if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
545         {
546             ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %u", pfn, i,
547                   (ctx->x86.pv.restore.pfn_types[pfn] >>
548                    XEN_DOMCTL_PFINFO_LTAB_SHIFT));
549             goto err;
550         }
551 
552         mfn = pfn_to_mfn(ctx, pfn);
553         if ( !mfn_in_pseudophysmap(ctx, mfn) )
554         {
555             ERROR("p2m_frame_list[%u] has bad mfn", i);
556             dump_bad_pseudophysmap_entry(ctx, mfn);
557             goto err;
558         }
559 
560         ctx->x86.pv.p2m_pfns[i] = mfn;
561     }
562 
563     guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE,
564                                      ctx->x86.pv.p2m_pfns,
565                                      ctx->x86.pv.p2m_frames);
566     if ( !guest_p2m )
567     {
568         PERROR("Failed to map p2m frames");
569         goto err;
570     }
571 
572     memcpy(guest_p2m, ctx->x86.pv.p2m,
573            (ctx->x86.pv.max_pfn + 1) * ctx->x86.pv.width);
574     rc = 0;
575 
576  err:
577     if ( guest_p2m )
578         munmap(guest_p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE);
579 
580     return rc;
581 }
582 
583 /*
584  * The valid width/pt_levels values in X86_PV_INFO are inextricably linked.
585  * Cross-check the legitimate combinations.
586  */
valid_x86_pv_info_combination(const struct xc_sr_rec_x86_pv_info * info)587 static bool valid_x86_pv_info_combination(
588     const struct xc_sr_rec_x86_pv_info *info)
589 {
590     switch ( info->guest_width )
591     {
592     case 4:  return info->pt_levels == 3;
593     case 8:  return info->pt_levels == 4;
594     default: return false;
595     }
596 }
597 
598 /*
599  * Process an X86_PV_INFO record.
600  */
handle_x86_pv_info(struct xc_sr_context * ctx,struct xc_sr_record * rec)601 static int handle_x86_pv_info(struct xc_sr_context *ctx,
602                               struct xc_sr_record *rec)
603 {
604     xc_interface *xch = ctx->xch;
605     struct xc_sr_rec_x86_pv_info *info = rec->data;
606 
607     if ( ctx->x86.pv.restore.seen_pv_info )
608     {
609         ERROR("Already received X86_PV_INFO record");
610         return -1;
611     }
612 
613     if ( rec->length < sizeof(*info) )
614     {
615         ERROR("X86_PV_INFO record truncated: length %u, expected %zu",
616               rec->length, sizeof(*info));
617         return -1;
618     }
619 
620     if ( !valid_x86_pv_info_combination(info) )
621     {
622         ERROR("Invalid X86_PV_INFO combination: width %u, pt_levels %u",
623               info->guest_width, info->pt_levels);
624         return -1;
625     }
626 
627     /*
628      * PV domains default to native width.  For an incomming compat domain, we
629      * will typically be the first entity to inform Xen.
630      */
631     if ( info->guest_width != ctx->x86.pv.width )
632     {
633         struct xen_domctl domctl = {
634             .domain = ctx->domid,
635             .cmd    = XEN_DOMCTL_set_address_size,
636             .u.address_size.size = info->guest_width * 8,
637         };
638         int rc = do_domctl(xch, &domctl);
639 
640         if ( rc != 0 )
641         {
642             ERROR("Failed to update d%d address size to %u",
643                   ctx->domid, info->guest_width * 8);
644             return -1;
645         }
646 
647         /* Domain's information changed, better to refresh. */
648         rc = x86_pv_domain_info(ctx);
649         if ( rc != 0 )
650         {
651             ERROR("Unable to refresh guest information");
652             return -1;
653         }
654     }
655 
656     /* Sanity check (possibly new) domain settings. */
657     if ( (info->guest_width != ctx->x86.pv.width) ||
658          (info->pt_levels   != ctx->x86.pv.levels) )
659     {
660         ERROR("X86_PV_INFO width/pt_levels settings %u/%u mismatch with d%d %u/%u",
661               info->guest_width, info->pt_levels, ctx->domid,
662               ctx->x86.pv.width, ctx->x86.pv.levels);
663         return -1;
664     }
665 
666     ctx->x86.pv.restore.seen_pv_info = true;
667     return 0;
668 }
669 
670 /*
671  * Process an X86_PV_P2M_FRAMES record.  Takes care of expanding the local p2m
672  * state if needed.
673  */
handle_x86_pv_p2m_frames(struct xc_sr_context * ctx,struct xc_sr_record * rec)674 static int handle_x86_pv_p2m_frames(struct xc_sr_context *ctx,
675                                     struct xc_sr_record *rec)
676 {
677     xc_interface *xch = ctx->xch;
678     struct xc_sr_rec_x86_pv_p2m_frames *data = rec->data;
679     unsigned int start, end, x, fpp = PAGE_SIZE / ctx->x86.pv.width;
680     int rc;
681 
682     /* v2 compat.  Infer the position of STATIC_DATA_END. */
683     if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end )
684     {
685         rc = handle_static_data_end(ctx);
686         if ( rc )
687         {
688             ERROR("Inferred STATIC_DATA_END record failed");
689             return rc;
690         }
691     }
692 
693     if ( !ctx->restore.seen_static_data_end )
694     {
695         ERROR("No STATIC_DATA_END seen");
696         return -1;
697     }
698 
699     if ( !ctx->x86.pv.restore.seen_pv_info )
700     {
701         ERROR("Not yet received X86_PV_INFO record");
702         return -1;
703     }
704 
705     if ( rec->length < sizeof(*data) )
706     {
707         ERROR("X86_PV_P2M_FRAMES record truncated: length %u, min %zu",
708               rec->length, sizeof(*data) + sizeof(uint64_t));
709         return -1;
710     }
711 
712     if ( data->start_pfn > data->end_pfn )
713     {
714         ERROR("End pfn in stream (%#x) exceeds Start (%#x)",
715               data->end_pfn, data->start_pfn);
716         return -1;
717     }
718 
719     start =  data->start_pfn / fpp;
720     end = data->end_pfn / fpp + 1;
721 
722     if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) )
723     {
724         ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#x"
725               ", end_pfn %#x, length %u, expected %zu + (%u - %u) * %zu",
726               data->start_pfn, data->end_pfn, rec->length,
727               sizeof(*data), end, start, sizeof(uint64_t));
728         return -1;
729     }
730 
731     if ( data->end_pfn > ctx->x86.pv.max_pfn )
732     {
733         rc = expand_p2m(ctx, data->end_pfn);
734         if ( rc )
735             return rc;
736     }
737 
738     for ( x = 0; x < (end - start); ++x )
739         ctx->x86.pv.p2m_pfns[start + x] = data->p2m_pfns[x];
740 
741     return 0;
742 }
743 
744 /*
745  * Processes X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} records from the stream.
746  * The blobs are all stashed to one side as they need to be deferred until the
747  * very end of the stream, rather than being send to Xen at the point they
748  * arrive in the stream.  It performs all pre-hypercall size validation.
749  */
handle_x86_pv_vcpu_blob(struct xc_sr_context * ctx,struct xc_sr_record * rec)750 static int handle_x86_pv_vcpu_blob(struct xc_sr_context *ctx,
751                                    struct xc_sr_record *rec)
752 {
753     xc_interface *xch = ctx->xch;
754     struct xc_sr_rec_x86_pv_vcpu_hdr *vhdr = rec->data;
755     struct xc_sr_x86_pv_restore_vcpu *vcpu;
756     const char *rec_name;
757     size_t blobsz;
758     struct xc_sr_blob *blob = NULL;
759     int rc = -1;
760 
761     switch ( rec->type )
762     {
763     case REC_TYPE_X86_PV_VCPU_BASIC:
764         rec_name = "X86_PV_VCPU_BASIC";
765         break;
766 
767     case REC_TYPE_X86_PV_VCPU_EXTENDED:
768         rec_name = "X86_PV_VCPU_EXTENDED";
769         break;
770 
771     case REC_TYPE_X86_PV_VCPU_XSAVE:
772         rec_name = "X86_PV_VCPU_XSAVE";
773         break;
774 
775     case REC_TYPE_X86_PV_VCPU_MSRS:
776         rec_name = "X86_PV_VCPU_MSRS";
777         break;
778 
779     default:
780         ERROR("Unrecognised vcpu blob record %s (%u)",
781               rec_type_to_str(rec->type), rec->type);
782         goto out;
783     }
784 
785     /* Confirm that there is a complete header. */
786     if ( rec->length < sizeof(*vhdr) )
787     {
788         ERROR("%s record truncated: length %u, header size %zu",
789               rec_name, rec->length, sizeof(*vhdr));
790         goto out;
791     }
792 
793     blobsz = rec->length - sizeof(*vhdr);
794 
795     /*
796      * Tolerate empty records.  Older sending sides used to accidentally
797      * generate them.
798      */
799     if ( blobsz == 0 )
800     {
801         DBGPRINTF("Skipping empty %s record for vcpu %u\n",
802                   rec_type_to_str(rec->type), vhdr->vcpu_id);
803         rc = 0;
804         goto out;
805     }
806 
807     /* Check that the vcpu id is within range. */
808     if ( vhdr->vcpu_id >= ctx->x86.pv.restore.nr_vcpus )
809     {
810         ERROR("%s record vcpu_id (%u) exceeds domain max (%u)",
811               rec_name, vhdr->vcpu_id, ctx->x86.pv.restore.nr_vcpus - 1);
812         goto out;
813     }
814 
815     vcpu = &ctx->x86.pv.restore.vcpus[vhdr->vcpu_id];
816 
817     /* Further per-record checks, where possible. */
818     switch ( rec->type )
819     {
820     case REC_TYPE_X86_PV_VCPU_BASIC:
821     {
822         size_t vcpusz = ctx->x86.pv.width == 8 ?
823             sizeof(vcpu_guest_context_x86_64_t) :
824             sizeof(vcpu_guest_context_x86_32_t);
825 
826         if ( blobsz != vcpusz )
827         {
828             ERROR("%s record wrong size: expected %zu, got %u",
829                   rec_name, sizeof(*vhdr) + vcpusz, rec->length);
830             goto out;
831         }
832         blob = &vcpu->basic;
833         break;
834     }
835 
836     case REC_TYPE_X86_PV_VCPU_EXTENDED:
837         if ( blobsz > 128 )
838         {
839             ERROR("%s record too long: max %zu, got %u",
840                   rec_name, sizeof(*vhdr) + 128, rec->length);
841             goto out;
842         }
843         blob = &vcpu->extd;
844         break;
845 
846     case REC_TYPE_X86_PV_VCPU_XSAVE:
847         if ( blobsz < 16 )
848         {
849             ERROR("%s record too short: min %zu, got %u",
850                   rec_name, sizeof(*vhdr) + 16, rec->length);
851             goto out;
852         }
853         blob = &vcpu->xsave;
854         break;
855 
856     case REC_TYPE_X86_PV_VCPU_MSRS:
857         if ( blobsz % sizeof(xen_domctl_vcpu_msr_t) != 0 )
858         {
859             ERROR("%s record payload size %zu expected to be a multiple of %zu",
860                   rec_name, blobsz, sizeof(xen_domctl_vcpu_msr_t));
861             goto out;
862         }
863         blob = &vcpu->msr;
864         break;
865     }
866 
867     rc = update_blob(blob, vhdr->context, blobsz);
868     if ( rc )
869         ERROR("Unable to allocate %zu bytes for vcpu%u %s blob",
870               blobsz, vhdr->vcpu_id, rec_name);
871 
872  out:
873     return rc;
874 }
875 
876 /*
877  * Process a SHARED_INFO record from the stream.
878  */
handle_shared_info(struct xc_sr_context * ctx,struct xc_sr_record * rec)879 static int handle_shared_info(struct xc_sr_context *ctx,
880                               struct xc_sr_record *rec)
881 {
882     xc_interface *xch = ctx->xch;
883     unsigned int i;
884     int rc = -1;
885     shared_info_any_t *guest_shinfo = NULL;
886     const shared_info_any_t *old_shinfo = rec->data;
887 
888     if ( !ctx->x86.pv.restore.seen_pv_info )
889     {
890         ERROR("Not yet received X86_PV_INFO record");
891         return -1;
892     }
893 
894     if ( rec->length != PAGE_SIZE )
895     {
896         ERROR("X86_PV_SHARED_INFO record wrong size: length %u"
897               ", expected 4096", rec->length);
898         goto err;
899     }
900 
901     guest_shinfo = xc_map_foreign_range(
902         xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
903         ctx->dominfo.shared_info_frame);
904     if ( !guest_shinfo )
905     {
906         PERROR("Failed to map Shared Info at mfn %#lx",
907                ctx->dominfo.shared_info_frame);
908         goto err;
909     }
910 
911     MEMCPY_FIELD(guest_shinfo, old_shinfo, vcpu_info, ctx->x86.pv.width);
912     MEMCPY_FIELD(guest_shinfo, old_shinfo, arch, ctx->x86.pv.width);
913 
914     SET_FIELD(guest_shinfo, arch.pfn_to_mfn_frame_list_list,
915               0, ctx->x86.pv.width);
916 
917     MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_pending, 0, ctx->x86.pv.width);
918     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
919         SET_FIELD(guest_shinfo, vcpu_info[i].evtchn_pending_sel,
920                   0, ctx->x86.pv.width);
921 
922     MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_mask, 0xff, ctx->x86.pv.width);
923 
924     rc = 0;
925 
926  err:
927     if ( guest_shinfo )
928         munmap(guest_shinfo, PAGE_SIZE);
929 
930     return rc;
931 }
932 
933 /* restore_ops function. */
x86_pv_pfn_is_valid(const struct xc_sr_context * ctx,xen_pfn_t pfn)934 static bool x86_pv_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
935 {
936     return pfn <= ctx->x86.pv.max_pfn;
937 }
938 
939 /* restore_ops function. */
x86_pv_set_page_type(struct xc_sr_context * ctx,xen_pfn_t pfn,unsigned long type)940 static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn,
941                                  unsigned long type)
942 {
943     assert(pfn <= ctx->x86.pv.max_pfn);
944 
945     ctx->x86.pv.restore.pfn_types[pfn] = type;
946 }
947 
948 /* restore_ops function. */
x86_pv_set_gfn(struct xc_sr_context * ctx,xen_pfn_t pfn,xen_pfn_t mfn)949 static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
950                            xen_pfn_t mfn)
951 {
952     assert(pfn <= ctx->x86.pv.max_pfn);
953 
954     if ( ctx->x86.pv.width == sizeof(uint64_t) )
955         /* 64 bit guest.  Need to expand INVALID_MFN for 32 bit toolstacks. */
956         ((uint64_t *)ctx->x86.pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn;
957     else
958         /* 32 bit guest.  Can truncate INVALID_MFN for 64 bit toolstacks. */
959         ((uint32_t *)ctx->x86.pv.p2m)[pfn] = mfn;
960 }
961 
962 /*
963  * restore_ops function.  Convert pfns back to mfns in pagetables.  Possibly
964  * needs to populate new frames if a PTE is found referring to a frame which
965  * hasn't yet been seen from PAGE_DATA records.
966  */
x86_pv_localise_page(struct xc_sr_context * ctx,uint32_t type,void * page)967 static int x86_pv_localise_page(struct xc_sr_context *ctx,
968                                 uint32_t type, void *page)
969 {
970     xc_interface *xch = ctx->xch;
971     uint64_t *table = page;
972     uint64_t pte;
973     unsigned int i, to_populate;
974     xen_pfn_t pfns[(PAGE_SIZE / sizeof(uint64_t))];
975 
976     type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
977 
978     /* Only page tables need localisation. */
979     if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
980         return 0;
981 
982     /* Check to see whether we need to populate any new frames. */
983     for ( i = 0, to_populate = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
984     {
985         pte = table[i];
986 
987         if ( pte & _PAGE_PRESENT )
988         {
989             xen_pfn_t pfn = pte_to_frame(pte);
990 
991 #ifdef __i386__
992             if ( pfn == INVALID_MFN )
993             {
994                 ERROR("PTE truncation detected.  L%u[%u] = %016"PRIx64,
995                       type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
996                 errno = E2BIG;
997                 return -1;
998             }
999 #endif
1000 
1001             if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN )
1002                 pfns[to_populate++] = pfn;
1003         }
1004     }
1005 
1006     if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
1007         return -1;
1008 
1009     for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
1010     {
1011         pte = table[i];
1012 
1013         if ( pte & _PAGE_PRESENT )
1014         {
1015             xen_pfn_t mfn, pfn;
1016 
1017             pfn = pte_to_frame(pte);
1018             mfn = pfn_to_mfn(ctx, pfn);
1019 
1020             if ( !mfn_in_pseudophysmap(ctx, mfn) )
1021             {
1022                 ERROR("Bad mfn for L%u[%u] - pte %"PRIx64,
1023                       type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
1024                 dump_bad_pseudophysmap_entry(ctx, mfn);
1025                 errno = ERANGE;
1026                 return -1;
1027             }
1028 
1029             table[i] = merge_pte(pte, mfn);
1030         }
1031     }
1032 
1033     return 0;
1034 }
1035 
1036 /*
1037  * restore_ops function.  Confirm that the incoming stream matches the type of
1038  * domain we are attempting to restore into.
1039  */
x86_pv_setup(struct xc_sr_context * ctx)1040 static int x86_pv_setup(struct xc_sr_context *ctx)
1041 {
1042     xc_interface *xch = ctx->xch;
1043     int rc;
1044 
1045     if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV )
1046     {
1047         ERROR("Unable to restore %s domain into an x86_pv domain",
1048               dhdr_type_to_str(ctx->restore.guest_type));
1049         return -1;
1050     }
1051 
1052     if ( ctx->restore.guest_page_size != PAGE_SIZE )
1053     {
1054         ERROR("Invalid page size %d for x86_pv domains",
1055               ctx->restore.guest_page_size);
1056         return -1;
1057     }
1058 
1059     rc = x86_pv_domain_info(ctx);
1060     if ( rc )
1061         return rc;
1062 
1063     ctx->x86.pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1;
1064     ctx->x86.pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu),
1065                                        ctx->x86.pv.restore.nr_vcpus);
1066     if ( !ctx->x86.pv.restore.vcpus )
1067     {
1068         errno = ENOMEM;
1069         return -1;
1070     }
1071 
1072     rc = x86_pv_map_m2p(ctx);
1073     if ( rc )
1074         return rc;
1075 
1076     return rc;
1077 }
1078 
1079 /*
1080  * restore_ops function.
1081  */
x86_pv_process_record(struct xc_sr_context * ctx,struct xc_sr_record * rec)1082 static int x86_pv_process_record(struct xc_sr_context *ctx,
1083                                  struct xc_sr_record *rec)
1084 {
1085     switch ( rec->type )
1086     {
1087     case REC_TYPE_X86_PV_INFO:
1088         return handle_x86_pv_info(ctx, rec);
1089 
1090     case REC_TYPE_X86_PV_P2M_FRAMES:
1091         return handle_x86_pv_p2m_frames(ctx, rec);
1092 
1093     case REC_TYPE_X86_PV_VCPU_BASIC:
1094     case REC_TYPE_X86_PV_VCPU_EXTENDED:
1095     case REC_TYPE_X86_PV_VCPU_XSAVE:
1096     case REC_TYPE_X86_PV_VCPU_MSRS:
1097         return handle_x86_pv_vcpu_blob(ctx, rec);
1098 
1099     case REC_TYPE_SHARED_INFO:
1100         return handle_shared_info(ctx, rec);
1101 
1102     case REC_TYPE_X86_TSC_INFO:
1103         return handle_x86_tsc_info(ctx, rec);
1104 
1105     case REC_TYPE_X86_CPUID_POLICY:
1106         return handle_x86_cpuid_policy(ctx, rec);
1107 
1108     case REC_TYPE_X86_MSR_POLICY:
1109         return handle_x86_msr_policy(ctx, rec);
1110 
1111     default:
1112         return RECORD_NOT_PROCESSED;
1113     }
1114 }
1115 
1116 /*
1117  * restore_ops function.  Update the vcpu context in Xen, pin the pagetables,
1118  * rewrite the p2m and seed the grant table.
1119  */
x86_pv_stream_complete(struct xc_sr_context * ctx)1120 static int x86_pv_stream_complete(struct xc_sr_context *ctx)
1121 {
1122     xc_interface *xch = ctx->xch;
1123     int rc;
1124 
1125     rc = update_vcpu_context(ctx);
1126     if ( rc )
1127         return rc;
1128 
1129     rc = pin_pagetables(ctx);
1130     if ( rc )
1131         return rc;
1132 
1133     rc = update_guest_p2m(ctx);
1134     if ( rc )
1135         return rc;
1136 
1137     rc = xc_dom_gnttab_seed(xch, ctx->domid, false,
1138                             ctx->restore.console_gfn,
1139                             ctx->restore.xenstore_gfn,
1140                             ctx->restore.console_domid,
1141                             ctx->restore.xenstore_domid);
1142     if ( rc )
1143     {
1144         PERROR("Failed to seed grant table");
1145         return rc;
1146     }
1147 
1148     return rc;
1149 }
1150 
1151 /*
1152  * restore_ops function.
1153  */
x86_pv_cleanup(struct xc_sr_context * ctx)1154 static int x86_pv_cleanup(struct xc_sr_context *ctx)
1155 {
1156     free(ctx->x86.pv.p2m);
1157     free(ctx->x86.pv.p2m_pfns);
1158 
1159     if ( ctx->x86.pv.restore.vcpus )
1160     {
1161         unsigned int i;
1162 
1163         for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i )
1164         {
1165             struct xc_sr_x86_pv_restore_vcpu *vcpu =
1166                 &ctx->x86.pv.restore.vcpus[i];
1167 
1168             free(vcpu->basic.ptr);
1169             free(vcpu->extd.ptr);
1170             free(vcpu->xsave.ptr);
1171             free(vcpu->msr.ptr);
1172         }
1173 
1174         free(ctx->x86.pv.restore.vcpus);
1175     }
1176 
1177     free(ctx->x86.pv.restore.pfn_types);
1178 
1179     if ( ctx->x86.pv.m2p )
1180         munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE);
1181 
1182     free(ctx->x86.restore.cpuid.ptr);
1183     free(ctx->x86.restore.msr.ptr);
1184 
1185     return 0;
1186 }
1187 
1188 struct xc_sr_restore_ops restore_ops_x86_pv =
1189 {
1190     .pfn_is_valid    = x86_pv_pfn_is_valid,
1191     .pfn_to_gfn      = pfn_to_mfn,
1192     .set_page_type   = x86_pv_set_page_type,
1193     .set_gfn         = x86_pv_set_gfn,
1194     .localise_page   = x86_pv_localise_page,
1195     .setup           = x86_pv_setup,
1196     .process_record  = x86_pv_process_record,
1197     .static_data_complete = x86_static_data_complete,
1198     .stream_complete = x86_pv_stream_complete,
1199     .cleanup         = x86_pv_cleanup,
1200 };
1201 
1202 /*
1203  * Local variables:
1204  * mode: C
1205  * c-file-style: "BSD"
1206  * c-basic-offset: 4
1207  * tab-width: 4
1208  * indent-tabs-mode: nil
1209  * End:
1210  */
1211