1 /******************************************************************************
2  * arch/x86/pv/emul-priv-op.c
3  *
4  * Emulate privileged instructions for PV guests
5  *
6  * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <xen/domain_page.h>
23 #include <xen/event.h>
24 #include <xen/guest_access.h>
25 #include <xen/iocap.h>
26 
27 #include <asm/amd.h>
28 #include <asm/debugreg.h>
29 #include <asm/hpet.h>
30 #include <asm/hypercall.h>
31 #include <asm/mc146818rtc.h>
32 #include <asm/pv/domain.h>
33 #include <asm/shared.h>
34 
35 #include <xsm/xsm.h>
36 
37 #include "../x86_64/mmconfig.h"
38 #include "emulate.h"
39 #include "mm.h"
40 
41 struct priv_op_ctxt {
42     struct x86_emulate_ctxt ctxt;
43     struct {
44         unsigned long base, limit;
45     } cs;
46     char *io_emul_stub;
47     unsigned int bpmatch;
48 };
49 
50 /* I/O emulation helpers.  Use non-standard calling conventions. */
51 void nocall load_guest_gprs(struct cpu_user_regs *);
52 void nocall save_guest_gprs(void);
53 
54 typedef void io_emul_stub_t(struct cpu_user_regs *);
55 
io_emul_stub_setup(struct priv_op_ctxt * ctxt,u8 opcode,unsigned int port,unsigned int bytes)56 static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
57                                           unsigned int port, unsigned int bytes)
58 {
59     /*
60      * Construct a stub for IN/OUT emulation.
61      *
62      * Some platform drivers communicate with the SMM handler using GPRs as a
63      * mailbox.  Therefore, we must perform the emulation with the hardware
64      * domain's registers in view.
65      *
66      * We write a stub of the following form, using the guest load/save
67      * helpers (non-standard ABI), and one of several possible stubs
68      * performing the real I/O.
69      */
70     static const char prologue[] = {
71         0x53,       /* push %rbx */
72         0x55,       /* push %rbp */
73         0x41, 0x54, /* push %r12 */
74         0x41, 0x55, /* push %r13 */
75         0x41, 0x56, /* push %r14 */
76         0x41, 0x57, /* push %r15 */
77         0x57,       /* push %rdi (param for save_guest_gprs) */
78     };              /* call load_guest_gprs */
79                     /* <I/O stub> */
80                     /* call save_guest_gprs */
81     static const char epilogue[] = {
82         0x5f,       /* pop %rdi  */
83         0x41, 0x5f, /* pop %r15  */
84         0x41, 0x5e, /* pop %r14  */
85         0x41, 0x5d, /* pop %r13  */
86         0x41, 0x5c, /* pop %r12  */
87         0x5d,       /* pop %rbp  */
88         0x5b,       /* pop %rbx  */
89         0xc3,       /* ret       */
90     };
91 
92     struct stubs *this_stubs = &this_cpu(stubs);
93     unsigned long stub_va = this_stubs->addr + STUB_BUF_SIZE / 2;
94     unsigned int quirk_bytes = 0;
95     char *p;
96 
97     /* Helpers - Read outer scope but only modify p. */
98 #define APPEND_BUFF(b) ({ memcpy(p, b, sizeof(b)); p += sizeof(b); })
99 #define APPEND_CALL(f)                                                  \
100     ({                                                                  \
101         long disp = (long)(f) - (stub_va + p - ctxt->io_emul_stub + 5); \
102         BUG_ON((int32_t)disp != disp);                                  \
103         *p++ = 0xe8;                                                    \
104         *(int32_t *)p = disp; p += 4;                                   \
105     })
106 
107     if ( !ctxt->io_emul_stub )
108         ctxt->io_emul_stub =
109             map_domain_page(_mfn(this_stubs->mfn)) + (stub_va & ~PAGE_MASK);
110 
111     p = ctxt->io_emul_stub;
112 
113     APPEND_BUFF(prologue);
114     APPEND_CALL(load_guest_gprs);
115 
116     /* Some platforms might need to quirk the stub for specific inputs. */
117     if ( unlikely(ioemul_handle_quirk) )
118     {
119         quirk_bytes = ioemul_handle_quirk(opcode, p, ctxt->ctxt.regs);
120         p += quirk_bytes;
121     }
122 
123     /* Default I/O stub. */
124     if ( likely(!quirk_bytes) )
125     {
126         *p++ = (bytes != 2) ? 0x90 : 0x66;  /* data16 or nop */
127         *p++ = opcode;                      /* <opcode>      */
128         *p++ = !(opcode & 8) ? port : 0x90; /* imm8 or nop   */
129     }
130 
131     APPEND_CALL(save_guest_gprs);
132     APPEND_BUFF(epilogue);
133 
134     /* Build-time best effort attempt to catch problems. */
135     BUILD_BUG_ON(STUB_BUF_SIZE / 2 <
136                  (sizeof(prologue) + sizeof(epilogue) + 10 /* 2x call */ +
137                   MAX(3 /* default stub */, IOEMUL_QUIRK_STUB_BYTES)));
138     /* Runtime confirmation that we haven't clobbered an adjacent stub. */
139     BUG_ON(STUB_BUF_SIZE / 2 < (p - ctxt->io_emul_stub));
140 
141     /* Handy function-typed pointer to the stub. */
142     return (void *)stub_va;
143 
144 #undef APPEND_CALL
145 #undef APPEND_BUFF
146 }
147 
148 
149 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
iopl_ok(const struct vcpu * v,const struct cpu_user_regs * regs)150 static bool iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
151 {
152     unsigned int cpl = guest_kernel_mode(v, regs) ?
153         (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
154 
155     ASSERT((v->arch.pv.iopl & ~X86_EFLAGS_IOPL) == 0);
156 
157     return IOPL(cpl) <= v->arch.pv.iopl;
158 }
159 
160 /* Has the guest requested sufficient permission for this I/O access? */
guest_io_okay(unsigned int port,unsigned int bytes,struct vcpu * v,struct cpu_user_regs * regs)161 static bool guest_io_okay(unsigned int port, unsigned int bytes,
162                           struct vcpu *v, struct cpu_user_regs *regs)
163 {
164     /* If in user mode, switch to kernel mode just to read I/O bitmap. */
165     const bool user_mode = !(v->arch.flags & TF_kernel_mode);
166 
167     if ( iopl_ok(v, regs) )
168         return true;
169 
170     if ( (port + bytes) <= v->arch.pv.iobmp_limit )
171     {
172         union { uint8_t bytes[2]; uint16_t mask; } x;
173 
174         /*
175          * Grab permission bytes from guest space. Inaccessible bytes are
176          * read as 0xff (no access allowed).
177          */
178         if ( user_mode )
179             toggle_guest_pt(v);
180 
181         switch ( __copy_from_guest_offset(x.bytes, v->arch.pv.iobmp,
182                                           port>>3, 2) )
183         {
184         default: x.bytes[0] = ~0;
185             /* fallthrough */
186         case 1:  x.bytes[1] = ~0;
187             /* fallthrough */
188         case 0:  break;
189         }
190 
191         if ( user_mode )
192             toggle_guest_pt(v);
193 
194         if ( (x.mask & (((1 << bytes) - 1) << (port & 7))) == 0 )
195             return true;
196     }
197 
198     return false;
199 }
200 
201 /* Has the administrator granted sufficient permission for this I/O access? */
admin_io_okay(unsigned int port,unsigned int bytes,const struct domain * d)202 static bool admin_io_okay(unsigned int port, unsigned int bytes,
203                           const struct domain *d)
204 {
205     /*
206      * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
207      * We never permit direct access to that register.
208      */
209     if ( (port == 0xcf8) && (bytes == 4) )
210         return false;
211 
212     /* We also never permit direct access to the RTC/CMOS registers. */
213     if ( ((port & ~1) == RTC_PORT(0)) )
214         return false;
215 
216     return ioports_access_permitted(d, port, port + bytes - 1);
217 }
218 
pci_cfg_ok(struct domain * currd,unsigned int start,unsigned int size,uint32_t * write)219 static bool pci_cfg_ok(struct domain *currd, unsigned int start,
220                        unsigned int size, uint32_t *write)
221 {
222     uint32_t machine_bdf;
223 
224     if ( !is_hardware_domain(currd) )
225         return false;
226 
227     if ( !CF8_ENABLED(currd->arch.pci_cf8) )
228         return true;
229 
230     machine_bdf = CF8_BDF(currd->arch.pci_cf8);
231     if ( write )
232     {
233         const unsigned long *ro_map = pci_get_ro_map(0);
234 
235         if ( ro_map && test_bit(machine_bdf, ro_map) )
236             return false;
237     }
238     start |= CF8_ADDR_LO(currd->arch.pci_cf8);
239     /* AMD extended configuration space access? */
240     if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
241          boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
242          boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 < 0x17 )
243     {
244         uint64_t msr_val;
245 
246         if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
247             return false;
248         if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
249             start |= CF8_ADDR_HI(currd->arch.pci_cf8);
250     }
251 
252     return !write ?
253            xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
254                                      start, start + size - 1, 0) == 0 :
255            pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
256 }
257 
guest_io_read(unsigned int port,unsigned int bytes,struct domain * currd)258 static uint32_t guest_io_read(unsigned int port, unsigned int bytes,
259                               struct domain *currd)
260 {
261     uint32_t data = 0;
262     unsigned int shift = 0;
263 
264     if ( admin_io_okay(port, bytes, currd) )
265     {
266         switch ( bytes )
267         {
268         case 1: return inb(port);
269         case 2: return inw(port);
270         case 4: return inl(port);
271         }
272     }
273 
274     while ( bytes != 0 )
275     {
276         unsigned int size = 1;
277         uint32_t sub_data = ~0;
278 
279         if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
280         {
281             sub_data = pv_pit_handler(port, 0, 0);
282         }
283         else if ( port == RTC_PORT(0) || port == RTC_PORT(1) )
284         {
285             sub_data = rtc_guest_read(port);
286         }
287         else if ( (port == 0xcf8) && (bytes == 4) )
288         {
289             size = 4;
290             sub_data = currd->arch.pci_cf8;
291         }
292         else if ( (port & 0xfffc) == 0xcfc )
293         {
294             size = min(bytes, 4 - (port & 3));
295             if ( size == 3 )
296                 size = 2;
297             if ( pci_cfg_ok(currd, port & 3, size, NULL) )
298                 sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
299         }
300 
301         if ( size == 4 )
302             return sub_data;
303 
304         data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
305         shift += size * 8;
306         port += size;
307         bytes -= size;
308     }
309 
310     return data;
311 }
312 
check_guest_io_breakpoint(struct vcpu * v,unsigned int port,unsigned int len)313 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
314                                               unsigned int port,
315                                               unsigned int len)
316 {
317     unsigned int width, i, match = 0;
318     unsigned long start;
319 
320     if ( !v->arch.pv.dr7_emul || !(v->arch.pv.ctrlreg[4] & X86_CR4_DE) )
321         return 0;
322 
323     for ( i = 0; i < 4; i++ )
324     {
325         if ( !(v->arch.pv.dr7_emul & (3 << (i * DR_ENABLE_SIZE))) )
326             continue;
327 
328         start = v->arch.dr[i];
329         width = 0;
330 
331         switch ( (v->arch.dr7 >>
332                   (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
333         {
334         case DR_LEN_1: width = 1; break;
335         case DR_LEN_2: width = 2; break;
336         case DR_LEN_4: width = 4; break;
337         case DR_LEN_8: width = 8; break;
338         }
339 
340         if ( (start < (port + len)) && ((start + width) > port) )
341             match |= 1u << i;
342     }
343 
344     return match;
345 }
346 
read_io(unsigned int port,unsigned int bytes,unsigned long * val,struct x86_emulate_ctxt * ctxt)347 static int read_io(unsigned int port, unsigned int bytes,
348                    unsigned long *val, struct x86_emulate_ctxt *ctxt)
349 {
350     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
351     struct vcpu *curr = current;
352     struct domain *currd = current->domain;
353 
354     /* INS must not come here. */
355     ASSERT((ctxt->opcode & ~9) == 0xe4);
356 
357     if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
358         return X86EMUL_UNHANDLEABLE;
359 
360     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
361 
362     if ( admin_io_okay(port, bytes, currd) )
363     {
364         io_emul_stub_t *io_emul =
365             io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
366 
367         io_emul(ctxt->regs);
368         return X86EMUL_DONE;
369     }
370 
371     *val = guest_io_read(port, bytes, currd);
372 
373     return X86EMUL_OKAY;
374 }
375 
guest_io_write(unsigned int port,unsigned int bytes,uint32_t data,struct domain * currd)376 static void guest_io_write(unsigned int port, unsigned int bytes,
377                            uint32_t data, struct domain *currd)
378 {
379     if ( admin_io_okay(port, bytes, currd) )
380     {
381         switch ( bytes )
382         {
383         case 1:
384             outb((uint8_t)data, port);
385             if ( amd_acpi_c1e_quirk )
386                 amd_check_disable_c1e(port, (uint8_t)data);
387             break;
388         case 2:
389             outw((uint16_t)data, port);
390             break;
391         case 4:
392             outl(data, port);
393             break;
394         }
395         return;
396     }
397 
398     while ( bytes != 0 )
399     {
400         unsigned int size = 1;
401 
402         if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
403         {
404             pv_pit_handler(port, (uint8_t)data, 1);
405         }
406         else if ( port == RTC_PORT(0) || port == RTC_PORT(1) )
407         {
408             rtc_guest_write(port, data);
409         }
410         else if ( (port == 0xcf8) && (bytes == 4) )
411         {
412             size = 4;
413             currd->arch.pci_cf8 = data;
414         }
415         else if ( (port & 0xfffc) == 0xcfc )
416         {
417             size = min(bytes, 4 - (port & 3));
418             if ( size == 3 )
419                 size = 2;
420             if ( pci_cfg_ok(currd, port & 3, size, &data) )
421                 pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
422         }
423 
424         if ( size == 4 )
425             return;
426 
427         port += size;
428         bytes -= size;
429         data >>= size * 8;
430     }
431 }
432 
write_io(unsigned int port,unsigned int bytes,unsigned long val,struct x86_emulate_ctxt * ctxt)433 static int write_io(unsigned int port, unsigned int bytes,
434                     unsigned long val, struct x86_emulate_ctxt *ctxt)
435 {
436     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
437     struct vcpu *curr = current;
438     struct domain *currd = current->domain;
439 
440     /* OUTS must not come here. */
441     ASSERT((ctxt->opcode & ~9) == 0xe6);
442 
443     if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
444         return X86EMUL_UNHANDLEABLE;
445 
446     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
447 
448     if ( admin_io_okay(port, bytes, currd) )
449     {
450         io_emul_stub_t *io_emul =
451             io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
452 
453         io_emul(ctxt->regs);
454         if ( (bytes == 1) && amd_acpi_c1e_quirk )
455             amd_check_disable_c1e(port, val);
456         return X86EMUL_DONE;
457     }
458 
459     guest_io_write(port, bytes, val, currd);
460 
461     return X86EMUL_OKAY;
462 }
463 
read_segment(enum x86_segment seg,struct segment_register * reg,struct x86_emulate_ctxt * ctxt)464 static int read_segment(enum x86_segment seg,
465                         struct segment_register *reg,
466                         struct x86_emulate_ctxt *ctxt)
467 {
468     /* Check if this is an attempt to access the I/O bitmap. */
469     if ( seg == x86_seg_tr )
470     {
471         switch ( ctxt->opcode )
472         {
473         case 0x6c ... 0x6f: /* ins / outs */
474         case 0xe4 ... 0xe7: /* in / out (immediate port) */
475         case 0xec ... 0xef: /* in / out (port in %dx) */
476             /* Defer the check to priv_op_{read,write}_io(). */
477             return X86EMUL_DONE;
478         }
479     }
480 
481     if ( ctxt->addr_size < 64 )
482     {
483         unsigned long limit;
484         unsigned int sel, ar;
485 
486         switch ( seg )
487         {
488         case x86_seg_cs: sel = ctxt->regs->cs; break;
489         case x86_seg_ds: sel = read_sreg(ds);  break;
490         case x86_seg_es: sel = read_sreg(es);  break;
491         case x86_seg_fs: sel = read_sreg(fs);  break;
492         case x86_seg_gs: sel = read_sreg(gs);  break;
493         case x86_seg_ss: sel = ctxt->regs->ss; break;
494         default: return X86EMUL_UNHANDLEABLE;
495         }
496 
497         if ( !pv_emul_read_descriptor(sel, current, &reg->base,
498                                       &limit, &ar, 0) )
499             return X86EMUL_UNHANDLEABLE;
500 
501         reg->limit = limit;
502         reg->attr = ar >> 8;
503     }
504     else
505     {
506         switch ( seg )
507         {
508         default:
509             if ( !is_x86_user_segment(seg) )
510                 return X86EMUL_UNHANDLEABLE;
511             reg->base = 0;
512             break;
513         case x86_seg_fs:
514             reg->base = rdfsbase();
515             break;
516         case x86_seg_gs:
517             reg->base = rdgsbase();
518             break;
519         }
520 
521         reg->limit = ~0U;
522 
523         reg->attr = 0;
524         reg->type = _SEGMENT_WR >> 8;
525         if ( seg == x86_seg_cs )
526         {
527             reg->type |= _SEGMENT_CODE >> 8;
528             reg->l = 1;
529         }
530         else
531             reg->db = 1;
532         reg->s   = 1;
533         reg->dpl = 3;
534         reg->p   = 1;
535         reg->g   = 1;
536     }
537 
538     /*
539      * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
540      * Also do this for consistency for non-conforming code segments.
541      */
542     if ( (seg == x86_seg_ss ||
543           (seg == x86_seg_cs &&
544            !(reg->type & (_SEGMENT_EC >> 8)))) &&
545          guest_kernel_mode(current, ctxt->regs) )
546         reg->dpl = 0;
547 
548     return X86EMUL_OKAY;
549 }
550 
pv_emul_virt_to_linear(unsigned long base,unsigned long offset,unsigned int bytes,unsigned long limit,enum x86_segment seg,struct x86_emulate_ctxt * ctxt,unsigned long * addr)551 static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
552                                   unsigned int bytes, unsigned long limit,
553                                   enum x86_segment seg,
554                                   struct x86_emulate_ctxt *ctxt,
555                                   unsigned long *addr)
556 {
557     int rc = X86EMUL_OKAY;
558 
559     *addr = base + offset;
560 
561     if ( ctxt->addr_size < 64 )
562     {
563         if ( limit < bytes - 1 || offset > limit - bytes + 1 )
564             rc = X86EMUL_EXCEPTION;
565         *addr = (uint32_t)*addr;
566     }
567     else if ( !__addr_ok(*addr) )
568         rc = X86EMUL_EXCEPTION;
569 
570     if ( unlikely(rc == X86EMUL_EXCEPTION) )
571         x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
572                                                 : TRAP_stack_error,
573                               0, ctxt);
574 
575     return rc;
576 }
577 
rep_ins(uint16_t port,enum x86_segment seg,unsigned long offset,unsigned int bytes_per_rep,unsigned long * reps,struct x86_emulate_ctxt * ctxt)578 static int rep_ins(uint16_t port,
579                    enum x86_segment seg, unsigned long offset,
580                    unsigned int bytes_per_rep, unsigned long *reps,
581                    struct x86_emulate_ctxt *ctxt)
582 {
583     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
584     struct vcpu *curr = current;
585     struct domain *currd = current->domain;
586     unsigned long goal = *reps;
587     struct segment_register sreg;
588     int rc;
589 
590     ASSERT(seg == x86_seg_es);
591 
592     *reps = 0;
593 
594     if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
595         return X86EMUL_UNHANDLEABLE;
596 
597     rc = read_segment(x86_seg_es, &sreg, ctxt);
598     if ( rc != X86EMUL_OKAY )
599         return rc;
600 
601     if ( !sreg.p )
602         return X86EMUL_UNHANDLEABLE;
603     if ( !sreg.s ||
604          (sreg.type & (_SEGMENT_CODE >> 8)) ||
605          !(sreg.type & (_SEGMENT_WR >> 8)) )
606     {
607         x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
608         return X86EMUL_EXCEPTION;
609     }
610 
611     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
612 
613     while ( *reps < goal )
614     {
615         unsigned int data = guest_io_read(port, bytes_per_rep, currd);
616         unsigned long addr;
617 
618         rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
619                                     sreg.limit, x86_seg_es, ctxt, &addr);
620         if ( rc != X86EMUL_OKAY )
621             return rc;
622 
623         if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
624         {
625             x86_emul_pagefault(PFEC_write_access,
626                                addr + bytes_per_rep - rc, ctxt);
627             return X86EMUL_EXCEPTION;
628         }
629 
630         ++*reps;
631 
632         if ( poc->bpmatch || hypercall_preempt_check() )
633             break;
634 
635         /* x86_emulate() clips the repetition count to ensure we don't wrap. */
636         if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
637             offset -= bytes_per_rep;
638         else
639             offset += bytes_per_rep;
640     }
641 
642     return X86EMUL_OKAY;
643 }
644 
rep_outs(enum x86_segment seg,unsigned long offset,uint16_t port,unsigned int bytes_per_rep,unsigned long * reps,struct x86_emulate_ctxt * ctxt)645 static int rep_outs(enum x86_segment seg, unsigned long offset,
646                     uint16_t port,
647                     unsigned int bytes_per_rep, unsigned long *reps,
648                     struct x86_emulate_ctxt *ctxt)
649 {
650     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
651     struct vcpu *curr = current;
652     struct domain *currd = current->domain;
653     unsigned long goal = *reps;
654     struct segment_register sreg;
655     int rc;
656 
657     *reps = 0;
658 
659     if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
660         return X86EMUL_UNHANDLEABLE;
661 
662     rc = read_segment(seg, &sreg, ctxt);
663     if ( rc != X86EMUL_OKAY )
664         return rc;
665 
666     if ( !sreg.p )
667         return X86EMUL_UNHANDLEABLE;
668     if ( !sreg.s ||
669          ((sreg.type & (_SEGMENT_CODE >> 8)) &&
670           !(sreg.type & (_SEGMENT_WR >> 8))) )
671     {
672         x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
673                                                 : TRAP_stack_error,
674                               0, ctxt);
675         return X86EMUL_EXCEPTION;
676     }
677 
678     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
679 
680     while ( *reps < goal )
681     {
682         unsigned int data = 0;
683         unsigned long addr;
684 
685         rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
686                                     sreg.limit, seg, ctxt, &addr);
687         if ( rc != X86EMUL_OKAY )
688             return rc;
689 
690         if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
691         {
692             x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
693             return X86EMUL_EXCEPTION;
694         }
695 
696         guest_io_write(port, bytes_per_rep, data, currd);
697 
698         ++*reps;
699 
700         if ( poc->bpmatch || hypercall_preempt_check() )
701             break;
702 
703         /* x86_emulate() clips the repetition count to ensure we don't wrap. */
704         if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
705             offset -= bytes_per_rep;
706         else
707             offset += bytes_per_rep;
708     }
709 
710     return X86EMUL_OKAY;
711 }
712 
read_cr(unsigned int reg,unsigned long * val,struct x86_emulate_ctxt * ctxt)713 static int read_cr(unsigned int reg, unsigned long *val,
714                    struct x86_emulate_ctxt *ctxt)
715 {
716     const struct vcpu *curr = current;
717 
718     switch ( reg )
719     {
720     case 0: /* Read CR0 */
721         *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv.ctrlreg[0];
722         return X86EMUL_OKAY;
723 
724     case 2: /* Read CR2 */
725     case 4: /* Read CR4 */
726         *val = curr->arch.pv.ctrlreg[reg];
727         return X86EMUL_OKAY;
728 
729     case 3: /* Read CR3 */
730     {
731         const struct domain *currd = curr->domain;
732         mfn_t mfn;
733 
734         if ( !is_pv_32bit_domain(currd) )
735         {
736             mfn = pagetable_get_mfn(curr->arch.guest_table);
737             *val = xen_pfn_to_cr3(gfn_x(mfn_to_gfn(currd, mfn)));
738         }
739         else
740         {
741             l4_pgentry_t *pl4e =
742                 map_domain_page(pagetable_get_mfn(curr->arch.guest_table));
743 
744             mfn = l4e_get_mfn(*pl4e);
745             unmap_domain_page(pl4e);
746             *val = compat_pfn_to_cr3(gfn_x(mfn_to_gfn(currd, mfn)));
747         }
748 
749         return X86EMUL_OKAY;
750     }
751     }
752 
753     return X86EMUL_UNHANDLEABLE;
754 }
755 
write_cr(unsigned int reg,unsigned long val,struct x86_emulate_ctxt * ctxt)756 static int write_cr(unsigned int reg, unsigned long val,
757                     struct x86_emulate_ctxt *ctxt)
758 {
759     struct vcpu *curr = current;
760 
761     switch ( reg )
762     {
763     case 0: /* Write CR0 */
764         if ( (val ^ read_cr0()) & ~X86_CR0_TS )
765         {
766             gdprintk(XENLOG_WARNING,
767                      "Attempt to change unmodifiable CR0 flags\n");
768             break;
769         }
770         do_fpu_taskswitch(!!(val & X86_CR0_TS));
771         return X86EMUL_OKAY;
772 
773     case 2: /* Write CR2 */
774         curr->arch.pv.ctrlreg[2] = val;
775         arch_set_cr2(curr, val);
776         return X86EMUL_OKAY;
777 
778     case 3: /* Write CR3 */
779     {
780         struct domain *currd = curr->domain;
781         unsigned long gfn;
782         struct page_info *page;
783         int rc;
784 
785         gfn = !is_pv_32bit_domain(currd)
786               ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
787         page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
788         if ( !page )
789             break;
790         rc = new_guest_cr3(page_to_mfn(page));
791         put_page(page);
792 
793         switch ( rc )
794         {
795         case 0:
796             return X86EMUL_OKAY;
797         case -ERESTART: /* retry after preemption */
798             return X86EMUL_RETRY;
799         }
800         break;
801     }
802 
803     case 4: /* Write CR4 */
804         curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
805         write_cr4(pv_make_cr4(curr));
806         ctxt_switch_levelling(curr);
807         return X86EMUL_OKAY;
808     }
809 
810     return X86EMUL_UNHANDLEABLE;
811 }
812 
guest_misc_enable(uint64_t val)813 static inline uint64_t guest_misc_enable(uint64_t val)
814 {
815     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
816              MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
817     val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
818            MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
819            MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
820     return val;
821 }
822 
read_msr(unsigned int reg,uint64_t * val,struct x86_emulate_ctxt * ctxt)823 static int read_msr(unsigned int reg, uint64_t *val,
824                     struct x86_emulate_ctxt *ctxt)
825 {
826     struct vcpu *curr = current;
827     const struct domain *currd = curr->domain;
828     bool vpmu_msr = false;
829     int ret;
830 
831     if ( (ret = guest_rdmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
832     {
833         if ( ret == X86EMUL_EXCEPTION )
834             x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
835 
836         return ret;
837     }
838 
839     switch ( reg )
840     {
841         int rc;
842 
843     case MSR_FS_BASE:
844         if ( is_pv_32bit_domain(currd) )
845             break;
846         *val = rdfsbase();
847         return X86EMUL_OKAY;
848 
849     case MSR_GS_BASE:
850         if ( is_pv_32bit_domain(currd) )
851             break;
852         *val = rdgsbase();
853         return X86EMUL_OKAY;
854 
855     case MSR_SHADOW_GS_BASE:
856         if ( is_pv_32bit_domain(currd) )
857             break;
858         *val = curr->arch.pv.gs_base_user;
859         return X86EMUL_OKAY;
860 
861     case MSR_IA32_TSC:
862         *val = currd->arch.vtsc ? pv_soft_rdtsc(curr, ctxt->regs) : rdtsc();
863         return X86EMUL_OKAY;
864 
865     case MSR_EFER:
866         /* Hide unknown bits, and unconditionally hide SVME from guests. */
867         *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME;
868         /*
869          * Hide the 64-bit features from 32-bit guests.  SCE has
870          * vendor-dependent behaviour.
871          */
872         if ( is_pv_32bit_domain(currd) )
873             *val &= ~(EFER_LME | EFER_LMA |
874                       (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
875                        ? EFER_SCE : 0));
876         return X86EMUL_OKAY;
877 
878     case MSR_K7_FID_VID_CTL:
879     case MSR_K7_FID_VID_STATUS:
880     case MSR_K8_PSTATE_LIMIT:
881     case MSR_K8_PSTATE_CTRL:
882     case MSR_K8_PSTATE_STATUS:
883     case MSR_K8_PSTATE0:
884     case MSR_K8_PSTATE1:
885     case MSR_K8_PSTATE2:
886     case MSR_K8_PSTATE3:
887     case MSR_K8_PSTATE4:
888     case MSR_K8_PSTATE5:
889     case MSR_K8_PSTATE6:
890     case MSR_K8_PSTATE7:
891         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
892             break;
893         if ( unlikely(is_cpufreq_controller(currd)) )
894             goto normal;
895         *val = 0;
896         return X86EMUL_OKAY;
897 
898     case MSR_FAM10H_MMIO_CONF_BASE:
899         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
900              boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 >= 0x17 )
901             break;
902         /* fall through */
903     case MSR_AMD64_NB_CFG:
904         if ( is_hwdom_pinned_vcpu(curr) )
905             goto normal;
906         *val = 0;
907         return X86EMUL_OKAY;
908 
909     case MSR_IA32_MISC_ENABLE:
910         if ( rdmsr_safe(reg, *val) )
911             break;
912         *val = guest_misc_enable(*val);
913         return X86EMUL_OKAY;
914 
915     case MSR_IA32_PERF_CAPABILITIES:
916         /* No extra capabilities are supported. */
917         *val = 0;
918         return X86EMUL_OKAY;
919 
920     case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
921     case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
922     case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
923     case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
924         if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
925         {
926             vpmu_msr = true;
927             /* fall through */
928     case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
929     case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
930             if ( vpmu_msr || (boot_cpu_data.x86_vendor &
931                               (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
932             {
933                 if ( vpmu_do_rdmsr(reg, val) )
934                     break;
935                 return X86EMUL_OKAY;
936             }
937         }
938         /* fall through */
939     default:
940         rc = vmce_rdmsr(reg, val);
941         if ( rc < 0 )
942             break;
943         if ( rc )
944             return X86EMUL_OKAY;
945         /* fall through */
946     normal:
947         /* Everyone can read the MSR space. */
948         /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
949         if ( rdmsr_safe(reg, *val) )
950             break;
951         return X86EMUL_OKAY;
952     }
953 
954     return X86EMUL_UNHANDLEABLE;
955 }
956 
write_msr(unsigned int reg,uint64_t val,struct x86_emulate_ctxt * ctxt)957 static int write_msr(unsigned int reg, uint64_t val,
958                      struct x86_emulate_ctxt *ctxt)
959 {
960     struct vcpu *curr = current;
961     const struct domain *currd = curr->domain;
962     bool vpmu_msr = false;
963     int ret;
964 
965     if ( (ret = guest_wrmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
966     {
967         if ( ret == X86EMUL_EXCEPTION )
968             x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
969 
970         return ret;
971     }
972 
973     switch ( reg )
974     {
975         uint64_t temp;
976         int rc;
977 
978     case MSR_FS_BASE:
979         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
980             break;
981         wrfsbase(val);
982         return X86EMUL_OKAY;
983 
984     case MSR_GS_BASE:
985         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
986             break;
987         wrgsbase(val);
988         return X86EMUL_OKAY;
989 
990     case MSR_SHADOW_GS_BASE:
991         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
992             break;
993         wrgsshadow(val);
994         curr->arch.pv.gs_base_user = val;
995         return X86EMUL_OKAY;
996 
997     case MSR_K7_FID_VID_STATUS:
998     case MSR_K7_FID_VID_CTL:
999     case MSR_K8_PSTATE_LIMIT:
1000     case MSR_K8_PSTATE_CTRL:
1001     case MSR_K8_PSTATE_STATUS:
1002     case MSR_K8_PSTATE0:
1003     case MSR_K8_PSTATE1:
1004     case MSR_K8_PSTATE2:
1005     case MSR_K8_PSTATE3:
1006     case MSR_K8_PSTATE4:
1007     case MSR_K8_PSTATE5:
1008     case MSR_K8_PSTATE6:
1009     case MSR_K8_PSTATE7:
1010     case MSR_K8_HWCR:
1011         if ( !(boot_cpu_data.x86_vendor &
1012                (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
1013             break;
1014         if ( likely(!is_cpufreq_controller(currd)) ||
1015              wrmsr_safe(reg, val) == 0 )
1016             return X86EMUL_OKAY;
1017         break;
1018 
1019     case MSR_AMD64_NB_CFG:
1020         if ( !is_hwdom_pinned_vcpu(curr) )
1021             return X86EMUL_OKAY;
1022         if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
1023              ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
1024             goto invalid;
1025         if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
1026             return X86EMUL_OKAY;
1027         break;
1028 
1029     case MSR_FAM10H_MMIO_CONF_BASE:
1030         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
1031              boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 >= 0x17 )
1032             break;
1033         if ( !is_hwdom_pinned_vcpu(curr) )
1034             return X86EMUL_OKAY;
1035         if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
1036             break;
1037         if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
1038              temp != val :
1039              ((temp ^ val) &
1040               ~(FAM10H_MMIO_CONF_ENABLE |
1041                 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
1042                  FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
1043                 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
1044                  FAM10H_MMIO_CONF_BASE_SHIFT))) )
1045             goto invalid;
1046         if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
1047             return X86EMUL_OKAY;
1048         break;
1049 
1050     case MSR_IA32_MISC_ENABLE:
1051         if ( rdmsr_safe(reg, temp) )
1052             break;
1053         if ( val != guest_misc_enable(temp) )
1054             goto invalid;
1055         return X86EMUL_OKAY;
1056 
1057     case MSR_IA32_MPERF:
1058     case MSR_IA32_APERF:
1059         if ( !(boot_cpu_data.x86_vendor &
1060                (X86_VENDOR_INTEL | X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
1061             break;
1062         if ( likely(!is_cpufreq_controller(currd)) ||
1063              wrmsr_safe(reg, val) == 0 )
1064             return X86EMUL_OKAY;
1065         break;
1066 
1067     case MSR_IA32_THERM_CONTROL:
1068     case MSR_IA32_ENERGY_PERF_BIAS:
1069         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1070             break;
1071         if ( !is_hwdom_pinned_vcpu(curr) || wrmsr_safe(reg, val) == 0 )
1072             return X86EMUL_OKAY;
1073         break;
1074 
1075     case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
1076     case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
1077     case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
1078     case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
1079         if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
1080         {
1081             vpmu_msr = true;
1082     case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
1083     case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
1084             if ( vpmu_msr || (boot_cpu_data.x86_vendor &
1085                               (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
1086             {
1087                 if ( (vpmu_mode & XENPMU_MODE_ALL) &&
1088                      !is_hardware_domain(currd) )
1089                     return X86EMUL_OKAY;
1090 
1091                 if ( vpmu_do_wrmsr(reg, val, 0) )
1092                     break;
1093                 return X86EMUL_OKAY;
1094             }
1095         }
1096         /* fall through */
1097     default:
1098         rc = vmce_wrmsr(reg, val);
1099         if ( rc < 0 )
1100             break;
1101         if ( rc )
1102             return X86EMUL_OKAY;
1103 
1104         if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
1105     invalid:
1106             gdprintk(XENLOG_WARNING,
1107                      "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
1108                      reg, temp, val);
1109         return X86EMUL_OKAY;
1110     }
1111 
1112     return X86EMUL_UNHANDLEABLE;
1113 }
1114 
cache_op(enum x86emul_cache_op op,enum x86_segment seg,unsigned long offset,struct x86_emulate_ctxt * ctxt)1115 static int cache_op(enum x86emul_cache_op op, enum x86_segment seg,
1116                     unsigned long offset, struct x86_emulate_ctxt *ctxt)
1117 {
1118     ASSERT(op == x86emul_wbinvd || op == x86emul_wbnoinvd);
1119 
1120     /* Ignore the instruction if unprivileged. */
1121     if ( !cache_flush_permitted(current->domain) )
1122         /*
1123          * Non-physdev domain attempted WBINVD; ignore for now since
1124          * newer linux uses this in some start-of-day timing loops.
1125          */
1126         ;
1127     else if ( op == x86emul_wbnoinvd /* && cpu_has_wbnoinvd */ )
1128         wbnoinvd();
1129     else
1130         wbinvd();
1131 
1132     return X86EMUL_OKAY;
1133 }
1134 
validate(const struct x86_emulate_state * state,struct x86_emulate_ctxt * ctxt)1135 static int validate(const struct x86_emulate_state *state,
1136                     struct x86_emulate_ctxt *ctxt)
1137 {
1138     switch ( ctxt->opcode )
1139     {
1140     case 0x6c ... 0x6f: /* ins / outs */
1141     case 0xe4 ... 0xe7: /* in / out (immediate port) */
1142     case 0xec ... 0xef: /* in / out (port in %dx) */
1143     case X86EMUL_OPC(0x0f, 0x06): /* clts */
1144     case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
1145     case X86EMUL_OPC(0x0f, 0x20) ...
1146          X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
1147     case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
1148     case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
1149     case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
1150     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
1151         return X86EMUL_OKAY;
1152 
1153     case 0xfa: case 0xfb: /* cli / sti */
1154         if ( !iopl_ok(current, ctxt->regs) )
1155             break;
1156         /*
1157          * This is just too dangerous to allow, in my opinion. Consider if the
1158          * caller then tries to reenable interrupts using POPF: we can't trap
1159          * that and we'll end up with hard-to-debug lockups. Fast & loose will
1160          * do for us. :-)
1161         vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
1162          */
1163         return X86EMUL_DONE;
1164 
1165     case X86EMUL_OPC(0x0f, 0x01):
1166     {
1167         unsigned int modrm_rm, modrm_reg;
1168 
1169         if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
1170              (modrm_rm & 7) != 1 )
1171             break;
1172         switch ( modrm_reg & 7 )
1173         {
1174         case 2: /* xsetbv */
1175         case 7: /* rdtscp */
1176             return X86EMUL_OKAY;
1177         }
1178         break;
1179     }
1180     }
1181 
1182     return X86EMUL_UNHANDLEABLE;
1183 }
1184 
insn_fetch(enum x86_segment seg,unsigned long offset,void * p_data,unsigned int bytes,struct x86_emulate_ctxt * ctxt)1185 static int insn_fetch(enum x86_segment seg,
1186                       unsigned long offset,
1187                       void *p_data,
1188                       unsigned int bytes,
1189                       struct x86_emulate_ctxt *ctxt)
1190 {
1191     const struct priv_op_ctxt *poc =
1192         container_of(ctxt, struct priv_op_ctxt, ctxt);
1193     unsigned int rc;
1194     unsigned long addr = poc->cs.base + offset;
1195 
1196     ASSERT(seg == x86_seg_cs);
1197 
1198     /* We don't mean to emulate any branches. */
1199     if ( !bytes )
1200         return X86EMUL_UNHANDLEABLE;
1201 
1202     rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
1203                                 x86_seg_cs, ctxt, &addr);
1204     if ( rc != X86EMUL_OKAY )
1205         return rc;
1206 
1207     if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
1208     {
1209         /*
1210          * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
1211          * cpu_has_nx, but we'd then need a "fetch" variant of
1212          * __copy_from_user() respecting NX, SMEP, and protection keys.
1213          */
1214         x86_emul_pagefault(0, addr + bytes - rc, ctxt);
1215         return X86EMUL_EXCEPTION;
1216     }
1217 
1218     return X86EMUL_OKAY;
1219 }
1220 
1221 
1222 static const struct x86_emulate_ops priv_op_ops = {
1223     .insn_fetch          = insn_fetch,
1224     .read                = x86emul_unhandleable_rw,
1225     .validate            = validate,
1226     .read_io             = read_io,
1227     .write_io            = write_io,
1228     .rep_ins             = rep_ins,
1229     .rep_outs            = rep_outs,
1230     .read_segment        = read_segment,
1231     .read_cr             = read_cr,
1232     .write_cr            = write_cr,
1233     .read_dr             = x86emul_read_dr,
1234     .write_dr            = x86emul_write_dr,
1235     .write_xcr           = x86emul_write_xcr,
1236     .read_msr            = read_msr,
1237     .write_msr           = write_msr,
1238     .cpuid               = x86emul_cpuid,
1239     .cache_op            = cache_op,
1240 };
1241 
pv_emulate_privileged_op(struct cpu_user_regs * regs)1242 int pv_emulate_privileged_op(struct cpu_user_regs *regs)
1243 {
1244     struct vcpu *curr = current;
1245     struct domain *currd = curr->domain;
1246     struct priv_op_ctxt ctxt = {
1247         .ctxt.regs = regs,
1248         .ctxt.cpuid = currd->arch.cpuid,
1249         .ctxt.lma = !is_pv_32bit_domain(currd),
1250     };
1251     int rc;
1252     unsigned int eflags, ar;
1253 
1254     if ( !pv_emul_read_descriptor(regs->cs, curr, &ctxt.cs.base,
1255                                   &ctxt.cs.limit, &ar, 1) ||
1256          !(ar & _SEGMENT_S) ||
1257          !(ar & _SEGMENT_P) ||
1258          !(ar & _SEGMENT_CODE) )
1259         return 0;
1260 
1261     /* Mirror virtualized state into EFLAGS. */
1262     ASSERT(regs->eflags & X86_EFLAGS_IF);
1263     if ( vcpu_info(curr, evtchn_upcall_mask) )
1264         regs->eflags &= ~X86_EFLAGS_IF;
1265     else
1266         regs->eflags |= X86_EFLAGS_IF;
1267     ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
1268     regs->eflags |= curr->arch.pv.iopl;
1269     eflags = regs->eflags;
1270 
1271     ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
1272     /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
1273     rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
1274 
1275     if ( ctxt.io_emul_stub )
1276         unmap_domain_page(ctxt.io_emul_stub);
1277 
1278     /*
1279      * Un-mirror virtualized state from EFLAGS.
1280      * Nothing we allow to be emulated can change anything other than the
1281      * arithmetic bits, and the resume flag.
1282      */
1283     ASSERT(!((regs->eflags ^ eflags) &
1284              ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
1285     regs->eflags |= X86_EFLAGS_IF;
1286     regs->eflags &= ~X86_EFLAGS_IOPL;
1287 
1288     switch ( rc )
1289     {
1290     case X86EMUL_OKAY:
1291         if ( ctxt.ctxt.retire.singlestep )
1292             ctxt.bpmatch |= DR_STEP;
1293         if ( ctxt.bpmatch )
1294         {
1295             curr->arch.dr6 |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
1296             if ( !(curr->arch.pv.trap_bounce.flags & TBF_EXCEPTION) )
1297                 pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1298         }
1299         /* fall through */
1300     case X86EMUL_RETRY:
1301         return EXCRET_fault_fixed;
1302 
1303     case X86EMUL_EXCEPTION:
1304         pv_inject_event(&ctxt.ctxt.event);
1305         return EXCRET_fault_fixed;
1306     }
1307 
1308     return 0;
1309 }
1310 
1311 /*
1312  * Local variables:
1313  * mode: C
1314  * c-file-style: "BSD"
1315  * c-basic-offset: 4
1316  * tab-width: 4
1317  * indent-tabs-mode: nil
1318  * End:
1319  */
1320