1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; If not, see <http://www.gnu.org/licenses/>.
15  *
16  * Copyright (C) Ashok Raj <ashok.raj@intel.com>
17  * Copyright (C) Shaohua Li <shaohua.li@intel.com>
18  * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
19  */
20 
21 #include <xen/irq.h>
22 #include <xen/sched.h>
23 #include <xen/xmalloc.h>
24 #include <xen/domain_page.h>
25 #include <xen/iocap.h>
26 #include <xen/iommu.h>
27 #include <xen/numa.h>
28 #include <xen/softirq.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include <asm/nops.h>
35 #include <asm/irq.h>
36 #include <asm/hvm/vmx/vmx.h>
37 #include <asm/p2m.h>
38 #include <mach_apic.h>
39 #include "iommu.h"
40 #include "dmar.h"
41 #include "extern.h"
42 #include "vtd.h"
43 #include "../ats.h"
44 
45 struct mapped_rmrr {
46     struct list_head list;
47     u64 base, end;
48     unsigned int count;
49 };
50 
51 /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
52 bool __read_mostly untrusted_msi;
53 
54 bool __read_mostly iommu_igfx = true;
55 bool __read_mostly iommu_qinval = true;
56 #ifndef iommu_snoop
57 bool __read_mostly iommu_snoop = true;
58 #endif
59 
60 int nr_iommus;
61 
62 static struct tasklet vtd_fault_tasklet;
63 
64 static int setup_hwdom_device(u8 devfn, struct pci_dev *);
65 static void setup_hwdom_rmrr(struct domain *d);
66 
domain_iommu_domid(struct domain * d,struct vtd_iommu * iommu)67 static int domain_iommu_domid(struct domain *d,
68                               struct vtd_iommu *iommu)
69 {
70     unsigned long nr_dom, i;
71 
72     nr_dom = cap_ndoms(iommu->cap);
73     i = find_first_bit(iommu->domid_bitmap, nr_dom);
74     while ( i < nr_dom )
75     {
76         if ( iommu->domid_map[i] == d->domain_id )
77             return i;
78 
79         i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
80     }
81 
82     dprintk(XENLOG_ERR VTDPREFIX,
83             "Cannot get valid iommu domid: domid=%d iommu->index=%d\n",
84             d->domain_id, iommu->index);
85     return -1;
86 }
87 
88 #define DID_FIELD_WIDTH 16
89 #define DID_HIGH_OFFSET 8
context_set_domain_id(struct context_entry * context,struct domain * d,struct vtd_iommu * iommu)90 static int context_set_domain_id(struct context_entry *context,
91                                  struct domain *d,
92                                  struct vtd_iommu *iommu)
93 {
94     unsigned long nr_dom, i;
95     int found = 0;
96 
97     ASSERT(spin_is_locked(&iommu->lock));
98 
99     nr_dom = cap_ndoms(iommu->cap);
100     i = find_first_bit(iommu->domid_bitmap, nr_dom);
101     while ( i < nr_dom )
102     {
103         if ( iommu->domid_map[i] == d->domain_id )
104         {
105             found = 1;
106             break;
107         }
108         i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
109     }
110 
111     if ( found == 0 )
112     {
113         i = find_first_zero_bit(iommu->domid_bitmap, nr_dom);
114         if ( i >= nr_dom )
115         {
116             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n");
117             return -EFAULT;
118         }
119         iommu->domid_map[i] = d->domain_id;
120     }
121 
122     set_bit(i, iommu->domid_bitmap);
123     context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
124     return 0;
125 }
126 
context_get_domain_id(struct context_entry * context,struct vtd_iommu * iommu)127 static int context_get_domain_id(struct context_entry *context,
128                                  struct vtd_iommu *iommu)
129 {
130     unsigned long dom_index, nr_dom;
131     int domid = -1;
132 
133     if (iommu && context)
134     {
135         nr_dom = cap_ndoms(iommu->cap);
136 
137         dom_index = context_domain_id(*context);
138 
139         if ( dom_index < nr_dom && iommu->domid_map )
140             domid = iommu->domid_map[dom_index];
141         else
142             dprintk(XENLOG_DEBUG VTDPREFIX,
143                     "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n",
144                     dom_index, nr_dom);
145     }
146     return domid;
147 }
148 
149 static int iommus_incoherent;
150 
sync_cache(const void * addr,unsigned int size)151 static void sync_cache(const void *addr, unsigned int size)
152 {
153     static unsigned long clflush_size = 0;
154     const void *end = addr + size;
155 
156     if ( !iommus_incoherent )
157         return;
158 
159     if ( clflush_size == 0 )
160         clflush_size = get_cache_line_size();
161 
162     addr -= (unsigned long)addr & (clflush_size - 1);
163     for ( ; addr < end; addr += clflush_size )
164 /*
165  * The arguments to a macro must not include preprocessor directives. Doing so
166  * results in undefined behavior, so we have to create some defines here in
167  * order to avoid it.
168  */
169 #if defined(HAVE_AS_CLWB)
170 # define CLWB_ENCODING "clwb %[p]"
171 #elif defined(HAVE_AS_XSAVEOPT)
172 # define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
173 #else
174 # define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
175 #endif
176 
177 #define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
178 #if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
179 # define INPUT BASE_INPUT
180 #else
181 # define INPUT(addr) "a" (addr), BASE_INPUT(addr)
182 #endif
183         /*
184          * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
185          * + prefix than a clflush + nop, and hence the prefix is added instead
186          * of letting the alternative framework fill the gap by appending nops.
187          */
188         alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
189                          "data16 clflush %[p]", /* clflushopt */
190                          X86_FEATURE_CLFLUSHOPT,
191                          CLWB_ENCODING,
192                          X86_FEATURE_CLWB, /* no outputs */,
193                          INPUT(addr));
194 #undef INPUT
195 #undef BASE_INPUT
196 #undef CLWB_ENCODING
197 
198     alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
199                       "sfence", X86_FEATURE_CLWB);
200 }
201 
202 /* Allocate page table, return its machine address */
alloc_pgtable_maddr(unsigned long npages,nodeid_t node)203 uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
204 {
205     struct page_info *pg, *cur_pg;
206     u64 *vaddr;
207     unsigned int i;
208 
209     pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
210                              (node == NUMA_NO_NODE) ? 0 : MEMF_node(node));
211     if ( !pg )
212         return 0;
213 
214     cur_pg = pg;
215     for ( i = 0; i < npages; i++ )
216     {
217         vaddr = __map_domain_page(cur_pg);
218         memset(vaddr, 0, PAGE_SIZE);
219 
220         sync_cache(vaddr, PAGE_SIZE);
221         unmap_domain_page(vaddr);
222         cur_pg++;
223     }
224 
225     return page_to_maddr(pg);
226 }
227 
free_pgtable_maddr(u64 maddr)228 void free_pgtable_maddr(u64 maddr)
229 {
230     if ( maddr != 0 )
231         free_domheap_page(maddr_to_page(maddr));
232 }
233 
234 /* context entry handling */
bus_to_context_maddr(struct vtd_iommu * iommu,u8 bus)235 static u64 bus_to_context_maddr(struct vtd_iommu *iommu, u8 bus)
236 {
237     struct root_entry *root, *root_entries;
238     u64 maddr;
239 
240     ASSERT(spin_is_locked(&iommu->lock));
241     root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
242     root = &root_entries[bus];
243     if ( !root_present(*root) )
244     {
245         maddr = alloc_pgtable_maddr(1, iommu->node);
246         if ( maddr == 0 )
247         {
248             unmap_vtd_domain_page(root_entries);
249             return 0;
250         }
251         set_root_value(*root, maddr);
252         set_root_present(*root);
253         iommu_sync_cache(root, sizeof(struct root_entry));
254     }
255     maddr = (u64) get_context_addr(*root);
256     unmap_vtd_domain_page(root_entries);
257     return maddr;
258 }
259 
addr_to_dma_page_maddr(struct domain * domain,u64 addr,int alloc)260 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
261 {
262     struct domain_iommu *hd = dom_iommu(domain);
263     int addr_width = agaw_to_width(hd->arch.agaw);
264     struct dma_pte *parent, *pte = NULL;
265     int level = agaw_to_level(hd->arch.agaw);
266     int offset;
267     u64 pte_maddr = 0;
268 
269     addr &= (((u64)1) << addr_width) - 1;
270     ASSERT(spin_is_locked(&hd->arch.mapping_lock));
271     if ( !hd->arch.pgd_maddr &&
272          (!alloc ||
273           ((hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node)) == 0)) )
274         goto out;
275 
276     parent = (struct dma_pte *)map_vtd_domain_page(hd->arch.pgd_maddr);
277     while ( level > 1 )
278     {
279         offset = address_level_offset(addr, level);
280         pte = &parent[offset];
281 
282         pte_maddr = dma_pte_addr(*pte);
283         if ( !pte_maddr )
284         {
285             if ( !alloc )
286                 break;
287 
288             pte_maddr = alloc_pgtable_maddr(1, hd->node);
289             if ( !pte_maddr )
290                 break;
291 
292             dma_set_pte_addr(*pte, pte_maddr);
293 
294             /*
295              * high level table always sets r/w, last level
296              * page table control read/write
297              */
298             dma_set_pte_readable(*pte);
299             dma_set_pte_writable(*pte);
300             iommu_sync_cache(pte, sizeof(struct dma_pte));
301         }
302 
303         if ( level == 2 )
304             break;
305 
306         unmap_vtd_domain_page(parent);
307         parent = map_vtd_domain_page(pte_maddr);
308         level--;
309     }
310 
311     unmap_vtd_domain_page(parent);
312  out:
313     return pte_maddr;
314 }
315 
iommu_flush_write_buffer(struct vtd_iommu * iommu)316 static void iommu_flush_write_buffer(struct vtd_iommu *iommu)
317 {
318     u32 val;
319     unsigned long flags;
320 
321     if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
322         return;
323 
324     spin_lock_irqsave(&iommu->register_lock, flags);
325     val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
326     dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
327 
328     /* Make sure hardware complete it */
329     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
330                   !(val & DMA_GSTS_WBFS), val);
331 
332     spin_unlock_irqrestore(&iommu->register_lock, flags);
333 }
334 
335 /* return value determine if we need a write buffer flush */
flush_context_reg(struct vtd_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type,bool flush_non_present_entry)336 static int __must_check flush_context_reg(struct vtd_iommu *iommu, u16 did,
337                                           u16 source_id, u8 function_mask,
338                                           u64 type,
339                                           bool flush_non_present_entry)
340 {
341     u64 val = 0;
342     unsigned long flags;
343 
344     /*
345      * In the non-present entry flush case, if hardware doesn't cache
346      * non-present entry we do nothing and if hardware cache non-present
347      * entry, we flush entries of domain 0 (the domain id is used to cache
348      * any non-present entries)
349      */
350     if ( flush_non_present_entry )
351     {
352         if ( !cap_caching_mode(iommu->cap) )
353             return 1;
354         else
355             did = 0;
356     }
357 
358     /* use register invalidation */
359     switch ( type )
360     {
361     case DMA_CCMD_GLOBAL_INVL:
362         val = DMA_CCMD_GLOBAL_INVL;
363         break;
364     case DMA_CCMD_DOMAIN_INVL:
365         val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
366         break;
367     case DMA_CCMD_DEVICE_INVL:
368         val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
369             |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
370         break;
371     default:
372         BUG();
373     }
374     val |= DMA_CCMD_ICC;
375 
376     spin_lock_irqsave(&iommu->register_lock, flags);
377     dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
378 
379     /* Make sure hardware complete it */
380     IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
381                   !(val & DMA_CCMD_ICC), val);
382 
383     spin_unlock_irqrestore(&iommu->register_lock, flags);
384     /* flush context entry will implicitly flush write buffer */
385     return 0;
386 }
387 
iommu_flush_context_global(struct vtd_iommu * iommu,bool flush_non_present_entry)388 static int __must_check iommu_flush_context_global(struct vtd_iommu *iommu,
389                                                    bool flush_non_present_entry)
390 {
391     return iommu->flush.context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
392                                 flush_non_present_entry);
393 }
394 
iommu_flush_context_device(struct vtd_iommu * iommu,u16 did,u16 source_id,u8 function_mask,bool flush_non_present_entry)395 static int __must_check iommu_flush_context_device(struct vtd_iommu *iommu,
396                                                    u16 did, u16 source_id,
397                                                    u8 function_mask,
398                                                    bool flush_non_present_entry)
399 {
400     return iommu->flush.context(iommu, did, source_id, function_mask,
401                                 DMA_CCMD_DEVICE_INVL, flush_non_present_entry);
402 }
403 
404 /* return value determine if we need a write buffer flush */
flush_iotlb_reg(struct vtd_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type,bool flush_non_present_entry,bool flush_dev_iotlb)405 static int __must_check flush_iotlb_reg(struct vtd_iommu *iommu, u16 did,
406                                         u64 addr,
407                                         unsigned int size_order, u64 type,
408                                         bool flush_non_present_entry,
409                                         bool flush_dev_iotlb)
410 {
411     int tlb_offset = ecap_iotlb_offset(iommu->ecap);
412     u64 val = 0;
413     unsigned long flags;
414 
415     /*
416      * In the non-present entry flush case, if hardware doesn't cache
417      * non-present entry we do nothing and if hardware cache non-present
418      * entry, we flush entries of domain 0 (the domain id is used to cache
419      * any non-present entries)
420      */
421     if ( flush_non_present_entry )
422     {
423         if ( !cap_caching_mode(iommu->cap) )
424             return 1;
425         else
426             did = 0;
427     }
428 
429     /* use register invalidation */
430     switch ( type )
431     {
432     case DMA_TLB_GLOBAL_FLUSH:
433         val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
434         break;
435     case DMA_TLB_DSI_FLUSH:
436         val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
437         break;
438     case DMA_TLB_PSI_FLUSH:
439         val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
440         break;
441     default:
442         BUG();
443     }
444     /* Note: set drain read/write */
445     if ( cap_read_drain(iommu->cap) )
446         val |= DMA_TLB_READ_DRAIN;
447     if ( cap_write_drain(iommu->cap) )
448         val |= DMA_TLB_WRITE_DRAIN;
449 
450     spin_lock_irqsave(&iommu->register_lock, flags);
451     /* Note: Only uses first TLB reg currently */
452     if ( type == DMA_TLB_PSI_FLUSH )
453     {
454         /* Note: always flush non-leaf currently. */
455         dmar_writeq(iommu->reg, tlb_offset, size_order | addr);
456     }
457     dmar_writeq(iommu->reg, tlb_offset + 8, val);
458 
459     /* Make sure hardware complete it */
460     IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
461                   !(val & DMA_TLB_IVT), val);
462     spin_unlock_irqrestore(&iommu->register_lock, flags);
463 
464     /* check IOTLB invalidation granularity */
465     if ( DMA_TLB_IAIG(val) == 0 )
466         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
467 
468     /* flush iotlb entry will implicitly flush write buffer */
469     return 0;
470 }
471 
iommu_flush_iotlb_global(struct vtd_iommu * iommu,bool flush_non_present_entry,bool flush_dev_iotlb)472 static int __must_check iommu_flush_iotlb_global(struct vtd_iommu *iommu,
473                                                  bool flush_non_present_entry,
474                                                  bool flush_dev_iotlb)
475 {
476     int status;
477 
478     /* apply platform specific errata workarounds */
479     vtd_ops_preamble_quirk(iommu);
480 
481     status = iommu->flush.iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
482                                 flush_non_present_entry, flush_dev_iotlb);
483 
484     /* undo platform specific errata workarounds */
485     vtd_ops_postamble_quirk(iommu);
486 
487     return status;
488 }
489 
iommu_flush_iotlb_dsi(struct vtd_iommu * iommu,u16 did,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)490 static int __must_check iommu_flush_iotlb_dsi(struct vtd_iommu *iommu, u16 did,
491                                               bool_t flush_non_present_entry,
492                                               bool_t flush_dev_iotlb)
493 {
494     int status;
495 
496     /* apply platform specific errata workarounds */
497     vtd_ops_preamble_quirk(iommu);
498 
499     status = iommu->flush.iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
500                                 flush_non_present_entry, flush_dev_iotlb);
501 
502     /* undo platform specific errata workarounds */
503     vtd_ops_postamble_quirk(iommu);
504 
505     return status;
506 }
507 
iommu_flush_iotlb_psi(struct vtd_iommu * iommu,u16 did,u64 addr,unsigned int order,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)508 static int __must_check iommu_flush_iotlb_psi(struct vtd_iommu *iommu, u16 did,
509                                               u64 addr, unsigned int order,
510                                               bool_t flush_non_present_entry,
511                                               bool_t flush_dev_iotlb)
512 {
513     int status;
514 
515     ASSERT(!(addr & (~PAGE_MASK_4K)));
516 
517     /* Fallback to domain selective flush if no PSI support */
518     if ( !cap_pgsel_inv(iommu->cap) )
519         return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry,
520                                      flush_dev_iotlb);
521 
522     /* Fallback to domain selective flush if size is too big */
523     if ( order > cap_max_amask_val(iommu->cap) )
524         return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry,
525                                      flush_dev_iotlb);
526 
527     addr >>= PAGE_SHIFT_4K + order;
528     addr <<= PAGE_SHIFT_4K + order;
529 
530     /* apply platform specific errata workarounds */
531     vtd_ops_preamble_quirk(iommu);
532 
533     status = iommu->flush.iotlb(iommu, did, addr, order, DMA_TLB_PSI_FLUSH,
534                                 flush_non_present_entry, flush_dev_iotlb);
535 
536     /* undo platform specific errata workarounds */
537     vtd_ops_postamble_quirk(iommu);
538 
539     return status;
540 }
541 
iommu_flush_all(void)542 static int __must_check iommu_flush_all(void)
543 {
544     struct acpi_drhd_unit *drhd;
545     struct vtd_iommu *iommu;
546     bool_t flush_dev_iotlb;
547     int rc = 0;
548 
549     flush_all_cache();
550     for_each_drhd_unit ( drhd )
551     {
552         int context_rc, iotlb_rc;
553 
554         iommu = drhd->iommu;
555         context_rc = iommu_flush_context_global(iommu, 0);
556         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
557         iotlb_rc = iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
558 
559         /*
560          * The current logic for returns:
561          *   - positive  invoke iommu_flush_write_buffer to flush cache.
562          *   - zero      on success.
563          *   - negative  on failure. Continue to flush IOMMU IOTLB on a
564          *               best effort basis.
565          */
566         if ( context_rc > 0 || iotlb_rc > 0 )
567             iommu_flush_write_buffer(iommu);
568         if ( rc >= 0 )
569             rc = context_rc;
570         if ( rc >= 0 )
571             rc = iotlb_rc;
572     }
573 
574     if ( rc > 0 )
575         rc = 0;
576 
577     return rc;
578 }
579 
iommu_flush_iotlb(struct domain * d,dfn_t dfn,bool_t dma_old_pte_present,unsigned int page_count)580 static int __must_check iommu_flush_iotlb(struct domain *d, dfn_t dfn,
581                                           bool_t dma_old_pte_present,
582                                           unsigned int page_count)
583 {
584     struct domain_iommu *hd = dom_iommu(d);
585     struct acpi_drhd_unit *drhd;
586     struct vtd_iommu *iommu;
587     bool_t flush_dev_iotlb;
588     int iommu_domid;
589     int rc = 0;
590 
591     /*
592      * No need pcideves_lock here because we have flush
593      * when assign/deassign device
594      */
595     for_each_drhd_unit ( drhd )
596     {
597         iommu = drhd->iommu;
598 
599         if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) )
600             continue;
601 
602         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
603         iommu_domid= domain_iommu_domid(d, iommu);
604         if ( iommu_domid == -1 )
605             continue;
606 
607         if ( !page_count || (page_count & (page_count - 1)) ||
608              dfn_eq(dfn, INVALID_DFN) || !IS_ALIGNED(dfn_x(dfn), page_count) )
609             rc = iommu_flush_iotlb_dsi(iommu, iommu_domid,
610                                        0, flush_dev_iotlb);
611         else
612             rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
613                                        dfn_to_daddr(dfn),
614                                        get_order_from_pages(page_count),
615                                        !dma_old_pte_present,
616                                        flush_dev_iotlb);
617 
618         if ( rc > 0 )
619         {
620             iommu_flush_write_buffer(iommu);
621             rc = 0;
622         }
623     }
624 
625     return rc;
626 }
627 
iommu_flush_iotlb_pages(struct domain * d,dfn_t dfn,unsigned int page_count,unsigned int flush_flags)628 static int __must_check iommu_flush_iotlb_pages(struct domain *d,
629                                                 dfn_t dfn,
630                                                 unsigned int page_count,
631                                                 unsigned int flush_flags)
632 {
633     ASSERT(page_count && !dfn_eq(dfn, INVALID_DFN));
634     ASSERT(flush_flags);
635 
636     return iommu_flush_iotlb(d, dfn, flush_flags & IOMMU_FLUSHF_modified,
637                              page_count);
638 }
639 
iommu_flush_iotlb_all(struct domain * d)640 static int __must_check iommu_flush_iotlb_all(struct domain *d)
641 {
642     return iommu_flush_iotlb(d, INVALID_DFN, 0, 0);
643 }
644 
645 /* clear one page's page table */
dma_pte_clear_one(struct domain * domain,uint64_t addr,unsigned int * flush_flags)646 static void dma_pte_clear_one(struct domain *domain, uint64_t addr,
647                               unsigned int *flush_flags)
648 {
649     struct domain_iommu *hd = dom_iommu(domain);
650     struct dma_pte *page = NULL, *pte = NULL;
651     u64 pg_maddr;
652 
653     spin_lock(&hd->arch.mapping_lock);
654     /* get last level pte */
655     pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
656     if ( pg_maddr == 0 )
657     {
658         spin_unlock(&hd->arch.mapping_lock);
659         return;
660     }
661 
662     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
663     pte = page + address_level_offset(addr, 1);
664 
665     if ( !dma_pte_present(*pte) )
666     {
667         spin_unlock(&hd->arch.mapping_lock);
668         unmap_vtd_domain_page(page);
669         return;
670     }
671 
672     dma_clear_pte(*pte);
673     *flush_flags |= IOMMU_FLUSHF_modified;
674 
675     spin_unlock(&hd->arch.mapping_lock);
676     iommu_sync_cache(pte, sizeof(struct dma_pte));
677 
678     unmap_vtd_domain_page(page);
679 }
680 
iommu_free_pagetable(u64 pt_maddr,int level)681 static void iommu_free_pagetable(u64 pt_maddr, int level)
682 {
683     struct page_info *pg = maddr_to_page(pt_maddr);
684 
685     if ( pt_maddr == 0 )
686         return;
687 
688     PFN_ORDER(pg) = level;
689     spin_lock(&iommu_pt_cleanup_lock);
690     page_list_add_tail(pg, &iommu_pt_cleanup_list);
691     spin_unlock(&iommu_pt_cleanup_lock);
692 }
693 
iommu_free_page_table(struct page_info * pg)694 static void iommu_free_page_table(struct page_info *pg)
695 {
696     unsigned int i, next_level = PFN_ORDER(pg) - 1;
697     u64 pt_maddr = page_to_maddr(pg);
698     struct dma_pte *pt_vaddr, *pte;
699 
700     PFN_ORDER(pg) = 0;
701     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
702 
703     for ( i = 0; i < PTE_NUM; i++ )
704     {
705         pte = &pt_vaddr[i];
706         if ( !dma_pte_present(*pte) )
707             continue;
708 
709         if ( next_level >= 1 )
710             iommu_free_pagetable(dma_pte_addr(*pte), next_level);
711 
712         dma_clear_pte(*pte);
713         iommu_sync_cache(pte, sizeof(struct dma_pte));
714     }
715 
716     unmap_vtd_domain_page(pt_vaddr);
717     free_pgtable_maddr(pt_maddr);
718 }
719 
iommu_set_root_entry(struct vtd_iommu * iommu)720 static int iommu_set_root_entry(struct vtd_iommu *iommu)
721 {
722     u32 sts;
723     unsigned long flags;
724 
725     spin_lock_irqsave(&iommu->register_lock, flags);
726     dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
727 
728     sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
729     dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
730 
731     /* Make sure hardware complete it */
732     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
733                   (sts & DMA_GSTS_RTPS), sts);
734     spin_unlock_irqrestore(&iommu->register_lock, flags);
735 
736     return 0;
737 }
738 
iommu_enable_translation(struct acpi_drhd_unit * drhd)739 static void iommu_enable_translation(struct acpi_drhd_unit *drhd)
740 {
741     u32 sts;
742     unsigned long flags;
743     struct vtd_iommu *iommu = drhd->iommu;
744 
745     if ( is_igd_drhd(drhd) )
746     {
747         if ( !iommu_igfx )
748         {
749             printk(XENLOG_INFO VTDPREFIX
750                    "Passed iommu=no-igfx option.  Disabling IGD VT-d engine.\n");
751             return;
752         }
753 
754         if ( !is_igd_vt_enabled_quirk() )
755         {
756             if ( force_iommu )
757                 panic("BIOS did not enable IGD for VT properly, crash Xen for security purpose\n");
758 
759             printk(XENLOG_WARNING VTDPREFIX
760                    "BIOS did not enable IGD for VT properly.  Disabling IGD VT-d engine.\n");
761             return;
762         }
763     }
764 
765     /* apply platform specific errata workarounds */
766     vtd_ops_preamble_quirk(iommu);
767 
768     if ( iommu_verbose )
769         printk(VTDPREFIX "iommu_enable_translation: iommu->reg = %p\n",
770                iommu->reg);
771     spin_lock_irqsave(&iommu->register_lock, flags);
772     sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
773     dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
774 
775     /* Make sure hardware complete it */
776     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
777                   (sts & DMA_GSTS_TES), sts);
778     spin_unlock_irqrestore(&iommu->register_lock, flags);
779 
780     /* undo platform specific errata workarounds */
781     vtd_ops_postamble_quirk(iommu);
782 
783     /* Disable PMRs when VT-d engine takes effect per spec definition */
784     disable_pmr(iommu);
785 }
786 
iommu_disable_translation(struct vtd_iommu * iommu)787 static void iommu_disable_translation(struct vtd_iommu *iommu)
788 {
789     u32 sts;
790     unsigned long flags;
791 
792     /* apply platform specific errata workarounds */
793     vtd_ops_preamble_quirk(iommu);
794 
795     spin_lock_irqsave(&iommu->register_lock, flags);
796     sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
797     dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
798 
799     /* Make sure hardware complete it */
800     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
801                   !(sts & DMA_GSTS_TES), sts);
802     spin_unlock_irqrestore(&iommu->register_lock, flags);
803 
804     /* undo platform specific errata workarounds */
805     vtd_ops_postamble_quirk(iommu);
806 }
807 
808 enum faulttype {
809     DMA_REMAP,
810     INTR_REMAP,
811     UNKNOWN,
812 };
813 
814 static const char *dma_remap_fault_reasons[] =
815 {
816     "Software",
817     "Present bit in root entry is clear",
818     "Present bit in context entry is clear",
819     "Invalid context entry",
820     "Access beyond MGAW",
821     "PTE Write access is not set",
822     "PTE Read access is not set",
823     "Next page table ptr is invalid",
824     "Root table address invalid",
825     "Context table ptr is invalid",
826     "non-zero reserved fields in RTP",
827     "non-zero reserved fields in CTP",
828     "non-zero reserved fields in PTE",
829     "Blocked a DMA translation request",
830 };
831 
832 static const char *intr_remap_fault_reasons[] =
833 {
834     "Detected reserved fields in the decoded interrupt-remapped request",
835     "Interrupt index exceeded the interrupt-remapping table size",
836     "Present field in the IRTE entry is clear",
837     "Error accessing interrupt-remapping table pointed by IRTA_REG",
838     "Detected reserved fields in the IRTE entry",
839     "Blocked a compatibility format interrupt request",
840     "Blocked an interrupt request due to source-id verification failure",
841 };
842 
iommu_get_fault_reason(u8 fault_reason,enum faulttype * fault_type)843 static const char *iommu_get_fault_reason(u8 fault_reason,
844                                           enum faulttype *fault_type)
845 {
846     if ( fault_reason >= 0x20 && ( fault_reason < 0x20 +
847                 ARRAY_SIZE(intr_remap_fault_reasons)) )
848     {
849         *fault_type = INTR_REMAP;
850         return intr_remap_fault_reasons[fault_reason - 0x20];
851     }
852     else if ( fault_reason < ARRAY_SIZE(dma_remap_fault_reasons) )
853     {
854         *fault_type = DMA_REMAP;
855         return dma_remap_fault_reasons[fault_reason];
856     }
857     else
858     {
859         *fault_type = UNKNOWN;
860         return "Unknown";
861     }
862 }
863 
iommu_page_fault_do_one(struct vtd_iommu * iommu,int type,u8 fault_reason,u16 source_id,u64 addr)864 static int iommu_page_fault_do_one(struct vtd_iommu *iommu, int type,
865                                    u8 fault_reason, u16 source_id, u64 addr)
866 {
867     const char *reason, *kind;
868     enum faulttype fault_type;
869     u16 seg = iommu->drhd->segment;
870 
871     reason = iommu_get_fault_reason(fault_reason, &fault_type);
872     switch ( fault_type )
873     {
874     case DMA_REMAP:
875         printk(XENLOG_G_WARNING VTDPREFIX
876                "DMAR:[%s] Request device [%04x:%02x:%02x.%u] "
877                "fault addr %"PRIx64"\n",
878                (type ? "DMA Read" : "DMA Write"),
879                seg, PCI_BUS(source_id), PCI_SLOT(source_id),
880                PCI_FUNC(source_id), addr);
881         kind = "DMAR";
882         break;
883     case INTR_REMAP:
884         printk(XENLOG_G_WARNING VTDPREFIX
885                "INTR-REMAP: Request device [%04x:%02x:%02x.%u] "
886                "fault index %"PRIx64"\n",
887                seg, PCI_BUS(source_id), PCI_SLOT(source_id),
888                PCI_FUNC(source_id), addr >> 48);
889         kind = "INTR-REMAP";
890         break;
891     default:
892         printk(XENLOG_G_WARNING VTDPREFIX
893                "UNKNOWN: Request device [%04x:%02x:%02x.%u] "
894                "fault addr %"PRIx64"\n",
895                seg, PCI_BUS(source_id), PCI_SLOT(source_id),
896                PCI_FUNC(source_id), addr);
897         kind = "UNKNOWN";
898         break;
899     }
900 
901     printk(XENLOG_G_WARNING VTDPREFIX "%s: reason %02x - %s\n",
902            kind, fault_reason, reason);
903 
904     if ( iommu_verbose && fault_type == DMA_REMAP )
905         print_vtd_entries(iommu, PCI_BUS(source_id), PCI_DEVFN2(source_id),
906                           addr >> PAGE_SHIFT);
907 
908     return 0;
909 }
910 
iommu_fault_status(u32 fault_status)911 static void iommu_fault_status(u32 fault_status)
912 {
913     if ( fault_status & DMA_FSTS_PFO )
914         INTEL_IOMMU_DEBUG("iommu_fault_status: Fault Overflow\n");
915     if ( fault_status & DMA_FSTS_PPF )
916         INTEL_IOMMU_DEBUG("iommu_fault_status: Primary Pending Fault\n");
917     if ( fault_status & DMA_FSTS_AFO )
918         INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Fault Overflow\n");
919     if ( fault_status & DMA_FSTS_APF )
920         INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Pending Fault\n");
921     if ( fault_status & DMA_FSTS_IQE )
922         INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Queue Error\n");
923     if ( fault_status & DMA_FSTS_ICE )
924         INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Completion Error\n");
925     if ( fault_status & DMA_FSTS_ITE )
926         INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Time-out Error\n");
927 }
928 
929 #define PRIMARY_FAULT_REG_LEN (16)
__do_iommu_page_fault(struct vtd_iommu * iommu)930 static void __do_iommu_page_fault(struct vtd_iommu *iommu)
931 {
932     int reg, fault_index;
933     u32 fault_status;
934     unsigned long flags;
935 
936     fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
937 
938     iommu_fault_status(fault_status);
939 
940     /* FIXME: ignore advanced fault log */
941     if ( !(fault_status & DMA_FSTS_PPF) )
942         goto clear_overflow;
943 
944     fault_index = dma_fsts_fault_record_index(fault_status);
945     reg = cap_fault_reg_offset(iommu->cap);
946     while (1)
947     {
948         u8 fault_reason;
949         u16 source_id;
950         u32 data;
951         u64 guest_addr;
952         int type;
953 
954         /* highest 32 bits */
955         spin_lock_irqsave(&iommu->register_lock, flags);
956         data = dmar_readl(iommu->reg, reg +
957                           fault_index * PRIMARY_FAULT_REG_LEN + 12);
958         if ( !(data & DMA_FRCD_F) )
959         {
960             spin_unlock_irqrestore(&iommu->register_lock, flags);
961             break;
962         }
963 
964         fault_reason = dma_frcd_fault_reason(data);
965         type = dma_frcd_type(data);
966 
967         data = dmar_readl(iommu->reg, reg +
968                           fault_index * PRIMARY_FAULT_REG_LEN + 8);
969         source_id = dma_frcd_source_id(data);
970 
971         guest_addr = dmar_readq(iommu->reg, reg +
972                                 fault_index * PRIMARY_FAULT_REG_LEN);
973         guest_addr = dma_frcd_page_addr(guest_addr);
974         /* clear the fault */
975         dmar_writel(iommu->reg, reg +
976                     fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
977         spin_unlock_irqrestore(&iommu->register_lock, flags);
978 
979         iommu_page_fault_do_one(iommu, type, fault_reason,
980                                 source_id, guest_addr);
981 
982         pci_check_disable_device(iommu->drhd->segment,
983                                  PCI_BUS(source_id), PCI_DEVFN2(source_id));
984 
985         fault_index++;
986         if ( fault_index > cap_num_fault_regs(iommu->cap) )
987             fault_index = 0;
988     }
989 clear_overflow:
990     /* clear primary fault overflow */
991     fault_status = readl(iommu->reg + DMAR_FSTS_REG);
992     if ( fault_status & DMA_FSTS_PFO )
993     {
994         spin_lock_irqsave(&iommu->register_lock, flags);
995         dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
996         spin_unlock_irqrestore(&iommu->register_lock, flags);
997     }
998 }
999 
do_iommu_page_fault(void * unused)1000 static void do_iommu_page_fault(void *unused)
1001 {
1002     struct acpi_drhd_unit *drhd;
1003 
1004     if ( list_empty(&acpi_drhd_units) )
1005     {
1006        INTEL_IOMMU_DEBUG("no device found, something must be very wrong!\n");
1007        return;
1008     }
1009 
1010     /*
1011      * No matter from whom the interrupt came from, check all the
1012      * IOMMUs present in the system. This allows for having just one
1013      * tasklet (instead of one per each IOMMUs) and should be more than
1014      * fine, considering how rare the event of a fault should be.
1015      */
1016     for_each_drhd_unit ( drhd )
1017         __do_iommu_page_fault(drhd->iommu);
1018 }
1019 
iommu_page_fault(int irq,void * dev_id,struct cpu_user_regs * regs)1020 static void iommu_page_fault(int irq, void *dev_id,
1021                              struct cpu_user_regs *regs)
1022 {
1023     /*
1024      * Just flag the tasklet as runnable. This is fine, according to VT-d
1025      * specs since a new interrupt won't be generated until we clear all
1026      * the faults that caused this one to happen.
1027      */
1028     tasklet_schedule(&vtd_fault_tasklet);
1029 }
1030 
dma_msi_unmask(struct irq_desc * desc)1031 static void dma_msi_unmask(struct irq_desc *desc)
1032 {
1033     struct vtd_iommu *iommu = desc->action->dev_id;
1034     unsigned long flags;
1035     u32 sts;
1036 
1037     /* unmask it */
1038     spin_lock_irqsave(&iommu->register_lock, flags);
1039     sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1040     sts &= ~DMA_FECTL_IM;
1041     dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1042     spin_unlock_irqrestore(&iommu->register_lock, flags);
1043     iommu->msi.msi_attrib.host_masked = 0;
1044 }
1045 
dma_msi_mask(struct irq_desc * desc)1046 static void dma_msi_mask(struct irq_desc *desc)
1047 {
1048     unsigned long flags;
1049     struct vtd_iommu *iommu = desc->action->dev_id;
1050     u32 sts;
1051 
1052     /* mask it */
1053     spin_lock_irqsave(&iommu->register_lock, flags);
1054     sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1055     sts |= DMA_FECTL_IM;
1056     dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1057     spin_unlock_irqrestore(&iommu->register_lock, flags);
1058     iommu->msi.msi_attrib.host_masked = 1;
1059 }
1060 
dma_msi_startup(struct irq_desc * desc)1061 static unsigned int dma_msi_startup(struct irq_desc *desc)
1062 {
1063     dma_msi_unmask(desc);
1064     return 0;
1065 }
1066 
dma_msi_ack(struct irq_desc * desc)1067 static void dma_msi_ack(struct irq_desc *desc)
1068 {
1069     irq_complete_move(desc);
1070     dma_msi_mask(desc);
1071     move_masked_irq(desc);
1072 }
1073 
dma_msi_end(struct irq_desc * desc,u8 vector)1074 static void dma_msi_end(struct irq_desc *desc, u8 vector)
1075 {
1076     dma_msi_unmask(desc);
1077     end_nonmaskable_irq(desc, vector);
1078 }
1079 
dma_msi_set_affinity(struct irq_desc * desc,const cpumask_t * mask)1080 static void dma_msi_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
1081 {
1082     struct msi_msg msg;
1083     unsigned int dest;
1084     unsigned long flags;
1085     struct vtd_iommu *iommu = desc->action->dev_id;
1086 
1087     dest = set_desc_affinity(desc, mask);
1088     if (dest == BAD_APICID){
1089         dprintk(XENLOG_ERR VTDPREFIX, "Set iommu interrupt affinity error!\n");
1090         return;
1091     }
1092 
1093     msi_compose_msg(desc->arch.vector, NULL, &msg);
1094     msg.dest32 = dest;
1095     if (x2apic_enabled)
1096         msg.address_hi = dest & 0xFFFFFF00;
1097     ASSERT(!(msg.address_lo & MSI_ADDR_DEST_ID_MASK));
1098     msg.address_lo |= MSI_ADDR_DEST_ID(dest);
1099     iommu->msi.msg = msg;
1100 
1101     spin_lock_irqsave(&iommu->register_lock, flags);
1102     dmar_writel(iommu->reg, DMAR_FEDATA_REG, msg.data);
1103     dmar_writel(iommu->reg, DMAR_FEADDR_REG, msg.address_lo);
1104     /*
1105      * When x2APIC is not enabled, DMAR_FEUADDR_REG is reserved and
1106      * it's not necessary to update it.
1107      */
1108     if ( x2apic_enabled )
1109         dmar_writel(iommu->reg, DMAR_FEUADDR_REG, msg.address_hi);
1110     spin_unlock_irqrestore(&iommu->register_lock, flags);
1111 }
1112 
1113 static hw_irq_controller dma_msi_type = {
1114     .typename = "DMA_MSI",
1115     .startup = dma_msi_startup,
1116     .shutdown = dma_msi_mask,
1117     .enable = dma_msi_unmask,
1118     .disable = dma_msi_mask,
1119     .ack = dma_msi_ack,
1120     .end = dma_msi_end,
1121     .set_affinity = dma_msi_set_affinity,
1122 };
1123 
iommu_set_interrupt(struct acpi_drhd_unit * drhd)1124 static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd)
1125 {
1126     int irq, ret;
1127     struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
1128     struct vtd_iommu *iommu = drhd->iommu;
1129     struct irq_desc *desc;
1130 
1131     irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain)
1132                           : NUMA_NO_NODE,
1133                      false);
1134     if ( irq <= 0 )
1135     {
1136         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n");
1137         return -EINVAL;
1138     }
1139 
1140     desc = irq_to_desc(irq);
1141     desc->handler = &dma_msi_type;
1142     ret = request_irq(irq, 0, iommu_page_fault, "dmar", iommu);
1143     if ( ret )
1144     {
1145         desc->handler = &no_irq_type;
1146         destroy_irq(irq);
1147         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
1148         return ret;
1149     }
1150 
1151     iommu->msi.irq = irq;
1152     iommu->msi.msi_attrib.pos = MSI_TYPE_IOMMU;
1153     iommu->msi.msi_attrib.maskbit = 1;
1154     iommu->msi.msi_attrib.is_64 = 1;
1155     desc->msi_desc = &iommu->msi;
1156 
1157     return 0;
1158 }
1159 
iommu_alloc(struct acpi_drhd_unit * drhd)1160 int __init iommu_alloc(struct acpi_drhd_unit *drhd)
1161 {
1162     struct vtd_iommu *iommu;
1163     unsigned long sagaw, nr_dom;
1164     int agaw;
1165 
1166     if ( nr_iommus > MAX_IOMMUS )
1167     {
1168         dprintk(XENLOG_ERR VTDPREFIX,
1169                  "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
1170         return -ENOMEM;
1171     }
1172 
1173     iommu = xzalloc(struct vtd_iommu);
1174     if ( iommu == NULL )
1175         return -ENOMEM;
1176 
1177     iommu->msi.irq = -1; /* No irq assigned yet. */
1178     iommu->node = NUMA_NO_NODE;
1179     INIT_LIST_HEAD(&iommu->ats_devices);
1180     spin_lock_init(&iommu->lock);
1181     spin_lock_init(&iommu->register_lock);
1182     spin_lock_init(&iommu->intremap.lock);
1183 
1184     iommu->drhd = drhd;
1185     drhd->iommu = iommu;
1186 
1187     iommu->reg = ioremap(drhd->address, PAGE_SIZE);
1188     if ( !iommu->reg )
1189         return -ENOMEM;
1190     iommu->index = nr_iommus++;
1191 
1192     iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
1193     iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
1194 
1195     if ( iommu_verbose )
1196     {
1197         printk(VTDPREFIX "drhd->address = %"PRIx64" iommu->reg = %p\n",
1198                drhd->address, iommu->reg);
1199         printk(VTDPREFIX "cap = %"PRIx64" ecap = %"PRIx64"\n",
1200                iommu->cap, iommu->ecap);
1201     }
1202     if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
1203         return -ENODEV;
1204 
1205     quirk_iommu_caps(iommu);
1206 
1207     if ( cap_fault_reg_offset(iommu->cap) +
1208          cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
1209          ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
1210     {
1211         printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n");
1212         print_iommu_regs(drhd);
1213         return -ENODEV;
1214     }
1215 
1216     /* Calculate number of pagetable levels: between 2 and 4. */
1217     sagaw = cap_sagaw(iommu->cap);
1218     for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
1219         if ( test_bit(agaw, &sagaw) )
1220             break;
1221     if ( agaw < 0 )
1222     {
1223         printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported sagaw %lx\n", sagaw);
1224         print_iommu_regs(drhd);
1225         return -ENODEV;
1226     }
1227     iommu->nr_pt_levels = agaw_to_level(agaw);
1228 
1229     if ( !ecap_coherent(iommu->ecap) )
1230         iommus_incoherent = 1;
1231 
1232     /* allocate domain id bitmap */
1233     nr_dom = cap_ndoms(iommu->cap);
1234     iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
1235     if ( !iommu->domid_bitmap )
1236         return -ENOMEM;
1237 
1238     /*
1239      * if Caching mode is set, then invalid translations are tagged with
1240      * domain id 0, Hence reserve bit 0 for it
1241      */
1242     if ( cap_caching_mode(iommu->cap) )
1243         __set_bit(0, iommu->domid_bitmap);
1244 
1245     iommu->domid_map = xzalloc_array(u16, nr_dom);
1246     if ( !iommu->domid_map )
1247         return -ENOMEM;
1248 
1249     return 0;
1250 }
1251 
iommu_free(struct acpi_drhd_unit * drhd)1252 void __init iommu_free(struct acpi_drhd_unit *drhd)
1253 {
1254     struct vtd_iommu *iommu = drhd->iommu;
1255 
1256     if ( iommu == NULL )
1257         return;
1258 
1259     drhd->iommu = NULL;
1260 
1261     if ( iommu->root_maddr != 0 )
1262     {
1263         free_pgtable_maddr(iommu->root_maddr);
1264         iommu->root_maddr = 0;
1265     }
1266 
1267     if ( iommu->reg )
1268         iounmap(iommu->reg);
1269 
1270     xfree(iommu->domid_bitmap);
1271     xfree(iommu->domid_map);
1272 
1273     if ( iommu->msi.irq >= 0 )
1274         destroy_irq(iommu->msi.irq);
1275     xfree(iommu);
1276 }
1277 
1278 #define guestwidth_to_adjustwidth(gaw) ({       \
1279     int agaw, r = (gaw - 12) % 9;               \
1280     agaw = (r == 0) ? gaw : (gaw + 9 - r);      \
1281     if ( agaw > 64 )                            \
1282         agaw = 64;                              \
1283     agaw; })
1284 
intel_iommu_domain_init(struct domain * d)1285 static int intel_iommu_domain_init(struct domain *d)
1286 {
1287     dom_iommu(d)->arch.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
1288 
1289     return 0;
1290 }
1291 
intel_iommu_hwdom_init(struct domain * d)1292 static void __hwdom_init intel_iommu_hwdom_init(struct domain *d)
1293 {
1294     struct acpi_drhd_unit *drhd;
1295 
1296     setup_hwdom_pci_devices(d, setup_hwdom_device);
1297     setup_hwdom_rmrr(d);
1298     /* Make sure workarounds are applied before enabling the IOMMU(s). */
1299     arch_iommu_hwdom_init(d);
1300 
1301     if ( iommu_flush_all() )
1302         printk(XENLOG_WARNING VTDPREFIX
1303                " IOMMU flush all failed for hardware domain\n");
1304 
1305     for_each_drhd_unit ( drhd )
1306     {
1307         if ( iomem_deny_access(d, PFN_DOWN(drhd->address),
1308                                PFN_DOWN(drhd->address)) )
1309             BUG();
1310         iommu_enable_translation(drhd);
1311     }
1312 }
1313 
domain_context_mapping_one(struct domain * domain,struct vtd_iommu * iommu,u8 bus,u8 devfn,const struct pci_dev * pdev)1314 int domain_context_mapping_one(
1315     struct domain *domain,
1316     struct vtd_iommu *iommu,
1317     u8 bus, u8 devfn, const struct pci_dev *pdev)
1318 {
1319     struct domain_iommu *hd = dom_iommu(domain);
1320     struct context_entry *context, *context_entries;
1321     u64 maddr, pgd_maddr;
1322     u16 seg = iommu->drhd->segment;
1323     int agaw, rc, ret;
1324     bool_t flush_dev_iotlb;
1325 
1326     ASSERT(pcidevs_locked());
1327     spin_lock(&iommu->lock);
1328     maddr = bus_to_context_maddr(iommu, bus);
1329     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1330     context = &context_entries[devfn];
1331 
1332     if ( context_present(*context) )
1333     {
1334         int res = 0;
1335 
1336         /* Try to get domain ownership from device structure.  If that's
1337          * not available, try to read it from the context itself. */
1338         if ( pdev )
1339         {
1340             if ( pdev->domain != domain )
1341             {
1342                 printk(XENLOG_G_INFO VTDPREFIX
1343                        "%pd: %04x:%02x:%02x.%u owned by %pd\n",
1344                        domain, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1345                        pdev->domain);
1346                 res = -EINVAL;
1347             }
1348         }
1349         else
1350         {
1351             int cdomain;
1352             cdomain = context_get_domain_id(context, iommu);
1353 
1354             if ( cdomain < 0 )
1355             {
1356                 printk(XENLOG_G_WARNING VTDPREFIX
1357                        "%pd: %04x:%02x:%02x.%u mapped, but can't find owner\n",
1358                        domain, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1359                 res = -EINVAL;
1360             }
1361             else if ( cdomain != domain->domain_id )
1362             {
1363                 printk(XENLOG_G_INFO VTDPREFIX
1364                        "%pd: %04x:%02x:%02x.%u already mapped to d%d\n",
1365                        domain,
1366                        seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1367                        cdomain);
1368                 res = -EINVAL;
1369             }
1370         }
1371 
1372         unmap_vtd_domain_page(context_entries);
1373         spin_unlock(&iommu->lock);
1374         return res;
1375     }
1376 
1377     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
1378     {
1379         context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1380         agaw = level_to_agaw(iommu->nr_pt_levels);
1381     }
1382     else
1383     {
1384         spin_lock(&hd->arch.mapping_lock);
1385 
1386         /* Ensure we have pagetables allocated down to leaf PTE. */
1387         if ( hd->arch.pgd_maddr == 0 )
1388         {
1389             addr_to_dma_page_maddr(domain, 0, 1);
1390             if ( hd->arch.pgd_maddr == 0 )
1391             {
1392             nomem:
1393                 spin_unlock(&hd->arch.mapping_lock);
1394                 spin_unlock(&iommu->lock);
1395                 unmap_vtd_domain_page(context_entries);
1396                 return -ENOMEM;
1397             }
1398         }
1399 
1400         /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1401         pgd_maddr = hd->arch.pgd_maddr;
1402         for ( agaw = level_to_agaw(4);
1403               agaw != level_to_agaw(iommu->nr_pt_levels);
1404               agaw-- )
1405         {
1406             struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1407             pgd_maddr = dma_pte_addr(*p);
1408             unmap_vtd_domain_page(p);
1409             if ( pgd_maddr == 0 )
1410                 goto nomem;
1411         }
1412 
1413         context_set_address_root(*context, pgd_maddr);
1414         if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1415             context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1416         else
1417             context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1418 
1419         spin_unlock(&hd->arch.mapping_lock);
1420     }
1421 
1422     if ( context_set_domain_id(context, domain, iommu) )
1423     {
1424         spin_unlock(&iommu->lock);
1425         unmap_vtd_domain_page(context_entries);
1426         return -EFAULT;
1427     }
1428 
1429     context_set_address_width(*context, agaw);
1430     context_set_fault_enable(*context);
1431     context_set_present(*context);
1432     iommu_sync_cache(context, sizeof(struct context_entry));
1433     spin_unlock(&iommu->lock);
1434 
1435     /* Context entry was previously non-present (with domid 0). */
1436     rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
1437                                     DMA_CCMD_MASK_NOBIT, 1);
1438     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1439     ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1440 
1441     /*
1442      * The current logic for returns:
1443      *   - positive  invoke iommu_flush_write_buffer to flush cache.
1444      *   - zero      on success.
1445      *   - negative  on failure. Continue to flush IOMMU IOTLB on a
1446      *               best effort basis.
1447      */
1448     if ( rc > 0 || ret > 0 )
1449         iommu_flush_write_buffer(iommu);
1450     if ( rc >= 0 )
1451         rc = ret;
1452     if ( rc > 0 )
1453         rc = 0;
1454 
1455     set_bit(iommu->index, &hd->arch.iommu_bitmap);
1456 
1457     unmap_vtd_domain_page(context_entries);
1458 
1459     if ( !seg && !rc )
1460         rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
1461 
1462     return rc;
1463 }
1464 
domain_context_mapping(struct domain * domain,u8 devfn,struct pci_dev * pdev)1465 static int domain_context_mapping(struct domain *domain, u8 devfn,
1466                                   struct pci_dev *pdev)
1467 {
1468     struct acpi_drhd_unit *drhd;
1469     int ret = 0;
1470     u8 seg = pdev->seg, bus = pdev->bus, secbus;
1471 
1472     drhd = acpi_find_matched_drhd_unit(pdev);
1473     if ( !drhd )
1474         return -ENODEV;
1475 
1476     /*
1477      * Generally we assume only devices from one node to get assigned to a
1478      * given guest.  But even if not, by replacing the prior value here we
1479      * guarantee that at least some basic allocations for the device being
1480      * added will get done against its node.  Any further allocations for
1481      * this or other devices may be penalized then, but some would also be
1482      * if we left other than NUMA_NO_NODE untouched here.
1483      */
1484     if ( drhd->iommu->node != NUMA_NO_NODE )
1485         dom_iommu(domain)->node = drhd->iommu->node;
1486 
1487     ASSERT(pcidevs_locked());
1488 
1489     switch ( pdev->type )
1490     {
1491     case DEV_TYPE_PCI_HOST_BRIDGE:
1492         if ( iommu_debug )
1493             printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n",
1494                    domain->domain_id, seg, bus,
1495                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1496         if ( !is_hardware_domain(domain) )
1497             return -EPERM;
1498         break;
1499 
1500     case DEV_TYPE_PCIe_BRIDGE:
1501     case DEV_TYPE_PCIe2PCI_BRIDGE:
1502     case DEV_TYPE_LEGACY_PCI_BRIDGE:
1503         break;
1504 
1505     case DEV_TYPE_PCIe_ENDPOINT:
1506         if ( iommu_debug )
1507             printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
1508                    domain->domain_id, seg, bus,
1509                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1510         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1511                                          pdev);
1512         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1513             enable_ats_device(pdev, &drhd->iommu->ats_devices);
1514 
1515         break;
1516 
1517     case DEV_TYPE_PCI:
1518         if ( iommu_debug )
1519             printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n",
1520                    domain->domain_id, seg, bus,
1521                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1522 
1523         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1524                                          pdev);
1525         if ( ret )
1526             break;
1527 
1528         if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 1 )
1529             break;
1530 
1531         /*
1532          * Mapping a bridge should, if anything, pass the struct pci_dev of
1533          * that bridge. Since bridges don't normally get assigned to guests,
1534          * their owner would be the wrong one. Pass NULL instead.
1535          */
1536         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1537                                          NULL);
1538 
1539         /*
1540          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
1541          * requester-id. It may originate from devfn=0 on the secondary bus
1542          * behind the bridge. Map that id as well if we didn't already.
1543          *
1544          * Somewhat similar as for bridges, we don't want to pass a struct
1545          * pci_dev here - there may not even exist one for this (secbus,0,0)
1546          * tuple. If there is one, without properly working device groups it
1547          * may again not have the correct owner.
1548          */
1549         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
1550              (secbus != pdev->bus || pdev->devfn != 0) )
1551             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
1552                                              NULL);
1553 
1554         break;
1555 
1556     default:
1557         dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1558                 domain->domain_id, pdev->type,
1559                 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560         ret = -EINVAL;
1561         break;
1562     }
1563 
1564     if ( !ret && devfn == pdev->devfn )
1565         pci_vtd_quirk(pdev);
1566 
1567     return ret;
1568 }
1569 
domain_context_unmap_one(struct domain * domain,struct vtd_iommu * iommu,u8 bus,u8 devfn)1570 int domain_context_unmap_one(
1571     struct domain *domain,
1572     struct vtd_iommu *iommu,
1573     u8 bus, u8 devfn)
1574 {
1575     struct context_entry *context, *context_entries;
1576     u64 maddr;
1577     int iommu_domid, rc, ret;
1578     bool_t flush_dev_iotlb;
1579 
1580     ASSERT(pcidevs_locked());
1581     spin_lock(&iommu->lock);
1582 
1583     maddr = bus_to_context_maddr(iommu, bus);
1584     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1585     context = &context_entries[devfn];
1586 
1587     if ( !context_present(*context) )
1588     {
1589         spin_unlock(&iommu->lock);
1590         unmap_vtd_domain_page(context_entries);
1591         return 0;
1592     }
1593 
1594     context_clear_present(*context);
1595     context_clear_entry(*context);
1596     iommu_sync_cache(context, sizeof(struct context_entry));
1597 
1598     iommu_domid= domain_iommu_domid(domain, iommu);
1599     if ( iommu_domid == -1 )
1600     {
1601         spin_unlock(&iommu->lock);
1602         unmap_vtd_domain_page(context_entries);
1603         return -EINVAL;
1604     }
1605 
1606     rc = iommu_flush_context_device(iommu, iommu_domid,
1607                                     PCI_BDF2(bus, devfn),
1608                                     DMA_CCMD_MASK_NOBIT, 0);
1609 
1610     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1611     ret = iommu_flush_iotlb_dsi(iommu, iommu_domid, 0, flush_dev_iotlb);
1612 
1613     /*
1614      * The current logic for returns:
1615      *   - positive  invoke iommu_flush_write_buffer to flush cache.
1616      *   - zero      on success.
1617      *   - negative  on failure. Continue to flush IOMMU IOTLB on a
1618      *               best effort basis.
1619      */
1620     if ( rc > 0 || ret > 0 )
1621         iommu_flush_write_buffer(iommu);
1622     if ( rc >= 0 )
1623         rc = ret;
1624     if ( rc > 0 )
1625         rc = 0;
1626 
1627     spin_unlock(&iommu->lock);
1628     unmap_vtd_domain_page(context_entries);
1629 
1630     if ( !iommu->drhd->segment && !rc )
1631         rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC);
1632 
1633     return rc;
1634 }
1635 
domain_context_unmap(struct domain * domain,u8 devfn,struct pci_dev * pdev)1636 static int domain_context_unmap(struct domain *domain, u8 devfn,
1637                                 struct pci_dev *pdev)
1638 {
1639     struct acpi_drhd_unit *drhd;
1640     struct vtd_iommu *iommu;
1641     int ret = 0;
1642     u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus;
1643     int found = 0;
1644 
1645     drhd = acpi_find_matched_drhd_unit(pdev);
1646     if ( !drhd )
1647         return -ENODEV;
1648     iommu = drhd->iommu;
1649 
1650     switch ( pdev->type )
1651     {
1652     case DEV_TYPE_PCI_HOST_BRIDGE:
1653         if ( iommu_debug )
1654             printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u unmap\n",
1655                    domain->domain_id, seg, bus,
1656                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1657         if ( !is_hardware_domain(domain) )
1658             return -EPERM;
1659         goto out;
1660 
1661     case DEV_TYPE_PCIe_BRIDGE:
1662     case DEV_TYPE_PCIe2PCI_BRIDGE:
1663     case DEV_TYPE_LEGACY_PCI_BRIDGE:
1664         goto out;
1665 
1666     case DEV_TYPE_PCIe_ENDPOINT:
1667         if ( iommu_debug )
1668             printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
1669                    domain->domain_id, seg, bus,
1670                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1671         ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1672         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1673             disable_ats_device(pdev);
1674 
1675         break;
1676 
1677     case DEV_TYPE_PCI:
1678         if ( iommu_debug )
1679             printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
1680                    domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1681         ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1682         if ( ret )
1683             break;
1684 
1685         tmp_bus = bus;
1686         tmp_devfn = devfn;
1687         if ( find_upstream_bridge(seg, &tmp_bus, &tmp_devfn, &secbus) < 1 )
1688             break;
1689 
1690         /* PCIe to PCI/PCIx bridge */
1691         if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1692         {
1693             ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1694             if ( ret )
1695                 return ret;
1696 
1697             ret = domain_context_unmap_one(domain, iommu, secbus, 0);
1698         }
1699         else /* Legacy PCI bridge */
1700             ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1701 
1702         break;
1703 
1704     default:
1705         dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1706                 domain->domain_id, pdev->type,
1707                 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1708         ret = -EINVAL;
1709         goto out;
1710     }
1711 
1712     /*
1713      * if no other devices under the same iommu owned by this domain,
1714      * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
1715      */
1716     for_each_pdev ( domain, pdev )
1717     {
1718         if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn )
1719             continue;
1720 
1721         drhd = acpi_find_matched_drhd_unit(pdev);
1722         if ( drhd && drhd->iommu == iommu )
1723         {
1724             found = 1;
1725             break;
1726         }
1727     }
1728 
1729     if ( found == 0 )
1730     {
1731         int iommu_domid;
1732 
1733         clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap);
1734 
1735         iommu_domid = domain_iommu_domid(domain, iommu);
1736         if ( iommu_domid == -1 )
1737         {
1738             ret = -EINVAL;
1739             goto out;
1740         }
1741 
1742         clear_bit(iommu_domid, iommu->domid_bitmap);
1743         iommu->domid_map[iommu_domid] = 0;
1744     }
1745 
1746 out:
1747     return ret;
1748 }
1749 
iommu_domain_teardown(struct domain * d)1750 static void iommu_domain_teardown(struct domain *d)
1751 {
1752     struct domain_iommu *hd = dom_iommu(d);
1753     struct mapped_rmrr *mrmrr, *tmp;
1754 
1755     if ( list_empty(&acpi_drhd_units) )
1756         return;
1757 
1758     list_for_each_entry_safe ( mrmrr, tmp, &hd->arch.mapped_rmrrs, list )
1759     {
1760         list_del(&mrmrr->list);
1761         xfree(mrmrr);
1762     }
1763 
1764     ASSERT(is_iommu_enabled(d));
1765 
1766     if ( iommu_use_hap_pt(d) )
1767         return;
1768 
1769     spin_lock(&hd->arch.mapping_lock);
1770     iommu_free_pagetable(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw));
1771     hd->arch.pgd_maddr = 0;
1772     spin_unlock(&hd->arch.mapping_lock);
1773 }
1774 
intel_iommu_map_page(struct domain * d,dfn_t dfn,mfn_t mfn,unsigned int flags,unsigned int * flush_flags)1775 static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
1776                                              mfn_t mfn, unsigned int flags,
1777                                              unsigned int *flush_flags)
1778 {
1779     struct domain_iommu *hd = dom_iommu(d);
1780     struct dma_pte *page, *pte, old, new = {};
1781     u64 pg_maddr;
1782     int rc = 0;
1783 
1784     /* Do nothing if VT-d shares EPT page table */
1785     if ( iommu_use_hap_pt(d) )
1786         return 0;
1787 
1788     /* Do nothing if hardware domain and iommu supports pass thru. */
1789     if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
1790         return 0;
1791 
1792     spin_lock(&hd->arch.mapping_lock);
1793 
1794     pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1);
1795     if ( !pg_maddr )
1796     {
1797         spin_unlock(&hd->arch.mapping_lock);
1798         return -ENOMEM;
1799     }
1800 
1801     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1802     pte = &page[dfn_x(dfn) & LEVEL_MASK];
1803     old = *pte;
1804 
1805     dma_set_pte_addr(new, mfn_to_maddr(mfn));
1806     dma_set_pte_prot(new,
1807                      ((flags & IOMMUF_readable) ? DMA_PTE_READ  : 0) |
1808                      ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
1809 
1810     /* Set the SNP on leaf page table if Snoop Control available */
1811     if ( iommu_snoop )
1812         dma_set_pte_snp(new);
1813 
1814     if ( old.val == new.val )
1815     {
1816         spin_unlock(&hd->arch.mapping_lock);
1817         unmap_vtd_domain_page(page);
1818         return 0;
1819     }
1820 
1821     *pte = new;
1822 
1823     iommu_sync_cache(pte, sizeof(struct dma_pte));
1824     spin_unlock(&hd->arch.mapping_lock);
1825     unmap_vtd_domain_page(page);
1826 
1827     *flush_flags |= IOMMU_FLUSHF_added;
1828     if ( dma_pte_present(old) )
1829         *flush_flags |= IOMMU_FLUSHF_modified;
1830 
1831     return rc;
1832 }
1833 
intel_iommu_unmap_page(struct domain * d,dfn_t dfn,unsigned int * flush_flags)1834 static int __must_check intel_iommu_unmap_page(struct domain *d, dfn_t dfn,
1835                                                unsigned int *flush_flags)
1836 {
1837     /* Do nothing if VT-d shares EPT page table */
1838     if ( iommu_use_hap_pt(d) )
1839         return 0;
1840 
1841     /* Do nothing if hardware domain and iommu supports pass thru. */
1842     if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
1843         return 0;
1844 
1845     dma_pte_clear_one(d, dfn_to_daddr(dfn), flush_flags);
1846 
1847     return 0;
1848 }
1849 
intel_iommu_lookup_page(struct domain * d,dfn_t dfn,mfn_t * mfn,unsigned int * flags)1850 static int intel_iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn,
1851                                    unsigned int *flags)
1852 {
1853     struct domain_iommu *hd = dom_iommu(d);
1854     struct dma_pte *page, val;
1855     u64 pg_maddr;
1856 
1857     /*
1858      * If VT-d shares EPT page table or if the domain is the hardware
1859      * domain and iommu_passthrough is set then pass back the dfn.
1860      */
1861     if ( iommu_use_hap_pt(d) ||
1862          (iommu_hwdom_passthrough && is_hardware_domain(d)) )
1863         return -EOPNOTSUPP;
1864 
1865     spin_lock(&hd->arch.mapping_lock);
1866 
1867     pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0);
1868     if ( !pg_maddr )
1869     {
1870         spin_unlock(&hd->arch.mapping_lock);
1871         return -ENOENT;
1872     }
1873 
1874     page = map_vtd_domain_page(pg_maddr);
1875     val = page[dfn_x(dfn) & LEVEL_MASK];
1876 
1877     unmap_vtd_domain_page(page);
1878     spin_unlock(&hd->arch.mapping_lock);
1879 
1880     if ( !dma_pte_present(val) )
1881         return -ENOENT;
1882 
1883     *mfn = maddr_to_mfn(dma_pte_addr(val));
1884     *flags = dma_pte_read(val) ? IOMMUF_readable : 0;
1885     *flags |= dma_pte_write(val) ? IOMMUF_writable : 0;
1886 
1887     return 0;
1888 }
1889 
vtd_ept_page_compatible(struct vtd_iommu * iommu)1890 static int __init vtd_ept_page_compatible(struct vtd_iommu *iommu)
1891 {
1892     u64 ept_cap, vtd_cap = iommu->cap;
1893 
1894     /* EPT is not initialised yet, so we must check the capability in
1895      * the MSR explicitly rather than use cpu_has_vmx_ept_*() */
1896     if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 )
1897         return 0;
1898 
1899     return (ept_has_2mb(ept_cap) && opt_hap_2mb) <= cap_sps_2mb(vtd_cap) &&
1900            (ept_has_1gb(ept_cap) && opt_hap_1gb) <= cap_sps_1gb(vtd_cap);
1901 }
1902 
1903 /*
1904  * set VT-d page table directory to EPT table if allowed
1905  */
iommu_set_pgd(struct domain * d)1906 static void iommu_set_pgd(struct domain *d)
1907 {
1908     mfn_t pgd_mfn;
1909 
1910     pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
1911     dom_iommu(d)->arch.pgd_maddr =
1912         pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
1913 }
1914 
rmrr_identity_mapping(struct domain * d,bool_t map,const struct acpi_rmrr_unit * rmrr,u32 flag)1915 static int rmrr_identity_mapping(struct domain *d, bool_t map,
1916                                  const struct acpi_rmrr_unit *rmrr,
1917                                  u32 flag)
1918 {
1919     unsigned long base_pfn = rmrr->base_address >> PAGE_SHIFT_4K;
1920     unsigned long end_pfn = PAGE_ALIGN_4K(rmrr->end_address) >> PAGE_SHIFT_4K;
1921     struct mapped_rmrr *mrmrr;
1922     struct domain_iommu *hd = dom_iommu(d);
1923 
1924     ASSERT(pcidevs_locked());
1925     ASSERT(rmrr->base_address < rmrr->end_address);
1926 
1927     /*
1928      * No need to acquire hd->arch.mapping_lock: Both insertion and removal
1929      * get done while holding pcidevs_lock.
1930      */
1931     list_for_each_entry( mrmrr, &hd->arch.mapped_rmrrs, list )
1932     {
1933         if ( mrmrr->base == rmrr->base_address &&
1934              mrmrr->end == rmrr->end_address )
1935         {
1936             int ret = 0;
1937 
1938             if ( map )
1939             {
1940                 ++mrmrr->count;
1941                 return 0;
1942             }
1943 
1944             if ( --mrmrr->count )
1945                 return 0;
1946 
1947             while ( base_pfn < end_pfn )
1948             {
1949                 if ( clear_identity_p2m_entry(d, base_pfn) )
1950                     ret = -ENXIO;
1951                 base_pfn++;
1952             }
1953 
1954             list_del(&mrmrr->list);
1955             xfree(mrmrr);
1956             return ret;
1957         }
1958     }
1959 
1960     if ( !map )
1961         return -ENOENT;
1962 
1963     while ( base_pfn < end_pfn )
1964     {
1965         int err = set_identity_p2m_entry(d, base_pfn, p2m_access_rw, flag);
1966 
1967         if ( err )
1968             return err;
1969         base_pfn++;
1970     }
1971 
1972     mrmrr = xmalloc(struct mapped_rmrr);
1973     if ( !mrmrr )
1974         return -ENOMEM;
1975     mrmrr->base = rmrr->base_address;
1976     mrmrr->end = rmrr->end_address;
1977     mrmrr->count = 1;
1978     list_add_tail(&mrmrr->list, &hd->arch.mapped_rmrrs);
1979 
1980     return 0;
1981 }
1982 
intel_iommu_add_device(u8 devfn,struct pci_dev * pdev)1983 static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
1984 {
1985     struct acpi_rmrr_unit *rmrr;
1986     u16 bdf;
1987     int ret, i;
1988 
1989     ASSERT(pcidevs_locked());
1990 
1991     if ( !pdev->domain )
1992         return -EINVAL;
1993 
1994     ret = domain_context_mapping(pdev->domain, devfn, pdev);
1995     if ( ret )
1996     {
1997         dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
1998                 pdev->domain->domain_id);
1999         return ret;
2000     }
2001 
2002     for_each_rmrr_device ( rmrr, bdf, i )
2003     {
2004         if ( rmrr->segment == pdev->seg &&
2005              PCI_BUS(bdf) == pdev->bus &&
2006              PCI_DEVFN2(bdf) == devfn )
2007         {
2008             /*
2009              * iommu_add_device() is only called for the hardware
2010              * domain (see xen/drivers/passthrough/pci.c:pci_add_device()).
2011              * Since RMRRs are always reserved in the e820 map for the hardware
2012              * domain, there shouldn't be a conflict.
2013              */
2014             ret = rmrr_identity_mapping(pdev->domain, 1, rmrr, 0);
2015             if ( ret )
2016                 dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
2017                         pdev->domain->domain_id);
2018         }
2019     }
2020 
2021     return 0;
2022 }
2023 
intel_iommu_enable_device(struct pci_dev * pdev)2024 static int intel_iommu_enable_device(struct pci_dev *pdev)
2025 {
2026     struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
2027     int ret = drhd ? ats_device(pdev, drhd) : -ENODEV;
2028 
2029     pci_vtd_quirk(pdev);
2030 
2031     if ( ret <= 0 )
2032         return ret;
2033 
2034     ret = enable_ats_device(pdev, &drhd->iommu->ats_devices);
2035 
2036     return ret >= 0 ? 0 : ret;
2037 }
2038 
intel_iommu_remove_device(u8 devfn,struct pci_dev * pdev)2039 static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
2040 {
2041     struct acpi_rmrr_unit *rmrr;
2042     u16 bdf;
2043     int i;
2044 
2045     if ( !pdev->domain )
2046         return -EINVAL;
2047 
2048     for_each_rmrr_device ( rmrr, bdf, i )
2049     {
2050         if ( rmrr->segment != pdev->seg ||
2051              PCI_BUS(bdf) != pdev->bus ||
2052              PCI_DEVFN2(bdf) != devfn )
2053             continue;
2054 
2055         /*
2056          * Any flag is nothing to clear these mappings but here
2057          * its always safe and strict to set 0.
2058          */
2059         rmrr_identity_mapping(pdev->domain, 0, rmrr, 0);
2060     }
2061 
2062     return domain_context_unmap(pdev->domain, devfn, pdev);
2063 }
2064 
setup_hwdom_device(u8 devfn,struct pci_dev * pdev)2065 static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev)
2066 {
2067     return domain_context_mapping(pdev->domain, devfn, pdev);
2068 }
2069 
clear_fault_bits(struct vtd_iommu * iommu)2070 void clear_fault_bits(struct vtd_iommu *iommu)
2071 {
2072     u64 val;
2073     unsigned long flags;
2074 
2075     spin_lock_irqsave(&iommu->register_lock, flags);
2076     val = dmar_readq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8);
2077     dmar_writeq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8, val);
2078     dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
2079     spin_unlock_irqrestore(&iommu->register_lock, flags);
2080 }
2081 
adjust_irq_affinity(struct acpi_drhd_unit * drhd)2082 static void adjust_irq_affinity(struct acpi_drhd_unit *drhd)
2083 {
2084     const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
2085     unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
2086                              : NUMA_NO_NODE;
2087     const cpumask_t *cpumask = NULL;
2088     struct irq_desc *desc;
2089     unsigned long flags;
2090 
2091     if ( node < MAX_NUMNODES && node_online(node) &&
2092          cpumask_intersects(&node_to_cpumask(node), &cpu_online_map) )
2093         cpumask = &node_to_cpumask(node);
2094 
2095     desc = irq_to_desc(drhd->iommu->msi.irq);
2096     spin_lock_irqsave(&desc->lock, flags);
2097     dma_msi_set_affinity(desc, cpumask);
2098     spin_unlock_irqrestore(&desc->lock, flags);
2099 }
2100 
adjust_vtd_irq_affinities(void)2101 static int adjust_vtd_irq_affinities(void)
2102 {
2103     struct acpi_drhd_unit *drhd;
2104 
2105     if ( !iommu_enabled )
2106         return 0;
2107 
2108     for_each_drhd_unit ( drhd )
2109         adjust_irq_affinity(drhd);
2110 
2111     return 0;
2112 }
2113 __initcall(adjust_vtd_irq_affinities);
2114 
init_vtd_hw(void)2115 static int __must_check init_vtd_hw(void)
2116 {
2117     struct acpi_drhd_unit *drhd;
2118     struct vtd_iommu *iommu;
2119     int ret;
2120     unsigned long flags;
2121     u32 sts;
2122 
2123     /*
2124      * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.
2125      */
2126     for_each_drhd_unit ( drhd )
2127     {
2128         adjust_irq_affinity(drhd);
2129 
2130         iommu = drhd->iommu;
2131 
2132         clear_fault_bits(iommu);
2133 
2134         spin_lock_irqsave(&iommu->register_lock, flags);
2135         sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
2136         sts &= ~DMA_FECTL_IM;
2137         dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
2138         spin_unlock_irqrestore(&iommu->register_lock, flags);
2139     }
2140 
2141     /*
2142      * Enable queue invalidation
2143      */
2144     for_each_drhd_unit ( drhd )
2145     {
2146         iommu = drhd->iommu;
2147         /*
2148          * If queued invalidation not enabled, use regiser based
2149          * invalidation
2150          */
2151         if ( enable_qinval(iommu) != 0 )
2152         {
2153             iommu->flush.context = flush_context_reg;
2154             iommu->flush.iotlb   = flush_iotlb_reg;
2155         }
2156     }
2157 
2158     /*
2159      * Enable interrupt remapping
2160      */
2161     if ( iommu_intremap )
2162     {
2163         int apic;
2164         for ( apic = 0; apic < nr_ioapics; apic++ )
2165         {
2166             if ( ioapic_to_iommu(IO_APIC_ID(apic)) == NULL )
2167             {
2168                 iommu_intremap = iommu_intremap_off;
2169                 dprintk(XENLOG_ERR VTDPREFIX,
2170                     "ioapic_to_iommu: ioapic %#x (id: %#x) is NULL! "
2171                     "Will not try to enable Interrupt Remapping.\n",
2172                     apic, IO_APIC_ID(apic));
2173                 break;
2174             }
2175         }
2176     }
2177     if ( iommu_intremap )
2178     {
2179         for_each_drhd_unit ( drhd )
2180         {
2181             iommu = drhd->iommu;
2182             if ( enable_intremap(iommu, 0) != 0 )
2183             {
2184                 iommu_intremap = iommu_intremap_off;
2185                 dprintk(XENLOG_WARNING VTDPREFIX,
2186                         "Interrupt Remapping not enabled\n");
2187 
2188                 break;
2189             }
2190         }
2191         if ( !iommu_intremap )
2192             for_each_drhd_unit ( drhd )
2193                 disable_intremap(drhd->iommu);
2194     }
2195 
2196     /*
2197      * Set root entries for each VT-d engine.  After set root entry,
2198      * must globally invalidate context cache, and then globally
2199      * invalidate IOTLB
2200      */
2201     for_each_drhd_unit ( drhd )
2202     {
2203         iommu = drhd->iommu;
2204         ret = iommu_set_root_entry(iommu);
2205         if ( ret )
2206         {
2207             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
2208             return -EIO;
2209         }
2210     }
2211 
2212     return iommu_flush_all();
2213 }
2214 
setup_hwdom_rmrr(struct domain * d)2215 static void __hwdom_init setup_hwdom_rmrr(struct domain *d)
2216 {
2217     struct acpi_rmrr_unit *rmrr;
2218     u16 bdf;
2219     int ret, i;
2220 
2221     pcidevs_lock();
2222     for_each_rmrr_device ( rmrr, bdf, i )
2223     {
2224         /*
2225          * Here means we're add a device to the hardware domain.
2226          * Since RMRRs are always reserved in the e820 map for the hardware
2227          * domain, there shouldn't be a conflict. So its always safe and
2228          * strict to set 0.
2229          */
2230         ret = rmrr_identity_mapping(d, 1, rmrr, 0);
2231         if ( ret )
2232             dprintk(XENLOG_ERR VTDPREFIX,
2233                      "IOMMU: mapping reserved region failed\n");
2234     }
2235     pcidevs_unlock();
2236 }
2237 
vtd_setup(void)2238 static int __init vtd_setup(void)
2239 {
2240     struct acpi_drhd_unit *drhd;
2241     struct vtd_iommu *iommu;
2242     int ret;
2243 
2244     if ( list_empty(&acpi_drhd_units) )
2245     {
2246         ret = -ENODEV;
2247         goto error;
2248     }
2249 
2250     if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) )
2251     {
2252         ret = -EPERM;
2253         goto error;
2254     }
2255 
2256     platform_quirks_init();
2257     if ( !iommu_enable )
2258     {
2259         ret = -ENODEV;
2260         goto error;
2261     }
2262 
2263     /* We enable the following features only if they are supported by all VT-d
2264      * engines: Snoop Control, DMA passthrough, Queued Invalidation, Interrupt
2265      * Remapping, and Posted Interrupt
2266      */
2267     for_each_drhd_unit ( drhd )
2268     {
2269         iommu = drhd->iommu;
2270 
2271         printk("Intel VT-d iommu %u supported page sizes: 4kB%s%s\n",
2272                iommu->index,
2273                cap_sps_2mb(iommu->cap) ? ", 2MB" : "",
2274                cap_sps_1gb(iommu->cap) ? ", 1GB" : "");
2275 
2276 #ifndef iommu_snoop
2277         if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
2278             iommu_snoop = false;
2279 #endif
2280 
2281         if ( iommu_hwdom_passthrough && !ecap_pass_thru(iommu->ecap) )
2282             iommu_hwdom_passthrough = false;
2283 
2284         if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
2285             iommu_qinval = 0;
2286 
2287         if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
2288             iommu_intremap = iommu_intremap_off;
2289 
2290 #ifndef iommu_intpost
2291         /*
2292          * We cannot use posted interrupt if X86_FEATURE_CX16 is
2293          * not supported, since we count on this feature to
2294          * atomically update 16-byte IRTE in posted format.
2295          */
2296         if ( !cap_intr_post(iommu->cap) || !iommu_intremap || !cpu_has_cx16 )
2297             iommu_intpost = false;
2298 #endif
2299 
2300         if ( !vtd_ept_page_compatible(iommu) )
2301             clear_iommu_hap_pt_share();
2302 
2303         ret = iommu_set_interrupt(drhd);
2304         if ( ret )
2305         {
2306             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
2307             goto error;
2308         }
2309     }
2310 
2311     softirq_tasklet_init(&vtd_fault_tasklet, do_iommu_page_fault, NULL);
2312 
2313     if ( !iommu_qinval && iommu_intremap )
2314     {
2315         iommu_intremap = iommu_intremap_off;
2316         dprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
2317             "since Queued Invalidation isn't supported or enabled.\n");
2318     }
2319 
2320 #define P(p,s) printk("Intel VT-d %s %senabled.\n", s, (p)? "" : "not ")
2321 #ifndef iommu_snoop
2322     P(iommu_snoop, "Snoop Control");
2323 #endif
2324     P(iommu_hwdom_passthrough, "Dom0 DMA Passthrough");
2325     P(iommu_qinval, "Queued Invalidation");
2326     P(iommu_intremap, "Interrupt Remapping");
2327 #ifndef iommu_intpost
2328     P(iommu_intpost, "Posted Interrupt");
2329 #endif
2330     P(iommu_hap_pt_share, "Shared EPT tables");
2331 #undef P
2332 
2333     ret = init_vtd_hw();
2334     if ( ret )
2335         goto error;
2336 
2337     register_keyhandler('V', vtd_dump_iommu_info, "dump iommu info", 1);
2338 
2339     return 0;
2340 
2341  error:
2342     iommu_enabled = 0;
2343 #ifndef iommu_snoop
2344     iommu_snoop = false;
2345 #endif
2346     iommu_hwdom_passthrough = false;
2347     iommu_qinval = 0;
2348     iommu_intremap = iommu_intremap_off;
2349 #ifndef iommu_intpost
2350     iommu_intpost = false;
2351 #endif
2352     return ret;
2353 }
2354 
reassign_device_ownership(struct domain * source,struct domain * target,u8 devfn,struct pci_dev * pdev)2355 static int reassign_device_ownership(
2356     struct domain *source,
2357     struct domain *target,
2358     u8 devfn, struct pci_dev *pdev)
2359 {
2360     int ret;
2361 
2362     /*
2363      * Devices assigned to untrusted domains (here assumed to be any domU)
2364      * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
2365      * by the root complex unless interrupt remapping is enabled.
2366      */
2367     if ( (target != hardware_domain) && !iommu_intremap )
2368         untrusted_msi = true;
2369 
2370     /*
2371      * If the device belongs to the hardware domain, and it has RMRR, don't
2372      * remove it from the hardware domain, because BIOS may use RMRR at
2373      * booting time.
2374      */
2375     if ( !is_hardware_domain(source) )
2376     {
2377         const struct acpi_rmrr_unit *rmrr;
2378         u16 bdf;
2379         unsigned int i;
2380 
2381         for_each_rmrr_device( rmrr, bdf, i )
2382             if ( rmrr->segment == pdev->seg &&
2383                  PCI_BUS(bdf) == pdev->bus &&
2384                  PCI_DEVFN2(bdf) == devfn )
2385             {
2386                 /*
2387                  * Any RMRR flag is always ignored when remove a device,
2388                  * but its always safe and strict to set 0.
2389                  */
2390                 ret = rmrr_identity_mapping(source, 0, rmrr, 0);
2391                 if ( ret != -ENOENT )
2392                     return ret;
2393             }
2394     }
2395 
2396     ret = domain_context_unmap(source, devfn, pdev);
2397     if ( ret )
2398         return ret;
2399 
2400     if ( devfn == pdev->devfn && pdev->domain != dom_io )
2401     {
2402         list_move(&pdev->domain_list, &dom_io->pdev_list);
2403         pdev->domain = dom_io;
2404     }
2405 
2406     if ( !has_arch_pdevs(source) )
2407         vmx_pi_hooks_deassign(source);
2408 
2409     if ( !has_arch_pdevs(target) )
2410         vmx_pi_hooks_assign(target);
2411 
2412     ret = domain_context_mapping(target, devfn, pdev);
2413     if ( ret )
2414     {
2415         if ( !has_arch_pdevs(target) )
2416             vmx_pi_hooks_deassign(target);
2417 
2418         return ret;
2419     }
2420 
2421     if ( devfn == pdev->devfn && pdev->domain != target )
2422     {
2423         list_move(&pdev->domain_list, &target->pdev_list);
2424         pdev->domain = target;
2425     }
2426 
2427     return ret;
2428 }
2429 
intel_iommu_assign_device(struct domain * d,u8 devfn,struct pci_dev * pdev,u32 flag)2430 static int intel_iommu_assign_device(
2431     struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
2432 {
2433     struct domain *s = pdev->domain;
2434     struct acpi_rmrr_unit *rmrr;
2435     int ret = 0, i;
2436     u16 bdf, seg;
2437     u8 bus;
2438 
2439     if ( list_empty(&acpi_drhd_units) )
2440         return -ENODEV;
2441 
2442     seg = pdev->seg;
2443     bus = pdev->bus;
2444     /*
2445      * In rare cases one given rmrr is shared by multiple devices but
2446      * obviously this would put the security of a system at risk. So
2447      * we would prevent from this sort of device assignment. But this
2448      * can be permitted if user set
2449      *      "pci = [ 'sbdf, rdm_policy=relaxed' ]"
2450      *
2451      * TODO: in the future we can introduce group device assignment
2452      * interface to make sure devices sharing RMRR are assigned to the
2453      * same domain together.
2454      */
2455     for_each_rmrr_device( rmrr, bdf, i )
2456     {
2457         if ( rmrr->segment == seg &&
2458              PCI_BUS(bdf) == bus &&
2459              PCI_DEVFN2(bdf) == devfn &&
2460              rmrr->scope.devices_cnt > 1 )
2461         {
2462             bool_t relaxed = !!(flag & XEN_DOMCTL_DEV_RDM_RELAXED);
2463 
2464             printk(XENLOG_GUEST "%s" VTDPREFIX
2465                    " It's %s to assign %04x:%02x:%02x.%u"
2466                    " with shared RMRR at %"PRIx64" for Dom%d.\n",
2467                    relaxed ? XENLOG_WARNING : XENLOG_ERR,
2468                    relaxed ? "risky" : "disallowed",
2469                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
2470                    rmrr->base_address, d->domain_id);
2471             if ( !relaxed )
2472                 return -EPERM;
2473         }
2474     }
2475 
2476     ret = reassign_device_ownership(s, d, devfn, pdev);
2477     if ( ret || d == dom_io )
2478         return ret;
2479 
2480     /* Setup rmrr identity mapping */
2481     for_each_rmrr_device( rmrr, bdf, i )
2482     {
2483         if ( rmrr->segment == seg &&
2484              PCI_BUS(bdf) == bus &&
2485              PCI_DEVFN2(bdf) == devfn )
2486         {
2487             ret = rmrr_identity_mapping(d, 1, rmrr, flag);
2488             if ( ret )
2489             {
2490                 int rc;
2491 
2492                 rc = reassign_device_ownership(d, s, devfn, pdev);
2493                 printk(XENLOG_G_ERR VTDPREFIX
2494                        " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
2495                        rmrr->base_address, rmrr->end_address,
2496                        d->domain_id, ret);
2497                 if ( rc )
2498                 {
2499                     printk(XENLOG_ERR VTDPREFIX
2500                            " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
2501                            seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
2502                     domain_crash(d);
2503                 }
2504                 break;
2505             }
2506         }
2507     }
2508 
2509     return ret;
2510 }
2511 
intel_iommu_group_id(u16 seg,u8 bus,u8 devfn)2512 static int intel_iommu_group_id(u16 seg, u8 bus, u8 devfn)
2513 {
2514     u8 secbus;
2515     if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 0 )
2516         return -1;
2517     else
2518         return PCI_BDF2(bus, devfn);
2519 }
2520 
2521 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
2522 
vtd_suspend(void)2523 static int __must_check vtd_suspend(void)
2524 {
2525     struct acpi_drhd_unit *drhd;
2526     struct vtd_iommu *iommu;
2527     u32    i;
2528     int rc;
2529 
2530     if ( !iommu_enabled )
2531         return 0;
2532 
2533     rc = iommu_flush_all();
2534     if ( unlikely(rc) )
2535     {
2536         printk(XENLOG_WARNING VTDPREFIX
2537                " suspend: IOMMU flush all failed: %d\n", rc);
2538 
2539         return rc;
2540     }
2541 
2542     for_each_drhd_unit ( drhd )
2543     {
2544         iommu = drhd->iommu;
2545         i = iommu->index;
2546 
2547         iommu_state[i][DMAR_FECTL_REG] =
2548             (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
2549         iommu_state[i][DMAR_FEDATA_REG] =
2550             (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
2551         iommu_state[i][DMAR_FEADDR_REG] =
2552             (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
2553         iommu_state[i][DMAR_FEUADDR_REG] =
2554             (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
2555 
2556         /* don't disable VT-d engine when force_iommu is set. */
2557         if ( force_iommu )
2558             continue;
2559 
2560         iommu_disable_translation(iommu);
2561 
2562         /* If interrupt remapping is enabled, queued invalidation
2563          * will be disabled following interupt remapping disabling
2564          * in local apic suspend
2565          */
2566         if ( !iommu_intremap && iommu_qinval )
2567             disable_qinval(iommu);
2568     }
2569 
2570     return 0;
2571 }
2572 
vtd_crash_shutdown(void)2573 static void vtd_crash_shutdown(void)
2574 {
2575     struct acpi_drhd_unit *drhd;
2576     struct vtd_iommu *iommu;
2577 
2578     if ( !iommu_enabled )
2579         return;
2580 
2581     if ( iommu_flush_all() )
2582         printk(XENLOG_WARNING VTDPREFIX
2583                " crash shutdown: IOMMU flush all failed\n");
2584 
2585     for_each_drhd_unit ( drhd )
2586     {
2587         iommu = drhd->iommu;
2588         iommu_disable_translation(iommu);
2589         disable_intremap(drhd->iommu);
2590         disable_qinval(drhd->iommu);
2591     }
2592 }
2593 
vtd_resume(void)2594 static void vtd_resume(void)
2595 {
2596     struct acpi_drhd_unit *drhd;
2597     struct vtd_iommu *iommu;
2598     u32 i;
2599     unsigned long flags;
2600 
2601     if ( !iommu_enabled )
2602         return;
2603 
2604     if ( init_vtd_hw() != 0  && force_iommu )
2605          panic("IOMMU setup failed, crash Xen for security purpose\n");
2606 
2607     for_each_drhd_unit ( drhd )
2608     {
2609         iommu = drhd->iommu;
2610         i = iommu->index;
2611 
2612         spin_lock_irqsave(&iommu->register_lock, flags);
2613         dmar_writel(iommu->reg, DMAR_FECTL_REG,
2614                     (u32) iommu_state[i][DMAR_FECTL_REG]);
2615         dmar_writel(iommu->reg, DMAR_FEDATA_REG,
2616                     (u32) iommu_state[i][DMAR_FEDATA_REG]);
2617         dmar_writel(iommu->reg, DMAR_FEADDR_REG,
2618                     (u32) iommu_state[i][DMAR_FEADDR_REG]);
2619         dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
2620                     (u32) iommu_state[i][DMAR_FEUADDR_REG]);
2621         spin_unlock_irqrestore(&iommu->register_lock, flags);
2622 
2623         iommu_enable_translation(drhd);
2624     }
2625 }
2626 
vtd_dump_p2m_table_level(paddr_t pt_maddr,int level,paddr_t gpa,int indent)2627 static void vtd_dump_p2m_table_level(paddr_t pt_maddr, int level, paddr_t gpa,
2628                                      int indent)
2629 {
2630     paddr_t address;
2631     int i;
2632     struct dma_pte *pt_vaddr, *pte;
2633     int next_level;
2634 
2635     if ( level < 1 )
2636         return;
2637 
2638     pt_vaddr = map_vtd_domain_page(pt_maddr);
2639     if ( pt_vaddr == NULL )
2640     {
2641         printk("Failed to map VT-D domain page %"PRIpaddr"\n", pt_maddr);
2642         return;
2643     }
2644 
2645     next_level = level - 1;
2646     for ( i = 0; i < PTE_NUM; i++ )
2647     {
2648         if ( !(i % 2) )
2649             process_pending_softirqs();
2650 
2651         pte = &pt_vaddr[i];
2652         if ( !dma_pte_present(*pte) )
2653             continue;
2654 
2655         address = gpa + offset_level_address(i, level);
2656         if ( next_level >= 1 )
2657             vtd_dump_p2m_table_level(dma_pte_addr(*pte), next_level,
2658                                      address, indent + 1);
2659         else
2660             printk("%*sdfn: %08lx mfn: %08lx\n",
2661                    indent, "",
2662                    (unsigned long)(address >> PAGE_SHIFT_4K),
2663                    (unsigned long)(dma_pte_addr(*pte) >> PAGE_SHIFT_4K));
2664     }
2665 
2666     unmap_vtd_domain_page(pt_vaddr);
2667 }
2668 
vtd_dump_p2m_table(struct domain * d)2669 static void vtd_dump_p2m_table(struct domain *d)
2670 {
2671     const struct domain_iommu *hd;
2672 
2673     if ( list_empty(&acpi_drhd_units) )
2674         return;
2675 
2676     hd = dom_iommu(d);
2677     printk("p2m table has %d levels\n", agaw_to_level(hd->arch.agaw));
2678     vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0);
2679 }
2680 
intel_iommu_quarantine_init(struct domain * d)2681 static int __init intel_iommu_quarantine_init(struct domain *d)
2682 {
2683     struct domain_iommu *hd = dom_iommu(d);
2684     struct dma_pte *parent;
2685     unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
2686     unsigned int level = agaw_to_level(agaw);
2687     int rc;
2688 
2689     if ( hd->arch.pgd_maddr )
2690     {
2691         ASSERT_UNREACHABLE();
2692         return 0;
2693     }
2694 
2695     spin_lock(&hd->arch.mapping_lock);
2696 
2697     hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node);
2698     if ( !hd->arch.pgd_maddr )
2699         goto out;
2700 
2701     parent = map_vtd_domain_page(hd->arch.pgd_maddr);
2702     while ( level )
2703     {
2704         uint64_t maddr;
2705         unsigned int offset;
2706 
2707         /*
2708          * The pgtable allocator is fine for the leaf page, as well as
2709          * page table pages, and the resulting allocations are always
2710          * zeroed.
2711          */
2712         maddr = alloc_pgtable_maddr(1, hd->node);
2713         if ( !maddr )
2714             break;
2715 
2716         for ( offset = 0; offset < PTE_NUM; offset++ )
2717         {
2718             struct dma_pte *pte = &parent[offset];
2719 
2720             dma_set_pte_addr(*pte, maddr);
2721             dma_set_pte_readable(*pte);
2722         }
2723         iommu_sync_cache(parent, PAGE_SIZE);
2724 
2725         unmap_vtd_domain_page(parent);
2726         parent = map_vtd_domain_page(maddr);
2727         level--;
2728     }
2729     unmap_vtd_domain_page(parent);
2730 
2731  out:
2732     spin_unlock(&hd->arch.mapping_lock);
2733 
2734     rc = iommu_flush_iotlb_all(d);
2735 
2736     /* Pages leaked in failure case */
2737     return level ? -ENOMEM : rc;
2738 }
2739 
2740 const struct iommu_ops __initconstrel intel_iommu_ops = {
2741     .init = intel_iommu_domain_init,
2742     .hwdom_init = intel_iommu_hwdom_init,
2743     .quarantine_init = intel_iommu_quarantine_init,
2744     .add_device = intel_iommu_add_device,
2745     .enable_device = intel_iommu_enable_device,
2746     .remove_device = intel_iommu_remove_device,
2747     .assign_device  = intel_iommu_assign_device,
2748     .teardown = iommu_domain_teardown,
2749     .map_page = intel_iommu_map_page,
2750     .unmap_page = intel_iommu_unmap_page,
2751     .lookup_page = intel_iommu_lookup_page,
2752     .free_page_table = iommu_free_page_table,
2753     .reassign_device = reassign_device_ownership,
2754     .get_device_group_id = intel_iommu_group_id,
2755     .enable_x2apic = intel_iommu_enable_eim,
2756     .disable_x2apic = intel_iommu_disable_eim,
2757     .update_ire_from_apic = io_apic_write_remap_rte,
2758     .update_ire_from_msi = msi_msg_write_remap_rte,
2759     .read_apic_from_ire = io_apic_read_remap_rte,
2760     .read_msi_from_ire = msi_msg_read_remap_rte,
2761     .setup_hpet_msi = intel_setup_hpet_msi,
2762     .adjust_irq_affinities = adjust_vtd_irq_affinities,
2763     .suspend = vtd_suspend,
2764     .resume = vtd_resume,
2765     .share_p2m = iommu_set_pgd,
2766     .crash_shutdown = vtd_crash_shutdown,
2767     .iotlb_flush = iommu_flush_iotlb_pages,
2768     .iotlb_flush_all = iommu_flush_iotlb_all,
2769     .get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
2770     .dump_p2m_table = vtd_dump_p2m_table,
2771     .sync_cache = sync_cache,
2772 };
2773 
2774 const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
2775     .ops = &intel_iommu_ops,
2776     .setup = vtd_setup,
2777     .supports_x2apic = intel_iommu_supports_eim,
2778 };
2779 
2780 /*
2781  * Local variables:
2782  * mode: C
2783  * c-file-style: "BSD"
2784  * c-basic-offset: 4
2785  * tab-width: 4
2786  * indent-tabs-mode: nil
2787  * End:
2788  */
2789