1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; If not, see <http://www.gnu.org/licenses/>.
15 *
16 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
17 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
18 * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
19 */
20
21 #include <xen/irq.h>
22 #include <xen/sched.h>
23 #include <xen/xmalloc.h>
24 #include <xen/domain_page.h>
25 #include <xen/iocap.h>
26 #include <xen/iommu.h>
27 #include <xen/numa.h>
28 #include <xen/softirq.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include <asm/nops.h>
35 #include <asm/irq.h>
36 #include <asm/hvm/vmx/vmx.h>
37 #include <asm/p2m.h>
38 #include <mach_apic.h>
39 #include "iommu.h"
40 #include "dmar.h"
41 #include "extern.h"
42 #include "vtd.h"
43 #include "../ats.h"
44
45 struct mapped_rmrr {
46 struct list_head list;
47 u64 base, end;
48 unsigned int count;
49 };
50
51 /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
52 bool __read_mostly untrusted_msi;
53
54 bool __read_mostly iommu_igfx = true;
55 bool __read_mostly iommu_qinval = true;
56 #ifndef iommu_snoop
57 bool __read_mostly iommu_snoop = true;
58 #endif
59
60 int nr_iommus;
61
62 static struct tasklet vtd_fault_tasklet;
63
64 static int setup_hwdom_device(u8 devfn, struct pci_dev *);
65 static void setup_hwdom_rmrr(struct domain *d);
66
domain_iommu_domid(struct domain * d,struct vtd_iommu * iommu)67 static int domain_iommu_domid(struct domain *d,
68 struct vtd_iommu *iommu)
69 {
70 unsigned long nr_dom, i;
71
72 nr_dom = cap_ndoms(iommu->cap);
73 i = find_first_bit(iommu->domid_bitmap, nr_dom);
74 while ( i < nr_dom )
75 {
76 if ( iommu->domid_map[i] == d->domain_id )
77 return i;
78
79 i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
80 }
81
82 dprintk(XENLOG_ERR VTDPREFIX,
83 "Cannot get valid iommu domid: domid=%d iommu->index=%d\n",
84 d->domain_id, iommu->index);
85 return -1;
86 }
87
88 #define DID_FIELD_WIDTH 16
89 #define DID_HIGH_OFFSET 8
context_set_domain_id(struct context_entry * context,struct domain * d,struct vtd_iommu * iommu)90 static int context_set_domain_id(struct context_entry *context,
91 struct domain *d,
92 struct vtd_iommu *iommu)
93 {
94 unsigned long nr_dom, i;
95 int found = 0;
96
97 ASSERT(spin_is_locked(&iommu->lock));
98
99 nr_dom = cap_ndoms(iommu->cap);
100 i = find_first_bit(iommu->domid_bitmap, nr_dom);
101 while ( i < nr_dom )
102 {
103 if ( iommu->domid_map[i] == d->domain_id )
104 {
105 found = 1;
106 break;
107 }
108 i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
109 }
110
111 if ( found == 0 )
112 {
113 i = find_first_zero_bit(iommu->domid_bitmap, nr_dom);
114 if ( i >= nr_dom )
115 {
116 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n");
117 return -EFAULT;
118 }
119 iommu->domid_map[i] = d->domain_id;
120 }
121
122 set_bit(i, iommu->domid_bitmap);
123 context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
124 return 0;
125 }
126
context_get_domain_id(struct context_entry * context,struct vtd_iommu * iommu)127 static int context_get_domain_id(struct context_entry *context,
128 struct vtd_iommu *iommu)
129 {
130 unsigned long dom_index, nr_dom;
131 int domid = -1;
132
133 if (iommu && context)
134 {
135 nr_dom = cap_ndoms(iommu->cap);
136
137 dom_index = context_domain_id(*context);
138
139 if ( dom_index < nr_dom && iommu->domid_map )
140 domid = iommu->domid_map[dom_index];
141 else
142 dprintk(XENLOG_DEBUG VTDPREFIX,
143 "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n",
144 dom_index, nr_dom);
145 }
146 return domid;
147 }
148
149 static int iommus_incoherent;
150
sync_cache(const void * addr,unsigned int size)151 static void sync_cache(const void *addr, unsigned int size)
152 {
153 static unsigned long clflush_size = 0;
154 const void *end = addr + size;
155
156 if ( !iommus_incoherent )
157 return;
158
159 if ( clflush_size == 0 )
160 clflush_size = get_cache_line_size();
161
162 addr -= (unsigned long)addr & (clflush_size - 1);
163 for ( ; addr < end; addr += clflush_size )
164 /*
165 * The arguments to a macro must not include preprocessor directives. Doing so
166 * results in undefined behavior, so we have to create some defines here in
167 * order to avoid it.
168 */
169 #if defined(HAVE_AS_CLWB)
170 # define CLWB_ENCODING "clwb %[p]"
171 #elif defined(HAVE_AS_XSAVEOPT)
172 # define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
173 #else
174 # define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
175 #endif
176
177 #define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
178 #if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
179 # define INPUT BASE_INPUT
180 #else
181 # define INPUT(addr) "a" (addr), BASE_INPUT(addr)
182 #endif
183 /*
184 * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
185 * + prefix than a clflush + nop, and hence the prefix is added instead
186 * of letting the alternative framework fill the gap by appending nops.
187 */
188 alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
189 "data16 clflush %[p]", /* clflushopt */
190 X86_FEATURE_CLFLUSHOPT,
191 CLWB_ENCODING,
192 X86_FEATURE_CLWB, /* no outputs */,
193 INPUT(addr));
194 #undef INPUT
195 #undef BASE_INPUT
196 #undef CLWB_ENCODING
197
198 alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
199 "sfence", X86_FEATURE_CLWB);
200 }
201
202 /* Allocate page table, return its machine address */
alloc_pgtable_maddr(unsigned long npages,nodeid_t node)203 uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
204 {
205 struct page_info *pg, *cur_pg;
206 u64 *vaddr;
207 unsigned int i;
208
209 pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
210 (node == NUMA_NO_NODE) ? 0 : MEMF_node(node));
211 if ( !pg )
212 return 0;
213
214 cur_pg = pg;
215 for ( i = 0; i < npages; i++ )
216 {
217 vaddr = __map_domain_page(cur_pg);
218 memset(vaddr, 0, PAGE_SIZE);
219
220 sync_cache(vaddr, PAGE_SIZE);
221 unmap_domain_page(vaddr);
222 cur_pg++;
223 }
224
225 return page_to_maddr(pg);
226 }
227
free_pgtable_maddr(u64 maddr)228 void free_pgtable_maddr(u64 maddr)
229 {
230 if ( maddr != 0 )
231 free_domheap_page(maddr_to_page(maddr));
232 }
233
234 /* context entry handling */
bus_to_context_maddr(struct vtd_iommu * iommu,u8 bus)235 static u64 bus_to_context_maddr(struct vtd_iommu *iommu, u8 bus)
236 {
237 struct root_entry *root, *root_entries;
238 u64 maddr;
239
240 ASSERT(spin_is_locked(&iommu->lock));
241 root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
242 root = &root_entries[bus];
243 if ( !root_present(*root) )
244 {
245 maddr = alloc_pgtable_maddr(1, iommu->node);
246 if ( maddr == 0 )
247 {
248 unmap_vtd_domain_page(root_entries);
249 return 0;
250 }
251 set_root_value(*root, maddr);
252 set_root_present(*root);
253 iommu_sync_cache(root, sizeof(struct root_entry));
254 }
255 maddr = (u64) get_context_addr(*root);
256 unmap_vtd_domain_page(root_entries);
257 return maddr;
258 }
259
addr_to_dma_page_maddr(struct domain * domain,u64 addr,int alloc)260 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
261 {
262 struct domain_iommu *hd = dom_iommu(domain);
263 int addr_width = agaw_to_width(hd->arch.agaw);
264 struct dma_pte *parent, *pte = NULL;
265 int level = agaw_to_level(hd->arch.agaw);
266 int offset;
267 u64 pte_maddr = 0;
268
269 addr &= (((u64)1) << addr_width) - 1;
270 ASSERT(spin_is_locked(&hd->arch.mapping_lock));
271 if ( !hd->arch.pgd_maddr &&
272 (!alloc ||
273 ((hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node)) == 0)) )
274 goto out;
275
276 parent = (struct dma_pte *)map_vtd_domain_page(hd->arch.pgd_maddr);
277 while ( level > 1 )
278 {
279 offset = address_level_offset(addr, level);
280 pte = &parent[offset];
281
282 pte_maddr = dma_pte_addr(*pte);
283 if ( !pte_maddr )
284 {
285 if ( !alloc )
286 break;
287
288 pte_maddr = alloc_pgtable_maddr(1, hd->node);
289 if ( !pte_maddr )
290 break;
291
292 dma_set_pte_addr(*pte, pte_maddr);
293
294 /*
295 * high level table always sets r/w, last level
296 * page table control read/write
297 */
298 dma_set_pte_readable(*pte);
299 dma_set_pte_writable(*pte);
300 iommu_sync_cache(pte, sizeof(struct dma_pte));
301 }
302
303 if ( level == 2 )
304 break;
305
306 unmap_vtd_domain_page(parent);
307 parent = map_vtd_domain_page(pte_maddr);
308 level--;
309 }
310
311 unmap_vtd_domain_page(parent);
312 out:
313 return pte_maddr;
314 }
315
iommu_flush_write_buffer(struct vtd_iommu * iommu)316 static void iommu_flush_write_buffer(struct vtd_iommu *iommu)
317 {
318 u32 val;
319 unsigned long flags;
320
321 if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
322 return;
323
324 spin_lock_irqsave(&iommu->register_lock, flags);
325 val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
326 dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
327
328 /* Make sure hardware complete it */
329 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
330 !(val & DMA_GSTS_WBFS), val);
331
332 spin_unlock_irqrestore(&iommu->register_lock, flags);
333 }
334
335 /* return value determine if we need a write buffer flush */
flush_context_reg(struct vtd_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type,bool flush_non_present_entry)336 static int __must_check flush_context_reg(struct vtd_iommu *iommu, u16 did,
337 u16 source_id, u8 function_mask,
338 u64 type,
339 bool flush_non_present_entry)
340 {
341 u64 val = 0;
342 unsigned long flags;
343
344 /*
345 * In the non-present entry flush case, if hardware doesn't cache
346 * non-present entry we do nothing and if hardware cache non-present
347 * entry, we flush entries of domain 0 (the domain id is used to cache
348 * any non-present entries)
349 */
350 if ( flush_non_present_entry )
351 {
352 if ( !cap_caching_mode(iommu->cap) )
353 return 1;
354 else
355 did = 0;
356 }
357
358 /* use register invalidation */
359 switch ( type )
360 {
361 case DMA_CCMD_GLOBAL_INVL:
362 val = DMA_CCMD_GLOBAL_INVL;
363 break;
364 case DMA_CCMD_DOMAIN_INVL:
365 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
366 break;
367 case DMA_CCMD_DEVICE_INVL:
368 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
369 |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
370 break;
371 default:
372 BUG();
373 }
374 val |= DMA_CCMD_ICC;
375
376 spin_lock_irqsave(&iommu->register_lock, flags);
377 dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
378
379 /* Make sure hardware complete it */
380 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
381 !(val & DMA_CCMD_ICC), val);
382
383 spin_unlock_irqrestore(&iommu->register_lock, flags);
384 /* flush context entry will implicitly flush write buffer */
385 return 0;
386 }
387
iommu_flush_context_global(struct vtd_iommu * iommu,bool flush_non_present_entry)388 static int __must_check iommu_flush_context_global(struct vtd_iommu *iommu,
389 bool flush_non_present_entry)
390 {
391 return iommu->flush.context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
392 flush_non_present_entry);
393 }
394
iommu_flush_context_device(struct vtd_iommu * iommu,u16 did,u16 source_id,u8 function_mask,bool flush_non_present_entry)395 static int __must_check iommu_flush_context_device(struct vtd_iommu *iommu,
396 u16 did, u16 source_id,
397 u8 function_mask,
398 bool flush_non_present_entry)
399 {
400 return iommu->flush.context(iommu, did, source_id, function_mask,
401 DMA_CCMD_DEVICE_INVL, flush_non_present_entry);
402 }
403
404 /* return value determine if we need a write buffer flush */
flush_iotlb_reg(struct vtd_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type,bool flush_non_present_entry,bool flush_dev_iotlb)405 static int __must_check flush_iotlb_reg(struct vtd_iommu *iommu, u16 did,
406 u64 addr,
407 unsigned int size_order, u64 type,
408 bool flush_non_present_entry,
409 bool flush_dev_iotlb)
410 {
411 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
412 u64 val = 0;
413 unsigned long flags;
414
415 /*
416 * In the non-present entry flush case, if hardware doesn't cache
417 * non-present entry we do nothing and if hardware cache non-present
418 * entry, we flush entries of domain 0 (the domain id is used to cache
419 * any non-present entries)
420 */
421 if ( flush_non_present_entry )
422 {
423 if ( !cap_caching_mode(iommu->cap) )
424 return 1;
425 else
426 did = 0;
427 }
428
429 /* use register invalidation */
430 switch ( type )
431 {
432 case DMA_TLB_GLOBAL_FLUSH:
433 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
434 break;
435 case DMA_TLB_DSI_FLUSH:
436 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
437 break;
438 case DMA_TLB_PSI_FLUSH:
439 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
440 break;
441 default:
442 BUG();
443 }
444 /* Note: set drain read/write */
445 if ( cap_read_drain(iommu->cap) )
446 val |= DMA_TLB_READ_DRAIN;
447 if ( cap_write_drain(iommu->cap) )
448 val |= DMA_TLB_WRITE_DRAIN;
449
450 spin_lock_irqsave(&iommu->register_lock, flags);
451 /* Note: Only uses first TLB reg currently */
452 if ( type == DMA_TLB_PSI_FLUSH )
453 {
454 /* Note: always flush non-leaf currently. */
455 dmar_writeq(iommu->reg, tlb_offset, size_order | addr);
456 }
457 dmar_writeq(iommu->reg, tlb_offset + 8, val);
458
459 /* Make sure hardware complete it */
460 IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
461 !(val & DMA_TLB_IVT), val);
462 spin_unlock_irqrestore(&iommu->register_lock, flags);
463
464 /* check IOTLB invalidation granularity */
465 if ( DMA_TLB_IAIG(val) == 0 )
466 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
467
468 /* flush iotlb entry will implicitly flush write buffer */
469 return 0;
470 }
471
iommu_flush_iotlb_global(struct vtd_iommu * iommu,bool flush_non_present_entry,bool flush_dev_iotlb)472 static int __must_check iommu_flush_iotlb_global(struct vtd_iommu *iommu,
473 bool flush_non_present_entry,
474 bool flush_dev_iotlb)
475 {
476 int status;
477
478 /* apply platform specific errata workarounds */
479 vtd_ops_preamble_quirk(iommu);
480
481 status = iommu->flush.iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
482 flush_non_present_entry, flush_dev_iotlb);
483
484 /* undo platform specific errata workarounds */
485 vtd_ops_postamble_quirk(iommu);
486
487 return status;
488 }
489
iommu_flush_iotlb_dsi(struct vtd_iommu * iommu,u16 did,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)490 static int __must_check iommu_flush_iotlb_dsi(struct vtd_iommu *iommu, u16 did,
491 bool_t flush_non_present_entry,
492 bool_t flush_dev_iotlb)
493 {
494 int status;
495
496 /* apply platform specific errata workarounds */
497 vtd_ops_preamble_quirk(iommu);
498
499 status = iommu->flush.iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
500 flush_non_present_entry, flush_dev_iotlb);
501
502 /* undo platform specific errata workarounds */
503 vtd_ops_postamble_quirk(iommu);
504
505 return status;
506 }
507
iommu_flush_iotlb_psi(struct vtd_iommu * iommu,u16 did,u64 addr,unsigned int order,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)508 static int __must_check iommu_flush_iotlb_psi(struct vtd_iommu *iommu, u16 did,
509 u64 addr, unsigned int order,
510 bool_t flush_non_present_entry,
511 bool_t flush_dev_iotlb)
512 {
513 int status;
514
515 ASSERT(!(addr & (~PAGE_MASK_4K)));
516
517 /* Fallback to domain selective flush if no PSI support */
518 if ( !cap_pgsel_inv(iommu->cap) )
519 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry,
520 flush_dev_iotlb);
521
522 /* Fallback to domain selective flush if size is too big */
523 if ( order > cap_max_amask_val(iommu->cap) )
524 return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry,
525 flush_dev_iotlb);
526
527 addr >>= PAGE_SHIFT_4K + order;
528 addr <<= PAGE_SHIFT_4K + order;
529
530 /* apply platform specific errata workarounds */
531 vtd_ops_preamble_quirk(iommu);
532
533 status = iommu->flush.iotlb(iommu, did, addr, order, DMA_TLB_PSI_FLUSH,
534 flush_non_present_entry, flush_dev_iotlb);
535
536 /* undo platform specific errata workarounds */
537 vtd_ops_postamble_quirk(iommu);
538
539 return status;
540 }
541
iommu_flush_all(void)542 static int __must_check iommu_flush_all(void)
543 {
544 struct acpi_drhd_unit *drhd;
545 struct vtd_iommu *iommu;
546 bool_t flush_dev_iotlb;
547 int rc = 0;
548
549 flush_all_cache();
550 for_each_drhd_unit ( drhd )
551 {
552 int context_rc, iotlb_rc;
553
554 iommu = drhd->iommu;
555 context_rc = iommu_flush_context_global(iommu, 0);
556 flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
557 iotlb_rc = iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
558
559 /*
560 * The current logic for returns:
561 * - positive invoke iommu_flush_write_buffer to flush cache.
562 * - zero on success.
563 * - negative on failure. Continue to flush IOMMU IOTLB on a
564 * best effort basis.
565 */
566 if ( context_rc > 0 || iotlb_rc > 0 )
567 iommu_flush_write_buffer(iommu);
568 if ( rc >= 0 )
569 rc = context_rc;
570 if ( rc >= 0 )
571 rc = iotlb_rc;
572 }
573
574 if ( rc > 0 )
575 rc = 0;
576
577 return rc;
578 }
579
iommu_flush_iotlb(struct domain * d,dfn_t dfn,bool_t dma_old_pte_present,unsigned int page_count)580 static int __must_check iommu_flush_iotlb(struct domain *d, dfn_t dfn,
581 bool_t dma_old_pte_present,
582 unsigned int page_count)
583 {
584 struct domain_iommu *hd = dom_iommu(d);
585 struct acpi_drhd_unit *drhd;
586 struct vtd_iommu *iommu;
587 bool_t flush_dev_iotlb;
588 int iommu_domid;
589 int rc = 0;
590
591 /*
592 * No need pcideves_lock here because we have flush
593 * when assign/deassign device
594 */
595 for_each_drhd_unit ( drhd )
596 {
597 iommu = drhd->iommu;
598
599 if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) )
600 continue;
601
602 flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
603 iommu_domid= domain_iommu_domid(d, iommu);
604 if ( iommu_domid == -1 )
605 continue;
606
607 if ( !page_count || (page_count & (page_count - 1)) ||
608 dfn_eq(dfn, INVALID_DFN) || !IS_ALIGNED(dfn_x(dfn), page_count) )
609 rc = iommu_flush_iotlb_dsi(iommu, iommu_domid,
610 0, flush_dev_iotlb);
611 else
612 rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
613 dfn_to_daddr(dfn),
614 get_order_from_pages(page_count),
615 !dma_old_pte_present,
616 flush_dev_iotlb);
617
618 if ( rc > 0 )
619 {
620 iommu_flush_write_buffer(iommu);
621 rc = 0;
622 }
623 }
624
625 return rc;
626 }
627
iommu_flush_iotlb_pages(struct domain * d,dfn_t dfn,unsigned int page_count,unsigned int flush_flags)628 static int __must_check iommu_flush_iotlb_pages(struct domain *d,
629 dfn_t dfn,
630 unsigned int page_count,
631 unsigned int flush_flags)
632 {
633 ASSERT(page_count && !dfn_eq(dfn, INVALID_DFN));
634 ASSERT(flush_flags);
635
636 return iommu_flush_iotlb(d, dfn, flush_flags & IOMMU_FLUSHF_modified,
637 page_count);
638 }
639
iommu_flush_iotlb_all(struct domain * d)640 static int __must_check iommu_flush_iotlb_all(struct domain *d)
641 {
642 return iommu_flush_iotlb(d, INVALID_DFN, 0, 0);
643 }
644
645 /* clear one page's page table */
dma_pte_clear_one(struct domain * domain,uint64_t addr,unsigned int * flush_flags)646 static void dma_pte_clear_one(struct domain *domain, uint64_t addr,
647 unsigned int *flush_flags)
648 {
649 struct domain_iommu *hd = dom_iommu(domain);
650 struct dma_pte *page = NULL, *pte = NULL;
651 u64 pg_maddr;
652
653 spin_lock(&hd->arch.mapping_lock);
654 /* get last level pte */
655 pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
656 if ( pg_maddr == 0 )
657 {
658 spin_unlock(&hd->arch.mapping_lock);
659 return;
660 }
661
662 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
663 pte = page + address_level_offset(addr, 1);
664
665 if ( !dma_pte_present(*pte) )
666 {
667 spin_unlock(&hd->arch.mapping_lock);
668 unmap_vtd_domain_page(page);
669 return;
670 }
671
672 dma_clear_pte(*pte);
673 *flush_flags |= IOMMU_FLUSHF_modified;
674
675 spin_unlock(&hd->arch.mapping_lock);
676 iommu_sync_cache(pte, sizeof(struct dma_pte));
677
678 unmap_vtd_domain_page(page);
679 }
680
iommu_free_pagetable(u64 pt_maddr,int level)681 static void iommu_free_pagetable(u64 pt_maddr, int level)
682 {
683 struct page_info *pg = maddr_to_page(pt_maddr);
684
685 if ( pt_maddr == 0 )
686 return;
687
688 PFN_ORDER(pg) = level;
689 spin_lock(&iommu_pt_cleanup_lock);
690 page_list_add_tail(pg, &iommu_pt_cleanup_list);
691 spin_unlock(&iommu_pt_cleanup_lock);
692 }
693
iommu_free_page_table(struct page_info * pg)694 static void iommu_free_page_table(struct page_info *pg)
695 {
696 unsigned int i, next_level = PFN_ORDER(pg) - 1;
697 u64 pt_maddr = page_to_maddr(pg);
698 struct dma_pte *pt_vaddr, *pte;
699
700 PFN_ORDER(pg) = 0;
701 pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
702
703 for ( i = 0; i < PTE_NUM; i++ )
704 {
705 pte = &pt_vaddr[i];
706 if ( !dma_pte_present(*pte) )
707 continue;
708
709 if ( next_level >= 1 )
710 iommu_free_pagetable(dma_pte_addr(*pte), next_level);
711
712 dma_clear_pte(*pte);
713 iommu_sync_cache(pte, sizeof(struct dma_pte));
714 }
715
716 unmap_vtd_domain_page(pt_vaddr);
717 free_pgtable_maddr(pt_maddr);
718 }
719
iommu_set_root_entry(struct vtd_iommu * iommu)720 static int iommu_set_root_entry(struct vtd_iommu *iommu)
721 {
722 u32 sts;
723 unsigned long flags;
724
725 spin_lock_irqsave(&iommu->register_lock, flags);
726 dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
727
728 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
729 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
730
731 /* Make sure hardware complete it */
732 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
733 (sts & DMA_GSTS_RTPS), sts);
734 spin_unlock_irqrestore(&iommu->register_lock, flags);
735
736 return 0;
737 }
738
iommu_enable_translation(struct acpi_drhd_unit * drhd)739 static void iommu_enable_translation(struct acpi_drhd_unit *drhd)
740 {
741 u32 sts;
742 unsigned long flags;
743 struct vtd_iommu *iommu = drhd->iommu;
744
745 if ( is_igd_drhd(drhd) )
746 {
747 if ( !iommu_igfx )
748 {
749 printk(XENLOG_INFO VTDPREFIX
750 "Passed iommu=no-igfx option. Disabling IGD VT-d engine.\n");
751 return;
752 }
753
754 if ( !is_igd_vt_enabled_quirk() )
755 {
756 if ( force_iommu )
757 panic("BIOS did not enable IGD for VT properly, crash Xen for security purpose\n");
758
759 printk(XENLOG_WARNING VTDPREFIX
760 "BIOS did not enable IGD for VT properly. Disabling IGD VT-d engine.\n");
761 return;
762 }
763 }
764
765 /* apply platform specific errata workarounds */
766 vtd_ops_preamble_quirk(iommu);
767
768 if ( iommu_verbose )
769 printk(VTDPREFIX "iommu_enable_translation: iommu->reg = %p\n",
770 iommu->reg);
771 spin_lock_irqsave(&iommu->register_lock, flags);
772 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
773 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
774
775 /* Make sure hardware complete it */
776 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
777 (sts & DMA_GSTS_TES), sts);
778 spin_unlock_irqrestore(&iommu->register_lock, flags);
779
780 /* undo platform specific errata workarounds */
781 vtd_ops_postamble_quirk(iommu);
782
783 /* Disable PMRs when VT-d engine takes effect per spec definition */
784 disable_pmr(iommu);
785 }
786
iommu_disable_translation(struct vtd_iommu * iommu)787 static void iommu_disable_translation(struct vtd_iommu *iommu)
788 {
789 u32 sts;
790 unsigned long flags;
791
792 /* apply platform specific errata workarounds */
793 vtd_ops_preamble_quirk(iommu);
794
795 spin_lock_irqsave(&iommu->register_lock, flags);
796 sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
797 dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
798
799 /* Make sure hardware complete it */
800 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
801 !(sts & DMA_GSTS_TES), sts);
802 spin_unlock_irqrestore(&iommu->register_lock, flags);
803
804 /* undo platform specific errata workarounds */
805 vtd_ops_postamble_quirk(iommu);
806 }
807
808 enum faulttype {
809 DMA_REMAP,
810 INTR_REMAP,
811 UNKNOWN,
812 };
813
814 static const char *dma_remap_fault_reasons[] =
815 {
816 "Software",
817 "Present bit in root entry is clear",
818 "Present bit in context entry is clear",
819 "Invalid context entry",
820 "Access beyond MGAW",
821 "PTE Write access is not set",
822 "PTE Read access is not set",
823 "Next page table ptr is invalid",
824 "Root table address invalid",
825 "Context table ptr is invalid",
826 "non-zero reserved fields in RTP",
827 "non-zero reserved fields in CTP",
828 "non-zero reserved fields in PTE",
829 "Blocked a DMA translation request",
830 };
831
832 static const char *intr_remap_fault_reasons[] =
833 {
834 "Detected reserved fields in the decoded interrupt-remapped request",
835 "Interrupt index exceeded the interrupt-remapping table size",
836 "Present field in the IRTE entry is clear",
837 "Error accessing interrupt-remapping table pointed by IRTA_REG",
838 "Detected reserved fields in the IRTE entry",
839 "Blocked a compatibility format interrupt request",
840 "Blocked an interrupt request due to source-id verification failure",
841 };
842
iommu_get_fault_reason(u8 fault_reason,enum faulttype * fault_type)843 static const char *iommu_get_fault_reason(u8 fault_reason,
844 enum faulttype *fault_type)
845 {
846 if ( fault_reason >= 0x20 && ( fault_reason < 0x20 +
847 ARRAY_SIZE(intr_remap_fault_reasons)) )
848 {
849 *fault_type = INTR_REMAP;
850 return intr_remap_fault_reasons[fault_reason - 0x20];
851 }
852 else if ( fault_reason < ARRAY_SIZE(dma_remap_fault_reasons) )
853 {
854 *fault_type = DMA_REMAP;
855 return dma_remap_fault_reasons[fault_reason];
856 }
857 else
858 {
859 *fault_type = UNKNOWN;
860 return "Unknown";
861 }
862 }
863
iommu_page_fault_do_one(struct vtd_iommu * iommu,int type,u8 fault_reason,u16 source_id,u64 addr)864 static int iommu_page_fault_do_one(struct vtd_iommu *iommu, int type,
865 u8 fault_reason, u16 source_id, u64 addr)
866 {
867 const char *reason, *kind;
868 enum faulttype fault_type;
869 u16 seg = iommu->drhd->segment;
870
871 reason = iommu_get_fault_reason(fault_reason, &fault_type);
872 switch ( fault_type )
873 {
874 case DMA_REMAP:
875 printk(XENLOG_G_WARNING VTDPREFIX
876 "DMAR:[%s] Request device [%04x:%02x:%02x.%u] "
877 "fault addr %"PRIx64"\n",
878 (type ? "DMA Read" : "DMA Write"),
879 seg, PCI_BUS(source_id), PCI_SLOT(source_id),
880 PCI_FUNC(source_id), addr);
881 kind = "DMAR";
882 break;
883 case INTR_REMAP:
884 printk(XENLOG_G_WARNING VTDPREFIX
885 "INTR-REMAP: Request device [%04x:%02x:%02x.%u] "
886 "fault index %"PRIx64"\n",
887 seg, PCI_BUS(source_id), PCI_SLOT(source_id),
888 PCI_FUNC(source_id), addr >> 48);
889 kind = "INTR-REMAP";
890 break;
891 default:
892 printk(XENLOG_G_WARNING VTDPREFIX
893 "UNKNOWN: Request device [%04x:%02x:%02x.%u] "
894 "fault addr %"PRIx64"\n",
895 seg, PCI_BUS(source_id), PCI_SLOT(source_id),
896 PCI_FUNC(source_id), addr);
897 kind = "UNKNOWN";
898 break;
899 }
900
901 printk(XENLOG_G_WARNING VTDPREFIX "%s: reason %02x - %s\n",
902 kind, fault_reason, reason);
903
904 if ( iommu_verbose && fault_type == DMA_REMAP )
905 print_vtd_entries(iommu, PCI_BUS(source_id), PCI_DEVFN2(source_id),
906 addr >> PAGE_SHIFT);
907
908 return 0;
909 }
910
iommu_fault_status(u32 fault_status)911 static void iommu_fault_status(u32 fault_status)
912 {
913 if ( fault_status & DMA_FSTS_PFO )
914 INTEL_IOMMU_DEBUG("iommu_fault_status: Fault Overflow\n");
915 if ( fault_status & DMA_FSTS_PPF )
916 INTEL_IOMMU_DEBUG("iommu_fault_status: Primary Pending Fault\n");
917 if ( fault_status & DMA_FSTS_AFO )
918 INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Fault Overflow\n");
919 if ( fault_status & DMA_FSTS_APF )
920 INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Pending Fault\n");
921 if ( fault_status & DMA_FSTS_IQE )
922 INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Queue Error\n");
923 if ( fault_status & DMA_FSTS_ICE )
924 INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Completion Error\n");
925 if ( fault_status & DMA_FSTS_ITE )
926 INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Time-out Error\n");
927 }
928
929 #define PRIMARY_FAULT_REG_LEN (16)
__do_iommu_page_fault(struct vtd_iommu * iommu)930 static void __do_iommu_page_fault(struct vtd_iommu *iommu)
931 {
932 int reg, fault_index;
933 u32 fault_status;
934 unsigned long flags;
935
936 fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
937
938 iommu_fault_status(fault_status);
939
940 /* FIXME: ignore advanced fault log */
941 if ( !(fault_status & DMA_FSTS_PPF) )
942 goto clear_overflow;
943
944 fault_index = dma_fsts_fault_record_index(fault_status);
945 reg = cap_fault_reg_offset(iommu->cap);
946 while (1)
947 {
948 u8 fault_reason;
949 u16 source_id;
950 u32 data;
951 u64 guest_addr;
952 int type;
953
954 /* highest 32 bits */
955 spin_lock_irqsave(&iommu->register_lock, flags);
956 data = dmar_readl(iommu->reg, reg +
957 fault_index * PRIMARY_FAULT_REG_LEN + 12);
958 if ( !(data & DMA_FRCD_F) )
959 {
960 spin_unlock_irqrestore(&iommu->register_lock, flags);
961 break;
962 }
963
964 fault_reason = dma_frcd_fault_reason(data);
965 type = dma_frcd_type(data);
966
967 data = dmar_readl(iommu->reg, reg +
968 fault_index * PRIMARY_FAULT_REG_LEN + 8);
969 source_id = dma_frcd_source_id(data);
970
971 guest_addr = dmar_readq(iommu->reg, reg +
972 fault_index * PRIMARY_FAULT_REG_LEN);
973 guest_addr = dma_frcd_page_addr(guest_addr);
974 /* clear the fault */
975 dmar_writel(iommu->reg, reg +
976 fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
977 spin_unlock_irqrestore(&iommu->register_lock, flags);
978
979 iommu_page_fault_do_one(iommu, type, fault_reason,
980 source_id, guest_addr);
981
982 pci_check_disable_device(iommu->drhd->segment,
983 PCI_BUS(source_id), PCI_DEVFN2(source_id));
984
985 fault_index++;
986 if ( fault_index > cap_num_fault_regs(iommu->cap) )
987 fault_index = 0;
988 }
989 clear_overflow:
990 /* clear primary fault overflow */
991 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
992 if ( fault_status & DMA_FSTS_PFO )
993 {
994 spin_lock_irqsave(&iommu->register_lock, flags);
995 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
996 spin_unlock_irqrestore(&iommu->register_lock, flags);
997 }
998 }
999
do_iommu_page_fault(void * unused)1000 static void do_iommu_page_fault(void *unused)
1001 {
1002 struct acpi_drhd_unit *drhd;
1003
1004 if ( list_empty(&acpi_drhd_units) )
1005 {
1006 INTEL_IOMMU_DEBUG("no device found, something must be very wrong!\n");
1007 return;
1008 }
1009
1010 /*
1011 * No matter from whom the interrupt came from, check all the
1012 * IOMMUs present in the system. This allows for having just one
1013 * tasklet (instead of one per each IOMMUs) and should be more than
1014 * fine, considering how rare the event of a fault should be.
1015 */
1016 for_each_drhd_unit ( drhd )
1017 __do_iommu_page_fault(drhd->iommu);
1018 }
1019
iommu_page_fault(int irq,void * dev_id,struct cpu_user_regs * regs)1020 static void iommu_page_fault(int irq, void *dev_id,
1021 struct cpu_user_regs *regs)
1022 {
1023 /*
1024 * Just flag the tasklet as runnable. This is fine, according to VT-d
1025 * specs since a new interrupt won't be generated until we clear all
1026 * the faults that caused this one to happen.
1027 */
1028 tasklet_schedule(&vtd_fault_tasklet);
1029 }
1030
dma_msi_unmask(struct irq_desc * desc)1031 static void dma_msi_unmask(struct irq_desc *desc)
1032 {
1033 struct vtd_iommu *iommu = desc->action->dev_id;
1034 unsigned long flags;
1035 u32 sts;
1036
1037 /* unmask it */
1038 spin_lock_irqsave(&iommu->register_lock, flags);
1039 sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1040 sts &= ~DMA_FECTL_IM;
1041 dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1042 spin_unlock_irqrestore(&iommu->register_lock, flags);
1043 iommu->msi.msi_attrib.host_masked = 0;
1044 }
1045
dma_msi_mask(struct irq_desc * desc)1046 static void dma_msi_mask(struct irq_desc *desc)
1047 {
1048 unsigned long flags;
1049 struct vtd_iommu *iommu = desc->action->dev_id;
1050 u32 sts;
1051
1052 /* mask it */
1053 spin_lock_irqsave(&iommu->register_lock, flags);
1054 sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1055 sts |= DMA_FECTL_IM;
1056 dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1057 spin_unlock_irqrestore(&iommu->register_lock, flags);
1058 iommu->msi.msi_attrib.host_masked = 1;
1059 }
1060
dma_msi_startup(struct irq_desc * desc)1061 static unsigned int dma_msi_startup(struct irq_desc *desc)
1062 {
1063 dma_msi_unmask(desc);
1064 return 0;
1065 }
1066
dma_msi_ack(struct irq_desc * desc)1067 static void dma_msi_ack(struct irq_desc *desc)
1068 {
1069 irq_complete_move(desc);
1070 dma_msi_mask(desc);
1071 move_masked_irq(desc);
1072 }
1073
dma_msi_end(struct irq_desc * desc,u8 vector)1074 static void dma_msi_end(struct irq_desc *desc, u8 vector)
1075 {
1076 dma_msi_unmask(desc);
1077 end_nonmaskable_irq(desc, vector);
1078 }
1079
dma_msi_set_affinity(struct irq_desc * desc,const cpumask_t * mask)1080 static void dma_msi_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
1081 {
1082 struct msi_msg msg;
1083 unsigned int dest;
1084 unsigned long flags;
1085 struct vtd_iommu *iommu = desc->action->dev_id;
1086
1087 dest = set_desc_affinity(desc, mask);
1088 if (dest == BAD_APICID){
1089 dprintk(XENLOG_ERR VTDPREFIX, "Set iommu interrupt affinity error!\n");
1090 return;
1091 }
1092
1093 msi_compose_msg(desc->arch.vector, NULL, &msg);
1094 msg.dest32 = dest;
1095 if (x2apic_enabled)
1096 msg.address_hi = dest & 0xFFFFFF00;
1097 ASSERT(!(msg.address_lo & MSI_ADDR_DEST_ID_MASK));
1098 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
1099 iommu->msi.msg = msg;
1100
1101 spin_lock_irqsave(&iommu->register_lock, flags);
1102 dmar_writel(iommu->reg, DMAR_FEDATA_REG, msg.data);
1103 dmar_writel(iommu->reg, DMAR_FEADDR_REG, msg.address_lo);
1104 /*
1105 * When x2APIC is not enabled, DMAR_FEUADDR_REG is reserved and
1106 * it's not necessary to update it.
1107 */
1108 if ( x2apic_enabled )
1109 dmar_writel(iommu->reg, DMAR_FEUADDR_REG, msg.address_hi);
1110 spin_unlock_irqrestore(&iommu->register_lock, flags);
1111 }
1112
1113 static hw_irq_controller dma_msi_type = {
1114 .typename = "DMA_MSI",
1115 .startup = dma_msi_startup,
1116 .shutdown = dma_msi_mask,
1117 .enable = dma_msi_unmask,
1118 .disable = dma_msi_mask,
1119 .ack = dma_msi_ack,
1120 .end = dma_msi_end,
1121 .set_affinity = dma_msi_set_affinity,
1122 };
1123
iommu_set_interrupt(struct acpi_drhd_unit * drhd)1124 static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd)
1125 {
1126 int irq, ret;
1127 struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
1128 struct vtd_iommu *iommu = drhd->iommu;
1129 struct irq_desc *desc;
1130
1131 irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain)
1132 : NUMA_NO_NODE,
1133 false);
1134 if ( irq <= 0 )
1135 {
1136 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n");
1137 return -EINVAL;
1138 }
1139
1140 desc = irq_to_desc(irq);
1141 desc->handler = &dma_msi_type;
1142 ret = request_irq(irq, 0, iommu_page_fault, "dmar", iommu);
1143 if ( ret )
1144 {
1145 desc->handler = &no_irq_type;
1146 destroy_irq(irq);
1147 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
1148 return ret;
1149 }
1150
1151 iommu->msi.irq = irq;
1152 iommu->msi.msi_attrib.pos = MSI_TYPE_IOMMU;
1153 iommu->msi.msi_attrib.maskbit = 1;
1154 iommu->msi.msi_attrib.is_64 = 1;
1155 desc->msi_desc = &iommu->msi;
1156
1157 return 0;
1158 }
1159
iommu_alloc(struct acpi_drhd_unit * drhd)1160 int __init iommu_alloc(struct acpi_drhd_unit *drhd)
1161 {
1162 struct vtd_iommu *iommu;
1163 unsigned long sagaw, nr_dom;
1164 int agaw;
1165
1166 if ( nr_iommus > MAX_IOMMUS )
1167 {
1168 dprintk(XENLOG_ERR VTDPREFIX,
1169 "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
1170 return -ENOMEM;
1171 }
1172
1173 iommu = xzalloc(struct vtd_iommu);
1174 if ( iommu == NULL )
1175 return -ENOMEM;
1176
1177 iommu->msi.irq = -1; /* No irq assigned yet. */
1178 iommu->node = NUMA_NO_NODE;
1179 INIT_LIST_HEAD(&iommu->ats_devices);
1180 spin_lock_init(&iommu->lock);
1181 spin_lock_init(&iommu->register_lock);
1182 spin_lock_init(&iommu->intremap.lock);
1183
1184 iommu->drhd = drhd;
1185 drhd->iommu = iommu;
1186
1187 iommu->reg = ioremap(drhd->address, PAGE_SIZE);
1188 if ( !iommu->reg )
1189 return -ENOMEM;
1190 iommu->index = nr_iommus++;
1191
1192 iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
1193 iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
1194
1195 if ( iommu_verbose )
1196 {
1197 printk(VTDPREFIX "drhd->address = %"PRIx64" iommu->reg = %p\n",
1198 drhd->address, iommu->reg);
1199 printk(VTDPREFIX "cap = %"PRIx64" ecap = %"PRIx64"\n",
1200 iommu->cap, iommu->ecap);
1201 }
1202 if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
1203 return -ENODEV;
1204
1205 quirk_iommu_caps(iommu);
1206
1207 if ( cap_fault_reg_offset(iommu->cap) +
1208 cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
1209 ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
1210 {
1211 printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n");
1212 print_iommu_regs(drhd);
1213 return -ENODEV;
1214 }
1215
1216 /* Calculate number of pagetable levels: between 2 and 4. */
1217 sagaw = cap_sagaw(iommu->cap);
1218 for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
1219 if ( test_bit(agaw, &sagaw) )
1220 break;
1221 if ( agaw < 0 )
1222 {
1223 printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported sagaw %lx\n", sagaw);
1224 print_iommu_regs(drhd);
1225 return -ENODEV;
1226 }
1227 iommu->nr_pt_levels = agaw_to_level(agaw);
1228
1229 if ( !ecap_coherent(iommu->ecap) )
1230 iommus_incoherent = 1;
1231
1232 /* allocate domain id bitmap */
1233 nr_dom = cap_ndoms(iommu->cap);
1234 iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
1235 if ( !iommu->domid_bitmap )
1236 return -ENOMEM;
1237
1238 /*
1239 * if Caching mode is set, then invalid translations are tagged with
1240 * domain id 0, Hence reserve bit 0 for it
1241 */
1242 if ( cap_caching_mode(iommu->cap) )
1243 __set_bit(0, iommu->domid_bitmap);
1244
1245 iommu->domid_map = xzalloc_array(u16, nr_dom);
1246 if ( !iommu->domid_map )
1247 return -ENOMEM;
1248
1249 return 0;
1250 }
1251
iommu_free(struct acpi_drhd_unit * drhd)1252 void __init iommu_free(struct acpi_drhd_unit *drhd)
1253 {
1254 struct vtd_iommu *iommu = drhd->iommu;
1255
1256 if ( iommu == NULL )
1257 return;
1258
1259 drhd->iommu = NULL;
1260
1261 if ( iommu->root_maddr != 0 )
1262 {
1263 free_pgtable_maddr(iommu->root_maddr);
1264 iommu->root_maddr = 0;
1265 }
1266
1267 if ( iommu->reg )
1268 iounmap(iommu->reg);
1269
1270 xfree(iommu->domid_bitmap);
1271 xfree(iommu->domid_map);
1272
1273 if ( iommu->msi.irq >= 0 )
1274 destroy_irq(iommu->msi.irq);
1275 xfree(iommu);
1276 }
1277
1278 #define guestwidth_to_adjustwidth(gaw) ({ \
1279 int agaw, r = (gaw - 12) % 9; \
1280 agaw = (r == 0) ? gaw : (gaw + 9 - r); \
1281 if ( agaw > 64 ) \
1282 agaw = 64; \
1283 agaw; })
1284
intel_iommu_domain_init(struct domain * d)1285 static int intel_iommu_domain_init(struct domain *d)
1286 {
1287 dom_iommu(d)->arch.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
1288
1289 return 0;
1290 }
1291
intel_iommu_hwdom_init(struct domain * d)1292 static void __hwdom_init intel_iommu_hwdom_init(struct domain *d)
1293 {
1294 struct acpi_drhd_unit *drhd;
1295
1296 setup_hwdom_pci_devices(d, setup_hwdom_device);
1297 setup_hwdom_rmrr(d);
1298 /* Make sure workarounds are applied before enabling the IOMMU(s). */
1299 arch_iommu_hwdom_init(d);
1300
1301 if ( iommu_flush_all() )
1302 printk(XENLOG_WARNING VTDPREFIX
1303 " IOMMU flush all failed for hardware domain\n");
1304
1305 for_each_drhd_unit ( drhd )
1306 {
1307 if ( iomem_deny_access(d, PFN_DOWN(drhd->address),
1308 PFN_DOWN(drhd->address)) )
1309 BUG();
1310 iommu_enable_translation(drhd);
1311 }
1312 }
1313
domain_context_mapping_one(struct domain * domain,struct vtd_iommu * iommu,u8 bus,u8 devfn,const struct pci_dev * pdev)1314 int domain_context_mapping_one(
1315 struct domain *domain,
1316 struct vtd_iommu *iommu,
1317 u8 bus, u8 devfn, const struct pci_dev *pdev)
1318 {
1319 struct domain_iommu *hd = dom_iommu(domain);
1320 struct context_entry *context, *context_entries;
1321 u64 maddr, pgd_maddr;
1322 u16 seg = iommu->drhd->segment;
1323 int agaw, rc, ret;
1324 bool_t flush_dev_iotlb;
1325
1326 ASSERT(pcidevs_locked());
1327 spin_lock(&iommu->lock);
1328 maddr = bus_to_context_maddr(iommu, bus);
1329 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1330 context = &context_entries[devfn];
1331
1332 if ( context_present(*context) )
1333 {
1334 int res = 0;
1335
1336 /* Try to get domain ownership from device structure. If that's
1337 * not available, try to read it from the context itself. */
1338 if ( pdev )
1339 {
1340 if ( pdev->domain != domain )
1341 {
1342 printk(XENLOG_G_INFO VTDPREFIX
1343 "%pd: %04x:%02x:%02x.%u owned by %pd\n",
1344 domain, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1345 pdev->domain);
1346 res = -EINVAL;
1347 }
1348 }
1349 else
1350 {
1351 int cdomain;
1352 cdomain = context_get_domain_id(context, iommu);
1353
1354 if ( cdomain < 0 )
1355 {
1356 printk(XENLOG_G_WARNING VTDPREFIX
1357 "%pd: %04x:%02x:%02x.%u mapped, but can't find owner\n",
1358 domain, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1359 res = -EINVAL;
1360 }
1361 else if ( cdomain != domain->domain_id )
1362 {
1363 printk(XENLOG_G_INFO VTDPREFIX
1364 "%pd: %04x:%02x:%02x.%u already mapped to d%d\n",
1365 domain,
1366 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1367 cdomain);
1368 res = -EINVAL;
1369 }
1370 }
1371
1372 unmap_vtd_domain_page(context_entries);
1373 spin_unlock(&iommu->lock);
1374 return res;
1375 }
1376
1377 if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
1378 {
1379 context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1380 agaw = level_to_agaw(iommu->nr_pt_levels);
1381 }
1382 else
1383 {
1384 spin_lock(&hd->arch.mapping_lock);
1385
1386 /* Ensure we have pagetables allocated down to leaf PTE. */
1387 if ( hd->arch.pgd_maddr == 0 )
1388 {
1389 addr_to_dma_page_maddr(domain, 0, 1);
1390 if ( hd->arch.pgd_maddr == 0 )
1391 {
1392 nomem:
1393 spin_unlock(&hd->arch.mapping_lock);
1394 spin_unlock(&iommu->lock);
1395 unmap_vtd_domain_page(context_entries);
1396 return -ENOMEM;
1397 }
1398 }
1399
1400 /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1401 pgd_maddr = hd->arch.pgd_maddr;
1402 for ( agaw = level_to_agaw(4);
1403 agaw != level_to_agaw(iommu->nr_pt_levels);
1404 agaw-- )
1405 {
1406 struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1407 pgd_maddr = dma_pte_addr(*p);
1408 unmap_vtd_domain_page(p);
1409 if ( pgd_maddr == 0 )
1410 goto nomem;
1411 }
1412
1413 context_set_address_root(*context, pgd_maddr);
1414 if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1415 context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1416 else
1417 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1418
1419 spin_unlock(&hd->arch.mapping_lock);
1420 }
1421
1422 if ( context_set_domain_id(context, domain, iommu) )
1423 {
1424 spin_unlock(&iommu->lock);
1425 unmap_vtd_domain_page(context_entries);
1426 return -EFAULT;
1427 }
1428
1429 context_set_address_width(*context, agaw);
1430 context_set_fault_enable(*context);
1431 context_set_present(*context);
1432 iommu_sync_cache(context, sizeof(struct context_entry));
1433 spin_unlock(&iommu->lock);
1434
1435 /* Context entry was previously non-present (with domid 0). */
1436 rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
1437 DMA_CCMD_MASK_NOBIT, 1);
1438 flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1439 ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1440
1441 /*
1442 * The current logic for returns:
1443 * - positive invoke iommu_flush_write_buffer to flush cache.
1444 * - zero on success.
1445 * - negative on failure. Continue to flush IOMMU IOTLB on a
1446 * best effort basis.
1447 */
1448 if ( rc > 0 || ret > 0 )
1449 iommu_flush_write_buffer(iommu);
1450 if ( rc >= 0 )
1451 rc = ret;
1452 if ( rc > 0 )
1453 rc = 0;
1454
1455 set_bit(iommu->index, &hd->arch.iommu_bitmap);
1456
1457 unmap_vtd_domain_page(context_entries);
1458
1459 if ( !seg && !rc )
1460 rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
1461
1462 return rc;
1463 }
1464
domain_context_mapping(struct domain * domain,u8 devfn,struct pci_dev * pdev)1465 static int domain_context_mapping(struct domain *domain, u8 devfn,
1466 struct pci_dev *pdev)
1467 {
1468 struct acpi_drhd_unit *drhd;
1469 int ret = 0;
1470 u8 seg = pdev->seg, bus = pdev->bus, secbus;
1471
1472 drhd = acpi_find_matched_drhd_unit(pdev);
1473 if ( !drhd )
1474 return -ENODEV;
1475
1476 /*
1477 * Generally we assume only devices from one node to get assigned to a
1478 * given guest. But even if not, by replacing the prior value here we
1479 * guarantee that at least some basic allocations for the device being
1480 * added will get done against its node. Any further allocations for
1481 * this or other devices may be penalized then, but some would also be
1482 * if we left other than NUMA_NO_NODE untouched here.
1483 */
1484 if ( drhd->iommu->node != NUMA_NO_NODE )
1485 dom_iommu(domain)->node = drhd->iommu->node;
1486
1487 ASSERT(pcidevs_locked());
1488
1489 switch ( pdev->type )
1490 {
1491 case DEV_TYPE_PCI_HOST_BRIDGE:
1492 if ( iommu_debug )
1493 printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n",
1494 domain->domain_id, seg, bus,
1495 PCI_SLOT(devfn), PCI_FUNC(devfn));
1496 if ( !is_hardware_domain(domain) )
1497 return -EPERM;
1498 break;
1499
1500 case DEV_TYPE_PCIe_BRIDGE:
1501 case DEV_TYPE_PCIe2PCI_BRIDGE:
1502 case DEV_TYPE_LEGACY_PCI_BRIDGE:
1503 break;
1504
1505 case DEV_TYPE_PCIe_ENDPOINT:
1506 if ( iommu_debug )
1507 printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
1508 domain->domain_id, seg, bus,
1509 PCI_SLOT(devfn), PCI_FUNC(devfn));
1510 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1511 pdev);
1512 if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1513 enable_ats_device(pdev, &drhd->iommu->ats_devices);
1514
1515 break;
1516
1517 case DEV_TYPE_PCI:
1518 if ( iommu_debug )
1519 printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n",
1520 domain->domain_id, seg, bus,
1521 PCI_SLOT(devfn), PCI_FUNC(devfn));
1522
1523 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1524 pdev);
1525 if ( ret )
1526 break;
1527
1528 if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 1 )
1529 break;
1530
1531 /*
1532 * Mapping a bridge should, if anything, pass the struct pci_dev of
1533 * that bridge. Since bridges don't normally get assigned to guests,
1534 * their owner would be the wrong one. Pass NULL instead.
1535 */
1536 ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1537 NULL);
1538
1539 /*
1540 * Devices behind PCIe-to-PCI/PCIx bridge may generate different
1541 * requester-id. It may originate from devfn=0 on the secondary bus
1542 * behind the bridge. Map that id as well if we didn't already.
1543 *
1544 * Somewhat similar as for bridges, we don't want to pass a struct
1545 * pci_dev here - there may not even exist one for this (secbus,0,0)
1546 * tuple. If there is one, without properly working device groups it
1547 * may again not have the correct owner.
1548 */
1549 if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
1550 (secbus != pdev->bus || pdev->devfn != 0) )
1551 ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
1552 NULL);
1553
1554 break;
1555
1556 default:
1557 dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1558 domain->domain_id, pdev->type,
1559 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560 ret = -EINVAL;
1561 break;
1562 }
1563
1564 if ( !ret && devfn == pdev->devfn )
1565 pci_vtd_quirk(pdev);
1566
1567 return ret;
1568 }
1569
domain_context_unmap_one(struct domain * domain,struct vtd_iommu * iommu,u8 bus,u8 devfn)1570 int domain_context_unmap_one(
1571 struct domain *domain,
1572 struct vtd_iommu *iommu,
1573 u8 bus, u8 devfn)
1574 {
1575 struct context_entry *context, *context_entries;
1576 u64 maddr;
1577 int iommu_domid, rc, ret;
1578 bool_t flush_dev_iotlb;
1579
1580 ASSERT(pcidevs_locked());
1581 spin_lock(&iommu->lock);
1582
1583 maddr = bus_to_context_maddr(iommu, bus);
1584 context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1585 context = &context_entries[devfn];
1586
1587 if ( !context_present(*context) )
1588 {
1589 spin_unlock(&iommu->lock);
1590 unmap_vtd_domain_page(context_entries);
1591 return 0;
1592 }
1593
1594 context_clear_present(*context);
1595 context_clear_entry(*context);
1596 iommu_sync_cache(context, sizeof(struct context_entry));
1597
1598 iommu_domid= domain_iommu_domid(domain, iommu);
1599 if ( iommu_domid == -1 )
1600 {
1601 spin_unlock(&iommu->lock);
1602 unmap_vtd_domain_page(context_entries);
1603 return -EINVAL;
1604 }
1605
1606 rc = iommu_flush_context_device(iommu, iommu_domid,
1607 PCI_BDF2(bus, devfn),
1608 DMA_CCMD_MASK_NOBIT, 0);
1609
1610 flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1611 ret = iommu_flush_iotlb_dsi(iommu, iommu_domid, 0, flush_dev_iotlb);
1612
1613 /*
1614 * The current logic for returns:
1615 * - positive invoke iommu_flush_write_buffer to flush cache.
1616 * - zero on success.
1617 * - negative on failure. Continue to flush IOMMU IOTLB on a
1618 * best effort basis.
1619 */
1620 if ( rc > 0 || ret > 0 )
1621 iommu_flush_write_buffer(iommu);
1622 if ( rc >= 0 )
1623 rc = ret;
1624 if ( rc > 0 )
1625 rc = 0;
1626
1627 spin_unlock(&iommu->lock);
1628 unmap_vtd_domain_page(context_entries);
1629
1630 if ( !iommu->drhd->segment && !rc )
1631 rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC);
1632
1633 return rc;
1634 }
1635
domain_context_unmap(struct domain * domain,u8 devfn,struct pci_dev * pdev)1636 static int domain_context_unmap(struct domain *domain, u8 devfn,
1637 struct pci_dev *pdev)
1638 {
1639 struct acpi_drhd_unit *drhd;
1640 struct vtd_iommu *iommu;
1641 int ret = 0;
1642 u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus;
1643 int found = 0;
1644
1645 drhd = acpi_find_matched_drhd_unit(pdev);
1646 if ( !drhd )
1647 return -ENODEV;
1648 iommu = drhd->iommu;
1649
1650 switch ( pdev->type )
1651 {
1652 case DEV_TYPE_PCI_HOST_BRIDGE:
1653 if ( iommu_debug )
1654 printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u unmap\n",
1655 domain->domain_id, seg, bus,
1656 PCI_SLOT(devfn), PCI_FUNC(devfn));
1657 if ( !is_hardware_domain(domain) )
1658 return -EPERM;
1659 goto out;
1660
1661 case DEV_TYPE_PCIe_BRIDGE:
1662 case DEV_TYPE_PCIe2PCI_BRIDGE:
1663 case DEV_TYPE_LEGACY_PCI_BRIDGE:
1664 goto out;
1665
1666 case DEV_TYPE_PCIe_ENDPOINT:
1667 if ( iommu_debug )
1668 printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
1669 domain->domain_id, seg, bus,
1670 PCI_SLOT(devfn), PCI_FUNC(devfn));
1671 ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1672 if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1673 disable_ats_device(pdev);
1674
1675 break;
1676
1677 case DEV_TYPE_PCI:
1678 if ( iommu_debug )
1679 printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
1680 domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1681 ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1682 if ( ret )
1683 break;
1684
1685 tmp_bus = bus;
1686 tmp_devfn = devfn;
1687 if ( find_upstream_bridge(seg, &tmp_bus, &tmp_devfn, &secbus) < 1 )
1688 break;
1689
1690 /* PCIe to PCI/PCIx bridge */
1691 if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1692 {
1693 ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1694 if ( ret )
1695 return ret;
1696
1697 ret = domain_context_unmap_one(domain, iommu, secbus, 0);
1698 }
1699 else /* Legacy PCI bridge */
1700 ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1701
1702 break;
1703
1704 default:
1705 dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1706 domain->domain_id, pdev->type,
1707 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1708 ret = -EINVAL;
1709 goto out;
1710 }
1711
1712 /*
1713 * if no other devices under the same iommu owned by this domain,
1714 * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
1715 */
1716 for_each_pdev ( domain, pdev )
1717 {
1718 if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn )
1719 continue;
1720
1721 drhd = acpi_find_matched_drhd_unit(pdev);
1722 if ( drhd && drhd->iommu == iommu )
1723 {
1724 found = 1;
1725 break;
1726 }
1727 }
1728
1729 if ( found == 0 )
1730 {
1731 int iommu_domid;
1732
1733 clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap);
1734
1735 iommu_domid = domain_iommu_domid(domain, iommu);
1736 if ( iommu_domid == -1 )
1737 {
1738 ret = -EINVAL;
1739 goto out;
1740 }
1741
1742 clear_bit(iommu_domid, iommu->domid_bitmap);
1743 iommu->domid_map[iommu_domid] = 0;
1744 }
1745
1746 out:
1747 return ret;
1748 }
1749
iommu_domain_teardown(struct domain * d)1750 static void iommu_domain_teardown(struct domain *d)
1751 {
1752 struct domain_iommu *hd = dom_iommu(d);
1753 struct mapped_rmrr *mrmrr, *tmp;
1754
1755 if ( list_empty(&acpi_drhd_units) )
1756 return;
1757
1758 list_for_each_entry_safe ( mrmrr, tmp, &hd->arch.mapped_rmrrs, list )
1759 {
1760 list_del(&mrmrr->list);
1761 xfree(mrmrr);
1762 }
1763
1764 ASSERT(is_iommu_enabled(d));
1765
1766 if ( iommu_use_hap_pt(d) )
1767 return;
1768
1769 spin_lock(&hd->arch.mapping_lock);
1770 iommu_free_pagetable(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw));
1771 hd->arch.pgd_maddr = 0;
1772 spin_unlock(&hd->arch.mapping_lock);
1773 }
1774
intel_iommu_map_page(struct domain * d,dfn_t dfn,mfn_t mfn,unsigned int flags,unsigned int * flush_flags)1775 static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
1776 mfn_t mfn, unsigned int flags,
1777 unsigned int *flush_flags)
1778 {
1779 struct domain_iommu *hd = dom_iommu(d);
1780 struct dma_pte *page, *pte, old, new = {};
1781 u64 pg_maddr;
1782 int rc = 0;
1783
1784 /* Do nothing if VT-d shares EPT page table */
1785 if ( iommu_use_hap_pt(d) )
1786 return 0;
1787
1788 /* Do nothing if hardware domain and iommu supports pass thru. */
1789 if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
1790 return 0;
1791
1792 spin_lock(&hd->arch.mapping_lock);
1793
1794 pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1);
1795 if ( !pg_maddr )
1796 {
1797 spin_unlock(&hd->arch.mapping_lock);
1798 return -ENOMEM;
1799 }
1800
1801 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1802 pte = &page[dfn_x(dfn) & LEVEL_MASK];
1803 old = *pte;
1804
1805 dma_set_pte_addr(new, mfn_to_maddr(mfn));
1806 dma_set_pte_prot(new,
1807 ((flags & IOMMUF_readable) ? DMA_PTE_READ : 0) |
1808 ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
1809
1810 /* Set the SNP on leaf page table if Snoop Control available */
1811 if ( iommu_snoop )
1812 dma_set_pte_snp(new);
1813
1814 if ( old.val == new.val )
1815 {
1816 spin_unlock(&hd->arch.mapping_lock);
1817 unmap_vtd_domain_page(page);
1818 return 0;
1819 }
1820
1821 *pte = new;
1822
1823 iommu_sync_cache(pte, sizeof(struct dma_pte));
1824 spin_unlock(&hd->arch.mapping_lock);
1825 unmap_vtd_domain_page(page);
1826
1827 *flush_flags |= IOMMU_FLUSHF_added;
1828 if ( dma_pte_present(old) )
1829 *flush_flags |= IOMMU_FLUSHF_modified;
1830
1831 return rc;
1832 }
1833
intel_iommu_unmap_page(struct domain * d,dfn_t dfn,unsigned int * flush_flags)1834 static int __must_check intel_iommu_unmap_page(struct domain *d, dfn_t dfn,
1835 unsigned int *flush_flags)
1836 {
1837 /* Do nothing if VT-d shares EPT page table */
1838 if ( iommu_use_hap_pt(d) )
1839 return 0;
1840
1841 /* Do nothing if hardware domain and iommu supports pass thru. */
1842 if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
1843 return 0;
1844
1845 dma_pte_clear_one(d, dfn_to_daddr(dfn), flush_flags);
1846
1847 return 0;
1848 }
1849
intel_iommu_lookup_page(struct domain * d,dfn_t dfn,mfn_t * mfn,unsigned int * flags)1850 static int intel_iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn,
1851 unsigned int *flags)
1852 {
1853 struct domain_iommu *hd = dom_iommu(d);
1854 struct dma_pte *page, val;
1855 u64 pg_maddr;
1856
1857 /*
1858 * If VT-d shares EPT page table or if the domain is the hardware
1859 * domain and iommu_passthrough is set then pass back the dfn.
1860 */
1861 if ( iommu_use_hap_pt(d) ||
1862 (iommu_hwdom_passthrough && is_hardware_domain(d)) )
1863 return -EOPNOTSUPP;
1864
1865 spin_lock(&hd->arch.mapping_lock);
1866
1867 pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0);
1868 if ( !pg_maddr )
1869 {
1870 spin_unlock(&hd->arch.mapping_lock);
1871 return -ENOENT;
1872 }
1873
1874 page = map_vtd_domain_page(pg_maddr);
1875 val = page[dfn_x(dfn) & LEVEL_MASK];
1876
1877 unmap_vtd_domain_page(page);
1878 spin_unlock(&hd->arch.mapping_lock);
1879
1880 if ( !dma_pte_present(val) )
1881 return -ENOENT;
1882
1883 *mfn = maddr_to_mfn(dma_pte_addr(val));
1884 *flags = dma_pte_read(val) ? IOMMUF_readable : 0;
1885 *flags |= dma_pte_write(val) ? IOMMUF_writable : 0;
1886
1887 return 0;
1888 }
1889
vtd_ept_page_compatible(struct vtd_iommu * iommu)1890 static int __init vtd_ept_page_compatible(struct vtd_iommu *iommu)
1891 {
1892 u64 ept_cap, vtd_cap = iommu->cap;
1893
1894 /* EPT is not initialised yet, so we must check the capability in
1895 * the MSR explicitly rather than use cpu_has_vmx_ept_*() */
1896 if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 )
1897 return 0;
1898
1899 return (ept_has_2mb(ept_cap) && opt_hap_2mb) <= cap_sps_2mb(vtd_cap) &&
1900 (ept_has_1gb(ept_cap) && opt_hap_1gb) <= cap_sps_1gb(vtd_cap);
1901 }
1902
1903 /*
1904 * set VT-d page table directory to EPT table if allowed
1905 */
iommu_set_pgd(struct domain * d)1906 static void iommu_set_pgd(struct domain *d)
1907 {
1908 mfn_t pgd_mfn;
1909
1910 pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
1911 dom_iommu(d)->arch.pgd_maddr =
1912 pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
1913 }
1914
rmrr_identity_mapping(struct domain * d,bool_t map,const struct acpi_rmrr_unit * rmrr,u32 flag)1915 static int rmrr_identity_mapping(struct domain *d, bool_t map,
1916 const struct acpi_rmrr_unit *rmrr,
1917 u32 flag)
1918 {
1919 unsigned long base_pfn = rmrr->base_address >> PAGE_SHIFT_4K;
1920 unsigned long end_pfn = PAGE_ALIGN_4K(rmrr->end_address) >> PAGE_SHIFT_4K;
1921 struct mapped_rmrr *mrmrr;
1922 struct domain_iommu *hd = dom_iommu(d);
1923
1924 ASSERT(pcidevs_locked());
1925 ASSERT(rmrr->base_address < rmrr->end_address);
1926
1927 /*
1928 * No need to acquire hd->arch.mapping_lock: Both insertion and removal
1929 * get done while holding pcidevs_lock.
1930 */
1931 list_for_each_entry( mrmrr, &hd->arch.mapped_rmrrs, list )
1932 {
1933 if ( mrmrr->base == rmrr->base_address &&
1934 mrmrr->end == rmrr->end_address )
1935 {
1936 int ret = 0;
1937
1938 if ( map )
1939 {
1940 ++mrmrr->count;
1941 return 0;
1942 }
1943
1944 if ( --mrmrr->count )
1945 return 0;
1946
1947 while ( base_pfn < end_pfn )
1948 {
1949 if ( clear_identity_p2m_entry(d, base_pfn) )
1950 ret = -ENXIO;
1951 base_pfn++;
1952 }
1953
1954 list_del(&mrmrr->list);
1955 xfree(mrmrr);
1956 return ret;
1957 }
1958 }
1959
1960 if ( !map )
1961 return -ENOENT;
1962
1963 while ( base_pfn < end_pfn )
1964 {
1965 int err = set_identity_p2m_entry(d, base_pfn, p2m_access_rw, flag);
1966
1967 if ( err )
1968 return err;
1969 base_pfn++;
1970 }
1971
1972 mrmrr = xmalloc(struct mapped_rmrr);
1973 if ( !mrmrr )
1974 return -ENOMEM;
1975 mrmrr->base = rmrr->base_address;
1976 mrmrr->end = rmrr->end_address;
1977 mrmrr->count = 1;
1978 list_add_tail(&mrmrr->list, &hd->arch.mapped_rmrrs);
1979
1980 return 0;
1981 }
1982
intel_iommu_add_device(u8 devfn,struct pci_dev * pdev)1983 static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
1984 {
1985 struct acpi_rmrr_unit *rmrr;
1986 u16 bdf;
1987 int ret, i;
1988
1989 ASSERT(pcidevs_locked());
1990
1991 if ( !pdev->domain )
1992 return -EINVAL;
1993
1994 ret = domain_context_mapping(pdev->domain, devfn, pdev);
1995 if ( ret )
1996 {
1997 dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
1998 pdev->domain->domain_id);
1999 return ret;
2000 }
2001
2002 for_each_rmrr_device ( rmrr, bdf, i )
2003 {
2004 if ( rmrr->segment == pdev->seg &&
2005 PCI_BUS(bdf) == pdev->bus &&
2006 PCI_DEVFN2(bdf) == devfn )
2007 {
2008 /*
2009 * iommu_add_device() is only called for the hardware
2010 * domain (see xen/drivers/passthrough/pci.c:pci_add_device()).
2011 * Since RMRRs are always reserved in the e820 map for the hardware
2012 * domain, there shouldn't be a conflict.
2013 */
2014 ret = rmrr_identity_mapping(pdev->domain, 1, rmrr, 0);
2015 if ( ret )
2016 dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
2017 pdev->domain->domain_id);
2018 }
2019 }
2020
2021 return 0;
2022 }
2023
intel_iommu_enable_device(struct pci_dev * pdev)2024 static int intel_iommu_enable_device(struct pci_dev *pdev)
2025 {
2026 struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
2027 int ret = drhd ? ats_device(pdev, drhd) : -ENODEV;
2028
2029 pci_vtd_quirk(pdev);
2030
2031 if ( ret <= 0 )
2032 return ret;
2033
2034 ret = enable_ats_device(pdev, &drhd->iommu->ats_devices);
2035
2036 return ret >= 0 ? 0 : ret;
2037 }
2038
intel_iommu_remove_device(u8 devfn,struct pci_dev * pdev)2039 static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
2040 {
2041 struct acpi_rmrr_unit *rmrr;
2042 u16 bdf;
2043 int i;
2044
2045 if ( !pdev->domain )
2046 return -EINVAL;
2047
2048 for_each_rmrr_device ( rmrr, bdf, i )
2049 {
2050 if ( rmrr->segment != pdev->seg ||
2051 PCI_BUS(bdf) != pdev->bus ||
2052 PCI_DEVFN2(bdf) != devfn )
2053 continue;
2054
2055 /*
2056 * Any flag is nothing to clear these mappings but here
2057 * its always safe and strict to set 0.
2058 */
2059 rmrr_identity_mapping(pdev->domain, 0, rmrr, 0);
2060 }
2061
2062 return domain_context_unmap(pdev->domain, devfn, pdev);
2063 }
2064
setup_hwdom_device(u8 devfn,struct pci_dev * pdev)2065 static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev)
2066 {
2067 return domain_context_mapping(pdev->domain, devfn, pdev);
2068 }
2069
clear_fault_bits(struct vtd_iommu * iommu)2070 void clear_fault_bits(struct vtd_iommu *iommu)
2071 {
2072 u64 val;
2073 unsigned long flags;
2074
2075 spin_lock_irqsave(&iommu->register_lock, flags);
2076 val = dmar_readq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8);
2077 dmar_writeq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8, val);
2078 dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
2079 spin_unlock_irqrestore(&iommu->register_lock, flags);
2080 }
2081
adjust_irq_affinity(struct acpi_drhd_unit * drhd)2082 static void adjust_irq_affinity(struct acpi_drhd_unit *drhd)
2083 {
2084 const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
2085 unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
2086 : NUMA_NO_NODE;
2087 const cpumask_t *cpumask = NULL;
2088 struct irq_desc *desc;
2089 unsigned long flags;
2090
2091 if ( node < MAX_NUMNODES && node_online(node) &&
2092 cpumask_intersects(&node_to_cpumask(node), &cpu_online_map) )
2093 cpumask = &node_to_cpumask(node);
2094
2095 desc = irq_to_desc(drhd->iommu->msi.irq);
2096 spin_lock_irqsave(&desc->lock, flags);
2097 dma_msi_set_affinity(desc, cpumask);
2098 spin_unlock_irqrestore(&desc->lock, flags);
2099 }
2100
adjust_vtd_irq_affinities(void)2101 static int adjust_vtd_irq_affinities(void)
2102 {
2103 struct acpi_drhd_unit *drhd;
2104
2105 if ( !iommu_enabled )
2106 return 0;
2107
2108 for_each_drhd_unit ( drhd )
2109 adjust_irq_affinity(drhd);
2110
2111 return 0;
2112 }
2113 __initcall(adjust_vtd_irq_affinities);
2114
init_vtd_hw(void)2115 static int __must_check init_vtd_hw(void)
2116 {
2117 struct acpi_drhd_unit *drhd;
2118 struct vtd_iommu *iommu;
2119 int ret;
2120 unsigned long flags;
2121 u32 sts;
2122
2123 /*
2124 * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.
2125 */
2126 for_each_drhd_unit ( drhd )
2127 {
2128 adjust_irq_affinity(drhd);
2129
2130 iommu = drhd->iommu;
2131
2132 clear_fault_bits(iommu);
2133
2134 spin_lock_irqsave(&iommu->register_lock, flags);
2135 sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
2136 sts &= ~DMA_FECTL_IM;
2137 dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
2138 spin_unlock_irqrestore(&iommu->register_lock, flags);
2139 }
2140
2141 /*
2142 * Enable queue invalidation
2143 */
2144 for_each_drhd_unit ( drhd )
2145 {
2146 iommu = drhd->iommu;
2147 /*
2148 * If queued invalidation not enabled, use regiser based
2149 * invalidation
2150 */
2151 if ( enable_qinval(iommu) != 0 )
2152 {
2153 iommu->flush.context = flush_context_reg;
2154 iommu->flush.iotlb = flush_iotlb_reg;
2155 }
2156 }
2157
2158 /*
2159 * Enable interrupt remapping
2160 */
2161 if ( iommu_intremap )
2162 {
2163 int apic;
2164 for ( apic = 0; apic < nr_ioapics; apic++ )
2165 {
2166 if ( ioapic_to_iommu(IO_APIC_ID(apic)) == NULL )
2167 {
2168 iommu_intremap = iommu_intremap_off;
2169 dprintk(XENLOG_ERR VTDPREFIX,
2170 "ioapic_to_iommu: ioapic %#x (id: %#x) is NULL! "
2171 "Will not try to enable Interrupt Remapping.\n",
2172 apic, IO_APIC_ID(apic));
2173 break;
2174 }
2175 }
2176 }
2177 if ( iommu_intremap )
2178 {
2179 for_each_drhd_unit ( drhd )
2180 {
2181 iommu = drhd->iommu;
2182 if ( enable_intremap(iommu, 0) != 0 )
2183 {
2184 iommu_intremap = iommu_intremap_off;
2185 dprintk(XENLOG_WARNING VTDPREFIX,
2186 "Interrupt Remapping not enabled\n");
2187
2188 break;
2189 }
2190 }
2191 if ( !iommu_intremap )
2192 for_each_drhd_unit ( drhd )
2193 disable_intremap(drhd->iommu);
2194 }
2195
2196 /*
2197 * Set root entries for each VT-d engine. After set root entry,
2198 * must globally invalidate context cache, and then globally
2199 * invalidate IOTLB
2200 */
2201 for_each_drhd_unit ( drhd )
2202 {
2203 iommu = drhd->iommu;
2204 ret = iommu_set_root_entry(iommu);
2205 if ( ret )
2206 {
2207 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
2208 return -EIO;
2209 }
2210 }
2211
2212 return iommu_flush_all();
2213 }
2214
setup_hwdom_rmrr(struct domain * d)2215 static void __hwdom_init setup_hwdom_rmrr(struct domain *d)
2216 {
2217 struct acpi_rmrr_unit *rmrr;
2218 u16 bdf;
2219 int ret, i;
2220
2221 pcidevs_lock();
2222 for_each_rmrr_device ( rmrr, bdf, i )
2223 {
2224 /*
2225 * Here means we're add a device to the hardware domain.
2226 * Since RMRRs are always reserved in the e820 map for the hardware
2227 * domain, there shouldn't be a conflict. So its always safe and
2228 * strict to set 0.
2229 */
2230 ret = rmrr_identity_mapping(d, 1, rmrr, 0);
2231 if ( ret )
2232 dprintk(XENLOG_ERR VTDPREFIX,
2233 "IOMMU: mapping reserved region failed\n");
2234 }
2235 pcidevs_unlock();
2236 }
2237
vtd_setup(void)2238 static int __init vtd_setup(void)
2239 {
2240 struct acpi_drhd_unit *drhd;
2241 struct vtd_iommu *iommu;
2242 int ret;
2243
2244 if ( list_empty(&acpi_drhd_units) )
2245 {
2246 ret = -ENODEV;
2247 goto error;
2248 }
2249
2250 if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) )
2251 {
2252 ret = -EPERM;
2253 goto error;
2254 }
2255
2256 platform_quirks_init();
2257 if ( !iommu_enable )
2258 {
2259 ret = -ENODEV;
2260 goto error;
2261 }
2262
2263 /* We enable the following features only if they are supported by all VT-d
2264 * engines: Snoop Control, DMA passthrough, Queued Invalidation, Interrupt
2265 * Remapping, and Posted Interrupt
2266 */
2267 for_each_drhd_unit ( drhd )
2268 {
2269 iommu = drhd->iommu;
2270
2271 printk("Intel VT-d iommu %u supported page sizes: 4kB%s%s\n",
2272 iommu->index,
2273 cap_sps_2mb(iommu->cap) ? ", 2MB" : "",
2274 cap_sps_1gb(iommu->cap) ? ", 1GB" : "");
2275
2276 #ifndef iommu_snoop
2277 if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
2278 iommu_snoop = false;
2279 #endif
2280
2281 if ( iommu_hwdom_passthrough && !ecap_pass_thru(iommu->ecap) )
2282 iommu_hwdom_passthrough = false;
2283
2284 if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
2285 iommu_qinval = 0;
2286
2287 if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
2288 iommu_intremap = iommu_intremap_off;
2289
2290 #ifndef iommu_intpost
2291 /*
2292 * We cannot use posted interrupt if X86_FEATURE_CX16 is
2293 * not supported, since we count on this feature to
2294 * atomically update 16-byte IRTE in posted format.
2295 */
2296 if ( !cap_intr_post(iommu->cap) || !iommu_intremap || !cpu_has_cx16 )
2297 iommu_intpost = false;
2298 #endif
2299
2300 if ( !vtd_ept_page_compatible(iommu) )
2301 clear_iommu_hap_pt_share();
2302
2303 ret = iommu_set_interrupt(drhd);
2304 if ( ret )
2305 {
2306 dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
2307 goto error;
2308 }
2309 }
2310
2311 softirq_tasklet_init(&vtd_fault_tasklet, do_iommu_page_fault, NULL);
2312
2313 if ( !iommu_qinval && iommu_intremap )
2314 {
2315 iommu_intremap = iommu_intremap_off;
2316 dprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
2317 "since Queued Invalidation isn't supported or enabled.\n");
2318 }
2319
2320 #define P(p,s) printk("Intel VT-d %s %senabled.\n", s, (p)? "" : "not ")
2321 #ifndef iommu_snoop
2322 P(iommu_snoop, "Snoop Control");
2323 #endif
2324 P(iommu_hwdom_passthrough, "Dom0 DMA Passthrough");
2325 P(iommu_qinval, "Queued Invalidation");
2326 P(iommu_intremap, "Interrupt Remapping");
2327 #ifndef iommu_intpost
2328 P(iommu_intpost, "Posted Interrupt");
2329 #endif
2330 P(iommu_hap_pt_share, "Shared EPT tables");
2331 #undef P
2332
2333 ret = init_vtd_hw();
2334 if ( ret )
2335 goto error;
2336
2337 register_keyhandler('V', vtd_dump_iommu_info, "dump iommu info", 1);
2338
2339 return 0;
2340
2341 error:
2342 iommu_enabled = 0;
2343 #ifndef iommu_snoop
2344 iommu_snoop = false;
2345 #endif
2346 iommu_hwdom_passthrough = false;
2347 iommu_qinval = 0;
2348 iommu_intremap = iommu_intremap_off;
2349 #ifndef iommu_intpost
2350 iommu_intpost = false;
2351 #endif
2352 return ret;
2353 }
2354
reassign_device_ownership(struct domain * source,struct domain * target,u8 devfn,struct pci_dev * pdev)2355 static int reassign_device_ownership(
2356 struct domain *source,
2357 struct domain *target,
2358 u8 devfn, struct pci_dev *pdev)
2359 {
2360 int ret;
2361
2362 /*
2363 * Devices assigned to untrusted domains (here assumed to be any domU)
2364 * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
2365 * by the root complex unless interrupt remapping is enabled.
2366 */
2367 if ( (target != hardware_domain) && !iommu_intremap )
2368 untrusted_msi = true;
2369
2370 /*
2371 * If the device belongs to the hardware domain, and it has RMRR, don't
2372 * remove it from the hardware domain, because BIOS may use RMRR at
2373 * booting time.
2374 */
2375 if ( !is_hardware_domain(source) )
2376 {
2377 const struct acpi_rmrr_unit *rmrr;
2378 u16 bdf;
2379 unsigned int i;
2380
2381 for_each_rmrr_device( rmrr, bdf, i )
2382 if ( rmrr->segment == pdev->seg &&
2383 PCI_BUS(bdf) == pdev->bus &&
2384 PCI_DEVFN2(bdf) == devfn )
2385 {
2386 /*
2387 * Any RMRR flag is always ignored when remove a device,
2388 * but its always safe and strict to set 0.
2389 */
2390 ret = rmrr_identity_mapping(source, 0, rmrr, 0);
2391 if ( ret != -ENOENT )
2392 return ret;
2393 }
2394 }
2395
2396 ret = domain_context_unmap(source, devfn, pdev);
2397 if ( ret )
2398 return ret;
2399
2400 if ( devfn == pdev->devfn && pdev->domain != dom_io )
2401 {
2402 list_move(&pdev->domain_list, &dom_io->pdev_list);
2403 pdev->domain = dom_io;
2404 }
2405
2406 if ( !has_arch_pdevs(source) )
2407 vmx_pi_hooks_deassign(source);
2408
2409 if ( !has_arch_pdevs(target) )
2410 vmx_pi_hooks_assign(target);
2411
2412 ret = domain_context_mapping(target, devfn, pdev);
2413 if ( ret )
2414 {
2415 if ( !has_arch_pdevs(target) )
2416 vmx_pi_hooks_deassign(target);
2417
2418 return ret;
2419 }
2420
2421 if ( devfn == pdev->devfn && pdev->domain != target )
2422 {
2423 list_move(&pdev->domain_list, &target->pdev_list);
2424 pdev->domain = target;
2425 }
2426
2427 return ret;
2428 }
2429
intel_iommu_assign_device(struct domain * d,u8 devfn,struct pci_dev * pdev,u32 flag)2430 static int intel_iommu_assign_device(
2431 struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
2432 {
2433 struct domain *s = pdev->domain;
2434 struct acpi_rmrr_unit *rmrr;
2435 int ret = 0, i;
2436 u16 bdf, seg;
2437 u8 bus;
2438
2439 if ( list_empty(&acpi_drhd_units) )
2440 return -ENODEV;
2441
2442 seg = pdev->seg;
2443 bus = pdev->bus;
2444 /*
2445 * In rare cases one given rmrr is shared by multiple devices but
2446 * obviously this would put the security of a system at risk. So
2447 * we would prevent from this sort of device assignment. But this
2448 * can be permitted if user set
2449 * "pci = [ 'sbdf, rdm_policy=relaxed' ]"
2450 *
2451 * TODO: in the future we can introduce group device assignment
2452 * interface to make sure devices sharing RMRR are assigned to the
2453 * same domain together.
2454 */
2455 for_each_rmrr_device( rmrr, bdf, i )
2456 {
2457 if ( rmrr->segment == seg &&
2458 PCI_BUS(bdf) == bus &&
2459 PCI_DEVFN2(bdf) == devfn &&
2460 rmrr->scope.devices_cnt > 1 )
2461 {
2462 bool_t relaxed = !!(flag & XEN_DOMCTL_DEV_RDM_RELAXED);
2463
2464 printk(XENLOG_GUEST "%s" VTDPREFIX
2465 " It's %s to assign %04x:%02x:%02x.%u"
2466 " with shared RMRR at %"PRIx64" for Dom%d.\n",
2467 relaxed ? XENLOG_WARNING : XENLOG_ERR,
2468 relaxed ? "risky" : "disallowed",
2469 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
2470 rmrr->base_address, d->domain_id);
2471 if ( !relaxed )
2472 return -EPERM;
2473 }
2474 }
2475
2476 ret = reassign_device_ownership(s, d, devfn, pdev);
2477 if ( ret || d == dom_io )
2478 return ret;
2479
2480 /* Setup rmrr identity mapping */
2481 for_each_rmrr_device( rmrr, bdf, i )
2482 {
2483 if ( rmrr->segment == seg &&
2484 PCI_BUS(bdf) == bus &&
2485 PCI_DEVFN2(bdf) == devfn )
2486 {
2487 ret = rmrr_identity_mapping(d, 1, rmrr, flag);
2488 if ( ret )
2489 {
2490 int rc;
2491
2492 rc = reassign_device_ownership(d, s, devfn, pdev);
2493 printk(XENLOG_G_ERR VTDPREFIX
2494 " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
2495 rmrr->base_address, rmrr->end_address,
2496 d->domain_id, ret);
2497 if ( rc )
2498 {
2499 printk(XENLOG_ERR VTDPREFIX
2500 " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
2501 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
2502 domain_crash(d);
2503 }
2504 break;
2505 }
2506 }
2507 }
2508
2509 return ret;
2510 }
2511
intel_iommu_group_id(u16 seg,u8 bus,u8 devfn)2512 static int intel_iommu_group_id(u16 seg, u8 bus, u8 devfn)
2513 {
2514 u8 secbus;
2515 if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 0 )
2516 return -1;
2517 else
2518 return PCI_BDF2(bus, devfn);
2519 }
2520
2521 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
2522
vtd_suspend(void)2523 static int __must_check vtd_suspend(void)
2524 {
2525 struct acpi_drhd_unit *drhd;
2526 struct vtd_iommu *iommu;
2527 u32 i;
2528 int rc;
2529
2530 if ( !iommu_enabled )
2531 return 0;
2532
2533 rc = iommu_flush_all();
2534 if ( unlikely(rc) )
2535 {
2536 printk(XENLOG_WARNING VTDPREFIX
2537 " suspend: IOMMU flush all failed: %d\n", rc);
2538
2539 return rc;
2540 }
2541
2542 for_each_drhd_unit ( drhd )
2543 {
2544 iommu = drhd->iommu;
2545 i = iommu->index;
2546
2547 iommu_state[i][DMAR_FECTL_REG] =
2548 (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
2549 iommu_state[i][DMAR_FEDATA_REG] =
2550 (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
2551 iommu_state[i][DMAR_FEADDR_REG] =
2552 (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
2553 iommu_state[i][DMAR_FEUADDR_REG] =
2554 (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
2555
2556 /* don't disable VT-d engine when force_iommu is set. */
2557 if ( force_iommu )
2558 continue;
2559
2560 iommu_disable_translation(iommu);
2561
2562 /* If interrupt remapping is enabled, queued invalidation
2563 * will be disabled following interupt remapping disabling
2564 * in local apic suspend
2565 */
2566 if ( !iommu_intremap && iommu_qinval )
2567 disable_qinval(iommu);
2568 }
2569
2570 return 0;
2571 }
2572
vtd_crash_shutdown(void)2573 static void vtd_crash_shutdown(void)
2574 {
2575 struct acpi_drhd_unit *drhd;
2576 struct vtd_iommu *iommu;
2577
2578 if ( !iommu_enabled )
2579 return;
2580
2581 if ( iommu_flush_all() )
2582 printk(XENLOG_WARNING VTDPREFIX
2583 " crash shutdown: IOMMU flush all failed\n");
2584
2585 for_each_drhd_unit ( drhd )
2586 {
2587 iommu = drhd->iommu;
2588 iommu_disable_translation(iommu);
2589 disable_intremap(drhd->iommu);
2590 disable_qinval(drhd->iommu);
2591 }
2592 }
2593
vtd_resume(void)2594 static void vtd_resume(void)
2595 {
2596 struct acpi_drhd_unit *drhd;
2597 struct vtd_iommu *iommu;
2598 u32 i;
2599 unsigned long flags;
2600
2601 if ( !iommu_enabled )
2602 return;
2603
2604 if ( init_vtd_hw() != 0 && force_iommu )
2605 panic("IOMMU setup failed, crash Xen for security purpose\n");
2606
2607 for_each_drhd_unit ( drhd )
2608 {
2609 iommu = drhd->iommu;
2610 i = iommu->index;
2611
2612 spin_lock_irqsave(&iommu->register_lock, flags);
2613 dmar_writel(iommu->reg, DMAR_FECTL_REG,
2614 (u32) iommu_state[i][DMAR_FECTL_REG]);
2615 dmar_writel(iommu->reg, DMAR_FEDATA_REG,
2616 (u32) iommu_state[i][DMAR_FEDATA_REG]);
2617 dmar_writel(iommu->reg, DMAR_FEADDR_REG,
2618 (u32) iommu_state[i][DMAR_FEADDR_REG]);
2619 dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
2620 (u32) iommu_state[i][DMAR_FEUADDR_REG]);
2621 spin_unlock_irqrestore(&iommu->register_lock, flags);
2622
2623 iommu_enable_translation(drhd);
2624 }
2625 }
2626
vtd_dump_p2m_table_level(paddr_t pt_maddr,int level,paddr_t gpa,int indent)2627 static void vtd_dump_p2m_table_level(paddr_t pt_maddr, int level, paddr_t gpa,
2628 int indent)
2629 {
2630 paddr_t address;
2631 int i;
2632 struct dma_pte *pt_vaddr, *pte;
2633 int next_level;
2634
2635 if ( level < 1 )
2636 return;
2637
2638 pt_vaddr = map_vtd_domain_page(pt_maddr);
2639 if ( pt_vaddr == NULL )
2640 {
2641 printk("Failed to map VT-D domain page %"PRIpaddr"\n", pt_maddr);
2642 return;
2643 }
2644
2645 next_level = level - 1;
2646 for ( i = 0; i < PTE_NUM; i++ )
2647 {
2648 if ( !(i % 2) )
2649 process_pending_softirqs();
2650
2651 pte = &pt_vaddr[i];
2652 if ( !dma_pte_present(*pte) )
2653 continue;
2654
2655 address = gpa + offset_level_address(i, level);
2656 if ( next_level >= 1 )
2657 vtd_dump_p2m_table_level(dma_pte_addr(*pte), next_level,
2658 address, indent + 1);
2659 else
2660 printk("%*sdfn: %08lx mfn: %08lx\n",
2661 indent, "",
2662 (unsigned long)(address >> PAGE_SHIFT_4K),
2663 (unsigned long)(dma_pte_addr(*pte) >> PAGE_SHIFT_4K));
2664 }
2665
2666 unmap_vtd_domain_page(pt_vaddr);
2667 }
2668
vtd_dump_p2m_table(struct domain * d)2669 static void vtd_dump_p2m_table(struct domain *d)
2670 {
2671 const struct domain_iommu *hd;
2672
2673 if ( list_empty(&acpi_drhd_units) )
2674 return;
2675
2676 hd = dom_iommu(d);
2677 printk("p2m table has %d levels\n", agaw_to_level(hd->arch.agaw));
2678 vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0);
2679 }
2680
intel_iommu_quarantine_init(struct domain * d)2681 static int __init intel_iommu_quarantine_init(struct domain *d)
2682 {
2683 struct domain_iommu *hd = dom_iommu(d);
2684 struct dma_pte *parent;
2685 unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
2686 unsigned int level = agaw_to_level(agaw);
2687 int rc;
2688
2689 if ( hd->arch.pgd_maddr )
2690 {
2691 ASSERT_UNREACHABLE();
2692 return 0;
2693 }
2694
2695 spin_lock(&hd->arch.mapping_lock);
2696
2697 hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node);
2698 if ( !hd->arch.pgd_maddr )
2699 goto out;
2700
2701 parent = map_vtd_domain_page(hd->arch.pgd_maddr);
2702 while ( level )
2703 {
2704 uint64_t maddr;
2705 unsigned int offset;
2706
2707 /*
2708 * The pgtable allocator is fine for the leaf page, as well as
2709 * page table pages, and the resulting allocations are always
2710 * zeroed.
2711 */
2712 maddr = alloc_pgtable_maddr(1, hd->node);
2713 if ( !maddr )
2714 break;
2715
2716 for ( offset = 0; offset < PTE_NUM; offset++ )
2717 {
2718 struct dma_pte *pte = &parent[offset];
2719
2720 dma_set_pte_addr(*pte, maddr);
2721 dma_set_pte_readable(*pte);
2722 }
2723 iommu_sync_cache(parent, PAGE_SIZE);
2724
2725 unmap_vtd_domain_page(parent);
2726 parent = map_vtd_domain_page(maddr);
2727 level--;
2728 }
2729 unmap_vtd_domain_page(parent);
2730
2731 out:
2732 spin_unlock(&hd->arch.mapping_lock);
2733
2734 rc = iommu_flush_iotlb_all(d);
2735
2736 /* Pages leaked in failure case */
2737 return level ? -ENOMEM : rc;
2738 }
2739
2740 const struct iommu_ops __initconstrel intel_iommu_ops = {
2741 .init = intel_iommu_domain_init,
2742 .hwdom_init = intel_iommu_hwdom_init,
2743 .quarantine_init = intel_iommu_quarantine_init,
2744 .add_device = intel_iommu_add_device,
2745 .enable_device = intel_iommu_enable_device,
2746 .remove_device = intel_iommu_remove_device,
2747 .assign_device = intel_iommu_assign_device,
2748 .teardown = iommu_domain_teardown,
2749 .map_page = intel_iommu_map_page,
2750 .unmap_page = intel_iommu_unmap_page,
2751 .lookup_page = intel_iommu_lookup_page,
2752 .free_page_table = iommu_free_page_table,
2753 .reassign_device = reassign_device_ownership,
2754 .get_device_group_id = intel_iommu_group_id,
2755 .enable_x2apic = intel_iommu_enable_eim,
2756 .disable_x2apic = intel_iommu_disable_eim,
2757 .update_ire_from_apic = io_apic_write_remap_rte,
2758 .update_ire_from_msi = msi_msg_write_remap_rte,
2759 .read_apic_from_ire = io_apic_read_remap_rte,
2760 .read_msi_from_ire = msi_msg_read_remap_rte,
2761 .setup_hpet_msi = intel_setup_hpet_msi,
2762 .adjust_irq_affinities = adjust_vtd_irq_affinities,
2763 .suspend = vtd_suspend,
2764 .resume = vtd_resume,
2765 .share_p2m = iommu_set_pgd,
2766 .crash_shutdown = vtd_crash_shutdown,
2767 .iotlb_flush = iommu_flush_iotlb_pages,
2768 .iotlb_flush_all = iommu_flush_iotlb_all,
2769 .get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
2770 .dump_p2m_table = vtd_dump_p2m_table,
2771 .sync_cache = sync_cache,
2772 };
2773
2774 const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
2775 .ops = &intel_iommu_ops,
2776 .setup = vtd_setup,
2777 .supports_x2apic = intel_iommu_supports_eim,
2778 };
2779
2780 /*
2781 * Local variables:
2782 * mode: C
2783 * c-file-style: "BSD"
2784 * c-basic-offset: 4
2785 * tab-width: 4
2786 * indent-tabs-mode: nil
2787 * End:
2788 */
2789