1 /*
2 * pci.c: HVM PCI setup.
3 *
4 * Leendert van Doorn, leendert@watson.ibm.com
5 * Copyright (c) 2005, International Business Machines Corporation.
6 *
7 * Copyright (c) 2006, Keir Fraser, XenSource Inc.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms and conditions of the GNU General Public License,
11 * version 2, as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 * more details.
17 *
18 * You should have received a copy of the GNU General Public License along with
19 * this program; If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include "util.h"
23 #include "hypercall.h"
24 #include "config.h"
25 #include "pci_regs.h"
26
27 #include <xen/memory.h>
28 #include <xen/hvm/ioreq.h>
29 #include <xen/hvm/hvm_xs_strings.h>
30 #include <xen/hvm/e820.h>
31
32 unsigned long pci_mem_start = HVM_BELOW_4G_MMIO_START;
33 unsigned long pci_mem_end = PCI_MEM_END;
34 uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0;
35
36 enum virtual_vga virtual_vga = VGA_none;
37 unsigned long igd_opregion_pgbase = 0;
38
39 /* Check if the specified range conflicts with any reserved device memory. */
check_overlap_all(uint64_t start,uint64_t size)40 static bool check_overlap_all(uint64_t start, uint64_t size)
41 {
42 unsigned int i;
43
44 for ( i = 0; i < memory_map.nr_map; i++ )
45 {
46 if ( memory_map.map[i].type == E820_RESERVED &&
47 check_overlap(start, size,
48 memory_map.map[i].addr,
49 memory_map.map[i].size) )
50 return true;
51 }
52
53 return false;
54 }
55
56 /* Find the lowest RMRR ending above base but below 4G. */
find_next_rmrr(uint32_t base)57 static int find_next_rmrr(uint32_t base)
58 {
59 unsigned int i;
60 int next_rmrr = -1;
61 uint64_t end, min_end = GB(4);
62
63 for ( i = 0; i < memory_map.nr_map ; i++ )
64 {
65 end = memory_map.map[i].addr + memory_map.map[i].size;
66
67 if ( memory_map.map[i].type == E820_RESERVED &&
68 end > base && end <= min_end )
69 {
70 next_rmrr = i;
71 min_end = end;
72 }
73 }
74
75 return next_rmrr;
76 }
77
pci_setup(void)78 void pci_setup(void)
79 {
80 uint8_t is_64bar, using_64bar, bar64_relocate = 0;
81 uint32_t devfn, bar_reg, cmd, bar_data, bar_data_upper;
82 uint64_t base, bar_sz, bar_sz_upper, mmio_total = 0;
83 uint32_t vga_devfn = 256;
84 uint16_t class, vendor_id, device_id;
85 unsigned int bar, pin, link, isa_irq;
86 uint8_t pci_devfn_decode_type[256] = {};
87
88 /* Resources assignable to PCI devices via BARs. */
89 struct resource {
90 uint64_t base, max;
91 } *resource, mem_resource, high_mem_resource, io_resource;
92
93 /* Create a list of device BARs in descending order of size. */
94 struct bars {
95 uint32_t is_64bar;
96 uint32_t devfn;
97 uint32_t bar_reg;
98 uint64_t bar_sz;
99 } *bars = (struct bars *)scratch_start;
100 unsigned int i, nr_bars = 0;
101 uint64_t mmio_hole_size = 0;
102
103 const char *s;
104 /*
105 * Do we allow hvmloader to relocate guest memory in order to
106 * increase the size of the lowmem MMIO hole? Defaulting to 1
107 * here will mean that non-libxl toolstacks (including xend and
108 * home-grown ones) means that those using qemu-xen will still
109 * experience the memory relocation bug described below; but it
110 * also means that those using qemu-traditional will *not*
111 * experience any change; and it also means that there is a
112 * work-around for those using qemu-xen, namely switching to
113 * qemu-traditional.
114 *
115 * If we defaulted to 0, and failing to resize the hole caused any
116 * problems with qemu-traditional, then there is no work-around.
117 *
118 * Since xend can only use qemu-traditional, I think this is the
119 * option that will have the least impact.
120 */
121 bool allow_memory_relocate = 1;
122
123 BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_IO !=
124 PCI_COMMAND_IO);
125 BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MEMORY !=
126 PCI_COMMAND_MEMORY);
127 BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MASTER !=
128 PCI_COMMAND_MASTER);
129
130 s = xenstore_read(HVM_XS_ALLOW_MEMORY_RELOCATE, NULL);
131 if ( s )
132 allow_memory_relocate = strtoll(s, NULL, 0);
133 printf("Relocating guest memory for lowmem MMIO space %s\n",
134 allow_memory_relocate?"enabled":"disabled");
135
136 s = xenstore_read("platform/mmio_hole_size", NULL);
137 if ( s )
138 mmio_hole_size = strtoll(s, NULL, 0);
139
140 /* Program PCI-ISA bridge with appropriate link routes. */
141 isa_irq = 0;
142 for ( link = 0; link < 4; link++ )
143 {
144 do { isa_irq = (isa_irq + 1) & 15;
145 } while ( !(PCI_ISA_IRQ_MASK & (1U << isa_irq)) );
146 pci_writeb(PCI_ISA_DEVFN, 0x60 + link, isa_irq);
147 printf("PCI-ISA link %u routed to IRQ%u\n", link, isa_irq);
148 }
149
150 /* Program ELCR to match PCI-wired IRQs. */
151 outb(0x4d0, (uint8_t)(PCI_ISA_IRQ_MASK >> 0));
152 outb(0x4d1, (uint8_t)(PCI_ISA_IRQ_MASK >> 8));
153
154 /* Scan the PCI bus and map resources. */
155 for ( devfn = 0; devfn < 256; devfn++ )
156 {
157 class = pci_readw(devfn, PCI_CLASS_DEVICE);
158 vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
159 device_id = pci_readw(devfn, PCI_DEVICE_ID);
160 if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
161 continue;
162
163 ASSERT((devfn != PCI_ISA_DEVFN) ||
164 ((vendor_id == 0x8086) && (device_id == 0x7000)));
165
166 switch ( class )
167 {
168 case 0x0300:
169 /* If emulated VGA is found, preserve it as primary VGA. */
170 if ( (vendor_id == 0x1234) && (device_id == 0x1111) )
171 {
172 vga_devfn = devfn;
173 virtual_vga = VGA_std;
174 }
175 else if ( (vendor_id == 0x1013) && (device_id == 0xb8) )
176 {
177 vga_devfn = devfn;
178 virtual_vga = VGA_cirrus;
179 }
180 else if ( virtual_vga == VGA_none )
181 {
182 vga_devfn = devfn;
183 virtual_vga = VGA_pt;
184 if ( vendor_id == 0x8086 )
185 {
186 igd_opregion_pgbase = mem_hole_alloc(IGD_OPREGION_PAGES);
187 /*
188 * Write the the OpRegion offset to give the opregion
189 * address to the device model. The device model will trap
190 * and map the OpRegion at the give address.
191 */
192 pci_writel(vga_devfn, PCI_INTEL_OPREGION,
193 igd_opregion_pgbase << PAGE_SHIFT);
194 }
195 }
196 break;
197 case 0x0680:
198 /* PIIX4 ACPI PM. Special device with special PCI config space. */
199 ASSERT((vendor_id == 0x8086) && (device_id == 0x7113));
200 pci_writew(devfn, 0x20, 0x0000); /* No smb bus IO enable */
201 pci_writew(devfn, 0xd2, 0x0000); /* No smb bus IO enable */
202 pci_writew(devfn, 0x22, 0x0000);
203 pci_writew(devfn, 0x3c, 0x0009); /* Hardcoded IRQ9 */
204 pci_writew(devfn, 0x3d, 0x0001);
205 pci_writel(devfn, 0x40, ACPI_PM1A_EVT_BLK_ADDRESS_V1 | 1);
206 pci_writeb(devfn, 0x80, 0x01); /* enable PM io space */
207 break;
208 case 0x0101:
209 if ( vendor_id == 0x8086 )
210 {
211 /* Intel ICHs since PIIX3: enable IDE legacy mode. */
212 pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
213 pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
214 }
215 break;
216 }
217
218 /*
219 * It is recommended that BAR programming be done whilst decode
220 * bits are cleared to avoid incorrect mappings being created.
221 * When 64-bit memory BAR is programmed, first by writing the
222 * lower half and then the upper half, which maps to an address
223 * under 4G, as soon as lower half is wriiten, replacing any RAM
224 * mapped in that address, which is not restored back after the
225 * upper half is written and PCI memory is correctly mapped to
226 * its intended high mem address.
227 */
228 cmd = pci_readw(devfn, PCI_COMMAND);
229 cmd &= ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
230 pci_writew(devfn, PCI_COMMAND, cmd);
231
232 /* Map the I/O memory and port resources. */
233 for ( bar = 0; bar < 7; bar++ )
234 {
235 bar_sz_upper = 0;
236 bar_reg = PCI_BASE_ADDRESS_0 + 4*bar;
237 if ( bar == 6 )
238 bar_reg = PCI_ROM_ADDRESS;
239
240 bar_data = pci_readl(devfn, bar_reg);
241 if ( bar_reg != PCI_ROM_ADDRESS )
242 {
243 is_64bar = !!((bar_data & (PCI_BASE_ADDRESS_SPACE |
244 PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
245 (PCI_BASE_ADDRESS_SPACE_MEMORY |
246 PCI_BASE_ADDRESS_MEM_TYPE_64));
247 pci_writel(devfn, bar_reg, ~0);
248 }
249 else
250 {
251 is_64bar = 0;
252 pci_writel(devfn, bar_reg,
253 (bar_data | PCI_ROM_ADDRESS_MASK) &
254 ~PCI_ROM_ADDRESS_ENABLE);
255 }
256 bar_sz = pci_readl(devfn, bar_reg);
257 pci_writel(devfn, bar_reg, bar_data);
258
259 if ( bar_reg != PCI_ROM_ADDRESS )
260 bar_sz &= (((bar_data & PCI_BASE_ADDRESS_SPACE) ==
261 PCI_BASE_ADDRESS_SPACE_MEMORY) ?
262 PCI_BASE_ADDRESS_MEM_MASK :
263 (PCI_BASE_ADDRESS_IO_MASK & 0xffff));
264 else
265 bar_sz &= PCI_ROM_ADDRESS_MASK;
266 if (is_64bar) {
267 bar_data_upper = pci_readl(devfn, bar_reg + 4);
268 pci_writel(devfn, bar_reg + 4, ~0);
269 bar_sz_upper = pci_readl(devfn, bar_reg + 4);
270 pci_writel(devfn, bar_reg + 4, bar_data_upper);
271 bar_sz = (bar_sz_upper << 32) | bar_sz;
272 }
273 bar_sz &= ~(bar_sz - 1);
274 if ( bar_sz == 0 )
275 continue;
276
277 for ( i = 0; i < nr_bars; i++ )
278 if ( bars[i].bar_sz < bar_sz )
279 break;
280
281 if ( i != nr_bars )
282 memmove(&bars[i+1], &bars[i], (nr_bars-i) * sizeof(*bars));
283
284 bars[i].is_64bar = is_64bar;
285 bars[i].devfn = devfn;
286 bars[i].bar_reg = bar_reg;
287 bars[i].bar_sz = bar_sz;
288
289 if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
290 PCI_BASE_ADDRESS_SPACE_MEMORY) ||
291 (bar_reg == PCI_ROM_ADDRESS) )
292 mmio_total += bar_sz;
293
294 nr_bars++;
295
296 /*The upper half is already calculated, skip it! */
297 if (is_64bar)
298 bar++;
299 }
300
301 /* Map the interrupt. */
302 pin = pci_readb(devfn, PCI_INTERRUPT_PIN);
303 if ( pin != 0 )
304 {
305 /* This is the barber's pole mapping used by Xen. */
306 link = ((pin - 1) + (devfn >> 3)) & 3;
307 isa_irq = pci_readb(PCI_ISA_DEVFN, 0x60 + link);
308 pci_writeb(devfn, PCI_INTERRUPT_LINE, isa_irq);
309 printf("pci dev %02x:%x INT%c->IRQ%u\n",
310 devfn>>3, devfn&7, 'A'+pin-1, isa_irq);
311 }
312
313 /* Enable bus master for this function later */
314 pci_devfn_decode_type[devfn] = PCI_COMMAND_MASTER;
315 }
316
317 if ( mmio_hole_size )
318 {
319 uint64_t max_ram_below_4g = GB(4) - mmio_hole_size;
320
321 if ( max_ram_below_4g > HVM_BELOW_4G_MMIO_START )
322 {
323 printf("max_ram_below_4g=0x"PRIllx
324 " too big for mmio_hole_size=0x"PRIllx
325 " has been ignored.\n",
326 PRIllx_arg(max_ram_below_4g),
327 PRIllx_arg(mmio_hole_size));
328 }
329 else
330 {
331 pci_mem_start = max_ram_below_4g;
332 printf("pci_mem_start=0x%lx (was 0x%x) for mmio_hole_size=%lu\n",
333 pci_mem_start, HVM_BELOW_4G_MMIO_START,
334 (long)mmio_hole_size);
335 }
336 }
337 else
338 {
339 /*
340 * At the moment qemu-xen can't deal with relocated memory regions.
341 * It's too close to the release to make a proper fix; for now,
342 * only allow the MMIO hole to grow large enough to move guest memory
343 * if we're running qemu-traditional. Items that don't fit will be
344 * relocated into the 64-bit address space.
345 *
346 * This loop now does the following:
347 * - If allow_memory_relocate, increase the MMIO hole until it's
348 * big enough, or until it's 2GiB
349 * - If !allow_memory_relocate, increase the MMIO hole until it's
350 * big enough, or until it's 2GiB, or until it overlaps guest
351 * memory
352 */
353 while ( (mmio_total > (pci_mem_end - pci_mem_start))
354 && ((pci_mem_start << 1) != 0)
355 && (allow_memory_relocate
356 || (((pci_mem_start << 1) >> PAGE_SHIFT)
357 >= hvm_info->low_mem_pgend)) )
358 pci_mem_start <<= 1;
359
360 /*
361 * Try to accommodate RMRRs in our MMIO region on a best-effort basis.
362 * If we have RMRRs in the range, then make pci_mem_start just after
363 * hvm_info->low_mem_pgend.
364 */
365 if ( pci_mem_start > (hvm_info->low_mem_pgend << PAGE_SHIFT) &&
366 check_overlap_all(pci_mem_start, pci_mem_end-pci_mem_start) )
367 pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT;
368 }
369
370 if ( mmio_total > (pci_mem_end - pci_mem_start) )
371 {
372 printf("Low MMIO hole not large enough for all devices,"
373 " relocating some BARs to 64-bit\n");
374 bar64_relocate = 1;
375 }
376
377 /* Relocate RAM that overlaps PCI space (in 64k-page chunks). */
378 while ( (pci_mem_start >> PAGE_SHIFT) < hvm_info->low_mem_pgend )
379 {
380 struct xen_add_to_physmap xatp;
381 unsigned int nr_pages = min_t(
382 unsigned int,
383 hvm_info->low_mem_pgend - (pci_mem_start >> PAGE_SHIFT),
384 (1u << 16) - 1);
385 if ( hvm_info->high_mem_pgend == 0 )
386 hvm_info->high_mem_pgend = 1ull << (32 - PAGE_SHIFT);
387 hvm_info->low_mem_pgend -= nr_pages;
388 printf("Relocating 0x%x pages from "PRIllx" to "PRIllx\
389 " for lowmem MMIO hole\n",
390 nr_pages,
391 PRIllx_arg(((uint64_t)hvm_info->low_mem_pgend)<<PAGE_SHIFT),
392 PRIllx_arg(((uint64_t)hvm_info->high_mem_pgend)<<PAGE_SHIFT));
393 xatp.domid = DOMID_SELF;
394 xatp.space = XENMAPSPACE_gmfn_range;
395 xatp.idx = hvm_info->low_mem_pgend;
396 xatp.gpfn = hvm_info->high_mem_pgend;
397 xatp.size = nr_pages;
398 if ( hypercall_memory_op(XENMEM_add_to_physmap, &xatp) != 0 )
399 BUG();
400 hvm_info->high_mem_pgend += nr_pages;
401 }
402
403 /* Sync memory map[] if necessary. */
404 adjust_memory_map();
405
406 high_mem_resource.base = ((uint64_t)hvm_info->high_mem_pgend) << PAGE_SHIFT;
407 if ( high_mem_resource.base < GB(4) )
408 {
409 if ( hvm_info->high_mem_pgend != 0 )
410 printf("WARNING: hvm_info->high_mem_pgend %x"
411 " does not point into high memory!",
412 hvm_info->high_mem_pgend);
413 high_mem_resource.base = GB(4);
414 }
415 printf("%sRAM in high memory; setting high_mem resource base to "PRIllx"\n",
416 hvm_info->high_mem_pgend?"":"No ",
417 PRIllx_arg(high_mem_resource.base));
418 high_mem_resource.max = 1ull << cpu_phys_addr();
419 mem_resource.base = pci_mem_start;
420 mem_resource.max = pci_mem_end;
421 io_resource.base = 0xc000;
422 io_resource.max = 0x10000;
423
424 /* Assign iomem and ioport resources in descending order of size. */
425 for ( i = 0; i < nr_bars; i++ )
426 {
427 devfn = bars[i].devfn;
428 bar_reg = bars[i].bar_reg;
429 bar_sz = bars[i].bar_sz;
430
431 /*
432 * Relocate to high memory if the total amount of MMIO needed
433 * is more than the low MMIO available. Because devices are
434 * processed in order of bar_sz, this will preferentially
435 * relocate larger devices to high memory first.
436 *
437 * NB: The code here is rather fragile, as the check here to see
438 * whether bar_sz will fit in the low MMIO region doesn't match the
439 * real check made below, which involves aligning the base offset of the
440 * bar with the size of the bar itself. As it happens, this will always
441 * be satisfied because:
442 * - The first one will succeed because the MMIO hole can only start at
443 * 0x{f,e,c,8}00000000. If it fits, it will be aligned properly.
444 * - All subsequent ones will be aligned because the list is ordered
445 * large to small, and bar_sz is always a power of 2. (At least
446 * the code here assumes it to be.)
447 * Should either of those two conditions change, this code will break.
448 */
449 using_64bar = bars[i].is_64bar && bar64_relocate
450 && (mmio_total > (mem_resource.max - mem_resource.base));
451 bar_data = pci_readl(devfn, bar_reg);
452
453 if ( (bar_data & PCI_BASE_ADDRESS_SPACE) ==
454 PCI_BASE_ADDRESS_SPACE_MEMORY )
455 {
456 /* Mapping high memory if PCI device is 64 bits bar */
457 if ( using_64bar ) {
458 if ( high_mem_resource.base & (bar_sz - 1) )
459 high_mem_resource.base = high_mem_resource.base -
460 (high_mem_resource.base & (bar_sz - 1)) + bar_sz;
461 if ( !pci_hi_mem_start )
462 pci_hi_mem_start = high_mem_resource.base;
463 resource = &high_mem_resource;
464 bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
465 }
466 else {
467 resource = &mem_resource;
468 bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
469 }
470 mmio_total -= bar_sz;
471 }
472 else
473 {
474 resource = &io_resource;
475 bar_data &= ~PCI_BASE_ADDRESS_IO_MASK;
476 }
477
478 base = (resource->base + bar_sz - 1) & ~(uint64_t)(bar_sz - 1);
479
480 /* If we're using mem_resource, check for RMRR conflicts. */
481 if ( resource == &mem_resource)
482 {
483 int next_rmrr = find_next_rmrr(base);
484
485 while ( next_rmrr >= 0 &&
486 check_overlap(base, bar_sz,
487 memory_map.map[next_rmrr].addr,
488 memory_map.map[next_rmrr].size) )
489 {
490 base = memory_map.map[next_rmrr].addr +
491 memory_map.map[next_rmrr].size;
492 base = (base + bar_sz - 1) & ~(bar_sz - 1);
493 next_rmrr = find_next_rmrr(base);
494 }
495 }
496
497 bar_data |= (uint32_t)base;
498 bar_data_upper = (uint32_t)(base >> 32);
499 base += bar_sz;
500
501 if ( (base < resource->base) || (base > resource->max) )
502 {
503 printf("pci dev %02x:%x bar %02x size "PRIllx": no space for "
504 "resource!\n", devfn>>3, devfn&7, bar_reg,
505 PRIllx_arg(bar_sz));
506 continue;
507 }
508
509 resource->base = base;
510
511 pci_writel(devfn, bar_reg, bar_data);
512 if (using_64bar)
513 pci_writel(devfn, bar_reg + 4, bar_data_upper);
514 printf("pci dev %02x:%x bar %02x size "PRIllx": %x%08x\n",
515 devfn>>3, devfn&7, bar_reg,
516 PRIllx_arg(bar_sz),
517 bar_data_upper, bar_data);
518
519 if ( (bar_reg == PCI_ROM_ADDRESS) ||
520 ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
521 PCI_BASE_ADDRESS_SPACE_MEMORY) )
522 pci_devfn_decode_type[devfn] |= PCI_COMMAND_MEMORY;
523 else
524 pci_devfn_decode_type[devfn] |= PCI_COMMAND_IO;
525 }
526
527 if ( pci_hi_mem_start )
528 {
529 /*
530 * Make end address alignment match the start address one's so that
531 * fewer variable range MTRRs are needed to cover the range.
532 */
533 pci_hi_mem_end = ((high_mem_resource.base - 1) |
534 ((pci_hi_mem_start & -pci_hi_mem_start) - 1)) + 1;
535 }
536
537 if ( vga_devfn != 256 )
538 {
539 /*
540 * VGA registers live in I/O space so ensure that primary VGA
541 * has IO enabled, even if there is no I/O BAR on that
542 * particular device.
543 */
544 pci_devfn_decode_type[vga_devfn] |= PCI_COMMAND_IO;
545 }
546
547 /* Enable bus master, memory and I/O decode for all valid functions. */
548 for ( devfn = 0; devfn < 256; devfn++ )
549 if ( pci_devfn_decode_type[devfn] )
550 {
551 cmd = pci_readw(devfn, PCI_COMMAND);
552 cmd |= pci_devfn_decode_type[devfn];
553 pci_writew(devfn, PCI_COMMAND, cmd);
554 }
555 }
556
557 /*
558 * Local variables:
559 * mode: C
560 * c-file-style: "BSD"
561 * c-basic-offset: 4
562 * tab-width: 4
563 * indent-tabs-mode: nil
564 * End:
565 */
566