1 /*
2  * pci.c: HVM PCI setup.
3  *
4  * Leendert van Doorn, leendert@watson.ibm.com
5  * Copyright (c) 2005, International Business Machines Corporation.
6  *
7  * Copyright (c) 2006, Keir Fraser, XenSource Inc.
8  *
9  * This program is free software; you can redistribute it and/or modify it
10  * under the terms and conditions of the GNU General Public License,
11  * version 2, as published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope it will be useful, but WITHOUT
14  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
16  * more details.
17  *
18  * You should have received a copy of the GNU General Public License along with
19  * this program; If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "util.h"
23 #include "hypercall.h"
24 #include "config.h"
25 #include "pci_regs.h"
26 
27 #include <xen/memory.h>
28 #include <xen/hvm/ioreq.h>
29 #include <xen/hvm/hvm_xs_strings.h>
30 #include <xen/hvm/e820.h>
31 
32 unsigned long pci_mem_start = HVM_BELOW_4G_MMIO_START;
33 unsigned long pci_mem_end = PCI_MEM_END;
34 uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0;
35 
36 enum virtual_vga virtual_vga = VGA_none;
37 unsigned long igd_opregion_pgbase = 0;
38 
39 /* Check if the specified range conflicts with any reserved device memory. */
check_overlap_all(uint64_t start,uint64_t size)40 static bool check_overlap_all(uint64_t start, uint64_t size)
41 {
42     unsigned int i;
43 
44     for ( i = 0; i < memory_map.nr_map; i++ )
45     {
46         if ( memory_map.map[i].type == E820_RESERVED &&
47              check_overlap(start, size,
48                            memory_map.map[i].addr,
49                            memory_map.map[i].size) )
50             return true;
51     }
52 
53     return false;
54 }
55 
56 /* Find the lowest RMRR ending above base but below 4G. */
find_next_rmrr(uint32_t base)57 static int find_next_rmrr(uint32_t base)
58 {
59     unsigned int i;
60     int next_rmrr = -1;
61     uint64_t end, min_end = GB(4);
62 
63     for ( i = 0; i < memory_map.nr_map ; i++ )
64     {
65         end = memory_map.map[i].addr + memory_map.map[i].size;
66 
67         if ( memory_map.map[i].type == E820_RESERVED &&
68              end > base && end <= min_end )
69         {
70             next_rmrr = i;
71             min_end = end;
72         }
73     }
74 
75     return next_rmrr;
76 }
77 
pci_setup(void)78 void pci_setup(void)
79 {
80     uint8_t is_64bar, using_64bar, bar64_relocate = 0;
81     uint32_t devfn, bar_reg, cmd, bar_data, bar_data_upper;
82     uint64_t base, bar_sz, bar_sz_upper, mmio_total = 0;
83     uint32_t vga_devfn = 256;
84     uint16_t class, vendor_id, device_id;
85     unsigned int bar, pin, link, isa_irq;
86     uint8_t pci_devfn_decode_type[256] = {};
87 
88     /* Resources assignable to PCI devices via BARs. */
89     struct resource {
90         uint64_t base, max;
91     } *resource, mem_resource, high_mem_resource, io_resource;
92 
93     /* Create a list of device BARs in descending order of size. */
94     struct bars {
95         uint32_t is_64bar;
96         uint32_t devfn;
97         uint32_t bar_reg;
98         uint64_t bar_sz;
99     } *bars = (struct bars *)scratch_start;
100     unsigned int i, nr_bars = 0;
101     uint64_t mmio_hole_size = 0;
102 
103     const char *s;
104     /*
105      * Do we allow hvmloader to relocate guest memory in order to
106      * increase the size of the lowmem MMIO hole?  Defaulting to 1
107      * here will mean that non-libxl toolstacks (including xend and
108      * home-grown ones) means that those using qemu-xen will still
109      * experience the memory relocation bug described below; but it
110      * also means that those using qemu-traditional will *not*
111      * experience any change; and it also means that there is a
112      * work-around for those using qemu-xen, namely switching to
113      * qemu-traditional.
114      *
115      * If we defaulted to 0, and failing to resize the hole caused any
116      * problems with qemu-traditional, then there is no work-around.
117      *
118      * Since xend can only use qemu-traditional, I think this is the
119      * option that will have the least impact.
120      */
121     bool allow_memory_relocate = 1;
122 
123     BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_IO !=
124                  PCI_COMMAND_IO);
125     BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MEMORY !=
126                  PCI_COMMAND_MEMORY);
127     BUILD_BUG_ON((typeof(*pci_devfn_decode_type))PCI_COMMAND_MASTER !=
128                  PCI_COMMAND_MASTER);
129 
130     s = xenstore_read(HVM_XS_ALLOW_MEMORY_RELOCATE, NULL);
131     if ( s )
132         allow_memory_relocate = strtoll(s, NULL, 0);
133     printf("Relocating guest memory for lowmem MMIO space %s\n",
134            allow_memory_relocate?"enabled":"disabled");
135 
136     s = xenstore_read("platform/mmio_hole_size", NULL);
137     if ( s )
138         mmio_hole_size = strtoll(s, NULL, 0);
139 
140     /* Program PCI-ISA bridge with appropriate link routes. */
141     isa_irq = 0;
142     for ( link = 0; link < 4; link++ )
143     {
144         do { isa_irq = (isa_irq + 1) & 15;
145         } while ( !(PCI_ISA_IRQ_MASK & (1U << isa_irq)) );
146         pci_writeb(PCI_ISA_DEVFN, 0x60 + link, isa_irq);
147         printf("PCI-ISA link %u routed to IRQ%u\n", link, isa_irq);
148     }
149 
150     /* Program ELCR to match PCI-wired IRQs. */
151     outb(0x4d0, (uint8_t)(PCI_ISA_IRQ_MASK >> 0));
152     outb(0x4d1, (uint8_t)(PCI_ISA_IRQ_MASK >> 8));
153 
154     /* Scan the PCI bus and map resources. */
155     for ( devfn = 0; devfn < 256; devfn++ )
156     {
157         class     = pci_readw(devfn, PCI_CLASS_DEVICE);
158         vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
159         device_id = pci_readw(devfn, PCI_DEVICE_ID);
160         if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
161             continue;
162 
163         ASSERT((devfn != PCI_ISA_DEVFN) ||
164                ((vendor_id == 0x8086) && (device_id == 0x7000)));
165 
166         switch ( class )
167         {
168         case 0x0300:
169             /* If emulated VGA is found, preserve it as primary VGA. */
170             if ( (vendor_id == 0x1234) && (device_id == 0x1111) )
171             {
172                 vga_devfn = devfn;
173                 virtual_vga = VGA_std;
174             }
175             else if ( (vendor_id == 0x1013) && (device_id == 0xb8) )
176             {
177                 vga_devfn = devfn;
178                 virtual_vga = VGA_cirrus;
179             }
180             else if ( virtual_vga == VGA_none )
181             {
182                 vga_devfn = devfn;
183                 virtual_vga = VGA_pt;
184                 if ( vendor_id == 0x8086 )
185                 {
186                     igd_opregion_pgbase = mem_hole_alloc(IGD_OPREGION_PAGES);
187                     /*
188                      * Write the the OpRegion offset to give the opregion
189                      * address to the device model. The device model will trap
190                      * and map the OpRegion at the give address.
191                      */
192                     pci_writel(vga_devfn, PCI_INTEL_OPREGION,
193                                igd_opregion_pgbase << PAGE_SHIFT);
194                 }
195             }
196             break;
197         case 0x0680:
198             /* PIIX4 ACPI PM. Special device with special PCI config space. */
199             ASSERT((vendor_id == 0x8086) && (device_id == 0x7113));
200             pci_writew(devfn, 0x20, 0x0000); /* No smb bus IO enable */
201             pci_writew(devfn, 0xd2, 0x0000); /* No smb bus IO enable */
202             pci_writew(devfn, 0x22, 0x0000);
203             pci_writew(devfn, 0x3c, 0x0009); /* Hardcoded IRQ9 */
204             pci_writew(devfn, 0x3d, 0x0001);
205             pci_writel(devfn, 0x40, ACPI_PM1A_EVT_BLK_ADDRESS_V1 | 1);
206             pci_writeb(devfn, 0x80, 0x01); /* enable PM io space */
207             break;
208         case 0x0101:
209             if ( vendor_id == 0x8086 )
210             {
211                 /* Intel ICHs since PIIX3: enable IDE legacy mode. */
212                 pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
213                 pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
214             }
215             break;
216         }
217 
218         /*
219          * It is recommended that BAR programming be done whilst decode
220          * bits are cleared to avoid incorrect mappings being created.
221          * When 64-bit memory BAR is programmed, first by writing the
222          * lower half and then the upper half, which maps to an address
223          * under 4G, as soon as lower half is wriiten, replacing any RAM
224          * mapped in that address, which is not restored back after the
225          * upper half is written and PCI memory is correctly mapped to
226          * its intended high mem address.
227          */
228         cmd = pci_readw(devfn, PCI_COMMAND);
229         cmd &= ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
230         pci_writew(devfn, PCI_COMMAND, cmd);
231 
232         /* Map the I/O memory and port resources. */
233         for ( bar = 0; bar < 7; bar++ )
234         {
235             bar_sz_upper = 0;
236             bar_reg = PCI_BASE_ADDRESS_0 + 4*bar;
237             if ( bar == 6 )
238                 bar_reg = PCI_ROM_ADDRESS;
239 
240             bar_data = pci_readl(devfn, bar_reg);
241             if ( bar_reg != PCI_ROM_ADDRESS )
242             {
243                 is_64bar = !!((bar_data & (PCI_BASE_ADDRESS_SPACE |
244                              PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
245                              (PCI_BASE_ADDRESS_SPACE_MEMORY |
246                              PCI_BASE_ADDRESS_MEM_TYPE_64));
247                 pci_writel(devfn, bar_reg, ~0);
248             }
249             else
250             {
251                 is_64bar = 0;
252                 pci_writel(devfn, bar_reg,
253                            (bar_data | PCI_ROM_ADDRESS_MASK) &
254                            ~PCI_ROM_ADDRESS_ENABLE);
255             }
256             bar_sz = pci_readl(devfn, bar_reg);
257             pci_writel(devfn, bar_reg, bar_data);
258 
259             if ( bar_reg != PCI_ROM_ADDRESS )
260                 bar_sz &= (((bar_data & PCI_BASE_ADDRESS_SPACE) ==
261                             PCI_BASE_ADDRESS_SPACE_MEMORY) ?
262                            PCI_BASE_ADDRESS_MEM_MASK :
263                            (PCI_BASE_ADDRESS_IO_MASK & 0xffff));
264             else
265                 bar_sz &= PCI_ROM_ADDRESS_MASK;
266             if (is_64bar) {
267                 bar_data_upper = pci_readl(devfn, bar_reg + 4);
268                 pci_writel(devfn, bar_reg + 4, ~0);
269                 bar_sz_upper = pci_readl(devfn, bar_reg + 4);
270                 pci_writel(devfn, bar_reg + 4, bar_data_upper);
271                 bar_sz = (bar_sz_upper << 32) | bar_sz;
272             }
273             bar_sz &= ~(bar_sz - 1);
274             if ( bar_sz == 0 )
275                 continue;
276 
277             for ( i = 0; i < nr_bars; i++ )
278                 if ( bars[i].bar_sz < bar_sz )
279                     break;
280 
281             if ( i != nr_bars )
282                 memmove(&bars[i+1], &bars[i], (nr_bars-i) * sizeof(*bars));
283 
284             bars[i].is_64bar = is_64bar;
285             bars[i].devfn   = devfn;
286             bars[i].bar_reg = bar_reg;
287             bars[i].bar_sz  = bar_sz;
288 
289             if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
290                   PCI_BASE_ADDRESS_SPACE_MEMORY) ||
291                  (bar_reg == PCI_ROM_ADDRESS) )
292                 mmio_total += bar_sz;
293 
294             nr_bars++;
295 
296             /*The upper half is already calculated, skip it! */
297             if (is_64bar)
298                 bar++;
299         }
300 
301         /* Map the interrupt. */
302         pin = pci_readb(devfn, PCI_INTERRUPT_PIN);
303         if ( pin != 0 )
304         {
305             /* This is the barber's pole mapping used by Xen. */
306             link = ((pin - 1) + (devfn >> 3)) & 3;
307             isa_irq = pci_readb(PCI_ISA_DEVFN, 0x60 + link);
308             pci_writeb(devfn, PCI_INTERRUPT_LINE, isa_irq);
309             printf("pci dev %02x:%x INT%c->IRQ%u\n",
310                    devfn>>3, devfn&7, 'A'+pin-1, isa_irq);
311         }
312 
313         /* Enable bus master for this function later */
314         pci_devfn_decode_type[devfn] = PCI_COMMAND_MASTER;
315     }
316 
317     if ( mmio_hole_size )
318     {
319         uint64_t max_ram_below_4g = GB(4) - mmio_hole_size;
320 
321         if ( max_ram_below_4g > HVM_BELOW_4G_MMIO_START )
322         {
323             printf("max_ram_below_4g=0x"PRIllx
324                    " too big for mmio_hole_size=0x"PRIllx
325                    " has been ignored.\n",
326                    PRIllx_arg(max_ram_below_4g),
327                    PRIllx_arg(mmio_hole_size));
328         }
329         else
330         {
331             pci_mem_start = max_ram_below_4g;
332             printf("pci_mem_start=0x%lx (was 0x%x) for mmio_hole_size=%lu\n",
333                    pci_mem_start, HVM_BELOW_4G_MMIO_START,
334                    (long)mmio_hole_size);
335         }
336     }
337     else
338     {
339         /*
340          * At the moment qemu-xen can't deal with relocated memory regions.
341          * It's too close to the release to make a proper fix; for now,
342          * only allow the MMIO hole to grow large enough to move guest memory
343          * if we're running qemu-traditional.  Items that don't fit will be
344          * relocated into the 64-bit address space.
345          *
346          * This loop now does the following:
347          * - If allow_memory_relocate, increase the MMIO hole until it's
348          *   big enough, or until it's 2GiB
349          * - If !allow_memory_relocate, increase the MMIO hole until it's
350          *   big enough, or until it's 2GiB, or until it overlaps guest
351          *   memory
352          */
353         while ( (mmio_total > (pci_mem_end - pci_mem_start))
354                 && ((pci_mem_start << 1) != 0)
355                 && (allow_memory_relocate
356                     || (((pci_mem_start << 1) >> PAGE_SHIFT)
357                         >= hvm_info->low_mem_pgend)) )
358             pci_mem_start <<= 1;
359 
360         /*
361          * Try to accommodate RMRRs in our MMIO region on a best-effort basis.
362          * If we have RMRRs in the range, then make pci_mem_start just after
363          * hvm_info->low_mem_pgend.
364          */
365         if ( pci_mem_start > (hvm_info->low_mem_pgend << PAGE_SHIFT) &&
366              check_overlap_all(pci_mem_start, pci_mem_end-pci_mem_start) )
367             pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT;
368     }
369 
370     if ( mmio_total > (pci_mem_end - pci_mem_start) )
371     {
372         printf("Low MMIO hole not large enough for all devices,"
373                " relocating some BARs to 64-bit\n");
374         bar64_relocate = 1;
375     }
376 
377     /* Relocate RAM that overlaps PCI space (in 64k-page chunks). */
378     while ( (pci_mem_start >> PAGE_SHIFT) < hvm_info->low_mem_pgend )
379     {
380         struct xen_add_to_physmap xatp;
381         unsigned int nr_pages = min_t(
382             unsigned int,
383             hvm_info->low_mem_pgend - (pci_mem_start >> PAGE_SHIFT),
384             (1u << 16) - 1);
385         if ( hvm_info->high_mem_pgend == 0 )
386             hvm_info->high_mem_pgend = 1ull << (32 - PAGE_SHIFT);
387         hvm_info->low_mem_pgend -= nr_pages;
388         printf("Relocating 0x%x pages from "PRIllx" to "PRIllx\
389                " for lowmem MMIO hole\n",
390                nr_pages,
391                PRIllx_arg(((uint64_t)hvm_info->low_mem_pgend)<<PAGE_SHIFT),
392                PRIllx_arg(((uint64_t)hvm_info->high_mem_pgend)<<PAGE_SHIFT));
393         xatp.domid = DOMID_SELF;
394         xatp.space = XENMAPSPACE_gmfn_range;
395         xatp.idx   = hvm_info->low_mem_pgend;
396         xatp.gpfn  = hvm_info->high_mem_pgend;
397         xatp.size  = nr_pages;
398         if ( hypercall_memory_op(XENMEM_add_to_physmap, &xatp) != 0 )
399             BUG();
400         hvm_info->high_mem_pgend += nr_pages;
401     }
402 
403     /* Sync memory map[] if necessary. */
404     adjust_memory_map();
405 
406     high_mem_resource.base = ((uint64_t)hvm_info->high_mem_pgend) << PAGE_SHIFT;
407     if ( high_mem_resource.base < GB(4) )
408     {
409         if ( hvm_info->high_mem_pgend != 0 )
410             printf("WARNING: hvm_info->high_mem_pgend %x"
411                    " does not point into high memory!",
412                    hvm_info->high_mem_pgend);
413         high_mem_resource.base = GB(4);
414     }
415     printf("%sRAM in high memory; setting high_mem resource base to "PRIllx"\n",
416            hvm_info->high_mem_pgend?"":"No ",
417            PRIllx_arg(high_mem_resource.base));
418     high_mem_resource.max = 1ull << cpu_phys_addr();
419     mem_resource.base = pci_mem_start;
420     mem_resource.max = pci_mem_end;
421     io_resource.base = 0xc000;
422     io_resource.max = 0x10000;
423 
424     /* Assign iomem and ioport resources in descending order of size. */
425     for ( i = 0; i < nr_bars; i++ )
426     {
427         devfn   = bars[i].devfn;
428         bar_reg = bars[i].bar_reg;
429         bar_sz  = bars[i].bar_sz;
430 
431         /*
432          * Relocate to high memory if the total amount of MMIO needed
433          * is more than the low MMIO available.  Because devices are
434          * processed in order of bar_sz, this will preferentially
435          * relocate larger devices to high memory first.
436          *
437          * NB: The code here is rather fragile, as the check here to see
438          * whether bar_sz will fit in the low MMIO region doesn't match the
439          * real check made below, which involves aligning the base offset of the
440          * bar with the size of the bar itself.  As it happens, this will always
441          * be satisfied because:
442          * - The first one will succeed because the MMIO hole can only start at
443          *   0x{f,e,c,8}00000000.  If it fits, it will be aligned properly.
444          * - All subsequent ones will be aligned because the list is ordered
445          *   large to small, and bar_sz is always a power of 2. (At least
446          *   the code here assumes it to be.)
447          * Should either of those two conditions change, this code will break.
448          */
449         using_64bar = bars[i].is_64bar && bar64_relocate
450             && (mmio_total > (mem_resource.max - mem_resource.base));
451         bar_data = pci_readl(devfn, bar_reg);
452 
453         if ( (bar_data & PCI_BASE_ADDRESS_SPACE) ==
454              PCI_BASE_ADDRESS_SPACE_MEMORY )
455         {
456             /* Mapping high memory if PCI device is 64 bits bar */
457             if ( using_64bar ) {
458                 if ( high_mem_resource.base & (bar_sz - 1) )
459                     high_mem_resource.base = high_mem_resource.base -
460                         (high_mem_resource.base & (bar_sz - 1)) + bar_sz;
461                 if ( !pci_hi_mem_start )
462                     pci_hi_mem_start = high_mem_resource.base;
463                 resource = &high_mem_resource;
464                 bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
465             }
466             else {
467                 resource = &mem_resource;
468                 bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
469             }
470             mmio_total -= bar_sz;
471         }
472         else
473         {
474             resource = &io_resource;
475             bar_data &= ~PCI_BASE_ADDRESS_IO_MASK;
476         }
477 
478         base = (resource->base  + bar_sz - 1) & ~(uint64_t)(bar_sz - 1);
479 
480         /* If we're using mem_resource, check for RMRR conflicts. */
481         if ( resource == &mem_resource)
482         {
483             int next_rmrr = find_next_rmrr(base);
484 
485             while ( next_rmrr >= 0 &&
486                     check_overlap(base, bar_sz,
487                               memory_map.map[next_rmrr].addr,
488                               memory_map.map[next_rmrr].size) )
489             {
490                 base = memory_map.map[next_rmrr].addr +
491                        memory_map.map[next_rmrr].size;
492                 base = (base + bar_sz - 1) & ~(bar_sz - 1);
493                 next_rmrr = find_next_rmrr(base);
494             }
495         }
496 
497         bar_data |= (uint32_t)base;
498         bar_data_upper = (uint32_t)(base >> 32);
499         base += bar_sz;
500 
501         if ( (base < resource->base) || (base > resource->max) )
502         {
503             printf("pci dev %02x:%x bar %02x size "PRIllx": no space for "
504                    "resource!\n", devfn>>3, devfn&7, bar_reg,
505                    PRIllx_arg(bar_sz));
506             continue;
507         }
508 
509         resource->base = base;
510 
511         pci_writel(devfn, bar_reg, bar_data);
512         if (using_64bar)
513             pci_writel(devfn, bar_reg + 4, bar_data_upper);
514         printf("pci dev %02x:%x bar %02x size "PRIllx": %x%08x\n",
515                devfn>>3, devfn&7, bar_reg,
516                PRIllx_arg(bar_sz),
517                bar_data_upper, bar_data);
518 
519         if ( (bar_reg == PCI_ROM_ADDRESS) ||
520              ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
521               PCI_BASE_ADDRESS_SPACE_MEMORY) )
522             pci_devfn_decode_type[devfn] |= PCI_COMMAND_MEMORY;
523         else
524             pci_devfn_decode_type[devfn] |= PCI_COMMAND_IO;
525     }
526 
527     if ( pci_hi_mem_start )
528     {
529         /*
530          * Make end address alignment match the start address one's so that
531          * fewer variable range MTRRs are needed to cover the range.
532          */
533         pci_hi_mem_end = ((high_mem_resource.base - 1) |
534                           ((pci_hi_mem_start & -pci_hi_mem_start) - 1)) + 1;
535     }
536 
537     if ( vga_devfn != 256 )
538     {
539         /*
540          * VGA registers live in I/O space so ensure that primary VGA
541          * has IO enabled, even if there is no I/O BAR on that
542          * particular device.
543          */
544         pci_devfn_decode_type[vga_devfn] |= PCI_COMMAND_IO;
545     }
546 
547     /* Enable bus master, memory and I/O decode for all valid functions. */
548     for ( devfn = 0; devfn < 256; devfn++ )
549         if ( pci_devfn_decode_type[devfn] )
550         {
551             cmd = pci_readw(devfn, PCI_COMMAND);
552             cmd |= pci_devfn_decode_type[devfn];
553             pci_writew(devfn, PCI_COMMAND, cmd);
554         }
555 }
556 
557 /*
558  * Local variables:
559  * mode: C
560  * c-file-style: "BSD"
561  * c-basic-offset: 4
562  * tab-width: 4
563  * indent-tabs-mode: nil
564  * End:
565  */
566