1 /******************************************************************************
2  * memory.c
3  *
4  * Code to handle memory-related requests.
5  *
6  * Copyright (c) 2003-2004, B Dragovic
7  * Copyright (c) 2003-2005, K A Fraser
8  */
9 
10 #include <xen/domain_page.h>
11 #include <xen/types.h>
12 #include <xen/lib.h>
13 #include <xen/mm.h>
14 #include <xen/param.h>
15 #include <xen/perfc.h>
16 #include <xen/sched.h>
17 #include <xen/event.h>
18 #include <xen/paging.h>
19 #include <xen/iocap.h>
20 #include <xen/guest_access.h>
21 #include <xen/hypercall.h>
22 #include <xen/errno.h>
23 #include <xen/numa.h>
24 #include <xen/mem_access.h>
25 #include <xen/trace.h>
26 #include <xen/grant_table.h>
27 #include <asm/current.h>
28 #include <asm/hardirq.h>
29 #include <asm/p2m.h>
30 #include <public/memory.h>
31 #include <xsm/xsm.h>
32 
33 #ifdef CONFIG_X86
34 #include <asm/guest.h>
35 #endif
36 
37 struct memop_args {
38     /* INPUT */
39     struct domain *domain;     /* Domain to be affected. */
40     XEN_GUEST_HANDLE(xen_pfn_t) extent_list; /* List of extent base addrs. */
41     unsigned int nr_extents;   /* Number of extents to allocate or free. */
42     unsigned int extent_order; /* Size of each extent. */
43     unsigned int memflags;     /* Allocation flags. */
44 
45     /* INPUT/OUTPUT */
46     unsigned int nr_done;    /* Number of extents processed so far. */
47     int          preempted;  /* Was the hypercall preempted? */
48 };
49 
50 #ifndef CONFIG_CTLDOM_MAX_ORDER
51 #define CONFIG_CTLDOM_MAX_ORDER CONFIG_PAGEALLOC_MAX_ORDER
52 #endif
53 #ifndef CONFIG_PTDOM_MAX_ORDER
54 #define CONFIG_PTDOM_MAX_ORDER CONFIG_HWDOM_MAX_ORDER
55 #endif
56 
57 static unsigned int __read_mostly domu_max_order = CONFIG_DOMU_MAX_ORDER;
58 static unsigned int __read_mostly ctldom_max_order = CONFIG_CTLDOM_MAX_ORDER;
59 static unsigned int __read_mostly hwdom_max_order = CONFIG_HWDOM_MAX_ORDER;
60 #ifdef HAS_PASSTHROUGH
61 static unsigned int __read_mostly ptdom_max_order = CONFIG_PTDOM_MAX_ORDER;
62 #endif
63 
parse_max_order(const char * s)64 static int __init parse_max_order(const char *s)
65 {
66     if ( *s != ',' )
67         domu_max_order = simple_strtoul(s, &s, 0);
68     if ( *s == ',' && *++s != ',' )
69         ctldom_max_order = simple_strtoul(s, &s, 0);
70     if ( *s == ',' && *++s != ',' )
71         hwdom_max_order = simple_strtoul(s, &s, 0);
72 #ifdef HAS_PASSTHROUGH
73     if ( *s == ',' && *++s != ',' )
74         ptdom_max_order = simple_strtoul(s, &s, 0);
75 #endif
76 
77     return *s ? -EINVAL : 0;
78 }
79 custom_param("memop-max-order", parse_max_order);
80 
max_order(const struct domain * d)81 static unsigned int max_order(const struct domain *d)
82 {
83     unsigned int order = domu_max_order;
84 
85 #ifdef HAS_PASSTHROUGH
86     if ( cache_flush_permitted(d) && order < ptdom_max_order )
87         order = ptdom_max_order;
88 #endif
89 
90     if ( is_control_domain(d) && order < ctldom_max_order )
91         order = ctldom_max_order;
92 
93     if ( is_hardware_domain(d) && order < hwdom_max_order )
94         order = hwdom_max_order;
95 
96     return min(order, MAX_ORDER + 0U);
97 }
98 
99 /* Helper to copy a typesafe MFN to guest */
100 static inline
__copy_mfn_to_guest_offset(XEN_GUEST_HANDLE (xen_pfn_t)hnd,size_t off,mfn_t mfn)101 unsigned long __copy_mfn_to_guest_offset(XEN_GUEST_HANDLE(xen_pfn_t) hnd,
102                                          size_t off, mfn_t mfn)
103  {
104     xen_pfn_t mfn_ = mfn_x(mfn);
105 
106     return __copy_to_guest_offset(hnd, off, &mfn_, 1);
107 }
108 
increase_reservation(struct memop_args * a)109 static void increase_reservation(struct memop_args *a)
110 {
111     struct page_info *page;
112     unsigned long i;
113     struct domain *d = a->domain;
114 
115     if ( !guest_handle_is_null(a->extent_list) &&
116          !guest_handle_subrange_okay(a->extent_list, a->nr_done,
117                                      a->nr_extents-1) )
118         return;
119 
120     if ( a->extent_order > max_order(current->domain) )
121         return;
122 
123     for ( i = a->nr_done; i < a->nr_extents; i++ )
124     {
125         if ( i != a->nr_done && hypercall_preempt_check() )
126         {
127             a->preempted = 1;
128             goto out;
129         }
130 
131         page = alloc_domheap_pages(d, a->extent_order, a->memflags);
132         if ( unlikely(page == NULL) )
133         {
134             gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
135                     "id=%d memflags=%x (%ld of %d)\n",
136                      a->extent_order, d->domain_id, a->memflags,
137                      i, a->nr_extents);
138             goto out;
139         }
140 
141         /* Inform the domain of the new page's machine address. */
142         if ( !paging_mode_translate(d) &&
143              !guest_handle_is_null(a->extent_list) )
144         {
145             mfn_t mfn = page_to_mfn(page);
146 
147             if ( unlikely(__copy_mfn_to_guest_offset(a->extent_list, i, mfn)) )
148                 goto out;
149         }
150     }
151 
152  out:
153     a->nr_done = i;
154 }
155 
populate_physmap(struct memop_args * a)156 static void populate_physmap(struct memop_args *a)
157 {
158     struct page_info *page;
159     unsigned int i, j;
160     xen_pfn_t gpfn;
161     struct domain *d = a->domain, *curr_d = current->domain;
162     bool need_tlbflush = false;
163     uint32_t tlbflush_timestamp = 0;
164 
165     if ( !guest_handle_subrange_okay(a->extent_list, a->nr_done,
166                                      a->nr_extents-1) )
167         return;
168 
169     if ( a->extent_order > (a->memflags & MEMF_populate_on_demand ? MAX_ORDER :
170                             max_order(curr_d)) )
171         return;
172 
173     if ( unlikely(!d->creation_finished) )
174     {
175         /*
176          * With MEMF_no_tlbflush set, alloc_heap_pages() will ignore
177          * TLB-flushes. After VM creation, this is a security issue (it can
178          * make pages accessible to guest B, when guest A may still have a
179          * cached mapping to them). So we do this only during domain creation,
180          * when the domain itself has not yet been unpaused for the first
181          * time.
182          */
183         a->memflags |= MEMF_no_tlbflush;
184         /*
185          * With MEMF_no_icache_flush, alloc_heap_pages() will skip
186          * performing icache flushes. We do it only before domain
187          * creation as once the domain is running there is a danger of
188          * executing instructions from stale caches if icache flush is
189          * delayed.
190          */
191         a->memflags |= MEMF_no_icache_flush;
192     }
193 
194     for ( i = a->nr_done; i < a->nr_extents; i++ )
195     {
196         mfn_t mfn;
197 
198         if ( i != a->nr_done && hypercall_preempt_check() )
199         {
200             a->preempted = 1;
201             goto out;
202         }
203 
204         if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
205             goto out;
206 
207         if ( a->memflags & MEMF_populate_on_demand )
208         {
209             /* Disallow populating PoD pages on oneself. */
210             if ( d == curr_d )
211                 goto out;
212 
213             if ( is_hvm_domain(d) &&
214                  guest_physmap_mark_populate_on_demand(d, gpfn,
215                                                        a->extent_order) < 0 )
216                 goto out;
217         }
218         else
219         {
220             if ( is_domain_direct_mapped(d) )
221             {
222                 mfn = _mfn(gpfn);
223 
224                 for ( j = 0; j < (1U << a->extent_order); j++,
225                       mfn = mfn_add(mfn, 1) )
226                 {
227                     if ( !mfn_valid(mfn) )
228                     {
229                         gdprintk(XENLOG_INFO, "Invalid mfn %#"PRI_mfn"\n",
230                                  mfn_x(mfn));
231                         goto out;
232                     }
233 
234                     page = mfn_to_page(mfn);
235                     if ( !get_page(page, d) )
236                     {
237                         gdprintk(XENLOG_INFO,
238                                  "mfn %#"PRI_mfn" doesn't belong to d%d\n",
239                                   mfn_x(mfn), d->domain_id);
240                         goto out;
241                     }
242                     put_page(page);
243                 }
244 
245                 mfn = _mfn(gpfn);
246             }
247             else
248             {
249                 page = alloc_domheap_pages(d, a->extent_order, a->memflags);
250 
251                 if ( unlikely(!page) )
252                 {
253                     gdprintk(XENLOG_INFO,
254                              "Could not allocate order=%u extent: id=%d memflags=%#x (%u of %u)\n",
255                              a->extent_order, d->domain_id, a->memflags,
256                              i, a->nr_extents);
257                     goto out;
258                 }
259 
260                 if ( unlikely(a->memflags & MEMF_no_tlbflush) )
261                 {
262                     for ( j = 0; j < (1U << a->extent_order); j++ )
263                         accumulate_tlbflush(&need_tlbflush, &page[j],
264                                             &tlbflush_timestamp);
265                 }
266 
267                 mfn = page_to_mfn(page);
268             }
269 
270             guest_physmap_add_page(d, _gfn(gpfn), mfn, a->extent_order);
271 
272             if ( !paging_mode_translate(d) &&
273                  /* Inform the domain of the new page's machine address. */
274                  unlikely(__copy_mfn_to_guest_offset(a->extent_list, i, mfn)) )
275                 goto out;
276         }
277     }
278 
279 out:
280     if ( need_tlbflush )
281         filtered_flush_tlb_mask(tlbflush_timestamp);
282 
283     if ( a->memflags & MEMF_no_icache_flush )
284         invalidate_icache();
285 
286     a->nr_done = i;
287 }
288 
guest_remove_page(struct domain * d,unsigned long gmfn)289 int guest_remove_page(struct domain *d, unsigned long gmfn)
290 {
291     struct page_info *page;
292 #ifdef CONFIG_X86
293     p2m_type_t p2mt;
294 #endif
295     mfn_t mfn;
296     bool *dont_flush_p, dont_flush;
297     int rc;
298 
299 #ifdef CONFIG_X86
300     mfn = get_gfn_query(d, gmfn, &p2mt);
301     if ( unlikely(p2mt == p2m_invalid) || unlikely(p2mt == p2m_mmio_dm) )
302     {
303         put_gfn(d, gmfn);
304 
305         return -ENOENT;
306     }
307 
308     if ( unlikely(p2m_is_paging(p2mt)) )
309     {
310         /*
311          * If the page hasn't yet been paged out, there is an
312          * actual page that needs to be released.
313          */
314         if ( p2mt == p2m_ram_paging_out )
315         {
316             ASSERT(mfn_valid(mfn));
317             goto obtain_page;
318         }
319 
320         rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0);
321         if ( rc )
322             goto out_put_gfn;
323 
324         put_gfn(d, gmfn);
325 
326         p2m_mem_paging_drop_page(d, _gfn(gmfn), p2mt);
327 
328         return 0;
329     }
330     if ( p2mt == p2m_mmio_direct )
331     {
332         rc = clear_mmio_p2m_entry(d, gmfn, mfn, PAGE_ORDER_4K);
333         goto out_put_gfn;
334     }
335 #else
336     mfn = gfn_to_mfn(d, _gfn(gmfn));
337 #endif
338     if ( unlikely(!mfn_valid(mfn)) )
339     {
340 #ifdef CONFIG_X86
341         put_gfn(d, gmfn);
342 #endif
343         gdprintk(XENLOG_INFO, "Domain %u page number %lx invalid\n",
344                 d->domain_id, gmfn);
345 
346         return -EINVAL;
347     }
348 
349 #ifdef CONFIG_X86
350     if ( p2m_is_shared(p2mt) )
351     {
352         /*
353          * Unshare the page, bail out on error. We unshare because we
354          * might be the only one using this shared page, and we need to
355          * trigger proper cleanup. Once done, this is like any other page.
356          */
357         rc = mem_sharing_unshare_page(d, gmfn);
358         if ( rc )
359         {
360             mem_sharing_notify_enomem(d, gmfn, false);
361             goto out_put_gfn;
362         }
363         /* Maybe the mfn changed */
364         mfn = get_gfn_query_unlocked(d, gmfn, &p2mt);
365         ASSERT(!p2m_is_shared(p2mt));
366     }
367 #endif /* CONFIG_X86 */
368 
369  obtain_page: __maybe_unused;
370     page = mfn_to_page(mfn);
371     if ( unlikely(!get_page(page, d)) )
372     {
373 #ifdef CONFIG_X86
374         put_gfn(d, gmfn);
375         if ( !p2m_is_paging(p2mt) )
376 #endif
377             gdprintk(XENLOG_INFO, "Bad page free for Dom%u GFN %lx\n",
378                      d->domain_id, gmfn);
379 
380         return -ENXIO;
381     }
382 
383     /*
384      * Since we're likely to free the page below, we need to suspend
385      * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes.
386      */
387     dont_flush_p = &this_cpu(iommu_dont_flush_iotlb);
388     dont_flush = *dont_flush_p;
389     *dont_flush_p = false;
390 
391     rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0);
392 
393     *dont_flush_p = dont_flush;
394 
395     /*
396      * With the lack of an IOMMU on some platforms, domains with DMA-capable
397      * device must retrieve the same pfn when the hypercall populate_physmap
398      * is called.
399      *
400      * For this purpose (and to match populate_physmap() behavior), the page
401      * is kept allocated.
402      */
403     if ( !rc && !is_domain_direct_mapped(d) )
404         put_page_alloc_ref(page);
405 
406     put_page(page);
407 
408 #ifdef CONFIG_X86
409  out_put_gfn:
410     put_gfn(d, gmfn);
411 #endif
412 
413     /*
414      * Filter out -ENOENT return values that aren't a result of an empty p2m
415      * entry.
416      */
417     return rc != -ENOENT ? rc : -EINVAL;
418 }
419 
decrease_reservation(struct memop_args * a)420 static void decrease_reservation(struct memop_args *a)
421 {
422     unsigned long i, j;
423     xen_pfn_t gmfn;
424 
425     if ( !guest_handle_subrange_okay(a->extent_list, a->nr_done,
426                                      a->nr_extents-1) ||
427          a->extent_order > max_order(current->domain) )
428         return;
429 
430     for ( i = a->nr_done; i < a->nr_extents; i++ )
431     {
432         unsigned long pod_done;
433 
434         if ( i != a->nr_done && hypercall_preempt_check() )
435         {
436             a->preempted = 1;
437             goto out;
438         }
439 
440         if ( unlikely(__copy_from_guest_offset(&gmfn, a->extent_list, i, 1)) )
441             goto out;
442 
443         if ( tb_init_done )
444         {
445             struct {
446                 u64 gfn;
447                 int d:16,order:16;
448             } t;
449 
450             t.gfn = gmfn;
451             t.d = a->domain->domain_id;
452             t.order = a->extent_order;
453 
454             __trace_var(TRC_MEM_DECREASE_RESERVATION, 0, sizeof(t), &t);
455         }
456 
457         /* See if populate-on-demand wants to handle this */
458         pod_done = is_hvm_domain(a->domain) ?
459                    p2m_pod_decrease_reservation(a->domain, _gfn(gmfn),
460                                                 a->extent_order) : 0;
461 
462         /*
463          * Look for pages not handled by p2m_pod_decrease_reservation().
464          *
465          * guest_remove_page() will return -ENOENT for pages which have already
466          * been removed by p2m_pod_decrease_reservation(); so expect to see
467          * exactly pod_done failures.  Any more means that there were invalid
468          * entries before p2m_pod_decrease_reservation() was called.
469          */
470         for ( j = 0; j + pod_done < (1UL << a->extent_order); j++ )
471         {
472             switch ( guest_remove_page(a->domain, gmfn + j) )
473             {
474             case 0:
475                 break;
476             case -ENOENT:
477                 if ( !pod_done )
478                     goto out;
479                 --pod_done;
480                 break;
481             default:
482                 goto out;
483             }
484         }
485     }
486 
487  out:
488     a->nr_done = i;
489 }
490 
propagate_node(unsigned int xmf,unsigned int * memflags)491 static bool propagate_node(unsigned int xmf, unsigned int *memflags)
492 {
493     const struct domain *currd = current->domain;
494 
495     BUILD_BUG_ON(XENMEMF_get_node(0) != NUMA_NO_NODE);
496     BUILD_BUG_ON(MEMF_get_node(0) != NUMA_NO_NODE);
497 
498     if ( XENMEMF_get_node(xmf) == NUMA_NO_NODE )
499         return true;
500 
501     if ( is_hardware_domain(currd) || is_control_domain(currd) )
502     {
503         if ( XENMEMF_get_node(xmf) >= MAX_NUMNODES )
504             return false;
505 
506         *memflags |= MEMF_node(XENMEMF_get_node(xmf));
507         if ( xmf & XENMEMF_exact_node_request )
508             *memflags |= MEMF_exact_node;
509     }
510     else if ( xmf & XENMEMF_exact_node_request )
511         return false;
512 
513     return true;
514 }
515 
memory_exchange(XEN_GUEST_HANDLE_PARAM (xen_memory_exchange_t)arg)516 static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg)
517 {
518     struct xen_memory_exchange exch;
519     PAGE_LIST_HEAD(in_chunk_list);
520     PAGE_LIST_HEAD(out_chunk_list);
521     unsigned long in_chunk_order, out_chunk_order;
522     xen_pfn_t     gpfn, gmfn;
523     mfn_t         mfn;
524     unsigned long i, j, k;
525     unsigned int  memflags = 0;
526     long          rc = 0;
527     struct domain *d;
528     struct page_info *page;
529 
530     if ( copy_from_guest(&exch, arg, 1) )
531         return -EFAULT;
532 
533     if ( max(exch.in.extent_order, exch.out.extent_order) >
534          max_order(current->domain) )
535     {
536         rc = -EPERM;
537         goto fail_early;
538     }
539 
540     /* Various sanity checks. */
541     if ( (exch.nr_exchanged > exch.in.nr_extents) ||
542          /* Input and output domain identifiers match? */
543          (exch.in.domid != exch.out.domid) ||
544          /* Sizes of input and output lists do not overflow a long? */
545          ((~0UL >> exch.in.extent_order) < exch.in.nr_extents) ||
546          ((~0UL >> exch.out.extent_order) < exch.out.nr_extents) ||
547          /* Sizes of input and output lists match? */
548          ((exch.in.nr_extents << exch.in.extent_order) !=
549           (exch.out.nr_extents << exch.out.extent_order)) )
550     {
551         rc = -EINVAL;
552         goto fail_early;
553     }
554 
555     if ( exch.nr_exchanged == exch.in.nr_extents )
556         return 0;
557 
558     if ( !guest_handle_subrange_okay(exch.in.extent_start, exch.nr_exchanged,
559                                      exch.in.nr_extents - 1) )
560     {
561         rc = -EFAULT;
562         goto fail_early;
563     }
564 
565     if ( exch.in.extent_order <= exch.out.extent_order )
566     {
567         in_chunk_order  = exch.out.extent_order - exch.in.extent_order;
568         out_chunk_order = 0;
569 
570         if ( !guest_handle_subrange_okay(exch.out.extent_start,
571                                          exch.nr_exchanged >> in_chunk_order,
572                                          exch.out.nr_extents - 1) )
573         {
574             rc = -EFAULT;
575             goto fail_early;
576         }
577     }
578     else
579     {
580         in_chunk_order  = 0;
581         out_chunk_order = exch.in.extent_order - exch.out.extent_order;
582 
583         if ( !guest_handle_subrange_okay(exch.out.extent_start,
584                                          exch.nr_exchanged << out_chunk_order,
585                                          exch.out.nr_extents - 1) )
586         {
587             rc = -EFAULT;
588             goto fail_early;
589         }
590     }
591 
592     if ( unlikely(!propagate_node(exch.out.mem_flags, &memflags)) )
593     {
594         rc = -EINVAL;
595         goto fail_early;
596     }
597 
598     d = rcu_lock_domain_by_any_id(exch.in.domid);
599     if ( d == NULL )
600     {
601         rc = -ESRCH;
602         goto fail_early;
603     }
604 
605     rc = xsm_memory_exchange(XSM_TARGET, d);
606     if ( rc )
607     {
608         rcu_unlock_domain(d);
609         goto fail_early;
610     }
611 
612     memflags |= MEMF_bits(domain_clamp_alloc_bitsize(
613         d,
614         XENMEMF_get_address_bits(exch.out.mem_flags) ? :
615         (BITS_PER_LONG+PAGE_SHIFT)));
616 
617     for ( i = (exch.nr_exchanged >> in_chunk_order);
618           i < (exch.in.nr_extents >> in_chunk_order);
619           i++ )
620     {
621         if ( i != (exch.nr_exchanged >> in_chunk_order) &&
622              hypercall_preempt_check() )
623         {
624             exch.nr_exchanged = i << in_chunk_order;
625             rcu_unlock_domain(d);
626             if ( __copy_field_to_guest(arg, &exch, nr_exchanged) )
627                 return -EFAULT;
628             return hypercall_create_continuation(
629                 __HYPERVISOR_memory_op, "lh", XENMEM_exchange, arg);
630         }
631 
632         /* Steal a chunk's worth of input pages from the domain. */
633         for ( j = 0; j < (1UL << in_chunk_order); j++ )
634         {
635             if ( unlikely(__copy_from_guest_offset(
636                 &gmfn, exch.in.extent_start, (i<<in_chunk_order)+j, 1)) )
637             {
638                 rc = -EFAULT;
639                 goto fail;
640             }
641 
642             for ( k = 0; k < (1UL << exch.in.extent_order); k++ )
643             {
644 #ifdef CONFIG_X86
645                 p2m_type_t p2mt;
646 
647                 /* Shared pages cannot be exchanged */
648                 mfn = get_gfn_unshare(d, gmfn + k, &p2mt);
649                 if ( p2m_is_shared(p2mt) )
650                 {
651                     put_gfn(d, gmfn + k);
652                     rc = -ENOMEM;
653                     goto fail;
654                 }
655 #else /* !CONFIG_X86 */
656                 mfn = gfn_to_mfn(d, _gfn(gmfn + k));
657 #endif
658                 if ( unlikely(!mfn_valid(mfn)) )
659                 {
660 #ifdef CONFIG_X86
661                     put_gfn(d, gmfn + k);
662 #endif
663                     rc = -EINVAL;
664                     goto fail;
665                 }
666 
667                 page = mfn_to_page(mfn);
668 
669                 rc = steal_page(d, page, MEMF_no_refcount);
670                 if ( unlikely(rc) )
671                 {
672 #ifdef CONFIG_X86
673                     put_gfn(d, gmfn + k);
674 #endif
675                     goto fail;
676                 }
677 
678                 page_list_add(page, &in_chunk_list);
679 #ifdef CONFIG_X86
680                 put_gfn(d, gmfn + k);
681 #endif
682             }
683         }
684 
685         /* Allocate a chunk's worth of anonymous output pages. */
686         for ( j = 0; j < (1UL << out_chunk_order); j++ )
687         {
688             page = alloc_domheap_pages(d, exch.out.extent_order,
689                                        MEMF_no_owner | memflags);
690             if ( unlikely(page == NULL) )
691             {
692                 rc = -ENOMEM;
693                 goto fail;
694             }
695 
696             page_list_add(page, &out_chunk_list);
697         }
698 
699         /*
700          * Success! Beyond this point we cannot fail for this chunk.
701          */
702 
703         /*
704          * These pages have already had owner and reference cleared.
705          * Do the final two steps: Remove from the physmap, and free
706          * them.
707          */
708         while ( (page = page_list_remove_head(&in_chunk_list)) )
709         {
710             unsigned long gfn;
711 
712             mfn = page_to_mfn(page);
713             gfn = mfn_to_gmfn(d, mfn_x(mfn));
714             /* Pages were unshared above */
715             BUG_ON(SHARED_M2P(gfn));
716             if ( guest_physmap_remove_page(d, _gfn(gfn), mfn, 0) )
717                 domain_crash(d);
718             free_domheap_page(page);
719         }
720 
721         /* Assign each output page to the domain. */
722         for ( j = 0; (page = page_list_remove_head(&out_chunk_list)); ++j )
723         {
724             if ( assign_pages(d, page, exch.out.extent_order,
725                               MEMF_no_refcount) )
726             {
727                 unsigned long dec_count;
728                 bool_t drop_dom_ref;
729 
730                 /*
731                  * Pages in in_chunk_list is stolen without
732                  * decreasing the tot_pages. If the domain is dying when
733                  * assign pages, we need decrease the count. For those pages
734                  * that has been assigned, it should be covered by
735                  * domain_relinquish_resources().
736                  */
737                 dec_count = (((1UL << exch.in.extent_order) *
738                               (1UL << in_chunk_order)) -
739                              (j * (1UL << exch.out.extent_order)));
740 
741                 spin_lock(&d->page_alloc_lock);
742                 drop_dom_ref = (dec_count &&
743                                 !domain_adjust_tot_pages(d, -dec_count));
744                 spin_unlock(&d->page_alloc_lock);
745 
746                 if ( drop_dom_ref )
747                     put_domain(d);
748 
749                 free_domheap_pages(page, exch.out.extent_order);
750                 goto dying;
751             }
752 
753             if ( __copy_from_guest_offset(&gpfn, exch.out.extent_start,
754                                           (i << out_chunk_order) + j, 1) )
755             {
756                 rc = -EFAULT;
757                 continue;
758             }
759 
760             mfn = page_to_mfn(page);
761             guest_physmap_add_page(d, _gfn(gpfn), mfn,
762                                    exch.out.extent_order);
763 
764             if ( !paging_mode_translate(d) &&
765                  __copy_mfn_to_guest_offset(exch.out.extent_start,
766                                             (i << out_chunk_order) + j,
767                                             mfn) )
768                 rc = -EFAULT;
769         }
770         BUG_ON( !(d->is_dying) && (j != (1UL << out_chunk_order)) );
771 
772         if ( rc )
773             goto fail;
774     }
775 
776     exch.nr_exchanged = exch.in.nr_extents;
777     if ( __copy_field_to_guest(arg, &exch, nr_exchanged) )
778         rc = -EFAULT;
779     rcu_unlock_domain(d);
780     return rc;
781 
782     /*
783      * Failed a chunk! Free any partial chunk work. Tell caller how many
784      * chunks succeeded.
785      */
786  fail:
787     /*
788      * Reassign any input pages we managed to steal.  NB that if the assign
789      * fails again, we're on the hook for freeing the page, since we've already
790      * cleared PGC_allocated.
791      */
792     while ( (page = page_list_remove_head(&in_chunk_list)) )
793         if ( assign_pages(d, page, 0, MEMF_no_refcount) )
794         {
795             BUG_ON(!d->is_dying);
796             free_domheap_page(page);
797         }
798 
799  dying:
800     rcu_unlock_domain(d);
801     /* Free any output pages we managed to allocate. */
802     while ( (page = page_list_remove_head(&out_chunk_list)) )
803         free_domheap_pages(page, exch.out.extent_order);
804 
805     exch.nr_exchanged = i << in_chunk_order;
806 
807  fail_early:
808     if ( __copy_field_to_guest(arg, &exch, nr_exchanged) )
809         rc = -EFAULT;
810     return rc;
811 }
812 
xenmem_add_to_physmap(struct domain * d,struct xen_add_to_physmap * xatp,unsigned int start)813 int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
814                           unsigned int start)
815 {
816     unsigned int done = 0;
817     long rc = 0;
818     union add_to_physmap_extra extra = {};
819     struct page_info *pages[16];
820 
821     ASSERT(paging_mode_translate(d));
822 
823     if ( xatp->space == XENMAPSPACE_gmfn_foreign )
824         extra.foreign_domid = DOMID_INVALID;
825 
826     if ( xatp->space != XENMAPSPACE_gmfn_range )
827         return xenmem_add_to_physmap_one(d, xatp->space, extra,
828                                          xatp->idx, _gfn(xatp->gpfn));
829 
830     if ( xatp->size < start )
831         return -EILSEQ;
832 
833     xatp->idx += start;
834     xatp->gpfn += start;
835     xatp->size -= start;
836 
837     if ( is_iommu_enabled(d) )
838     {
839        this_cpu(iommu_dont_flush_iotlb) = 1;
840        extra.ppage = &pages[0];
841     }
842 
843     while ( xatp->size > done )
844     {
845         rc = xenmem_add_to_physmap_one(d, XENMAPSPACE_gmfn, extra,
846                                        xatp->idx, _gfn(xatp->gpfn));
847         if ( rc < 0 )
848             break;
849 
850         xatp->idx++;
851         xatp->gpfn++;
852 
853         if ( extra.ppage )
854             ++extra.ppage;
855 
856         /* Check for continuation if it's not the last iteration. */
857         if ( (++done >= ARRAY_SIZE(pages) && extra.ppage) ||
858              (xatp->size > done && hypercall_preempt_check()) )
859         {
860             rc = start + done;
861             break;
862         }
863     }
864 
865     if ( is_iommu_enabled(d) )
866     {
867         int ret;
868         unsigned int i;
869 
870         this_cpu(iommu_dont_flush_iotlb) = 0;
871 
872         ret = iommu_iotlb_flush(d, _dfn(xatp->idx - done), done,
873                                 IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified);
874         if ( unlikely(ret) && rc >= 0 )
875             rc = ret;
876 
877         /*
878          * Now that the IOMMU TLB flush was done for the original GFN, drop
879          * the page references. The 2nd flush below is fine to make later, as
880          * whoever removes the page again from its new GFN will have to do
881          * another flush anyway.
882          */
883         for ( i = 0; i < done; ++i )
884             put_page(pages[i]);
885 
886         ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done,
887                                 IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified);
888         if ( unlikely(ret) && rc >= 0 )
889             rc = ret;
890     }
891 
892     return rc;
893 }
894 
xenmem_add_to_physmap_batch(struct domain * d,struct xen_add_to_physmap_batch * xatpb,unsigned int extent)895 static int xenmem_add_to_physmap_batch(struct domain *d,
896                                        struct xen_add_to_physmap_batch *xatpb,
897                                        unsigned int extent)
898 {
899     union add_to_physmap_extra extra = {};
900 
901     if ( unlikely(xatpb->size < extent) )
902         return -EILSEQ;
903 
904     if ( unlikely(xatpb->size == extent) )
905         return extent ? -EILSEQ : 0;
906 
907     if ( !guest_handle_subrange_okay(xatpb->idxs, extent, xatpb->size - 1) ||
908          !guest_handle_subrange_okay(xatpb->gpfns, extent, xatpb->size - 1) ||
909          !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) )
910         return -EFAULT;
911 
912     switch ( xatpb->space )
913     {
914     case XENMAPSPACE_dev_mmio:
915         /* res0 is reserved for future use. */
916         if ( xatpb->u.res0 )
917             return -EOPNOTSUPP;
918         break;
919 
920     case XENMAPSPACE_gmfn_foreign:
921         extra.foreign_domid = xatpb->u.foreign_domid;
922         break;
923     }
924 
925     while ( xatpb->size > extent )
926     {
927         xen_ulong_t idx;
928         xen_pfn_t gpfn;
929         int rc;
930 
931         if ( unlikely(__copy_from_guest_offset(&idx, xatpb->idxs,
932                                                extent, 1)) ||
933              unlikely(__copy_from_guest_offset(&gpfn, xatpb->gpfns,
934                                                extent, 1)) )
935             return -EFAULT;
936 
937         rc = xenmem_add_to_physmap_one(d, xatpb->space, extra,
938                                        idx, _gfn(gpfn));
939 
940         if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) )
941             return -EFAULT;
942 
943         /* Check for continuation if it's not the last iteration. */
944         if ( xatpb->size > ++extent && hypercall_preempt_check() )
945             return extent;
946     }
947 
948     return 0;
949 }
950 
construct_memop_from_reservation(const struct xen_memory_reservation * r,struct memop_args * a)951 static int construct_memop_from_reservation(
952                const struct xen_memory_reservation *r,
953                struct memop_args *a)
954 {
955     unsigned int address_bits;
956 
957     a->extent_list  = r->extent_start;
958     a->nr_extents   = r->nr_extents;
959     a->extent_order = r->extent_order;
960     a->memflags     = 0;
961 
962     address_bits = XENMEMF_get_address_bits(r->mem_flags);
963     if ( (address_bits != 0) &&
964          (address_bits < (get_order_from_pages(max_page) + PAGE_SHIFT)) )
965     {
966         if ( address_bits <= PAGE_SHIFT )
967             return -EINVAL;
968         a->memflags = MEMF_bits(address_bits);
969     }
970 
971     if ( r->mem_flags & XENMEMF_vnode )
972     {
973         nodeid_t vnode, pnode;
974         struct domain *d = a->domain;
975 
976         read_lock(&d->vnuma_rwlock);
977         if ( d->vnuma )
978         {
979             vnode = XENMEMF_get_node(r->mem_flags);
980             if ( vnode >= d->vnuma->nr_vnodes )
981             {
982                 read_unlock(&d->vnuma_rwlock);
983                 return -EINVAL;
984             }
985 
986             pnode = d->vnuma->vnode_to_pnode[vnode];
987             if ( pnode != NUMA_NO_NODE )
988             {
989                 a->memflags |= MEMF_node(pnode);
990                 if ( r->mem_flags & XENMEMF_exact_node_request )
991                     a->memflags |= MEMF_exact_node;
992             }
993         }
994         read_unlock(&d->vnuma_rwlock);
995     }
996     else if ( unlikely(!propagate_node(r->mem_flags, &a->memflags)) )
997         return -EINVAL;
998 
999     return 0;
1000 }
1001 
1002 #ifdef CONFIG_HAS_PASSTHROUGH
1003 struct get_reserved_device_memory {
1004     struct xen_reserved_device_memory_map map;
1005     unsigned int used_entries;
1006 };
1007 
get_reserved_device_memory(xen_pfn_t start,xen_ulong_t nr,u32 id,void * ctxt)1008 static int get_reserved_device_memory(xen_pfn_t start, xen_ulong_t nr,
1009                                       u32 id, void *ctxt)
1010 {
1011     struct get_reserved_device_memory *grdm = ctxt;
1012     uint32_t sbdf = PCI_SBDF3(grdm->map.dev.pci.seg, grdm->map.dev.pci.bus,
1013                               grdm->map.dev.pci.devfn).sbdf;
1014 
1015     if ( !(grdm->map.flags & XENMEM_RDM_ALL) && (sbdf != id) )
1016         return 0;
1017 
1018     if ( grdm->used_entries < grdm->map.nr_entries )
1019     {
1020         struct xen_reserved_device_memory rdm = {
1021             .start_pfn = start, .nr_pages = nr
1022         };
1023 
1024         if ( __copy_to_guest_offset(grdm->map.buffer, grdm->used_entries,
1025                                     &rdm, 1) )
1026             return -EFAULT;
1027     }
1028 
1029     ++grdm->used_entries;
1030 
1031     return 1;
1032 }
1033 #endif
1034 
xatp_permission_check(struct domain * d,unsigned int space)1035 static long xatp_permission_check(struct domain *d, unsigned int space)
1036 {
1037     if ( !paging_mode_translate(d) )
1038         return -EACCES;
1039 
1040     /*
1041      * XENMAPSPACE_dev_mmio mapping is only supported for hardware Domain
1042      * to map this kind of space to itself.
1043      */
1044     if ( (space == XENMAPSPACE_dev_mmio) &&
1045          (!is_hardware_domain(d) || (d != current->domain)) )
1046         return -EACCES;
1047 
1048     return xsm_add_to_physmap(XSM_TARGET, current->domain, d);
1049 }
1050 
acquire_grant_table(struct domain * d,unsigned int id,unsigned long frame,unsigned int nr_frames,xen_pfn_t mfn_list[])1051 static int acquire_grant_table(struct domain *d, unsigned int id,
1052                                unsigned long frame,
1053                                unsigned int nr_frames,
1054                                xen_pfn_t mfn_list[])
1055 {
1056     unsigned int i = nr_frames;
1057 
1058     /* Iterate backwards in case table needs to grow */
1059     while ( i-- != 0 )
1060     {
1061         mfn_t mfn = INVALID_MFN;
1062         int rc;
1063 
1064         switch ( id )
1065         {
1066         case XENMEM_resource_grant_table_id_shared:
1067             rc = gnttab_get_shared_frame(d, frame + i, &mfn);
1068             break;
1069 
1070         case XENMEM_resource_grant_table_id_status:
1071             rc = gnttab_get_status_frame(d, frame + i, &mfn);
1072             break;
1073 
1074         default:
1075             rc = -EINVAL;
1076             break;
1077         }
1078 
1079         if ( rc )
1080             return rc;
1081 
1082         ASSERT(!mfn_eq(mfn, INVALID_MFN));
1083         mfn_list[i] = mfn_x(mfn);
1084     }
1085 
1086     return 0;
1087 }
1088 
acquire_resource(XEN_GUEST_HANDLE_PARAM (xen_mem_acquire_resource_t)arg)1089 static int acquire_resource(
1090     XEN_GUEST_HANDLE_PARAM(xen_mem_acquire_resource_t) arg)
1091 {
1092     struct domain *d, *currd = current->domain;
1093     xen_mem_acquire_resource_t xmar;
1094     /*
1095      * The mfn_list and gfn_list (below) arrays are ok on stack for the
1096      * moment since they are small, but if they need to grow in future
1097      * use-cases then per-CPU arrays or heap allocations may be required.
1098      */
1099     xen_pfn_t mfn_list[32];
1100     int rc;
1101 
1102     /*
1103      * FIXME: Until foreign pages inserted into the P2M are properly
1104      *        reference counted, it is unsafe to allow mapping of
1105      *        resource pages unless the caller is the hardware domain.
1106      */
1107     if ( paging_mode_translate(currd) && !is_hardware_domain(currd) )
1108         return -EACCES;
1109 
1110     if ( copy_from_guest(&xmar, arg, 1) )
1111         return -EFAULT;
1112 
1113     if ( xmar.pad != 0 )
1114         return -EINVAL;
1115 
1116     if ( guest_handle_is_null(xmar.frame_list) )
1117     {
1118         if ( xmar.nr_frames )
1119             return -EINVAL;
1120 
1121         xmar.nr_frames = ARRAY_SIZE(mfn_list);
1122 
1123         if ( __copy_field_to_guest(arg, &xmar, nr_frames) )
1124             return -EFAULT;
1125 
1126         return 0;
1127     }
1128 
1129     if ( xmar.nr_frames > ARRAY_SIZE(mfn_list) )
1130         return -E2BIG;
1131 
1132     rc = rcu_lock_remote_domain_by_id(xmar.domid, &d);
1133     if ( rc )
1134         return rc;
1135 
1136     rc = xsm_domain_resource_map(XSM_DM_PRIV, d);
1137     if ( rc )
1138         goto out;
1139 
1140     switch ( xmar.type )
1141     {
1142     case XENMEM_resource_grant_table:
1143         rc = acquire_grant_table(d, xmar.id, xmar.frame, xmar.nr_frames,
1144                                  mfn_list);
1145         break;
1146 
1147     default:
1148         rc = arch_acquire_resource(d, xmar.type, xmar.id, xmar.frame,
1149                                    xmar.nr_frames, mfn_list);
1150         break;
1151     }
1152 
1153     if ( rc )
1154         goto out;
1155 
1156     if ( !paging_mode_translate(currd) )
1157     {
1158         if ( copy_to_guest(xmar.frame_list, mfn_list, xmar.nr_frames) )
1159             rc = -EFAULT;
1160     }
1161     else
1162     {
1163         xen_pfn_t gfn_list[ARRAY_SIZE(mfn_list)];
1164         unsigned int i;
1165 
1166         if ( copy_from_guest(gfn_list, xmar.frame_list, xmar.nr_frames) )
1167             rc = -EFAULT;
1168 
1169         for ( i = 0; !rc && i < xmar.nr_frames; i++ )
1170         {
1171             rc = set_foreign_p2m_entry(currd, gfn_list[i],
1172                                        _mfn(mfn_list[i]));
1173             /* rc should be -EIO for any iteration other than the first */
1174             if ( rc && i )
1175                 rc = -EIO;
1176         }
1177     }
1178 
1179  out:
1180     rcu_unlock_domain(d);
1181 
1182     return rc;
1183 }
1184 
do_memory_op(unsigned long cmd,XEN_GUEST_HANDLE_PARAM (void)arg)1185 long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
1186 {
1187     struct domain *d, *curr_d = current->domain;
1188     long rc;
1189     struct xen_memory_reservation reservation;
1190     struct memop_args args;
1191     domid_t domid;
1192     unsigned long start_extent = cmd >> MEMOP_EXTENT_SHIFT;
1193     int op = cmd & MEMOP_CMD_MASK;
1194 
1195     switch ( op )
1196     {
1197     case XENMEM_increase_reservation:
1198     case XENMEM_decrease_reservation:
1199     case XENMEM_populate_physmap:
1200         if ( copy_from_guest(&reservation, arg, 1) )
1201             return start_extent;
1202 
1203         /* Is size too large for us to encode a continuation? */
1204         if ( reservation.nr_extents > (UINT_MAX >> MEMOP_EXTENT_SHIFT) )
1205             return start_extent;
1206 
1207         if ( unlikely(start_extent >= reservation.nr_extents) )
1208             return start_extent;
1209 
1210         d = rcu_lock_domain_by_any_id(reservation.domid);
1211         if ( d == NULL )
1212             return start_extent;
1213         args.domain = d;
1214 
1215         if ( construct_memop_from_reservation(&reservation, &args) )
1216         {
1217             rcu_unlock_domain(d);
1218             return start_extent;
1219         }
1220 
1221         args.nr_done   = start_extent;
1222         args.preempted = 0;
1223 
1224         if ( op == XENMEM_populate_physmap
1225              && (reservation.mem_flags & XENMEMF_populate_on_demand) )
1226             args.memflags |= MEMF_populate_on_demand;
1227 
1228         if ( xsm_memory_adjust_reservation(XSM_TARGET, curr_d, d) )
1229         {
1230             rcu_unlock_domain(d);
1231             return start_extent;
1232         }
1233 
1234 #ifdef CONFIG_X86
1235         if ( pv_shim && op != XENMEM_decrease_reservation && !start_extent )
1236             /* Avoid calling pv_shim_online_memory when in a continuation. */
1237             pv_shim_online_memory(args.nr_extents, args.extent_order);
1238 #endif
1239 
1240         switch ( op )
1241         {
1242         case XENMEM_increase_reservation:
1243             increase_reservation(&args);
1244             break;
1245         case XENMEM_decrease_reservation:
1246             decrease_reservation(&args);
1247             break;
1248         default: /* XENMEM_populate_physmap */
1249             populate_physmap(&args);
1250             break;
1251         }
1252 
1253         rcu_unlock_domain(d);
1254 
1255         rc = args.nr_done;
1256 
1257         if ( args.preempted )
1258             return hypercall_create_continuation(
1259                 __HYPERVISOR_memory_op, "lh",
1260                 op | (rc << MEMOP_EXTENT_SHIFT), arg);
1261 
1262 #ifdef CONFIG_X86
1263         if ( pv_shim && op == XENMEM_decrease_reservation )
1264             /*
1265              * Only call pv_shim_offline_memory when the hypercall has
1266              * finished. Note that nr_done is used to cope in case the
1267              * hypercall has failed and only part of the extents where
1268              * processed.
1269              */
1270             pv_shim_offline_memory(args.nr_done, args.extent_order);
1271 #endif
1272 
1273         break;
1274 
1275     case XENMEM_exchange:
1276         if ( unlikely(start_extent) )
1277             return -EINVAL;
1278 
1279         rc = memory_exchange(guest_handle_cast(arg, xen_memory_exchange_t));
1280         break;
1281 
1282     case XENMEM_maximum_ram_page:
1283         if ( unlikely(start_extent) )
1284             return -EINVAL;
1285 
1286         rc = max_page;
1287         break;
1288 
1289     case XENMEM_current_reservation:
1290     case XENMEM_maximum_reservation:
1291     case XENMEM_maximum_gpfn:
1292         if ( unlikely(start_extent) )
1293             return -EINVAL;
1294 
1295         if ( copy_from_guest(&domid, arg, 1) )
1296             return -EFAULT;
1297 
1298         d = rcu_lock_domain_by_any_id(domid);
1299         if ( d == NULL )
1300             return -ESRCH;
1301 
1302         rc = xsm_memory_stat_reservation(XSM_TARGET, curr_d, d);
1303         if ( rc )
1304         {
1305             rcu_unlock_domain(d);
1306             return rc;
1307         }
1308 
1309         switch ( op )
1310         {
1311         case XENMEM_current_reservation:
1312             rc = domain_tot_pages(d);
1313             break;
1314         case XENMEM_maximum_reservation:
1315             rc = d->max_pages;
1316             break;
1317         default:
1318             ASSERT(op == XENMEM_maximum_gpfn);
1319             rc = domain_get_maximum_gpfn(d);
1320             break;
1321         }
1322 
1323         rcu_unlock_domain(d);
1324 
1325         break;
1326 
1327     case XENMEM_add_to_physmap:
1328     {
1329         struct xen_add_to_physmap xatp;
1330 
1331         BUILD_BUG_ON((typeof(xatp.size))-1 > (UINT_MAX >> MEMOP_EXTENT_SHIFT));
1332 
1333         /* Check for malicious or buggy input. */
1334         if ( start_extent != (typeof(xatp.size))start_extent )
1335             return -EDOM;
1336 
1337         if ( copy_from_guest(&xatp, arg, 1) )
1338             return -EFAULT;
1339 
1340         /* Foreign mapping is only possible via add_to_physmap_batch. */
1341         if ( xatp.space == XENMAPSPACE_gmfn_foreign )
1342             return -ENOSYS;
1343 
1344         d = rcu_lock_domain_by_any_id(xatp.domid);
1345         if ( d == NULL )
1346             return -ESRCH;
1347 
1348         rc = xatp_permission_check(d, xatp.space);
1349         if ( rc )
1350         {
1351             rcu_unlock_domain(d);
1352             return rc;
1353         }
1354 
1355         rc = xenmem_add_to_physmap(d, &xatp, start_extent);
1356 
1357         rcu_unlock_domain(d);
1358 
1359         if ( xatp.space == XENMAPSPACE_gmfn_range && rc > 0 )
1360             rc = hypercall_create_continuation(
1361                      __HYPERVISOR_memory_op, "lh",
1362                      op | (rc << MEMOP_EXTENT_SHIFT), arg);
1363 
1364         return rc;
1365     }
1366 
1367     case XENMEM_add_to_physmap_batch:
1368     {
1369         struct xen_add_to_physmap_batch xatpb;
1370 
1371         BUILD_BUG_ON((typeof(xatpb.size))-1 >
1372                      (UINT_MAX >> MEMOP_EXTENT_SHIFT));
1373 
1374         /* Check for malicious or buggy input. */
1375         if ( start_extent != (typeof(xatpb.size))start_extent )
1376             return -EDOM;
1377 
1378         if ( copy_from_guest(&xatpb, arg, 1) )
1379             return -EFAULT;
1380 
1381         /* This mapspace is unsupported for this hypercall. */
1382         if ( xatpb.space == XENMAPSPACE_gmfn_range )
1383             return -EOPNOTSUPP;
1384 
1385         d = rcu_lock_domain_by_any_id(xatpb.domid);
1386         if ( d == NULL )
1387             return -ESRCH;
1388 
1389         rc = xatp_permission_check(d, xatpb.space);
1390         if ( rc )
1391         {
1392             rcu_unlock_domain(d);
1393             return rc;
1394         }
1395 
1396         rc = xenmem_add_to_physmap_batch(d, &xatpb, start_extent);
1397 
1398         rcu_unlock_domain(d);
1399 
1400         if ( rc > 0 )
1401             rc = hypercall_create_continuation(
1402                     __HYPERVISOR_memory_op, "lh",
1403                     op | (rc << MEMOP_EXTENT_SHIFT), arg);
1404 
1405         return rc;
1406     }
1407 
1408     case XENMEM_remove_from_physmap:
1409     {
1410         struct xen_remove_from_physmap xrfp;
1411         struct page_info *page;
1412 
1413         if ( unlikely(start_extent) )
1414             return -EINVAL;
1415 
1416         if ( copy_from_guest(&xrfp, arg, 1) )
1417             return -EFAULT;
1418 
1419         d = rcu_lock_domain_by_any_id(xrfp.domid);
1420         if ( d == NULL )
1421             return -ESRCH;
1422 
1423         rc = paging_mode_translate(d)
1424              ? xsm_remove_from_physmap(XSM_TARGET, curr_d, d)
1425              : -EACCES;
1426         if ( rc )
1427         {
1428             rcu_unlock_domain(d);
1429             return rc;
1430         }
1431 
1432         page = get_page_from_gfn(d, xrfp.gpfn, NULL, P2M_ALLOC);
1433         if ( page )
1434         {
1435             rc = guest_physmap_remove_page(d, _gfn(xrfp.gpfn),
1436                                            page_to_mfn(page), 0);
1437             put_page(page);
1438         }
1439         else
1440             rc = -ENOENT;
1441 
1442         rcu_unlock_domain(d);
1443 
1444         break;
1445     }
1446 
1447     case XENMEM_access_op:
1448         rc = mem_access_memop(cmd, guest_handle_cast(arg, xen_mem_access_op_t));
1449         break;
1450 
1451     case XENMEM_claim_pages:
1452         if ( unlikely(start_extent) )
1453             return -EINVAL;
1454 
1455         if ( copy_from_guest(&reservation, arg, 1) )
1456             return -EFAULT;
1457 
1458         if ( !guest_handle_is_null(reservation.extent_start) )
1459             return -EINVAL;
1460 
1461         if ( reservation.extent_order != 0 )
1462             return -EINVAL;
1463 
1464         if ( reservation.mem_flags != 0 )
1465             return -EINVAL;
1466 
1467         d = rcu_lock_domain_by_id(reservation.domid);
1468         if ( d == NULL )
1469             return -EINVAL;
1470 
1471         rc = xsm_claim_pages(XSM_PRIV, d);
1472 
1473         if ( !rc )
1474             rc = domain_set_outstanding_pages(d, reservation.nr_extents);
1475 
1476         rcu_unlock_domain(d);
1477 
1478         break;
1479 
1480     case XENMEM_get_vnumainfo:
1481     {
1482         struct xen_vnuma_topology_info topology;
1483         unsigned int dom_vnodes, dom_vranges, dom_vcpus;
1484         struct vnuma_info tmp;
1485 
1486         if ( unlikely(start_extent) )
1487             return -EINVAL;
1488 
1489         /*
1490          * Guest passes nr_vnodes, number of regions and nr_vcpus thus
1491          * we know how much memory guest has allocated.
1492          */
1493         if ( copy_from_guest(&topology, arg, 1 ))
1494             return -EFAULT;
1495 
1496         if ( topology.pad != 0 )
1497             return -EINVAL;
1498 
1499         if ( (d = rcu_lock_domain_by_any_id(topology.domid)) == NULL )
1500             return -ESRCH;
1501 
1502         rc = xsm_get_vnumainfo(XSM_TARGET, d);
1503         if ( rc )
1504         {
1505             rcu_unlock_domain(d);
1506             return rc;
1507         }
1508 
1509         read_lock(&d->vnuma_rwlock);
1510 
1511         if ( d->vnuma == NULL )
1512         {
1513             read_unlock(&d->vnuma_rwlock);
1514             rcu_unlock_domain(d);
1515             return -EOPNOTSUPP;
1516         }
1517 
1518         dom_vnodes = d->vnuma->nr_vnodes;
1519         dom_vranges = d->vnuma->nr_vmemranges;
1520         dom_vcpus = d->max_vcpus;
1521 
1522         /*
1523          * Copied from guest values may differ from domain vnuma config.
1524          * Check here guest parameters make sure we dont overflow.
1525          * Additionaly check padding.
1526          */
1527         if ( topology.nr_vnodes < dom_vnodes      ||
1528              topology.nr_vcpus < dom_vcpus        ||
1529              topology.nr_vmemranges < dom_vranges )
1530         {
1531             read_unlock(&d->vnuma_rwlock);
1532             rcu_unlock_domain(d);
1533 
1534             topology.nr_vnodes = dom_vnodes;
1535             topology.nr_vcpus = dom_vcpus;
1536             topology.nr_vmemranges = dom_vranges;
1537 
1538             /* Copy back needed values. */
1539             return __copy_to_guest(arg, &topology, 1) ? -EFAULT : -ENOBUFS;
1540         }
1541 
1542         read_unlock(&d->vnuma_rwlock);
1543 
1544         tmp.vdistance = xmalloc_array(unsigned int, dom_vnodes * dom_vnodes);
1545         tmp.vmemrange = xmalloc_array(xen_vmemrange_t, dom_vranges);
1546         tmp.vcpu_to_vnode = xmalloc_array(unsigned int, dom_vcpus);
1547 
1548         if ( tmp.vdistance == NULL ||
1549              tmp.vmemrange == NULL ||
1550              tmp.vcpu_to_vnode == NULL )
1551         {
1552             rc = -ENOMEM;
1553             goto vnumainfo_out;
1554         }
1555 
1556         /*
1557          * Check if vnuma info has changed and if the allocated arrays
1558          * are not big enough.
1559          */
1560         read_lock(&d->vnuma_rwlock);
1561 
1562         if ( dom_vnodes < d->vnuma->nr_vnodes ||
1563              dom_vranges < d->vnuma->nr_vmemranges ||
1564              dom_vcpus < d->max_vcpus )
1565         {
1566             read_unlock(&d->vnuma_rwlock);
1567             rc = -EAGAIN;
1568             goto vnumainfo_out;
1569         }
1570 
1571         dom_vnodes = d->vnuma->nr_vnodes;
1572         dom_vranges = d->vnuma->nr_vmemranges;
1573         dom_vcpus = d->max_vcpus;
1574 
1575         memcpy(tmp.vmemrange, d->vnuma->vmemrange,
1576                sizeof(*d->vnuma->vmemrange) * dom_vranges);
1577         memcpy(tmp.vdistance, d->vnuma->vdistance,
1578                sizeof(*d->vnuma->vdistance) * dom_vnodes * dom_vnodes);
1579         memcpy(tmp.vcpu_to_vnode, d->vnuma->vcpu_to_vnode,
1580                sizeof(*d->vnuma->vcpu_to_vnode) * dom_vcpus);
1581 
1582         read_unlock(&d->vnuma_rwlock);
1583 
1584         rc = -EFAULT;
1585 
1586         if ( copy_to_guest(topology.vmemrange.h, tmp.vmemrange,
1587                            dom_vranges) != 0 )
1588             goto vnumainfo_out;
1589 
1590         if ( copy_to_guest(topology.vdistance.h, tmp.vdistance,
1591                            dom_vnodes * dom_vnodes) != 0 )
1592             goto vnumainfo_out;
1593 
1594         if ( copy_to_guest(topology.vcpu_to_vnode.h, tmp.vcpu_to_vnode,
1595                            dom_vcpus) != 0 )
1596             goto vnumainfo_out;
1597 
1598         topology.nr_vnodes = dom_vnodes;
1599         topology.nr_vcpus = dom_vcpus;
1600         topology.nr_vmemranges = dom_vranges;
1601 
1602         rc = __copy_to_guest(arg, &topology, 1) ? -EFAULT : 0;
1603 
1604  vnumainfo_out:
1605         rcu_unlock_domain(d);
1606 
1607         xfree(tmp.vdistance);
1608         xfree(tmp.vmemrange);
1609         xfree(tmp.vcpu_to_vnode);
1610         break;
1611     }
1612 
1613 #ifdef CONFIG_HAS_PASSTHROUGH
1614     case XENMEM_reserved_device_memory_map:
1615     {
1616         struct get_reserved_device_memory grdm;
1617 
1618         if ( unlikely(start_extent) )
1619             return -EINVAL;
1620 
1621         if ( copy_from_guest(&grdm.map, arg, 1) ||
1622              !guest_handle_okay(grdm.map.buffer, grdm.map.nr_entries) )
1623             return -EFAULT;
1624 
1625         if ( grdm.map.flags & ~XENMEM_RDM_ALL )
1626             return -EINVAL;
1627 
1628         grdm.used_entries = 0;
1629         rc = iommu_get_reserved_device_memory(get_reserved_device_memory,
1630                                               &grdm);
1631 
1632         if ( !rc && grdm.map.nr_entries < grdm.used_entries )
1633             rc = -ENOBUFS;
1634         grdm.map.nr_entries = grdm.used_entries;
1635         if ( __copy_to_guest(arg, &grdm.map, 1) )
1636             rc = -EFAULT;
1637 
1638         break;
1639     }
1640 #endif
1641 
1642     case XENMEM_acquire_resource:
1643         rc = acquire_resource(
1644             guest_handle_cast(arg, xen_mem_acquire_resource_t));
1645         break;
1646 
1647     default:
1648         rc = arch_memory_op(cmd, arg);
1649         break;
1650     }
1651 
1652     return rc;
1653 }
1654 
clear_domain_page(mfn_t mfn)1655 void clear_domain_page(mfn_t mfn)
1656 {
1657     void *ptr = map_domain_page(mfn);
1658 
1659     clear_page(ptr);
1660     unmap_domain_page(ptr);
1661 }
1662 
copy_domain_page(mfn_t dest,mfn_t source)1663 void copy_domain_page(mfn_t dest, mfn_t source)
1664 {
1665     const void *src = map_domain_page(source);
1666     void *dst = map_domain_page(dest);
1667 
1668     copy_page(dst, src);
1669     unmap_domain_page(dst);
1670     unmap_domain_page(src);
1671 }
1672 
destroy_ring_for_helper(void ** _va,struct page_info * page)1673 void destroy_ring_for_helper(
1674     void **_va, struct page_info *page)
1675 {
1676     void *va = *_va;
1677 
1678     if ( va != NULL )
1679     {
1680         unmap_domain_page_global(va);
1681         put_page_and_type(page);
1682         *_va = NULL;
1683     }
1684 }
1685 
1686 /*
1687  * Acquire a pointer to struct page_info for a specified domain and GFN,
1688  * checking whether the page has been paged out, or needs unsharing.
1689  * If the function succeeds then zero is returned, page_p is written
1690  * with a pointer to the struct page_info with a reference taken, and
1691  * p2mt_p it is written with the P2M type of the page. The caller is
1692  * responsible for dropping the reference.
1693  * If the function fails then an appropriate errno is returned and the
1694  * values referenced by page_p and p2mt_p are undefined.
1695  */
check_get_page_from_gfn(struct domain * d,gfn_t gfn,bool readonly,p2m_type_t * p2mt_p,struct page_info ** page_p)1696 int check_get_page_from_gfn(struct domain *d, gfn_t gfn, bool readonly,
1697                             p2m_type_t *p2mt_p, struct page_info **page_p)
1698 {
1699     p2m_query_t q = readonly ? P2M_ALLOC : P2M_UNSHARE;
1700     p2m_type_t p2mt;
1701     struct page_info *page;
1702 
1703     page = get_page_from_gfn(d, gfn_x(gfn), &p2mt, q);
1704 
1705 #ifdef CONFIG_HAS_MEM_PAGING
1706     if ( p2m_is_paging(p2mt) )
1707     {
1708         if ( page )
1709             put_page(page);
1710 
1711         p2m_mem_paging_populate(d, gfn);
1712         return -EAGAIN;
1713     }
1714 #endif
1715 #ifdef CONFIG_MEM_SHARING
1716     if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
1717     {
1718         if ( page )
1719             put_page(page);
1720 
1721         return -EAGAIN;
1722     }
1723 #endif
1724 
1725     if ( !page )
1726         return -EINVAL;
1727 
1728     *p2mt_p = p2mt;
1729     *page_p = page;
1730     return 0;
1731 }
1732 
prepare_ring_for_helper(struct domain * d,unsigned long gmfn,struct page_info ** _page,void ** _va)1733 int prepare_ring_for_helper(
1734     struct domain *d, unsigned long gmfn, struct page_info **_page,
1735     void **_va)
1736 {
1737     p2m_type_t p2mt;
1738     struct page_info *page;
1739     void *va;
1740     int rc;
1741 
1742     rc = check_get_page_from_gfn(d, _gfn(gmfn), false, &p2mt, &page);
1743     if ( rc )
1744         return (rc == -EAGAIN) ? -ENOENT : rc;
1745 
1746     if ( !get_page_type(page, PGT_writable_page) )
1747     {
1748         put_page(page);
1749         return -EINVAL;
1750     }
1751 
1752     va = __map_domain_page_global(page);
1753     if ( va == NULL )
1754     {
1755         put_page_and_type(page);
1756         return -ENOMEM;
1757     }
1758 
1759     *_va = va;
1760     *_page = page;
1761 
1762     return 0;
1763 }
1764 
1765 /*
1766  * Local variables:
1767  * mode: C
1768  * c-file-style: "BSD"
1769  * c-basic-offset: 4
1770  * tab-width: 4
1771  * indent-tabs-mode: nil
1772  * End:
1773  */
1774