1 /******************************************************************************
2  * arch/x86/mm/mem_sharing.c
3  *
4  * Memory sharing support.
5  *
6  * Copyright (c) 2011 GridCentric, Inc. (Adin Scannell & Andres Lagar-Cavilla)
7  * Copyright (c) 2009 Citrix Systems, Inc. (Grzegorz Milos)
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; If not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include <xen/types.h>
24 #include <xen/domain_page.h>
25 #include <xen/event.h>
26 #include <xen/spinlock.h>
27 #include <xen/rwlock.h>
28 #include <xen/mm.h>
29 #include <xen/grant_table.h>
30 #include <xen/sched.h>
31 #include <xen/rcupdate.h>
32 #include <xen/guest_access.h>
33 #include <xen/vm_event.h>
34 #include <asm/page.h>
35 #include <asm/string.h>
36 #include <asm/p2m.h>
37 #include <asm/altp2m.h>
38 #include <asm/atomic.h>
39 #include <asm/event.h>
40 #include <asm/hap.h>
41 #include <asm/hvm/hvm.h>
42 #include <xsm/xsm.h>
43 
44 #include <public/hvm/params.h>
45 
46 #include "mm-locks.h"
47 
48 static shr_handle_t next_handle = 1;
49 
50 typedef struct pg_lock_data {
51     int mm_unlock_level;
52     unsigned short recurse_count;
53 } pg_lock_data_t;
54 
55 static DEFINE_PER_CPU(pg_lock_data_t, __pld);
56 
57 /* Reverse map defines */
58 #define RMAP_HASHTAB_ORDER  0
59 #define RMAP_HASHTAB_SIZE   \
60         ((PAGE_SIZE << RMAP_HASHTAB_ORDER) / sizeof(struct list_head))
61 #define RMAP_USES_HASHTAB(page) \
62         ((page)->sharing->hash_table.flag == NULL)
63 #define RMAP_HEAVY_SHARED_PAGE   RMAP_HASHTAB_SIZE
64 /*
65  * A bit of hysteresis. We don't want to be mutating between list and hash
66  * table constantly.
67  */
68 #define RMAP_LIGHT_SHARED_PAGE   (RMAP_HEAVY_SHARED_PAGE >> 2)
69 
70 #if MEM_SHARING_AUDIT
71 
72 static LIST_HEAD(shr_audit_list);
73 static DEFINE_SPINLOCK(shr_audit_lock);
74 static DEFINE_RCU_READ_LOCK(shr_audit_read_lock);
75 
76 /* RCU delayed free of audit list entry */
_free_pg_shared_info(struct rcu_head * head)77 static void _free_pg_shared_info(struct rcu_head *head)
78 {
79     xfree(container_of(head, struct page_sharing_info, rcu_head));
80 }
81 
audit_add_list(struct page_info * page)82 static void audit_add_list(struct page_info *page)
83 {
84     INIT_LIST_HEAD(&page->sharing->entry);
85     spin_lock(&shr_audit_lock);
86     list_add_rcu(&page->sharing->entry, &shr_audit_list);
87     spin_unlock(&shr_audit_lock);
88 }
89 
90 /* Removes from the audit list and cleans up the page sharing metadata. */
page_sharing_dispose(struct page_info * page)91 static void page_sharing_dispose(struct page_info *page)
92 {
93     /* Unlikely given our thresholds, but we should be careful. */
94     if ( unlikely(RMAP_USES_HASHTAB(page)) )
95         free_xenheap_pages(page->sharing->hash_table.bucket,
96                            RMAP_HASHTAB_ORDER);
97 
98     spin_lock(&shr_audit_lock);
99     list_del_rcu(&page->sharing->entry);
100     spin_unlock(&shr_audit_lock);
101     INIT_RCU_HEAD(&page->sharing->rcu_head);
102     call_rcu(&page->sharing->rcu_head, _free_pg_shared_info);
103 }
104 
105 #else
106 
107 #define audit_add_list(p)  ((void)0)
page_sharing_dispose(struct page_info * page)108 static void page_sharing_dispose(struct page_info *page)
109 {
110     /* Unlikely given our thresholds, but we should be careful. */
111     if ( unlikely(RMAP_USES_HASHTAB(page)) )
112         free_xenheap_pages(page->sharing->hash_table.bucket,
113                            RMAP_HASHTAB_ORDER);
114     xfree(page->sharing);
115 }
116 
117 #endif /* MEM_SHARING_AUDIT */
118 
119 /*
120  * Private implementations of page_lock/unlock to bypass PV-only
121  * sanity checks not applicable to mem-sharing.
122  *
123  * _page_lock is used in memory sharing to protect addition (share) and removal
124  * (unshare) of (gfn,domain) tupples to a list of gfn's that the shared page is
125  * currently backing.
126  * Nesting may happen when sharing (and locking) two pages.
127  * Deadlock is avoided by locking pages in increasing order.
128  * All memory sharing code paths take the p2m lock of the affected gfn before
129  * taking the lock for the underlying page. We enforce ordering between
130  * page_lock and p2m_lock using an mm-locks.h construct.
131  *
132  * TODO: Investigate if PGT_validated is necessary.
133  */
_page_lock(struct page_info * page)134 static bool _page_lock(struct page_info *page)
135 {
136     unsigned long x, nx;
137 
138     do {
139         while ( (x = page->u.inuse.type_info) & PGT_locked )
140             cpu_relax();
141         nx = x + (1 | PGT_locked);
142         if ( !(x & PGT_validated) ||
143              !(x & PGT_count_mask) ||
144              !(nx & PGT_count_mask) )
145             return false;
146     } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
147 
148     return true;
149 }
150 
_page_unlock(struct page_info * page)151 static void _page_unlock(struct page_info *page)
152 {
153     unsigned long x, nx, y = page->u.inuse.type_info;
154 
155     do {
156         x = y;
157         ASSERT((x & PGT_count_mask) && (x & PGT_locked));
158 
159         nx = x - (1 | PGT_locked);
160         /* We must not drop the last reference here. */
161         ASSERT(nx & PGT_count_mask);
162     } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
163 }
164 
mem_sharing_page_lock(struct page_info * pg)165 static bool mem_sharing_page_lock(struct page_info *pg)
166 {
167     bool rc;
168     pg_lock_data_t *pld = &(this_cpu(__pld));
169 
170     page_sharing_mm_pre_lock();
171     rc = _page_lock(pg);
172     if ( rc )
173     {
174         preempt_disable();
175         page_sharing_mm_post_lock(&pld->mm_unlock_level,
176                                   &pld->recurse_count);
177     }
178     return rc;
179 }
180 
mem_sharing_page_unlock(struct page_info * pg)181 static void mem_sharing_page_unlock(struct page_info *pg)
182 {
183     pg_lock_data_t *pld = &(this_cpu(__pld));
184 
185     page_sharing_mm_unlock(pld->mm_unlock_level,
186                            &pld->recurse_count);
187     preempt_enable();
188     _page_unlock(pg);
189 }
190 
get_next_handle(void)191 static shr_handle_t get_next_handle(void)
192 {
193     /* Get the next handle get_page style */
194     uint64_t x, y = next_handle;
195     do {
196         x = y;
197     }
198     while ( (y = cmpxchg(&next_handle, x, x + 1)) != x );
199     return x + 1;
200 }
201 
202 static atomic_t nr_saved_mfns   = ATOMIC_INIT(0);
203 static atomic_t nr_shared_mfns  = ATOMIC_INIT(0);
204 
205 /*
206  * Reverse map
207  *
208  * Every shared frame keeps a reverse map (rmap) of <domain, gfn> tuples that
209  * this shared frame backs. For pages with a low degree of sharing, a O(n)
210  * search linked list is good enough. For pages with higher degree of sharing,
211  * we use a hash table instead.
212  */
213 
214 typedef struct gfn_info
215 {
216     unsigned long gfn;
217     domid_t domain;
218     struct list_head list;
219 } gfn_info_t;
220 
rmap_init(struct page_info * page)221 static void rmap_init(struct page_info *page)
222 {
223     /* We always start off as a doubly linked list. */
224     INIT_LIST_HEAD(&page->sharing->gfns);
225 }
226 
227 /* Exceedingly simple "hash function" */
228 #define HASH(domain, gfn)       \
229     (((gfn) + (domain)) % RMAP_HASHTAB_SIZE)
230 
231 /*
232  * Conversions. Tuned by the thresholds. Should only happen twice
233  * (once each) during the lifetime of a shared page.
234  */
rmap_list_to_hash_table(struct page_info * page)235 static inline int rmap_list_to_hash_table(struct page_info *page)
236 {
237     unsigned int i;
238     struct list_head *pos, *tmp, *b =
239         alloc_xenheap_pages(RMAP_HASHTAB_ORDER, 0);
240 
241     if ( b == NULL )
242         return -ENOMEM;
243 
244     for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ )
245         INIT_LIST_HEAD(b + i);
246 
247     list_for_each_safe ( pos, tmp, &page->sharing->gfns )
248     {
249         gfn_info_t *gfn_info = list_entry(pos, gfn_info_t, list);
250         struct list_head *bucket = b + HASH(gfn_info->domain, gfn_info->gfn);
251 
252         list_del(pos);
253         list_add(pos, bucket);
254     }
255 
256     page->sharing->hash_table.bucket = b;
257     page->sharing->hash_table.flag   = NULL;
258 
259     return 0;
260 }
261 
rmap_hash_table_to_list(struct page_info * page)262 static void rmap_hash_table_to_list(struct page_info *page)
263 {
264     unsigned int i;
265     struct list_head *bucket = page->sharing->hash_table.bucket;
266 
267     INIT_LIST_HEAD(&page->sharing->gfns);
268 
269     for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ )
270     {
271         struct list_head *pos, *tmp, *head = bucket + i;
272 
273         list_for_each_safe ( pos, tmp, head )
274         {
275             list_del(pos);
276             list_add(pos, &page->sharing->gfns);
277         }
278     }
279 
280     free_xenheap_pages(bucket, RMAP_HASHTAB_ORDER);
281 }
282 
283 /* Generic accessors to the rmap */
rmap_count(const struct page_info * pg)284 static unsigned long rmap_count(const struct page_info *pg)
285 {
286     unsigned long count;
287     unsigned long t = read_atomic(&pg->u.inuse.type_info);
288 
289     count = t & PGT_count_mask;
290     if ( t & PGT_locked )
291         count--;
292     return count;
293 }
294 
295 /*
296  * The page type count is always decreased after removing from the rmap.
297  * Use a convert flag to avoid mutating the rmap if in the middle of an
298  * iterator, or if the page will be soon destroyed anyways.
299  */
rmap_del(gfn_info_t * gfn_info,struct page_info * page,int convert)300 static void rmap_del(gfn_info_t *gfn_info, struct page_info *page, int convert)
301 {
302     if ( RMAP_USES_HASHTAB(page) && convert &&
303          (rmap_count(page) <= RMAP_LIGHT_SHARED_PAGE) )
304         rmap_hash_table_to_list(page);
305 
306     /* Regardless of rmap type, same removal operation */
307     list_del(&gfn_info->list);
308 }
309 
310 /* The page type count is always increased before adding to the rmap. */
rmap_add(gfn_info_t * gfn_info,struct page_info * page)311 static void rmap_add(gfn_info_t *gfn_info, struct page_info *page)
312 {
313     struct list_head *head;
314 
315     if ( !RMAP_USES_HASHTAB(page) &&
316          (rmap_count(page) >= RMAP_HEAVY_SHARED_PAGE) )
317         /*
318          * The conversion may fail with ENOMEM. We'll be less efficient,
319          * but no reason to panic.
320          */
321         (void)rmap_list_to_hash_table(page);
322 
323     head = (RMAP_USES_HASHTAB(page)
324             ? page->sharing->hash_table.bucket + HASH(gfn_info->domain,
325                                                       gfn_info->gfn)
326             : &page->sharing->gfns);
327 
328     INIT_LIST_HEAD(&gfn_info->list);
329     list_add(&gfn_info->list, head);
330 }
331 
rmap_retrieve(uint16_t domain_id,unsigned long gfn,struct page_info * page)332 static gfn_info_t *rmap_retrieve(uint16_t domain_id, unsigned long gfn,
333                                  struct page_info *page)
334 {
335     gfn_info_t *gfn_info;
336     struct list_head *le, *head;
337 
338     head = (RMAP_USES_HASHTAB(page)
339             ? page->sharing->hash_table.bucket + HASH(domain_id, gfn)
340             : &page->sharing->gfns);
341 
342     list_for_each ( le, head )
343     {
344         gfn_info = list_entry(le, gfn_info_t, list);
345         if ( (gfn_info->gfn == gfn) && (gfn_info->domain == domain_id) )
346             return gfn_info;
347     }
348 
349     /* Nothing was found */
350     return NULL;
351 }
352 
353 /*
354  * The iterator hides the details of how the rmap is implemented. This
355  * involves splitting the list_for_each_safe macro into two steps.
356  */
357 struct rmap_iterator {
358     struct list_head *curr;
359     struct list_head *next;
360     unsigned int bucket;
361 };
362 
rmap_seed_iterator(struct page_info * page,struct rmap_iterator * ri)363 static void rmap_seed_iterator(struct page_info *page, struct rmap_iterator *ri)
364 {
365     ri->curr = (RMAP_USES_HASHTAB(page)
366                 ? page->sharing->hash_table.bucket
367                 : &page->sharing->gfns);
368     ri->next = ri->curr->next;
369     ri->bucket = 0;
370 }
371 
rmap_iterate(struct page_info * page,struct rmap_iterator * ri)372 static gfn_info_t *rmap_iterate(struct page_info *page,
373                                 struct rmap_iterator *ri)
374 {
375     struct list_head *head = (RMAP_USES_HASHTAB(page)
376                               ? page->sharing->hash_table.bucket + ri->bucket
377                               : &page->sharing->gfns);
378 
379  retry:
380     if ( ri->next == head)
381     {
382         if ( RMAP_USES_HASHTAB(page) )
383         {
384             ri->bucket++;
385             if ( ri->bucket >= RMAP_HASHTAB_SIZE )
386                 /* No more hash table buckets */
387                 return NULL;
388             head = page->sharing->hash_table.bucket + ri->bucket;
389             ri->curr = head;
390             ri->next = ri->curr->next;
391             goto retry;
392         }
393         else
394             /* List exhausted */
395             return NULL;
396     }
397 
398     ri->curr = ri->next;
399     ri->next = ri->curr->next;
400 
401     return list_entry(ri->curr, gfn_info_t, list);
402 }
403 
mem_sharing_gfn_alloc(struct page_info * page,struct domain * d,unsigned long gfn)404 static gfn_info_t *mem_sharing_gfn_alloc(struct page_info *page,
405                                          struct domain *d, unsigned long gfn)
406 {
407     gfn_info_t *gfn_info = xmalloc(gfn_info_t);
408 
409     if ( gfn_info == NULL )
410         return NULL;
411 
412     gfn_info->gfn = gfn;
413     gfn_info->domain = d->domain_id;
414 
415     rmap_add(gfn_info, page);
416 
417     /* Increment our number of shared pges. */
418     atomic_inc(&d->shr_pages);
419 
420     return gfn_info;
421 }
422 
mem_sharing_gfn_destroy(struct page_info * page,struct domain * d,gfn_info_t * gfn_info)423 static void mem_sharing_gfn_destroy(struct page_info *page, struct domain *d,
424                                     gfn_info_t *gfn_info)
425 {
426     /* Decrement the number of pages. */
427     atomic_dec(&d->shr_pages);
428 
429     /* Free the gfn_info structure. */
430     rmap_del(gfn_info, page, 1);
431     xfree(gfn_info);
432 }
433 
mem_sharing_lookup(unsigned long mfn)434 static struct page_info *mem_sharing_lookup(unsigned long mfn)
435 {
436     struct page_info *page;
437     unsigned long t;
438 
439     if ( !mfn_valid(_mfn(mfn)) )
440         return NULL;
441 
442     page = mfn_to_page(_mfn(mfn));
443     if ( page_get_owner(page) != dom_cow )
444         return NULL;
445 
446     /*
447      * Count has to be at least two, because we're called
448      * with the mfn locked (1) and this is supposed to be
449      * a shared page (1).
450      */
451     t = read_atomic(&page->u.inuse.type_info);
452     ASSERT((t & PGT_type_mask) == PGT_shared_page);
453     ASSERT((t & PGT_count_mask) >= 2);
454     ASSERT(SHARED_M2P(get_gpfn_from_mfn(mfn)));
455 
456     return page;
457 }
458 
audit(void)459 static int audit(void)
460 {
461 #if MEM_SHARING_AUDIT
462     int errors = 0;
463     unsigned long count_expected;
464     unsigned long count_found = 0;
465     struct list_head *ae;
466 
467     count_expected = atomic_read(&nr_shared_mfns);
468 
469     rcu_read_lock(&shr_audit_read_lock);
470 
471     list_for_each_rcu ( ae, &shr_audit_list )
472     {
473         struct page_sharing_info *pg_shared_info;
474         unsigned long nr_gfns = 0;
475         struct page_info *pg;
476         mfn_t mfn;
477         gfn_info_t *g;
478         struct rmap_iterator ri;
479 
480         pg_shared_info = list_entry(ae, struct page_sharing_info, entry);
481         pg = pg_shared_info->pg;
482         mfn = page_to_mfn(pg);
483 
484         /* If we can't lock it, it's definitely not a shared page */
485         if ( !mem_sharing_page_lock(pg) )
486         {
487             gdprintk(XENLOG_ERR,
488                      "mfn %lx in audit list, but cannot be locked (%lx)!\n",
489                      mfn_x(mfn), pg->u.inuse.type_info);
490             errors++;
491             continue;
492         }
493 
494         /* Check if the MFN has correct type, owner and handle. */
495         if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_shared_page )
496         {
497             gdprintk(XENLOG_ERR,
498                      "mfn %lx in audit list, but not PGT_shared_page (%lx)!\n",
499                      mfn_x(mfn), pg->u.inuse.type_info & PGT_type_mask);
500             errors++;
501             continue;
502         }
503 
504         /* Check the page owner. */
505         if ( page_get_owner(pg) != dom_cow )
506         {
507             gdprintk(XENLOG_ERR, "mfn %lx shared, but wrong owner (%pd)!\n",
508                      mfn_x(mfn), page_get_owner(pg));
509             errors++;
510         }
511 
512         /* Check the m2p entry */
513         if ( !SHARED_M2P(get_gpfn_from_mfn(mfn_x(mfn))) )
514         {
515             gdprintk(XENLOG_ERR, "mfn %lx shared, but wrong m2p entry (%lx)!\n",
516                      mfn_x(mfn), get_gpfn_from_mfn(mfn_x(mfn)));
517             errors++;
518         }
519 
520         /* Check we have a list */
521         if ( (!pg->sharing) || rmap_count(pg) == 0 )
522         {
523             gdprintk(XENLOG_ERR, "mfn %lx shared, but empty gfn list!\n",
524                      mfn_x(mfn));
525             errors++;
526             continue;
527         }
528 
529         /* We've found a page that is shared */
530         count_found++;
531 
532         /* Check if all GFNs map to the MFN, and the p2m types */
533         rmap_seed_iterator(pg, &ri);
534         while ( (g = rmap_iterate(pg, &ri)) != NULL )
535         {
536             struct domain *d;
537             p2m_type_t t;
538             mfn_t o_mfn;
539 
540             d = get_domain_by_id(g->domain);
541             if ( d == NULL )
542             {
543                 gdprintk(XENLOG_ERR,
544                          "Unknown dom: %d, for PFN=%lx, MFN=%lx\n",
545                          g->domain, g->gfn, mfn_x(mfn));
546                 errors++;
547                 continue;
548             }
549             o_mfn = get_gfn_query_unlocked(d, g->gfn, &t);
550             if ( !mfn_eq(o_mfn, mfn) )
551             {
552                 gdprintk(XENLOG_ERR, "Incorrect P2M for %pd, PFN=%lx."
553                          "Expecting MFN=%lx, got %lx\n",
554                          d, g->gfn, mfn_x(mfn), mfn_x(o_mfn));
555                 errors++;
556             }
557             if ( t != p2m_ram_shared )
558             {
559                 gdprintk(XENLOG_ERR,
560                          "Incorrect P2M type for %pd, PFN=%lx MFN=%lx."
561                          "Expecting t=%d, got %d\n",
562                          d, g->gfn, mfn_x(mfn), p2m_ram_shared, t);
563                 errors++;
564             }
565             put_domain(d);
566             nr_gfns++;
567         }
568         /* The type count has an extra ref because we have locked the page */
569         if ( (nr_gfns + 1) != (pg->u.inuse.type_info & PGT_count_mask) )
570         {
571             gdprintk(XENLOG_ERR, "Mismatched counts for MFN=%lx."
572                      "nr_gfns in list %lu, in type_info %lx\n",
573                      mfn_x(mfn), nr_gfns,
574                      (pg->u.inuse.type_info & PGT_count_mask));
575             errors++;
576         }
577 
578         mem_sharing_page_unlock(pg);
579     }
580 
581     rcu_read_unlock(&shr_audit_read_lock);
582 
583     if ( count_found != count_expected )
584     {
585         gdprintk(XENLOG_ERR, "Expected %ld shared mfns, found %ld.",
586                  count_expected, count_found);
587         errors++;
588     }
589 
590     return errors;
591 #else
592     return -EOPNOTSUPP;
593 #endif
594 }
595 
mem_sharing_notify_enomem(struct domain * d,unsigned long gfn,bool allow_sleep)596 int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
597                               bool allow_sleep)
598 {
599     struct vcpu *v = current;
600     int rc;
601     vm_event_request_t req = {
602         .reason = VM_EVENT_REASON_MEM_SHARING,
603         .vcpu_id = v->vcpu_id,
604         .u.mem_sharing.gfn = gfn,
605         .u.mem_sharing.p2mt = p2m_ram_shared,
606     };
607 
608     if ( (rc = __vm_event_claim_slot(
609               d, d->vm_event_share, allow_sleep)) < 0 )
610         return rc;
611 
612     if ( v->domain == d )
613     {
614         req.flags = VM_EVENT_FLAG_VCPU_PAUSED;
615         vm_event_vcpu_pause(v);
616     }
617 
618     vm_event_put_request(d, d->vm_event_share, &req);
619 
620     return 0;
621 }
622 
mem_sharing_get_nr_saved_mfns(void)623 unsigned int mem_sharing_get_nr_saved_mfns(void)
624 {
625     return atomic_read(&nr_saved_mfns);
626 }
627 
mem_sharing_get_nr_shared_mfns(void)628 unsigned int mem_sharing_get_nr_shared_mfns(void)
629 {
630     return atomic_read(&nr_shared_mfns);
631 }
632 
633 /* Functions that change a page's type and ownership */
page_make_sharable(struct domain * d,struct page_info * page,unsigned int expected_refcnt,bool validate_only)634 static int page_make_sharable(struct domain *d,
635                               struct page_info *page,
636                               unsigned int expected_refcnt,
637                               bool validate_only)
638 {
639     int rc = 0;
640     bool drop_dom_ref = false;
641 
642     spin_lock_recursive(&d->page_alloc_lock);
643 
644     if ( d->is_dying )
645     {
646         rc = -EBUSY;
647         goto out;
648     }
649 
650     /* Change page type and count atomically */
651     if ( !get_page_and_type(page, d, PGT_shared_page) )
652     {
653         rc = -EINVAL;
654         goto out;
655     }
656 
657     /* Check it wasn't already sharable and undo if it was */
658     if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
659     {
660         put_page_and_type(page);
661         rc = -EEXIST;
662         goto out;
663     }
664 
665     /*
666      * Check if the ref count is 2. The first from PGC_allocated, and
667      * the second from get_page_and_type at the top of this function.
668      */
669     if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) )
670     {
671         /* Return type count back to zero */
672         put_page_and_type(page);
673         rc = -E2BIG;
674         goto out;
675     }
676 
677     if ( !validate_only )
678     {
679         page_set_owner(page, dom_cow);
680         drop_dom_ref = !domain_adjust_tot_pages(d, -1);
681         page_list_del(page, &d->page_list);
682     }
683 
684 out:
685     spin_unlock_recursive(&d->page_alloc_lock);
686 
687     if ( drop_dom_ref )
688         put_domain(d);
689 
690     return rc;
691 }
692 
page_make_private(struct domain * d,struct page_info * page)693 static int page_make_private(struct domain *d, struct page_info *page)
694 {
695     unsigned long expected_type;
696 
697     if ( !get_page(page, dom_cow) )
698         return -EINVAL;
699 
700     spin_lock(&d->page_alloc_lock);
701 
702     if ( d->is_dying )
703     {
704         spin_unlock(&d->page_alloc_lock);
705         put_page(page);
706         return -EBUSY;
707     }
708 
709     expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
710     if ( page->u.inuse.type_info != expected_type )
711     {
712         spin_unlock(&d->page_alloc_lock);
713         put_page(page);
714         return -EEXIST;
715     }
716 
717     mem_sharing_page_unlock(page);
718 
719     /* Drop the final typecount */
720     put_page_and_type(page);
721 
722     /* Change the owner */
723     ASSERT(page_get_owner(page) == dom_cow);
724     page_set_owner(page, d);
725 
726     if ( domain_adjust_tot_pages(d, 1) == 1 )
727         get_knownalive_domain(d);
728     page_list_add_tail(page, &d->page_list);
729     spin_unlock(&d->page_alloc_lock);
730 
731     put_page(page);
732 
733     return 0;
734 }
735 
__grab_shared_page(mfn_t mfn)736 static struct page_info *__grab_shared_page(mfn_t mfn)
737 {
738     struct page_info *pg = NULL;
739 
740     if ( !mfn_valid(mfn) )
741         return NULL;
742 
743     pg = mfn_to_page(mfn);
744 
745     /*
746      * If the page is not validated we can't lock it, and if it's
747      * not validated it's obviously not shared.
748      */
749     if ( !mem_sharing_page_lock(pg) )
750         return NULL;
751 
752     if ( mem_sharing_lookup(mfn_x(mfn)) == NULL )
753     {
754         mem_sharing_page_unlock(pg);
755         return NULL;
756     }
757 
758     return pg;
759 }
760 
debug_mfn(mfn_t mfn)761 static int debug_mfn(mfn_t mfn)
762 {
763     struct page_info *page;
764     int num_refs;
765 
766     if ( (page = __grab_shared_page(mfn)) == NULL)
767     {
768         gdprintk(XENLOG_ERR, "Invalid MFN=%lx\n", mfn_x(mfn));
769         return -EINVAL;
770     }
771 
772     gdprintk(XENLOG_ERR,
773              "Debug page: MFN=%lx is ci=%lx, ti=%lx, owner_id=%pd\n",
774              mfn_x(page_to_mfn(page)), page->count_info,
775              page->u.inuse.type_info, page_get_owner(page));
776 
777     /* -1 because the page is locked and that's an additional type ref */
778     num_refs = ((int) (page->u.inuse.type_info & PGT_count_mask)) - 1;
779     mem_sharing_page_unlock(page);
780     return num_refs;
781 }
782 
debug_gfn(struct domain * d,gfn_t gfn)783 static int debug_gfn(struct domain *d, gfn_t gfn)
784 {
785     p2m_type_t p2mt;
786     mfn_t mfn;
787     int num_refs;
788 
789     mfn = get_gfn_query(d, gfn_x(gfn), &p2mt);
790 
791     gdprintk(XENLOG_ERR, "Debug for %pd, gfn=%" PRI_gfn "\n",
792              d, gfn_x(gfn));
793 
794     num_refs = debug_mfn(mfn);
795     put_gfn(d, gfn_x(gfn));
796 
797     return num_refs;
798 }
799 
debug_gref(struct domain * d,grant_ref_t ref)800 static int debug_gref(struct domain *d, grant_ref_t ref)
801 {
802     int rc;
803     uint16_t status;
804     gfn_t gfn;
805 
806     rc = mem_sharing_gref_to_gfn(d->grant_table, ref, &gfn, &status);
807     if ( rc )
808     {
809         gdprintk(XENLOG_ERR, "Asked to debug [%pd,gref=%u]: error %d.\n",
810                  d, ref, rc);
811         return rc;
812     }
813 
814     gdprintk(XENLOG_ERR, "==> Grant [%pd,ref=%d], status=%x. ",
815              d, ref, status);
816 
817     return debug_gfn(d, gfn);
818 }
819 
nominate_page(struct domain * d,gfn_t gfn,unsigned int expected_refcnt,bool validate_only,shr_handle_t * phandle)820 static int nominate_page(struct domain *d, gfn_t gfn,
821                          unsigned int expected_refcnt, bool validate_only,
822                          shr_handle_t *phandle)
823 {
824     struct p2m_domain *hp2m = p2m_get_hostp2m(d);
825     p2m_type_t p2mt;
826     p2m_access_t p2ma;
827     mfn_t mfn;
828     struct page_info *page = NULL; /* gcc... */
829     int ret;
830 
831     *phandle = 0UL;
832 
833     mfn = get_gfn_type_access(hp2m, gfn_x(gfn), &p2mt, &p2ma, 0, NULL);
834 
835     /* Check if mfn is valid */
836     ret = -EINVAL;
837     if ( !mfn_valid(mfn) )
838         goto out;
839 
840     /* Return the handle if the page is already shared */
841     if ( p2m_is_shared(p2mt) )
842     {
843         struct page_info *pg = __grab_shared_page(mfn);
844         if ( !pg )
845             BUG();
846 
847         *phandle = pg->sharing->handle;
848         ret = 0;
849         mem_sharing_page_unlock(pg);
850         goto out;
851     }
852 
853     /* Check p2m type */
854     if ( !p2m_is_sharable(p2mt) )
855         goto out;
856 
857     page = mfn_to_page(mfn);
858     if ( !page || is_special_page(page) )
859         goto out;
860 
861     /* Check if there are mem_access/remapped altp2m entries for this page */
862     if ( altp2m_active(d) )
863     {
864         unsigned int i;
865         struct p2m_domain *ap2m;
866         mfn_t amfn;
867         p2m_type_t ap2mt;
868         p2m_access_t ap2ma;
869 
870         altp2m_list_lock(d);
871 
872         for ( i = 0; i < MAX_ALTP2M; i++ )
873         {
874             ap2m = d->arch.altp2m_p2m[i];
875             if ( !ap2m )
876                 continue;
877 
878             amfn = __get_gfn_type_access(ap2m, gfn_x(gfn), &ap2mt, &ap2ma,
879                                          0, NULL, false);
880             if ( mfn_valid(amfn) && (!mfn_eq(amfn, mfn) || ap2ma != p2ma) )
881             {
882                 altp2m_list_unlock(d);
883                 goto out;
884             }
885         }
886 
887         altp2m_list_unlock(d);
888     }
889 
890     /* Try to convert the mfn to the sharable type */
891     ret = page_make_sharable(d, page, expected_refcnt, validate_only);
892     if ( ret || validate_only )
893         goto out;
894 
895     /*
896      * Now that the page is validated, we can lock it. There is no
897      * race because we're holding the p2m entry, so no one else
898      * could be nominating this gfn.
899      */
900     ret = -ENOENT;
901     if ( !mem_sharing_page_lock(page) )
902         goto out;
903 
904     /* Initialize the shared state */
905     ret = -ENOMEM;
906     if ( !(page->sharing = xmalloc(struct page_sharing_info)) )
907     {
908         /* Making a page private atomically unlocks it */
909         BUG_ON(page_make_private(d, page));
910         goto out;
911     }
912     page->sharing->pg = page;
913     rmap_init(page);
914 
915     /* Create the handle */
916     page->sharing->handle = get_next_handle();
917 
918     /* Create the local gfn info */
919     if ( !mem_sharing_gfn_alloc(page, d, gfn_x(gfn)) )
920     {
921         xfree(page->sharing);
922         page->sharing = NULL;
923         BUG_ON(page_make_private(d, page));
924         goto out;
925     }
926 
927     /* Change the p2m type, should never fail with p2m locked. */
928     BUG_ON(p2m_change_type_one(d, gfn_x(gfn), p2mt, p2m_ram_shared));
929 
930     /* Account for this page. */
931     atomic_inc(&nr_shared_mfns);
932 
933     /* Update m2p entry to SHARED_M2P_ENTRY */
934     set_gpfn_from_mfn(mfn_x(mfn), SHARED_M2P_ENTRY);
935 
936     *phandle = page->sharing->handle;
937     audit_add_list(page);
938     mem_sharing_page_unlock(page);
939     ret = 0;
940 
941 out:
942     put_gfn(d, gfn_x(gfn));
943     return ret;
944 }
945 
share_pages(struct domain * sd,gfn_t sgfn,shr_handle_t sh,struct domain * cd,gfn_t cgfn,shr_handle_t ch)946 static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
947                        struct domain *cd, gfn_t cgfn, shr_handle_t ch)
948 {
949     struct page_info *spage, *cpage, *firstpg, *secondpg;
950     gfn_info_t *gfn;
951     struct domain *d;
952     int ret = -EINVAL;
953     mfn_t smfn, cmfn;
954     p2m_type_t smfn_type, cmfn_type;
955     struct two_gfns tg;
956     struct rmap_iterator ri;
957     unsigned long put_count = 0;
958 
959     get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
960                  cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg, true);
961 
962     /*
963      * This tricky business is to avoid two callers deadlocking if
964      * grabbing pages in opposite client/source order.
965      */
966     if ( mfn_eq(smfn, cmfn) )
967     {
968         /*
969          * The pages are already the same.  We could return some
970          * kind of error here, but no matter how you look at it,
971          * the pages are already 'shared'.  It possibly represents
972          * a big problem somewhere else, but as far as sharing is
973          * concerned: great success!
974          */
975         ret = 0;
976         goto err_out;
977     }
978 
979     if ( mfn_x(smfn) < mfn_x(cmfn) )
980     {
981         ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
982         spage = firstpg = __grab_shared_page(smfn);
983         if ( spage == NULL )
984             goto err_out;
985 
986         ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
987         cpage = secondpg = __grab_shared_page(cmfn);
988         if ( cpage == NULL )
989         {
990             mem_sharing_page_unlock(spage);
991             goto err_out;
992         }
993     }
994     else
995     {
996         ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
997         cpage = firstpg = __grab_shared_page(cmfn);
998         if ( cpage == NULL )
999             goto err_out;
1000 
1001         ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
1002         spage = secondpg = __grab_shared_page(smfn);
1003         if ( spage == NULL )
1004         {
1005             mem_sharing_page_unlock(cpage);
1006             goto err_out;
1007         }
1008     }
1009 
1010     ASSERT(smfn_type == p2m_ram_shared);
1011     ASSERT(cmfn_type == p2m_ram_shared);
1012 
1013     /* Check that the handles match */
1014     if ( spage->sharing->handle != sh )
1015     {
1016         ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
1017         mem_sharing_page_unlock(secondpg);
1018         mem_sharing_page_unlock(firstpg);
1019         goto err_out;
1020     }
1021 
1022     if ( cpage->sharing->handle != ch )
1023     {
1024         ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
1025         mem_sharing_page_unlock(secondpg);
1026         mem_sharing_page_unlock(firstpg);
1027         goto err_out;
1028     }
1029 
1030     /* Merge the lists together */
1031     rmap_seed_iterator(cpage, &ri);
1032     while ( (gfn = rmap_iterate(cpage, &ri)) != NULL)
1033     {
1034         /*
1035          * Get the source page and type, this should never fail:
1036          * we are under shr lock, and got a successful lookup.
1037          */
1038         BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page));
1039         /*
1040          * Move the gfn_info from client list to source list.
1041          * Don't change the type of rmap for the client page.
1042          */
1043         rmap_del(gfn, cpage, 0);
1044         rmap_add(gfn, spage);
1045         put_count++;
1046         d = get_domain_by_id(gfn->domain);
1047         BUG_ON(!d);
1048         BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
1049         put_domain(d);
1050     }
1051     ASSERT(list_empty(&cpage->sharing->gfns));
1052     BUG_ON(!put_count);
1053 
1054     /* Clear the rest of the shared state */
1055     page_sharing_dispose(cpage);
1056     cpage->sharing = NULL;
1057 
1058     mem_sharing_page_unlock(secondpg);
1059     mem_sharing_page_unlock(firstpg);
1060 
1061     /* Free the client page */
1062     put_page_alloc_ref(cpage);
1063 
1064     while ( put_count-- )
1065         put_page_and_type(cpage);
1066 
1067     /* We managed to free a domain page. */
1068     atomic_dec(&nr_shared_mfns);
1069     atomic_inc(&nr_saved_mfns);
1070     ret = 0;
1071 
1072 err_out:
1073     put_two_gfns(&tg);
1074     return ret;
1075 }
1076 
1077 /*
1078  * This function is intended to be used for plugging a "hole" in the client's
1079  * physmap with a shared memory entry. Unfortunately the definition of a "hole"
1080  * is currently ambigious. There are two cases one can run into a "hole":
1081  *  1) there is no pagetable entry at all
1082  *  2) there is a pagetable entry with a type that passes p2m_is_hole
1083  *
1084  * The intended use-case for this function is case 1.
1085  *
1086  * During 1) the mem_access being returned is p2m_access_n and that is
1087  * incorrect to be applied to the new entry being added the client physmap,
1088  * thus we make use of the p2m->default_access instead.
1089  * When 2) is true it is possible that the existing pagetable entry also has
1090  * a mem_access permission set, which could be p2m_access_n. Since we can't
1091  * differentiate whether we are in case 1) or 2), we default to using the
1092  * access permission defined as default for the p2m, thus in
1093  * case 2) overwriting any custom mem_access permission the user may have set
1094  * on a hole page. Custom mem_access permissions being set on a hole are
1095  * unheard of but technically possible.
1096  *
1097  * TODO: to properly resolve this issue implement differentiation between the
1098  * two "hole" types.
1099  */
1100 static
add_to_physmap(struct domain * sd,unsigned long sgfn,shr_handle_t sh,struct domain * cd,unsigned long cgfn,bool lock)1101 int add_to_physmap(struct domain *sd, unsigned long sgfn, shr_handle_t sh,
1102                    struct domain *cd, unsigned long cgfn, bool lock)
1103 {
1104     struct page_info *spage;
1105     int ret = -EINVAL;
1106     mfn_t smfn, cmfn;
1107     p2m_type_t smfn_type, cmfn_type;
1108     struct gfn_info *gfn_info;
1109     struct p2m_domain *p2m = p2m_get_hostp2m(cd);
1110     struct two_gfns tg;
1111 
1112     get_two_gfns(sd, _gfn(sgfn), &smfn_type, NULL, &smfn,
1113                  cd, _gfn(cgfn), &cmfn_type, NULL, &cmfn, 0, &tg, lock);
1114 
1115     /* Get the source shared page, check and lock */
1116     ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
1117     spage = __grab_shared_page(smfn);
1118     if ( spage == NULL )
1119         goto err_out;
1120 
1121     ASSERT(smfn_type == p2m_ram_shared);
1122 
1123     /* Check that the handles match */
1124     if ( spage->sharing->handle != sh )
1125         goto err_unlock;
1126 
1127     /*
1128      * Make sure the target page is a hole in the physmap. These are typically
1129      * p2m_mmio_dm, but also accept p2m_invalid and paged out pages. See the
1130      * definition of p2m_is_hole in p2m.h.
1131      */
1132     if ( !p2m_is_hole(cmfn_type) )
1133     {
1134         ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
1135         goto err_unlock;
1136     }
1137 
1138     /* This is simpler than regular sharing */
1139     BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page));
1140     if ( !(gfn_info = mem_sharing_gfn_alloc(spage, cd, cgfn)) )
1141     {
1142         put_page_and_type(spage);
1143         ret = -ENOMEM;
1144         goto err_unlock;
1145     }
1146 
1147     ret = p2m_set_entry(p2m, _gfn(cgfn), smfn, PAGE_ORDER_4K,
1148                         p2m_ram_shared, p2m->default_access);
1149 
1150     /* Tempted to turn this into an assert */
1151     if ( ret )
1152     {
1153         mem_sharing_gfn_destroy(spage, cd, gfn_info);
1154         put_page_and_type(spage);
1155     }
1156     else
1157     {
1158         /*
1159          * There is a chance we're plugging a hole where a paged out
1160          * page was.
1161          */
1162         if ( p2m_is_paging(cmfn_type) && (cmfn_type != p2m_ram_paging_out) )
1163         {
1164             atomic_dec(&cd->paged_pages);
1165             /*
1166              * Further, there is a chance this was a valid page.
1167              * Don't leak it.
1168              */
1169             if ( mfn_valid(cmfn) )
1170             {
1171                 struct page_info *cpage = mfn_to_page(cmfn);
1172 
1173                 if ( !get_page(cpage, cd) )
1174                 {
1175                     domain_crash(cd);
1176                     ret = -EOVERFLOW;
1177                     goto err_unlock;
1178                 }
1179                 put_page_alloc_ref(cpage);
1180                 put_page(cpage);
1181             }
1182         }
1183     }
1184 
1185     atomic_inc(&nr_saved_mfns);
1186 
1187 err_unlock:
1188     mem_sharing_page_unlock(spage);
1189 err_out:
1190     if ( lock )
1191         put_two_gfns(&tg);
1192     return ret;
1193 }
1194 
1195 
1196 /*
1197  * A note on the rationale for unshare error handling:
1198  *  1. Unshare can only fail with ENOMEM. Any other error conditions BUG_ON()'s
1199  *  2. We notify a potential dom0 helper through a vm_event ring. But we
1200  *     allow the notification to not go to sleep. If the event ring is full
1201  *     of ENOMEM warnings, then it's on the ball.
1202  *  3. We cannot go to sleep until the unshare is resolved, because we might
1203  *     be buried deep into locks (e.g. something -> copy_to_user -> __hvm_copy)
1204  *  4. So, we make sure we:
1205  *     4.1. return an error
1206  *     4.2. do not corrupt shared memory
1207  *     4.3. do not corrupt guest memory
1208  *     4.4. let the guest deal with it if the error propagation will reach it
1209  */
__mem_sharing_unshare_page(struct domain * d,unsigned long gfn,bool destroy)1210 int __mem_sharing_unshare_page(struct domain *d,
1211                                unsigned long gfn,
1212                                bool destroy)
1213 {
1214     p2m_type_t p2mt;
1215     mfn_t mfn;
1216     struct page_info *page, *old_page;
1217     int last_gfn;
1218     gfn_info_t *gfn_info = NULL;
1219 
1220     mfn = get_gfn(d, gfn, &p2mt);
1221 
1222     /* Has someone already unshared it? */
1223     if ( !p2m_is_shared(p2mt) )
1224     {
1225         put_gfn(d, gfn);
1226         return 0;
1227     }
1228 
1229     page = __grab_shared_page(mfn);
1230     if ( page == NULL )
1231     {
1232         gdprintk(XENLOG_ERR, "Domain p2m is shared, but page is not: %lx\n",
1233                  gfn);
1234         BUG();
1235     }
1236 
1237     gfn_info = rmap_retrieve(d->domain_id, gfn, page);
1238     if ( unlikely(gfn_info == NULL) )
1239     {
1240         gdprintk(XENLOG_ERR, "Could not find gfn_info for shared gfn: %lx\n",
1241                  gfn);
1242         BUG();
1243     }
1244 
1245     /*
1246      * Do the accounting first. If anything fails below, we have bigger
1247      * bigger fish to fry. First, remove the gfn from the list.
1248      */
1249     last_gfn = rmap_count(page) == 1;
1250     if ( last_gfn )
1251     {
1252         /*
1253          * Clean up shared state. Get rid of the <domid, gfn> tuple
1254          * before destroying the rmap.
1255          */
1256         mem_sharing_gfn_destroy(page, d, gfn_info);
1257         page_sharing_dispose(page);
1258         page->sharing = NULL;
1259         atomic_dec(&nr_shared_mfns);
1260     }
1261     else
1262         atomic_dec(&nr_saved_mfns);
1263 
1264     /*
1265      * If the GFN is getting destroyed drop the references to MFN
1266      * (possibly freeing the page), and exit early.
1267      */
1268     if ( destroy )
1269     {
1270         if ( !last_gfn )
1271             mem_sharing_gfn_destroy(page, d, gfn_info);
1272 
1273         mem_sharing_page_unlock(page);
1274 
1275         if ( last_gfn )
1276             put_page_alloc_ref(page);
1277 
1278         put_page_and_type(page);
1279         put_gfn(d, gfn);
1280 
1281         return 0;
1282     }
1283 
1284     if ( last_gfn )
1285     {
1286         /* Making a page private atomically unlocks it */
1287         BUG_ON(page_make_private(d, page) != 0);
1288         goto private_page_found;
1289     }
1290 
1291     old_page = page;
1292     page = alloc_domheap_page(d, 0);
1293     if ( !page )
1294     {
1295         /* Undo dec of nr_saved_mfns, as the retry will decrease again. */
1296         atomic_inc(&nr_saved_mfns);
1297         mem_sharing_page_unlock(old_page);
1298         put_gfn(d, gfn);
1299         /*
1300          * Caller is responsible for placing an event
1301          * in the ring.
1302          */
1303         return -ENOMEM;
1304     }
1305 
1306     copy_domain_page(page_to_mfn(page), page_to_mfn(old_page));
1307 
1308     BUG_ON(set_shared_p2m_entry(d, gfn, page_to_mfn(page)));
1309     mem_sharing_gfn_destroy(old_page, d, gfn_info);
1310     mem_sharing_page_unlock(old_page);
1311     put_page_and_type(old_page);
1312 
1313  private_page_found:
1314     if ( p2m_change_type_one(d, gfn, p2m_ram_shared, p2m_ram_rw) )
1315     {
1316         gdprintk(XENLOG_ERR, "Could not change p2m type d %pd gfn %lx.\n",
1317                  d, gfn);
1318         BUG();
1319     }
1320 
1321     /* Update m2p entry */
1322     set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), gfn);
1323 
1324     /*
1325      * Now that the gfn<->mfn map is properly established,
1326      * marking dirty is feasible
1327      */
1328     paging_mark_dirty(d, page_to_mfn(page));
1329     /* We do not need to unlock a private page */
1330     put_gfn(d, gfn);
1331     return 0;
1332 }
1333 
relinquish_shared_pages(struct domain * d)1334 int relinquish_shared_pages(struct domain *d)
1335 {
1336     int rc = 0;
1337     struct mem_sharing_domain *msd = &d->arch.hvm.mem_sharing;
1338     struct p2m_domain *p2m = p2m_get_hostp2m(d);
1339     unsigned long gfn, count = 0;
1340 
1341     if ( p2m == NULL )
1342         return 0;
1343 
1344     p2m_lock(p2m);
1345     for ( gfn = msd->next_shared_gfn_to_relinquish;
1346           gfn <= p2m->max_mapped_pfn; gfn++ )
1347     {
1348         p2m_access_t a;
1349         p2m_type_t t;
1350         mfn_t mfn;
1351         int set_rc;
1352 
1353         if ( !atomic_read(&d->shr_pages) )
1354             break;
1355 
1356         mfn = p2m->get_entry(p2m, _gfn(gfn), &t, &a, 0, NULL, NULL);
1357         if ( mfn_valid(mfn) && p2m_is_shared(t) )
1358         {
1359             /* Does not fail with ENOMEM given "destroy" is set to true */
1360             BUG_ON(__mem_sharing_unshare_page(d, gfn, true));
1361             /*
1362              * Clear out the p2m entry so no one else may try to
1363              * unshare.  Must succeed: we just read the old entry and
1364              * we hold the p2m lock.
1365              */
1366             set_rc = p2m->set_entry(p2m, _gfn(gfn), INVALID_MFN, PAGE_ORDER_4K,
1367                                     p2m_invalid, p2m_access_rwx, -1);
1368             ASSERT(!set_rc);
1369             count += 0x10;
1370         }
1371         else
1372             ++count;
1373 
1374         /* Preempt every 2MiB (shared) or 32MiB (unshared) - arbitrary. */
1375         if ( count >= 0x2000 )
1376         {
1377             if ( hypercall_preempt_check() )
1378             {
1379                 msd->next_shared_gfn_to_relinquish = gfn + 1;
1380                 rc = -ERESTART;
1381                 break;
1382             }
1383             count = 0;
1384         }
1385     }
1386 
1387     p2m_unlock(p2m);
1388     return rc;
1389 }
1390 
range_share(struct domain * d,struct domain * cd,struct mem_sharing_op_range * range)1391 static int range_share(struct domain *d, struct domain *cd,
1392                        struct mem_sharing_op_range *range)
1393 {
1394     int rc = 0;
1395     shr_handle_t sh, ch;
1396     unsigned long start = range->opaque ?: range->first_gfn;
1397 
1398     while ( range->last_gfn >= start )
1399     {
1400         /*
1401          * We only break out if we run out of memory as individual pages may
1402          * legitimately be unsharable and we just want to skip over those.
1403          */
1404         rc = nominate_page(d, _gfn(start), 0, false, &sh);
1405         if ( rc == -ENOMEM )
1406             break;
1407 
1408         if ( !rc )
1409         {
1410             rc = nominate_page(cd, _gfn(start), 0, false, &ch);
1411             if ( rc == -ENOMEM )
1412                 break;
1413 
1414             if ( !rc )
1415             {
1416                 /* If we get here this should be guaranteed to succeed. */
1417                 rc = share_pages(d, _gfn(start), sh, cd, _gfn(start), ch);
1418                 ASSERT(!rc);
1419             }
1420         }
1421 
1422         /* Check for continuation if it's not the last iteration. */
1423         if ( range->last_gfn >= ++start && hypercall_preempt_check() )
1424         {
1425             rc = 1;
1426             break;
1427         }
1428     }
1429 
1430     range->opaque = start;
1431 
1432     /*
1433      * The last page may fail with -EINVAL, and for range sharing we don't
1434      * care about that.
1435      */
1436     if ( range->last_gfn < start && rc == -EINVAL )
1437         rc = 0;
1438 
1439     return rc;
1440 }
1441 
mem_sharing_control(struct domain * d,bool enable,uint16_t flags)1442 static inline int mem_sharing_control(struct domain *d, bool enable,
1443                                       uint16_t flags)
1444 {
1445     if ( enable )
1446     {
1447         if ( unlikely(!is_hvm_domain(d) || !cpu_has_vmx) )
1448             return -EOPNOTSUPP;
1449 
1450         if ( unlikely(!hap_enabled(d)) )
1451             return -ENODEV;
1452 
1453         if ( unlikely(is_iommu_enabled(d) &&
1454                       !(flags & XENMEM_FORK_WITH_IOMMU_ALLOWED)) )
1455             return -EXDEV;
1456     }
1457 
1458     d->arch.hvm.mem_sharing.enabled = enable;
1459     return 0;
1460 }
1461 
1462 /*
1463  * Forking a page only gets called when the VM faults due to no entry being
1464  * in the EPT for the access. Depending on the type of access we either
1465  * populate the physmap with a shared entry for read-only access or
1466  * fork the page if its a write access.
1467  *
1468  * The client p2m is already locked so we only need to lock
1469  * the parent's here.
1470  */
mem_sharing_fork_page(struct domain * d,gfn_t gfn,bool unsharing)1471 int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
1472 {
1473     int rc = -ENOENT;
1474     shr_handle_t handle;
1475     struct domain *parent = d->parent;
1476     struct p2m_domain *p2m;
1477     unsigned long gfn_l = gfn_x(gfn);
1478     mfn_t mfn, new_mfn;
1479     p2m_type_t p2mt;
1480     struct page_info *page;
1481 
1482     if ( !mem_sharing_is_fork(d) )
1483         return -ENOENT;
1484 
1485     if ( !unsharing )
1486     {
1487         /* For read-only accesses we just add a shared entry to the physmap */
1488         while ( parent )
1489         {
1490             if ( !(rc = nominate_page(parent, gfn, 0, false, &handle)) )
1491                 break;
1492 
1493             parent = parent->parent;
1494         }
1495 
1496         if ( !rc )
1497         {
1498             /* The client's p2m is already locked */
1499             p2m = p2m_get_hostp2m(parent);
1500 
1501             p2m_lock(p2m);
1502             rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
1503             p2m_unlock(p2m);
1504 
1505             if ( !rc )
1506                 return 0;
1507         }
1508     }
1509 
1510     /*
1511      * If it's a write access (ie. unsharing) or if adding a shared entry to
1512      * the physmap failed we'll fork the page directly.
1513      */
1514     p2m = p2m_get_hostp2m(d);
1515     parent = d->parent;
1516 
1517     while ( parent )
1518     {
1519         mfn = get_gfn_query(parent, gfn_l, &p2mt);
1520 
1521         /* We can't fork grant memory from the parent, only regular ram */
1522         if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
1523             break;
1524 
1525         put_gfn(parent, gfn_l);
1526         parent = parent->parent;
1527     }
1528 
1529     if ( !parent )
1530         return -ENOENT;
1531 
1532     if ( !(page = alloc_domheap_page(d, 0)) )
1533     {
1534         put_gfn(parent, gfn_l);
1535         return -ENOMEM;
1536     }
1537 
1538     new_mfn = page_to_mfn(page);
1539     copy_domain_page(new_mfn, mfn);
1540     set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
1541 
1542     put_gfn(parent, gfn_l);
1543 
1544     return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
1545                           p2m->default_access, -1);
1546 }
1547 
bring_up_vcpus(struct domain * cd,struct domain * d)1548 static int bring_up_vcpus(struct domain *cd, struct domain *d)
1549 {
1550     unsigned int i;
1551     int ret = -EINVAL;
1552 
1553     if ( d->max_vcpus != cd->max_vcpus ||
1554         (ret = cpupool_move_domain(cd, d->cpupool)) )
1555         return ret;
1556 
1557     for ( i = 0; i < cd->max_vcpus; i++ )
1558     {
1559         if ( !d->vcpu[i] || cd->vcpu[i] )
1560             continue;
1561 
1562         if ( !vcpu_create(cd, i) )
1563             return -EINVAL;
1564     }
1565 
1566     domain_update_node_affinity(cd);
1567     return 0;
1568 }
1569 
copy_vcpu_settings(struct domain * cd,const struct domain * d)1570 static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
1571 {
1572     unsigned int i;
1573     struct p2m_domain *p2m = p2m_get_hostp2m(cd);
1574     int ret = -EINVAL;
1575 
1576     for ( i = 0; i < cd->max_vcpus; i++ )
1577     {
1578         const struct vcpu *d_vcpu = d->vcpu[i];
1579         struct vcpu *cd_vcpu = cd->vcpu[i];
1580         mfn_t vcpu_info_mfn;
1581 
1582         if ( !d_vcpu || !cd_vcpu )
1583             continue;
1584 
1585         /* Copy & map in the vcpu_info page if the guest uses one */
1586         vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
1587         if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
1588         {
1589             mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
1590 
1591             /* Allocate & map the page for it if it hasn't been already */
1592             if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
1593             {
1594                 gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
1595                 unsigned long gfn_l = gfn_x(gfn);
1596                 struct page_info *page;
1597 
1598                 if ( !(page = alloc_domheap_page(cd, 0)) )
1599                     return -ENOMEM;
1600 
1601                 new_vcpu_info_mfn = page_to_mfn(page);
1602                 set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
1603 
1604                 ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn,
1605                                      PAGE_ORDER_4K, p2m_ram_rw,
1606                                      p2m->default_access, -1);
1607                 if ( ret )
1608                     return ret;
1609 
1610                 ret = map_vcpu_info(cd_vcpu, gfn_l,
1611                                     PAGE_OFFSET(d_vcpu->vcpu_info));
1612                 if ( ret )
1613                     return ret;
1614             }
1615 
1616             copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
1617         }
1618 
1619         /*
1620          * TODO: to support VMs with PV interfaces copy additional
1621          * settings here, such as PV timers.
1622          */
1623     }
1624 
1625     return 0;
1626 }
1627 
fork_hap_allocation(struct domain * cd,struct domain * d)1628 static int fork_hap_allocation(struct domain *cd, struct domain *d)
1629 {
1630     int rc;
1631     bool preempted;
1632     unsigned long mb = hap_get_allocation(d);
1633 
1634     if ( mb == hap_get_allocation(cd) )
1635         return 0;
1636 
1637     paging_lock(cd);
1638     rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
1639     paging_unlock(cd);
1640 
1641     return preempted ? -ERESTART : rc;
1642 }
1643 
copy_tsc(struct domain * cd,struct domain * d)1644 static void copy_tsc(struct domain *cd, struct domain *d)
1645 {
1646     uint32_t tsc_mode;
1647     uint32_t gtsc_khz;
1648     uint32_t incarnation;
1649     uint64_t elapsed_nsec;
1650 
1651     tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
1652     /* Don't bump incarnation on set */
1653     tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
1654 }
1655 
copy_special_pages(struct domain * cd,struct domain * d)1656 static int copy_special_pages(struct domain *cd, struct domain *d)
1657 {
1658     mfn_t new_mfn, old_mfn;
1659     gfn_t new_gfn, old_gfn;
1660     struct p2m_domain *p2m = p2m_get_hostp2m(cd);
1661     static const unsigned int params[] =
1662     {
1663         HVM_PARAM_STORE_PFN,
1664         HVM_PARAM_IOREQ_PFN,
1665         HVM_PARAM_BUFIOREQ_PFN,
1666         HVM_PARAM_CONSOLE_PFN
1667     };
1668     unsigned int i;
1669     int rc;
1670 
1671     for ( i = 0; i < ARRAY_SIZE(params); i++ )
1672     {
1673         p2m_type_t t;
1674         uint64_t value = 0;
1675         struct page_info *page;
1676 
1677         if ( hvm_get_param(d, params[i], &value) || !value )
1678             continue;
1679 
1680         old_mfn = get_gfn_query_unlocked(d, value, &t);
1681         new_mfn = get_gfn_query_unlocked(cd, value, &t);
1682 
1683         /* Allocate the page and map it in if it's not present */
1684         if ( mfn_eq(new_mfn, INVALID_MFN) )
1685         {
1686             if ( !(page = alloc_domheap_page(cd, 0)) )
1687                 return -ENOMEM;
1688 
1689             new_mfn = page_to_mfn(page);
1690             set_gpfn_from_mfn(mfn_x(new_mfn), value);
1691 
1692             rc = p2m->set_entry(p2m, _gfn(value), new_mfn, PAGE_ORDER_4K,
1693                                 p2m_ram_rw, p2m->default_access, -1);
1694             if ( rc )
1695                 return rc;
1696         }
1697 
1698         copy_domain_page(new_mfn, old_mfn);
1699     }
1700 
1701     old_mfn = _mfn(virt_to_mfn(d->shared_info));
1702     new_mfn = _mfn(virt_to_mfn(cd->shared_info));
1703     copy_domain_page(new_mfn, old_mfn);
1704 
1705     old_gfn = _gfn(get_gpfn_from_mfn(mfn_x(old_mfn)));
1706     new_gfn = _gfn(get_gpfn_from_mfn(mfn_x(new_mfn)));
1707 
1708     if ( !gfn_eq(old_gfn, new_gfn) )
1709     {
1710         if ( !gfn_eq(new_gfn, INVALID_GFN) )
1711         {
1712             /* if shared_info is mapped to a different gfn just remove it */
1713             rc = p2m->set_entry(p2m, new_gfn, INVALID_MFN, PAGE_ORDER_4K,
1714                                 p2m_invalid, p2m->default_access, -1);
1715             if ( rc )
1716                 return rc;
1717         }
1718 
1719         if ( !gfn_eq(old_gfn, INVALID_GFN) )
1720         {
1721             /* now map it to the same gfn as the parent */
1722             rc = p2m->set_entry(p2m, old_gfn, new_mfn, PAGE_ORDER_4K,
1723                                 p2m_ram_rw, p2m->default_access, -1);
1724             if ( rc )
1725                 return rc;
1726         }
1727     }
1728 
1729     return 0;
1730 }
1731 
copy_settings(struct domain * cd,struct domain * d)1732 static int copy_settings(struct domain *cd, struct domain *d)
1733 {
1734     int rc;
1735 
1736     if ( (rc = copy_vcpu_settings(cd, d)) )
1737         return rc;
1738 
1739     if ( (rc = hvm_copy_context_and_params(cd, d)) )
1740         return rc;
1741 
1742     if ( (rc = copy_special_pages(cd, d)) )
1743         return rc;
1744 
1745     copy_tsc(cd, d);
1746 
1747     return rc;
1748 }
1749 
fork(struct domain * cd,struct domain * d)1750 static int fork(struct domain *cd, struct domain *d)
1751 {
1752     int rc = -EBUSY;
1753 
1754     if ( !cd->controller_pause_count )
1755         return rc;
1756 
1757     if ( !cd->parent )
1758     {
1759         if ( !get_domain(d) )
1760         {
1761             ASSERT_UNREACHABLE();
1762             return -EBUSY;
1763         }
1764 
1765         domain_pause(d);
1766         cd->max_pages = d->max_pages;
1767         cd->parent = d;
1768     }
1769 
1770     /* This is preemptible so it's the first to get done */
1771     if ( (rc = fork_hap_allocation(cd, d)) )
1772         goto done;
1773 
1774     if ( (rc = bring_up_vcpus(cd, d)) )
1775         goto done;
1776 
1777     rc = copy_settings(cd, d);
1778 
1779  done:
1780     if ( rc && rc != -ERESTART )
1781     {
1782         domain_unpause(d);
1783         put_domain(d);
1784         cd->parent = NULL;
1785     }
1786 
1787     return rc;
1788 }
1789 
1790 /*
1791  * The fork reset operation is intended to be used on short-lived forks only.
1792  * There is no hypercall continuation operation implemented for this reason.
1793  * For forks that obtain a larger memory footprint it is likely going to be
1794  * more performant to create a new fork instead of resetting an existing one.
1795  *
1796  * TODO: In case this hypercall would become useful on forks with larger memory
1797  * footprints the hypercall continuation should be implemented (or if this
1798  * feature needs to be become "stable").
1799  */
mem_sharing_fork_reset(struct domain * d,struct domain * pd)1800 static int mem_sharing_fork_reset(struct domain *d, struct domain *pd)
1801 {
1802     int rc;
1803     struct p2m_domain *p2m = p2m_get_hostp2m(d);
1804     struct page_info *page, *tmp;
1805 
1806     domain_pause(d);
1807 
1808     /* need recursive lock because we will free pages */
1809     spin_lock_recursive(&d->page_alloc_lock);
1810     page_list_for_each_safe(page, tmp, &d->page_list)
1811     {
1812         shr_handle_t sh;
1813         mfn_t mfn = page_to_mfn(page);
1814         gfn_t gfn = mfn_to_gfn(d, mfn);
1815 
1816         /*
1817          * We only want to remove pages from the fork here that were copied
1818          * from the parent but could be potentially re-populated using memory
1819          * sharing after the reset. These pages all must be regular pages with
1820          * no extra reference held to them, thus should be possible to make
1821          * them sharable. Unfortunately p2m_is_sharable check is not sufficient
1822          * to test this as it doesn't check the page's reference count. We thus
1823          * check whether the page is convertable to the shared type using
1824          * nominate_page. In case the page is already shared (ie. a share
1825          * handle is returned) then we don't remove it.
1826          */
1827         if ( (rc = nominate_page(d, gfn, 0, true, &sh)) || sh )
1828             continue;
1829 
1830         /* forked memory is 4k, not splitting large pages so this must work */
1831         rc = p2m->set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K,
1832                             p2m_invalid, p2m_access_rwx, -1);
1833         ASSERT(!rc);
1834 
1835         put_page_alloc_ref(page);
1836         put_page_and_type(page);
1837     }
1838     spin_unlock_recursive(&d->page_alloc_lock);
1839 
1840     rc = copy_settings(d, pd);
1841 
1842     domain_unpause(d);
1843 
1844     return rc;
1845 }
1846 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM (xen_mem_sharing_op_t)arg)1847 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
1848 {
1849     int rc;
1850     xen_mem_sharing_op_t mso;
1851     struct domain *d;
1852 
1853     rc = -EFAULT;
1854     if ( copy_from_guest(&mso, arg, 1) )
1855         return rc;
1856 
1857     if ( mso.op == XENMEM_sharing_op_audit )
1858         return audit();
1859 
1860     rc = rcu_lock_live_remote_domain_by_id(mso.domain, &d);
1861     if ( rc )
1862         return rc;
1863 
1864     rc = xsm_mem_sharing(XSM_DM_PRIV, d);
1865     if ( rc )
1866         goto out;
1867 
1868     if ( !mem_sharing_enabled(d) &&
1869          (rc = mem_sharing_control(d, true, 0)) )
1870         return rc;
1871 
1872     switch ( mso.op )
1873     {
1874     case XENMEM_sharing_op_nominate_gfn:
1875     {
1876         shr_handle_t handle;
1877 
1878         rc = nominate_page(d, _gfn(mso.u.nominate.u.gfn), 0, false, &handle);
1879         mso.u.nominate.handle = handle;
1880     }
1881     break;
1882 
1883     case XENMEM_sharing_op_nominate_gref:
1884     {
1885         grant_ref_t gref = mso.u.nominate.u.grant_ref;
1886         gfn_t gfn;
1887         shr_handle_t handle;
1888 
1889         rc = mem_sharing_gref_to_gfn(d->grant_table, gref, &gfn, NULL);
1890         if ( rc < 0 )
1891             goto out;
1892 
1893         rc = nominate_page(d, gfn, 3, false, &handle);
1894         mso.u.nominate.handle = handle;
1895     }
1896     break;
1897 
1898     case XENMEM_sharing_op_share:
1899     {
1900         gfn_t sgfn, cgfn;
1901         struct domain *cd;
1902         shr_handle_t sh, ch;
1903 
1904         rc = rcu_lock_live_remote_domain_by_id(mso.u.share.client_domain,
1905                                                &cd);
1906         if ( rc )
1907             goto out;
1908 
1909         rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mso.op);
1910         if ( rc )
1911         {
1912             rcu_unlock_domain(cd);
1913             goto out;
1914         }
1915 
1916         if ( !mem_sharing_enabled(cd) )
1917         {
1918             rcu_unlock_domain(cd);
1919             rc = -EINVAL;
1920             goto out;
1921         }
1922 
1923         if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.source_gfn) )
1924         {
1925             grant_ref_t gref =
1926                 XENMEM_SHARING_OP_FIELD_GET_GREF(mso.u.share.source_gfn);
1927 
1928             rc = mem_sharing_gref_to_gfn(d->grant_table, gref, &sgfn,
1929                                          NULL);
1930             if ( rc < 0 )
1931             {
1932                 rcu_unlock_domain(cd);
1933                 goto out;
1934             }
1935         }
1936         else
1937             sgfn = _gfn(mso.u.share.source_gfn);
1938 
1939         if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.client_gfn) )
1940         {
1941             grant_ref_t gref =
1942                 XENMEM_SHARING_OP_FIELD_GET_GREF(mso.u.share.client_gfn);
1943 
1944             rc = mem_sharing_gref_to_gfn(cd->grant_table, gref, &cgfn,
1945                                          NULL);
1946             if ( rc < 0 )
1947             {
1948                 rcu_unlock_domain(cd);
1949                 goto out;
1950             }
1951         }
1952         else
1953             cgfn = _gfn(mso.u.share.client_gfn);
1954 
1955         sh = mso.u.share.source_handle;
1956         ch = mso.u.share.client_handle;
1957 
1958         rc = share_pages(d, sgfn, sh, cd, cgfn, ch);
1959 
1960         rcu_unlock_domain(cd);
1961     }
1962     break;
1963 
1964     case XENMEM_sharing_op_add_physmap:
1965     {
1966         unsigned long sgfn, cgfn;
1967         struct domain *cd;
1968         shr_handle_t sh;
1969 
1970         rc = rcu_lock_live_remote_domain_by_id(mso.u.share.client_domain,
1971                                                &cd);
1972         if ( rc )
1973             goto out;
1974 
1975         rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mso.op);
1976         if ( rc )
1977         {
1978             rcu_unlock_domain(cd);
1979             goto out;
1980         }
1981 
1982         if ( !mem_sharing_enabled(cd) )
1983         {
1984             rcu_unlock_domain(cd);
1985             rc = -EINVAL;
1986             goto out;
1987         }
1988 
1989         if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.source_gfn) )
1990         {
1991             /* Cannot add a gref to the physmap */
1992             rcu_unlock_domain(cd);
1993             rc = -EINVAL;
1994             goto out;
1995         }
1996 
1997         sgfn    = mso.u.share.source_gfn;
1998         sh      = mso.u.share.source_handle;
1999         cgfn    = mso.u.share.client_gfn;
2000 
2001         rc = add_to_physmap(d, sgfn, sh, cd, cgfn, true);
2002 
2003         rcu_unlock_domain(cd);
2004     }
2005     break;
2006 
2007     case XENMEM_sharing_op_range_share:
2008     {
2009         unsigned long max_sgfn, max_cgfn;
2010         struct domain *cd;
2011 
2012         rc = -EINVAL;
2013         if ( mso.u.range._pad[0] || mso.u.range._pad[1] ||
2014              mso.u.range._pad[2] )
2015             goto out;
2016 
2017         /*
2018          * We use opaque for the hypercall continuation value.
2019          * Ideally the user sets this to 0 in the beginning but
2020          * there is no good way of enforcing that here, so we just check
2021          * that it's at least in range.
2022          */
2023         if ( mso.u.range.opaque &&
2024              (mso.u.range.opaque < mso.u.range.first_gfn ||
2025               mso.u.range.opaque > mso.u.range.last_gfn) )
2026             goto out;
2027 
2028         rc = rcu_lock_live_remote_domain_by_id(mso.u.range.client_domain,
2029                                                &cd);
2030         if ( rc )
2031             goto out;
2032 
2033         /*
2034          * We reuse XENMEM_sharing_op_share XSM check here as this is
2035          * essentially the same concept repeated over multiple pages.
2036          */
2037         rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd,
2038                                 XENMEM_sharing_op_share);
2039         if ( rc )
2040         {
2041             rcu_unlock_domain(cd);
2042             goto out;
2043         }
2044 
2045         if ( !mem_sharing_enabled(cd) )
2046         {
2047             rcu_unlock_domain(cd);
2048             rc = -EINVAL;
2049             goto out;
2050         }
2051 
2052         /*
2053          * Sanity check only, the client should keep the domains paused for
2054          * the duration of this op.
2055          */
2056         if ( !atomic_read(&d->pause_count) ||
2057              !atomic_read(&cd->pause_count) )
2058         {
2059             rcu_unlock_domain(cd);
2060             rc = -EINVAL;
2061             goto out;
2062         }
2063 
2064         max_sgfn = domain_get_maximum_gpfn(d);
2065         max_cgfn = domain_get_maximum_gpfn(cd);
2066 
2067         if ( max_sgfn < mso.u.range.first_gfn ||
2068              max_sgfn < mso.u.range.last_gfn ||
2069              max_cgfn < mso.u.range.first_gfn ||
2070              max_cgfn < mso.u.range.last_gfn )
2071         {
2072             rcu_unlock_domain(cd);
2073             rc = -EINVAL;
2074             goto out;
2075         }
2076 
2077         rc = range_share(d, cd, &mso.u.range);
2078         rcu_unlock_domain(cd);
2079 
2080         if ( rc > 0 )
2081         {
2082             if ( __copy_to_guest(arg, &mso, 1) )
2083                 rc = -EFAULT;
2084             else
2085                 rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
2086                                                    "lh", XENMEM_sharing_op,
2087                                                    arg);
2088         }
2089         else
2090             mso.u.range.opaque = 0;
2091     }
2092     break;
2093 
2094     case XENMEM_sharing_op_debug_gfn:
2095         rc = debug_gfn(d, _gfn(mso.u.debug.u.gfn));
2096         break;
2097 
2098     case XENMEM_sharing_op_debug_gref:
2099         rc = debug_gref(d, mso.u.debug.u.gref);
2100         break;
2101 
2102     case XENMEM_sharing_op_fork:
2103     {
2104         struct domain *pd;
2105 
2106         rc = -EINVAL;
2107         if ( mso.u.fork.pad )
2108             goto out;
2109         if ( mso.u.fork.flags &
2110              ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS) )
2111             goto out;
2112 
2113         rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
2114                                                &pd);
2115         if ( rc )
2116             goto out;
2117 
2118         rc = -EINVAL;
2119         if ( pd->max_vcpus != d->max_vcpus )
2120         {
2121             rcu_unlock_domain(pd);
2122             goto out;
2123         }
2124 
2125         if ( !mem_sharing_enabled(pd) &&
2126              (rc = mem_sharing_control(pd, true, mso.u.fork.flags)) )
2127         {
2128             rcu_unlock_domain(pd);
2129             goto out;
2130         }
2131 
2132         rc = fork(d, pd);
2133 
2134         if ( rc == -ERESTART )
2135             rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
2136                                                "lh", XENMEM_sharing_op,
2137                                                arg);
2138         else if ( !rc && (mso.u.fork.flags & XENMEM_FORK_BLOCK_INTERRUPTS) )
2139             d->arch.hvm.mem_sharing.block_interrupts = true;
2140 
2141         rcu_unlock_domain(pd);
2142         break;
2143     }
2144 
2145     case XENMEM_sharing_op_fork_reset:
2146     {
2147         struct domain *pd;
2148 
2149         rc = -EINVAL;
2150         if ( mso.u.fork.pad || mso.u.fork.flags )
2151             goto out;
2152 
2153         rc = -ENOSYS;
2154         if ( !d->parent )
2155             goto out;
2156 
2157         rc = rcu_lock_live_remote_domain_by_id(d->parent->domain_id, &pd);
2158         if ( rc )
2159             goto out;
2160 
2161         rc = mem_sharing_fork_reset(d, pd);
2162 
2163         rcu_unlock_domain(pd);
2164         break;
2165     }
2166 
2167     default:
2168         rc = -ENOSYS;
2169         break;
2170     }
2171 
2172     if ( !rc && __copy_to_guest(arg, &mso, 1) )
2173         rc = -EFAULT;
2174 
2175 out:
2176     rcu_unlock_domain(d);
2177     return rc;
2178 }
2179 
mem_sharing_domctl(struct domain * d,struct xen_domctl_mem_sharing_op * mec)2180 int mem_sharing_domctl(struct domain *d, struct xen_domctl_mem_sharing_op *mec)
2181 {
2182     int rc;
2183 
2184     switch ( mec->op )
2185     {
2186     case XEN_DOMCTL_MEM_SHARING_CONTROL:
2187         rc = mem_sharing_control(d, mec->u.enable, 0);
2188         break;
2189 
2190     default:
2191         rc = -ENOSYS;
2192         break;
2193     }
2194 
2195     return rc;
2196 }
2197