1 /*
2     Domain communications for Xen Store Daemon.
3     Copyright (C) 2005 Rusty Russell IBM Corporation
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <stdio.h>
20 #include <sys/mman.h>
21 #include <unistd.h>
22 #include <stdlib.h>
23 #include <stdarg.h>
24 #include <time.h>
25 #include <syslog.h>
26 
27 #include "utils.h"
28 #include "talloc.h"
29 #include "xenstored_core.h"
30 #include "xenstored_domain.h"
31 #include "xenstored_transaction.h"
32 #include "xenstored_watch.h"
33 
34 #include <xenevtchn.h>
35 #include <xenctrl.h>
36 #include <xen/grant_table.h>
37 
38 static xc_interface **xc_handle;
39 xengnttab_handle **xgt_handle;
40 static evtchn_port_t virq_port;
41 
42 xenevtchn_handle *xce_handle = NULL;
43 
44 static struct node_perms dom_release_perms;
45 static struct node_perms dom_introduce_perms;
46 
47 struct domain
48 {
49 	struct list_head list;
50 
51 	/* The id of this domain */
52 	unsigned int domid;
53 
54 	/* Event channel port */
55 	evtchn_port_t port;
56 
57 	/* The remote end of the event channel, used only to validate
58 	   repeated domain introductions. */
59 	evtchn_port_t remote_port;
60 
61 	/* Domain path in store. */
62 	char *path;
63 
64 	/* Shared page. */
65 	struct xenstore_domain_interface *interface;
66 
67 	/* The connection associated with this. */
68 	struct connection *conn;
69 
70 	/* Generation count at domain introduction time. */
71 	uint64_t generation;
72 
73 	/* Have we noticed that this domain is shutdown? */
74 	bool shutdown;
75 
76 	/* Has domain been officially introduced? */
77 	bool introduced;
78 
79 	/* number of entry from this domain in the store */
80 	int nbentry;
81 
82 	/* number of watch for this domain */
83 	int nbwatch;
84 
85 	/* write rate limit */
86 	wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */
87 	struct wrl_timestampt wrl_timestamp;
88 	bool wrl_delay_logged;
89 };
90 
91 static LIST_HEAD(domains);
92 
check_indexes(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod)93 static bool check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
94 {
95 	return ((prod - cons) <= XENSTORE_RING_SIZE);
96 }
97 
get_output_chunk(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod,char * buf,uint32_t * len)98 static void *get_output_chunk(XENSTORE_RING_IDX cons,
99 			      XENSTORE_RING_IDX prod,
100 			      char *buf, uint32_t *len)
101 {
102 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
103 	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
104 		*len = XENSTORE_RING_SIZE - (prod - cons);
105 	return buf + MASK_XENSTORE_IDX(prod);
106 }
107 
get_input_chunk(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod,const char * buf,uint32_t * len)108 static const void *get_input_chunk(XENSTORE_RING_IDX cons,
109 				   XENSTORE_RING_IDX prod,
110 				   const char *buf, uint32_t *len)
111 {
112 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
113 	if ((prod - cons) < *len)
114 		*len = prod - cons;
115 	return buf + MASK_XENSTORE_IDX(cons);
116 }
117 
writechn(struct connection * conn,const void * data,unsigned int len)118 static int writechn(struct connection *conn,
119 		    const void *data, unsigned int len)
120 {
121 	uint32_t avail;
122 	void *dest;
123 	struct xenstore_domain_interface *intf = conn->domain->interface;
124 	XENSTORE_RING_IDX cons, prod;
125 
126 	/* Must read indexes once, and before anything else, and verified. */
127 	cons = intf->rsp_cons;
128 	prod = intf->rsp_prod;
129 	xen_mb();
130 
131 	if (!check_indexes(cons, prod)) {
132 		errno = EIO;
133 		return -1;
134 	}
135 
136 	dest = get_output_chunk(cons, prod, intf->rsp, &avail);
137 	if (avail < len)
138 		len = avail;
139 
140 	memcpy(dest, data, len);
141 	xen_mb();
142 	intf->rsp_prod += len;
143 
144 	xenevtchn_notify(xce_handle, conn->domain->port);
145 
146 	return len;
147 }
148 
readchn(struct connection * conn,void * data,unsigned int len)149 static int readchn(struct connection *conn, void *data, unsigned int len)
150 {
151 	uint32_t avail;
152 	const void *src;
153 	struct xenstore_domain_interface *intf = conn->domain->interface;
154 	XENSTORE_RING_IDX cons, prod;
155 
156 	/* Must read indexes once, and before anything else, and verified. */
157 	cons = intf->req_cons;
158 	prod = intf->req_prod;
159 	xen_mb();
160 
161 	if (!check_indexes(cons, prod)) {
162 		errno = EIO;
163 		return -1;
164 	}
165 
166 	src = get_input_chunk(cons, prod, intf->req, &avail);
167 	if (avail < len)
168 		len = avail;
169 
170 	memcpy(data, src, len);
171 	xen_mb();
172 	intf->req_cons += len;
173 
174 	xenevtchn_notify(xce_handle, conn->domain->port);
175 
176 	return len;
177 }
178 
map_interface(domid_t domid)179 static void *map_interface(domid_t domid)
180 {
181 	return xengnttab_map_grant_ref(*xgt_handle, domid,
182 				       GNTTAB_RESERVED_XENSTORE,
183 				       PROT_READ|PROT_WRITE);
184 }
185 
unmap_interface(void * interface)186 static void unmap_interface(void *interface)
187 {
188 	xengnttab_unmap(*xgt_handle, interface, 1);
189 }
190 
destroy_domain(void * _domain)191 static int destroy_domain(void *_domain)
192 {
193 	struct domain *domain = _domain;
194 
195 	list_del(&domain->list);
196 
197 	if (!domain->introduced)
198 		return 0;
199 
200 	if (domain->port) {
201 		if (xenevtchn_unbind(xce_handle, domain->port) == -1)
202 			eprintf("> Unbinding port %i failed!\n", domain->port);
203 	}
204 
205 	if (domain->interface) {
206 		/* Domain 0 was mapped by dom0_init, so it must be unmapped
207 		   using munmap() and not the grant unmap call. */
208 		if (domain->domid == 0)
209 			unmap_xenbus(domain->interface);
210 		else
211 			unmap_interface(domain->interface);
212 	}
213 
214 	fire_watches(NULL, domain, "@releaseDomain", NULL, false, NULL);
215 
216 	wrl_domain_destroy(domain);
217 
218 	return 0;
219 }
220 
get_domain_info(unsigned int domid,xc_dominfo_t * dominfo)221 static bool get_domain_info(unsigned int domid, xc_dominfo_t *dominfo)
222 {
223 	return xc_domain_getinfo(*xc_handle, domid, 1, dominfo) == 1 &&
224 	       dominfo->domid == domid;
225 }
226 
domain_cleanup(void)227 static void domain_cleanup(void)
228 {
229 	xc_dominfo_t dominfo;
230 	struct domain *domain;
231 	struct connection *conn;
232 	int notify = 0;
233 	bool dom_valid;
234 
235  again:
236 	list_for_each_entry(domain, &domains, list) {
237 		dom_valid = get_domain_info(domain->domid, &dominfo);
238 		if (!domain->introduced) {
239 			if (!dom_valid) {
240 				talloc_free(domain);
241 				goto again;
242 			}
243 			continue;
244 		}
245 		if (dom_valid) {
246 			if ((dominfo.crashed || dominfo.shutdown)
247 			    && !domain->shutdown) {
248 				domain->shutdown = true;
249 				notify = 1;
250 			}
251 			if (!dominfo.dying)
252 				continue;
253 		}
254 		if (domain->conn) {
255 			/* domain is a talloc child of domain->conn. */
256 			conn = domain->conn;
257 			domain->conn = NULL;
258 			talloc_unlink(talloc_autofree_context(), conn);
259 			notify = 0; /* destroy_domain() fires the watch */
260 			goto again;
261 		}
262 	}
263 
264 	if (notify)
265 		fire_watches(NULL, NULL, "@releaseDomain", NULL, false, NULL);
266 }
267 
268 /* We scan all domains rather than use the information given here. */
handle_event(void)269 void handle_event(void)
270 {
271 	evtchn_port_t port;
272 
273 	if ((port = xenevtchn_pending(xce_handle)) == -1)
274 		barf_perror("Failed to read from event fd");
275 
276 	if (port == virq_port)
277 		domain_cleanup();
278 
279 	if (xenevtchn_unmask(xce_handle, port) == -1)
280 		barf_perror("Failed to write to event fd");
281 }
282 
domain_can_read(struct connection * conn)283 bool domain_can_read(struct connection *conn)
284 {
285 	struct xenstore_domain_interface *intf = conn->domain->interface;
286 
287 	if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0)
288 		return false;
289 
290 	if (conn->is_ignored)
291 		return false;
292 
293 	return (intf->req_cons != intf->req_prod);
294 }
295 
domid_is_unprivileged(unsigned int domid)296 static bool domid_is_unprivileged(unsigned int domid)
297 {
298 	return domid != 0 && domid != priv_domid;
299 }
300 
domain_is_unprivileged(struct connection * conn)301 bool domain_is_unprivileged(struct connection *conn)
302 {
303 	return conn && conn->domain &&
304 	       domid_is_unprivileged(conn->domain->domid);
305 }
306 
domain_can_write(struct connection * conn)307 bool domain_can_write(struct connection *conn)
308 {
309 	struct xenstore_domain_interface *intf = conn->domain->interface;
310 
311 	if (conn->is_ignored)
312 		return false;
313 
314 	return ((intf->rsp_prod - intf->rsp_cons) != XENSTORE_RING_SIZE);
315 }
316 
talloc_domain_path(void * context,unsigned int domid)317 static char *talloc_domain_path(void *context, unsigned int domid)
318 {
319 	return talloc_asprintf(context, "/local/domain/%u", domid);
320 }
321 
find_domain_struct(unsigned int domid)322 static struct domain *find_domain_struct(unsigned int domid)
323 {
324 	struct domain *i;
325 
326 	list_for_each_entry(i, &domains, list) {
327 		if (i->domid == domid)
328 			return i;
329 	}
330 	return NULL;
331 }
332 
alloc_domain(void * context,unsigned int domid)333 static struct domain *alloc_domain(void *context, unsigned int domid)
334 {
335 	struct domain *domain;
336 
337 	domain = talloc(context, struct domain);
338 	if (!domain) {
339 		errno = ENOMEM;
340 		return NULL;
341 	}
342 
343 	domain->domid = domid;
344 	domain->generation = generation;
345 	domain->introduced = false;
346 
347 	talloc_set_destructor(domain, destroy_domain);
348 
349 	list_add(&domain->list, &domains);
350 
351 	return domain;
352 }
353 
new_domain(struct domain * domain,int port)354 static int new_domain(struct domain *domain, int port)
355 {
356 	int rc;
357 
358 	domain->port = 0;
359 	domain->shutdown = false;
360 	domain->path = talloc_domain_path(domain, domain->domid);
361 	if (!domain->path) {
362 		errno = ENOMEM;
363 		return errno;
364 	}
365 
366 	wrl_domain_new(domain);
367 
368 	/* Tell kernel we're interested in this event. */
369 	rc = xenevtchn_bind_interdomain(xce_handle, domain->domid, port);
370 	if (rc == -1)
371 		return errno;
372 	domain->port = rc;
373 
374 	domain->introduced = true;
375 
376 	domain->conn = new_connection(writechn, readchn);
377 	if (!domain->conn)  {
378 		errno = ENOMEM;
379 		return errno;
380 	}
381 
382 	domain->conn->domain = domain;
383 	domain->conn->id = domain->domid;
384 
385 	domain->remote_port = port;
386 	domain->nbentry = 0;
387 	domain->nbwatch = 0;
388 
389 	return 0;
390 }
391 
392 
find_domain_by_domid(unsigned int domid)393 static struct domain *find_domain_by_domid(unsigned int domid)
394 {
395 	struct domain *d;
396 
397 	d = find_domain_struct(domid);
398 
399 	return (d && d->introduced) ? d : NULL;
400 }
401 
domain_conn_reset(struct domain * domain)402 static void domain_conn_reset(struct domain *domain)
403 {
404 	struct connection *conn = domain->conn;
405 	struct buffered_data *out;
406 
407 	conn_delete_all_watches(conn);
408 	conn_delete_all_transactions(conn);
409 
410 	while ((out = list_top(&conn->out_list, struct buffered_data, list))) {
411 		list_del(&out->list);
412 		talloc_free(out);
413 	}
414 
415 	talloc_free(conn->in);
416 
417 	domain->interface->req_cons = domain->interface->req_prod = 0;
418 	domain->interface->rsp_cons = domain->interface->rsp_prod = 0;
419 }
420 
421 /* domid, gfn, evtchn, path */
do_introduce(struct connection * conn,struct buffered_data * in)422 int do_introduce(struct connection *conn, struct buffered_data *in)
423 {
424 	struct domain *domain;
425 	char *vec[3];
426 	unsigned int domid;
427 	evtchn_port_t port;
428 	int rc;
429 	struct xenstore_domain_interface *interface;
430 
431 	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
432 		return EINVAL;
433 
434 	if (!conn->can_write)
435 		return EACCES;
436 
437 	domid = atoi(vec[0]);
438 	/* Ignore the gfn, we don't need it. */
439 	port = atoi(vec[2]);
440 
441 	/* Sanity check args. */
442 	if (port <= 0)
443 		return EINVAL;
444 
445 	domain = find_domain_struct(domid);
446 
447 	if (domain == NULL) {
448 		/* Hang domain off "in" until we're finished. */
449 		domain = alloc_domain(in, domid);
450 		if (domain == NULL)
451 			return ENOMEM;
452 	}
453 
454 	if (!domain->introduced) {
455 		interface = map_interface(domid);
456 		if (!interface)
457 			return errno;
458 		/* Hang domain off "in" until we're finished. */
459 		if (new_domain(domain, port)) {
460 			rc = errno;
461 			unmap_interface(interface);
462 			return rc;
463 		}
464 		domain->interface = interface;
465 
466 		/* Now domain belongs to its connection. */
467 		talloc_steal(domain->conn, domain);
468 
469 		fire_watches(NULL, in, "@introduceDomain", NULL, false, NULL);
470 	} else {
471 		/* Use XS_INTRODUCE for recreating the xenbus event-channel. */
472 		if (domain->port)
473 			xenevtchn_unbind(xce_handle, domain->port);
474 		rc = xenevtchn_bind_interdomain(xce_handle, domid, port);
475 		domain->port = (rc == -1) ? 0 : rc;
476 		domain->remote_port = port;
477 	}
478 
479 	domain_conn_reset(domain);
480 
481 	send_ack(conn, XS_INTRODUCE);
482 
483 	return 0;
484 }
485 
find_connected_domain(unsigned int domid)486 static struct domain *find_connected_domain(unsigned int domid)
487 {
488 	struct domain *domain;
489 
490 	domain = find_domain_by_domid(domid);
491 	if (!domain)
492 		return ERR_PTR(-ENOENT);
493 	if (!domain->conn)
494 		return ERR_PTR(-EINVAL);
495 	return domain;
496 }
497 
do_set_target(struct connection * conn,struct buffered_data * in)498 int do_set_target(struct connection *conn, struct buffered_data *in)
499 {
500 	char *vec[2];
501 	unsigned int domid, tdomid;
502         struct domain *domain, *tdomain;
503 	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
504 		return EINVAL;
505 
506 	if (!conn->can_write)
507 		return EACCES;
508 
509 	domid = atoi(vec[0]);
510 	tdomid = atoi(vec[1]);
511 
512         domain = find_connected_domain(domid);
513 	if (IS_ERR(domain))
514 		return -PTR_ERR(domain);
515 
516         tdomain = find_connected_domain(tdomid);
517 	if (IS_ERR(tdomain))
518 		return -PTR_ERR(tdomain);
519 
520         talloc_reference(domain->conn, tdomain->conn);
521         domain->conn->target = tdomain->conn;
522 
523 	send_ack(conn, XS_SET_TARGET);
524 
525 	return 0;
526 }
527 
onearg_domain(struct connection * conn,struct buffered_data * in)528 static struct domain *onearg_domain(struct connection *conn,
529 				    struct buffered_data *in)
530 {
531 	const char *domid_str = onearg(in);
532 	unsigned int domid;
533 
534 	if (!domid_str)
535 		return ERR_PTR(-EINVAL);
536 
537 	domid = atoi(domid_str);
538 	if (!domid)
539 		return ERR_PTR(-EINVAL);
540 
541 	return find_connected_domain(domid);
542 }
543 
544 /* domid */
do_release(struct connection * conn,struct buffered_data * in)545 int do_release(struct connection *conn, struct buffered_data *in)
546 {
547 	struct domain *domain;
548 
549 	domain = onearg_domain(conn, in);
550 	if (IS_ERR(domain))
551 		return -PTR_ERR(domain);
552 
553 	talloc_free(domain->conn);
554 
555 	send_ack(conn, XS_RELEASE);
556 
557 	return 0;
558 }
559 
do_resume(struct connection * conn,struct buffered_data * in)560 int do_resume(struct connection *conn, struct buffered_data *in)
561 {
562 	struct domain *domain;
563 
564 	domain = onearg_domain(conn, in);
565 	if (IS_ERR(domain))
566 		return -PTR_ERR(domain);
567 
568 	domain->shutdown = false;
569 
570 	send_ack(conn, XS_RESUME);
571 
572 	return 0;
573 }
574 
do_get_domain_path(struct connection * conn,struct buffered_data * in)575 int do_get_domain_path(struct connection *conn, struct buffered_data *in)
576 {
577 	char *path;
578 	const char *domid_str = onearg(in);
579 
580 	if (!domid_str)
581 		return EINVAL;
582 
583 	path = talloc_domain_path(conn, atoi(domid_str));
584 	if (!path)
585 		return errno;
586 
587 	send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1);
588 
589 	talloc_free(path);
590 
591 	return 0;
592 }
593 
do_is_domain_introduced(struct connection * conn,struct buffered_data * in)594 int do_is_domain_introduced(struct connection *conn, struct buffered_data *in)
595 {
596 	int result;
597 	unsigned int domid;
598 	const char *domid_str = onearg(in);
599 
600 	if (!domid_str)
601 		return EINVAL;
602 
603 	domid = atoi(domid_str);
604 	if (domid == DOMID_SELF)
605 		result = 1;
606 	else
607 		result = (find_domain_by_domid(domid) != NULL);
608 
609 	send_reply(conn, XS_IS_DOMAIN_INTRODUCED, result ? "T" : "F", 2);
610 
611 	return 0;
612 }
613 
614 /* Allow guest to reset all watches */
do_reset_watches(struct connection * conn,struct buffered_data * in)615 int do_reset_watches(struct connection *conn, struct buffered_data *in)
616 {
617 	conn_delete_all_watches(conn);
618 	conn_delete_all_transactions(conn);
619 
620 	send_ack(conn, XS_RESET_WATCHES);
621 
622 	return 0;
623 }
624 
close_xc_handle(void * _handle)625 static int close_xc_handle(void *_handle)
626 {
627 	xc_interface_close(*(xc_interface**)_handle);
628 	return 0;
629 }
630 
close_xgt_handle(void * _handle)631 static int close_xgt_handle(void *_handle)
632 {
633 	xengnttab_close(*(xengnttab_handle **)_handle);
634 	return 0;
635 }
636 
637 /* Returns the implicit path of a connection (only domains have this) */
get_implicit_path(const struct connection * conn)638 const char *get_implicit_path(const struct connection *conn)
639 {
640 	if (!conn->domain)
641 		return "/local/domain/0";
642 	return conn->domain->path;
643 }
644 
645 /* Restore existing connections. */
restore_existing_connections(void)646 void restore_existing_connections(void)
647 {
648 }
649 
set_dom_perms_default(struct node_perms * perms)650 static int set_dom_perms_default(struct node_perms *perms)
651 {
652 	perms->num = 1;
653 	perms->p = talloc_array(NULL, struct xs_permissions, perms->num);
654 	if (!perms->p)
655 		return -1;
656 	perms->p->id = 0;
657 	perms->p->perms = XS_PERM_NONE;
658 
659 	return 0;
660 }
661 
get_perms_special(const char * name)662 static struct node_perms *get_perms_special(const char *name)
663 {
664 	if (!strcmp(name, "@releaseDomain"))
665 		return &dom_release_perms;
666 	if (!strcmp(name, "@introduceDomain"))
667 		return &dom_introduce_perms;
668 	return NULL;
669 }
670 
set_perms_special(struct connection * conn,const char * name,struct node_perms * perms)671 int set_perms_special(struct connection *conn, const char *name,
672 		      struct node_perms *perms)
673 {
674 	struct node_perms *p;
675 
676 	p = get_perms_special(name);
677 	if (!p)
678 		return EINVAL;
679 
680 	if ((perm_for_conn(conn, p) & (XS_PERM_WRITE | XS_PERM_OWNER)) !=
681 	    (XS_PERM_WRITE | XS_PERM_OWNER))
682 		return EACCES;
683 
684 	p->num = perms->num;
685 	talloc_free(p->p);
686 	p->p = perms->p;
687 	talloc_steal(NULL, perms->p);
688 
689 	return 0;
690 }
691 
check_perms_special(const char * name,struct connection * conn)692 bool check_perms_special(const char *name, struct connection *conn)
693 {
694 	struct node_perms *p;
695 
696 	p = get_perms_special(name);
697 	if (!p)
698 		return false;
699 
700 	return perm_for_conn(conn, p) & XS_PERM_READ;
701 }
702 
dom0_init(void)703 static int dom0_init(void)
704 {
705 	evtchn_port_t port;
706 	struct domain *dom0;
707 
708 	port = xenbus_evtchn();
709 	if (port == -1)
710 		return -1;
711 
712 	dom0 = alloc_domain(NULL, xenbus_master_domid());
713 	if (!dom0)
714 		return -1;
715 	if (new_domain(dom0, port))
716 		return -1;
717 
718 	dom0->interface = xenbus_map();
719 	if (dom0->interface == NULL)
720 		return -1;
721 
722 	talloc_steal(dom0->conn, dom0);
723 
724 	xenevtchn_notify(xce_handle, dom0->port);
725 
726 	if (set_dom_perms_default(&dom_release_perms) ||
727 	    set_dom_perms_default(&dom_introduce_perms))
728 		return -1;
729 
730 	return 0;
731 }
732 
domain_init(void)733 void domain_init(void)
734 {
735 	int rc;
736 
737 	xc_handle = talloc(talloc_autofree_context(), xc_interface*);
738 	if (!xc_handle)
739 		barf_perror("Failed to allocate domain handle");
740 
741 	*xc_handle = xc_interface_open(0,0,0);
742 	if (!*xc_handle)
743 		barf_perror("Failed to open connection to hypervisor");
744 
745 	talloc_set_destructor(xc_handle, close_xc_handle);
746 
747 	xgt_handle = talloc(talloc_autofree_context(), xengnttab_handle*);
748 	if (!xgt_handle)
749 		barf_perror("Failed to allocate domain gnttab handle");
750 
751 	*xgt_handle = xengnttab_open(NULL, 0);
752 	if (*xgt_handle == NULL)
753 		barf_perror("Failed to open connection to gnttab");
754 
755 	talloc_set_destructor(xgt_handle, close_xgt_handle);
756 
757 	xce_handle = xenevtchn_open(NULL, 0);
758 
759 	if (xce_handle == NULL)
760 		barf_perror("Failed to open evtchn device");
761 
762 	if (dom0_init() != 0)
763 		barf_perror("Failed to initialize dom0 state");
764 
765 	if ((rc = xenevtchn_bind_virq(xce_handle, VIRQ_DOM_EXC)) == -1)
766 		barf_perror("Failed to bind to domain exception virq port");
767 	virq_port = rc;
768 }
769 
domain_entry_inc(struct connection * conn,struct node * node)770 void domain_entry_inc(struct connection *conn, struct node *node)
771 {
772 	struct domain *d;
773 
774 	if (!conn)
775 		return;
776 
777 	if (node->perms.p && node->perms.p[0].id != conn->id) {
778 		if (conn->transaction) {
779 			transaction_entry_inc(conn->transaction,
780 				node->perms.p[0].id);
781 		} else {
782 			d = find_domain_by_domid(node->perms.p[0].id);
783 			if (d)
784 				d->nbentry++;
785 		}
786 	} else if (conn->domain) {
787 		if (conn->transaction) {
788 			transaction_entry_inc(conn->transaction,
789 				conn->domain->domid);
790  		} else {
791  			conn->domain->nbentry++;
792 		}
793 	}
794 }
795 
796 /*
797  * Check whether a domain was created before or after a specific generation
798  * count (used for testing whether a node permission is older than a domain).
799  *
800  * Return values:
801  * -1: error
802  *  0: domain has higher generation count (it is younger than a node with the
803  *     given count), or domain isn't existing any longer
804  *  1: domain is older than the node
805  */
chk_domain_generation(unsigned int domid,uint64_t gen)806 static int chk_domain_generation(unsigned int domid, uint64_t gen)
807 {
808 	struct domain *d;
809 	xc_dominfo_t dominfo;
810 
811 	if (!xc_handle && domid == 0)
812 		return 1;
813 
814 	d = find_domain_struct(domid);
815 	if (d)
816 		return (d->generation <= gen) ? 1 : 0;
817 
818 	if (!get_domain_info(domid, &dominfo))
819 		return 0;
820 
821 	d = alloc_domain(NULL, domid);
822 	return d ? 1 : -1;
823 }
824 
825 /*
826  * Remove permissions for no longer existing domains in order to avoid a new
827  * domain with the same domid inheriting the permissions.
828  */
domain_adjust_node_perms(struct node * node)829 int domain_adjust_node_perms(struct node *node)
830 {
831 	unsigned int i;
832 	int ret;
833 
834 	ret = chk_domain_generation(node->perms.p[0].id, node->generation);
835 	if (ret < 0)
836 		return errno;
837 
838 	/* If the owner doesn't exist any longer give it to priv domain. */
839 	if (!ret)
840 		node->perms.p[0].id = priv_domid;
841 
842 	for (i = 1; i < node->perms.num; i++) {
843 		if (node->perms.p[i].perms & XS_PERM_IGNORE)
844 			continue;
845 		ret = chk_domain_generation(node->perms.p[i].id,
846 					    node->generation);
847 		if (ret < 0)
848 			return errno;
849 		if (!ret)
850 			node->perms.p[i].perms |= XS_PERM_IGNORE;
851 	}
852 
853 	return 0;
854 }
855 
domain_entry_dec(struct connection * conn,struct node * node)856 void domain_entry_dec(struct connection *conn, struct node *node)
857 {
858 	struct domain *d;
859 
860 	if (!conn)
861 		return;
862 
863 	if (node->perms.p && node->perms.p[0].id != conn->id) {
864 		if (conn->transaction) {
865 			transaction_entry_dec(conn->transaction,
866 				node->perms.p[0].id);
867 		} else {
868 			d = find_domain_by_domid(node->perms.p[0].id);
869 			if (d && d->nbentry)
870 				d->nbentry--;
871 		}
872 	} else if (conn->domain && conn->domain->nbentry) {
873 		if (conn->transaction) {
874 			transaction_entry_dec(conn->transaction,
875 				conn->domain->domid);
876 		} else {
877 			conn->domain->nbentry--;
878 		}
879 	}
880 }
881 
domain_entry_fix(unsigned int domid,int num,bool update)882 int domain_entry_fix(unsigned int domid, int num, bool update)
883 {
884 	struct domain *d;
885 	int cnt;
886 
887 	d = find_domain_by_domid(domid);
888 	if (!d)
889 		return 0;
890 
891 	cnt = d->nbentry + num;
892 	if (cnt < 0)
893 		cnt = 0;
894 
895 	if (update)
896 		d->nbentry = cnt;
897 
898 	return domid_is_unprivileged(domid) ? cnt : 0;
899 }
900 
domain_entry(struct connection * conn)901 int domain_entry(struct connection *conn)
902 {
903 	return (domain_is_unprivileged(conn))
904 		? conn->domain->nbentry
905 		: 0;
906 }
907 
domain_watch_inc(struct connection * conn)908 void domain_watch_inc(struct connection *conn)
909 {
910 	if (!conn || !conn->domain)
911 		return;
912 	conn->domain->nbwatch++;
913 }
914 
domain_watch_dec(struct connection * conn)915 void domain_watch_dec(struct connection *conn)
916 {
917 	if (!conn || !conn->domain)
918 		return;
919 	if (conn->domain->nbwatch)
920 		conn->domain->nbwatch--;
921 }
922 
domain_watch(struct connection * conn)923 int domain_watch(struct connection *conn)
924 {
925 	return (domain_is_unprivileged(conn))
926 		? conn->domain->nbwatch
927 		: 0;
928 }
929 
930 static wrl_creditt wrl_config_writecost      = WRL_FACTOR;
931 static wrl_creditt wrl_config_rate           = WRL_RATE   * WRL_FACTOR;
932 static wrl_creditt wrl_config_dburst         = WRL_DBURST * WRL_FACTOR;
933 static wrl_creditt wrl_config_gburst         = WRL_GBURST * WRL_FACTOR;
934 static wrl_creditt wrl_config_newdoms_dburst =
935 	                         WRL_DBURST * WRL_NEWDOMS * WRL_FACTOR;
936 
937 long wrl_ntransactions;
938 
939 static long wrl_ndomains;
940 static wrl_creditt wrl_reserve; /* [-wrl_config_newdoms_dburst, +_gburst ] */
941 static time_t wrl_log_last_warning; /* 0: no previous warning */
942 
wrl_gettime_now(struct wrl_timestampt * now_wt)943 void wrl_gettime_now(struct wrl_timestampt *now_wt)
944 {
945 	struct timespec now_ts;
946 	int r;
947 
948 	r = clock_gettime(CLOCK_MONOTONIC, &now_ts);
949 	if (r)
950 		barf_perror("Could not find time (clock_gettime failed)");
951 
952 	now_wt->sec = now_ts.tv_sec;
953 	now_wt->msec = now_ts.tv_nsec / 1000000;
954 }
955 
wrl_xfer_credit(wrl_creditt * debit,wrl_creditt debit_floor,wrl_creditt * credit,wrl_creditt credit_ceil)956 static void wrl_xfer_credit(wrl_creditt *debit,  wrl_creditt debit_floor,
957 			    wrl_creditt *credit, wrl_creditt credit_ceil)
958 	/*
959 	 * Transfers zero or more credit from "debit" to "credit".
960 	 * Transfers as much as possible while maintaining
961 	 * debit >= debit_floor and credit <= credit_ceil.
962 	 * (If that's violated already, does nothing.)
963 	 *
964 	 * Sufficient conditions to avoid overflow, either of:
965 	 *  |every argument| <= 0x3fffffff
966 	 *  |every argument| <= 1E9
967 	 *  |every argument| <= WRL_CREDIT_MAX
968 	 * (And this condition is preserved.)
969 	 */
970 {
971 	wrl_creditt xfer = MIN( *debit      - debit_floor,
972 			        credit_ceil - *credit      );
973 	if (xfer > 0) {
974 		*debit -= xfer;
975 		*credit += xfer;
976 	}
977 }
978 
wrl_domain_new(struct domain * domain)979 void wrl_domain_new(struct domain *domain)
980 {
981 	domain->wrl_credit = 0;
982 	wrl_gettime_now(&domain->wrl_timestamp);
983 	wrl_ndomains++;
984 	/* Steal up to DBURST from the reserve */
985 	wrl_xfer_credit(&wrl_reserve, -wrl_config_newdoms_dburst,
986 			&domain->wrl_credit, wrl_config_dburst);
987 }
988 
wrl_domain_destroy(struct domain * domain)989 void wrl_domain_destroy(struct domain *domain)
990 {
991 	wrl_ndomains--;
992 	/*
993 	 * Don't bother recalculating domain's credit - this just
994 	 * means we don't give the reserve the ending domain's credit
995 	 * for time elapsed since last update.
996 	 */
997 	wrl_xfer_credit(&domain->wrl_credit, 0,
998 			&wrl_reserve, wrl_config_dburst);
999 }
1000 
wrl_credit_update(struct domain * domain,struct wrl_timestampt now)1001 void wrl_credit_update(struct domain *domain, struct wrl_timestampt now)
1002 {
1003 	/*
1004 	 * We want to calculate
1005 	 *    credit += (now - timestamp) * RATE / ndoms;
1006 	 * But we want it to saturate, and to avoid floating point.
1007 	 * To avoid rounding errors from constantly adding small
1008 	 * amounts of credit, we only add credit for whole milliseconds.
1009 	 */
1010 	long seconds      = now.sec -  domain->wrl_timestamp.sec;
1011 	long milliseconds = now.msec - domain->wrl_timestamp.msec;
1012 	long msec;
1013 	int64_t denom, num;
1014 	wrl_creditt surplus;
1015 
1016 	seconds = MIN(seconds, 1000*1000); /* arbitrary, prevents overflow */
1017 	msec = seconds * 1000 + milliseconds;
1018 
1019 	if (msec < 0)
1020                 /* shouldn't happen with CLOCK_MONOTONIC */
1021 		msec = 0;
1022 
1023 	/* 32x32 -> 64 cannot overflow */
1024 	denom = (int64_t)msec * wrl_config_rate;
1025 	num  =  (int64_t)wrl_ndomains * 1000;
1026 	/* denom / num <= 1E6 * wrl_config_rate, so with
1027 	   reasonable wrl_config_rate, denom / num << 2^64 */
1028 
1029 	/* at last! */
1030 	domain->wrl_credit = MIN( (int64_t)domain->wrl_credit + denom / num,
1031 				  WRL_CREDIT_MAX );
1032 	/* (maybe briefly violating the DBURST cap on wrl_credit) */
1033 
1034 	/* maybe take from the reserve to make us nonnegative */
1035 	wrl_xfer_credit(&wrl_reserve,        0,
1036 			&domain->wrl_credit, 0);
1037 
1038 	/* return any surplus (over DBURST) to the reserve */
1039 	surplus = 0;
1040 	wrl_xfer_credit(&domain->wrl_credit, wrl_config_dburst,
1041 			&surplus,            WRL_CREDIT_MAX);
1042 	wrl_xfer_credit(&surplus,     0,
1043 			&wrl_reserve, wrl_config_gburst);
1044 	/* surplus is now implicitly discarded */
1045 
1046 	domain->wrl_timestamp = now;
1047 
1048 	trace("wrl: dom %4d %6ld  msec  %9ld credit   %9ld reserve"
1049 	      "  %9ld discard\n",
1050 	      domain->domid,
1051 	      msec,
1052 	      (long)domain->wrl_credit, (long)wrl_reserve,
1053 	      (long)surplus);
1054 }
1055 
wrl_check_timeout(struct domain * domain,struct wrl_timestampt now,int * ptimeout)1056 void wrl_check_timeout(struct domain *domain,
1057 		       struct wrl_timestampt now,
1058 		       int *ptimeout)
1059 {
1060 	uint64_t num, denom;
1061 	int wakeup;
1062 
1063 	wrl_credit_update(domain, now);
1064 
1065 	if (domain->wrl_credit >= 0)
1066 		/* not blocked */
1067 		return;
1068 
1069 	if (!*ptimeout)
1070 		/* already decided on immediate wakeup,
1071 		   so no need to calculate our timeout */
1072 		return;
1073 
1074 	/* calculate  wakeup = now + -credit / (RATE / ndoms); */
1075 
1076 	/* credit cannot go more -ve than one transaction,
1077 	 * so the first multiplication cannot overflow even 32-bit */
1078 	num   = (uint64_t)(-domain->wrl_credit * 1000) * wrl_ndomains;
1079 	denom = wrl_config_rate;
1080 
1081 	wakeup = MIN( num / denom /* uint64_t */, INT_MAX );
1082 	if (*ptimeout==-1 || wakeup < *ptimeout)
1083 		*ptimeout = wakeup;
1084 
1085 	trace("wrl: domain %u credit=%ld (reserve=%ld) SLEEPING for %d\n",
1086 	      domain->domid,
1087 	      (long)domain->wrl_credit, (long)wrl_reserve,
1088 	      wakeup);
1089 }
1090 
1091 #define WRL_LOG(now, ...) \
1092 	(syslog(LOG_WARNING, "write rate limit: " __VA_ARGS__))
1093 
wrl_apply_debit_actual(struct domain * domain)1094 void wrl_apply_debit_actual(struct domain *domain)
1095 {
1096 	struct wrl_timestampt now;
1097 
1098 	if (!domain || !domid_is_unprivileged(domain->domid))
1099 		/* sockets and privileged domain escape the write rate limit */
1100 		return;
1101 
1102 	wrl_gettime_now(&now);
1103 	wrl_credit_update(domain, now);
1104 
1105 	domain->wrl_credit -= wrl_config_writecost;
1106 	trace("wrl: domain %u credit=%ld (reserve=%ld)\n",
1107 	      domain->domid,
1108 	      (long)domain->wrl_credit, (long)wrl_reserve);
1109 
1110 	if (domain->wrl_credit < 0) {
1111 		if (!domain->wrl_delay_logged) {
1112 			domain->wrl_delay_logged = true;
1113 			WRL_LOG(now, "domain %ld is affected",
1114 				(long)domain->domid);
1115 		} else if (!wrl_log_last_warning) {
1116 			WRL_LOG(now, "rate limiting restarts");
1117 		}
1118 		wrl_log_last_warning = now.sec;
1119 	}
1120 }
1121 
wrl_log_periodic(struct wrl_timestampt now)1122 void wrl_log_periodic(struct wrl_timestampt now)
1123 {
1124 	if (wrl_log_last_warning &&
1125 	    (now.sec - wrl_log_last_warning) > WRL_LOGEVERY) {
1126 		WRL_LOG(now, "not in force recently");
1127 		wrl_log_last_warning = 0;
1128 	}
1129 }
1130 
wrl_apply_debit_direct(struct connection * conn)1131 void wrl_apply_debit_direct(struct connection *conn)
1132 {
1133 	if (!conn)
1134 		/* some writes are generated internally */
1135 		return;
1136 
1137 	if (conn->transaction)
1138 		/* these are accounted for when the transaction ends */
1139 		return;
1140 
1141 	if (!wrl_ntransactions)
1142 		/* we don't conflict with anyone */
1143 		return;
1144 
1145 	wrl_apply_debit_actual(conn->domain);
1146 }
1147 
wrl_apply_debit_trans_commit(struct connection * conn)1148 void wrl_apply_debit_trans_commit(struct connection *conn)
1149 {
1150 	if (wrl_ntransactions <= 1)
1151 		/* our own transaction appears in the counter */
1152 		return;
1153 
1154 	wrl_apply_debit_actual(conn->domain);
1155 }
1156 
1157 /*
1158  * Local variables:
1159  *  mode: C
1160  *  c-file-style: "linux"
1161  *  indent-tabs-mode: t
1162  *  c-basic-offset: 8
1163  *  tab-width: 8
1164  * End:
1165  */
1166