1 /*
2     Transaction code for Xen Store Daemon.
3     Copyright (C) 2005 Rusty Russell IBM Corporation
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <inttypes.h>
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <sys/wait.h>
24 #include <sys/time.h>
25 #include <time.h>
26 #include <assert.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <fcntl.h>
30 #include <unistd.h>
31 #include "talloc.h"
32 #include "list.h"
33 #include "xenstored_transaction.h"
34 #include "xenstored_watch.h"
35 #include "xenstored_domain.h"
36 #include "xenstore_lib.h"
37 #include "utils.h"
38 
39 /*
40  * Some notes regarding detection and handling of transaction conflicts:
41  *
42  * Basic source of reference is the 'generation' count. Each writing access
43  * (either normal write or in a transaction) to the tdb data base will set
44  * the node specific generation count to the global generation count.
45  * For being able to identify a transaction the transaction specific generation
46  * count is initialized with the global generation count when starting the
47  * transaction.
48  * Each time the global generation count is copied to either a node or a
49  * transaction it is incremented. This ensures all nodes and/or transactions
50  * are having a unique generation count. The increment is done _before_ the
51  * copy as that is needed for checking whether a domain was created before
52  * or after a node has been written (the domain's generation is set with the
53  * actual generation count without incrementing it, in order to support
54  * writing a node for a domain before the domain has been officially
55  * introduced).
56  *
57  * Transaction conflicts are detected by checking the generation count of all
58  * nodes read in the transaction to match with the generation count in the
59  * global data base at the end of the transaction. Nodes which have been
60  * modified in the transaction don't have to be checked to match even if they
61  * have been read, as the modified node will be globally visible after the
62  * succeeded transaction possibly overwriting another modification which may
63  * have occurred concurrent to the transaction.
64  *
65  * Examples:
66  * ---------
67  * The following notation is used:
68  * I:      initial state
69  * G       global generation count
70  * g(X)    generation count of node X
71  * G(1)    generation count of transaction 1
72  * g(1:Y)  saved generation count of node Y in transaction 1
73  * TA1:    operation in transaction 1
74  * X=1:X   replace global node X with transaction 1 specific value of X
75  *
76  * 1. Simple transaction doing: read node A, write node B
77  *    I: g(A) = 1, g(B) = 2, G = 3
78  *    Start transaction 1: G(1) = 3, G = 4
79  *    TA1: read node A:    g(1:A) = 1
80  *    TA1: write node B:   g(1:B) = 4, G = 5
81  *    End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 5, G = 6
82  *
83  * 2. Transaction with conflicting write
84  *    I: g(A) = 1, g(B) = 2, G = 3
85  *    Start transaction 1: G(1) = 3, G = 4
86  *    TA1: read node A:    g(1:A) = 1
87  *    write node A:        g(A) = 4, G = 5
88  *    TA1: write node B:   g(1:B) = 5, G = 6
89  *    End TA1: g(1:A) != g(A) => EAGAIN
90  *
91  * 3. Transaction with conflicting delete
92  *    I: g(A) = 1, g(B) = 2, G = 3
93  *    Start transaction 1: G(1) = 3, G = 4
94  *    TA1: read node A:    g(1:A) = 1
95  *    delete node A:       g(A) = ~0
96  *    TA1: write node B:   g(1:B) = 4, G = 5
97  *    End TA1: g(1:A) != g(A) => EAGAIN
98  *
99  * 4. Two interfering transactions
100  *    I: g(A) = 1, g(B) = 2, G = 3
101  *    Start transaction 1: G(1) = 3, G = 4
102  *    Start transaction 2: G(2) = 4, G = 5
103  *    TA1: read node A:    g(1:A) = 1
104  *    TA2: read node B:    g(2:B) = 2
105  *    TA1: write node B:   g(1:B) = 5, G = 6
106  *    TA2: write node A:   g(2:A) = 6, G = 7
107  *    End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 7, G = 8
108  *    End TA2: g(2:B) != g(B) => EAGAIN
109  */
110 
111 struct accessed_node
112 {
113 	/* List of all changed nodes in the context of this transaction. */
114 	struct list_head list;
115 
116 	/* The name of the node. */
117 	char *node;
118 
119 	/* Generation count (or NO_GENERATION) for conflict checking. */
120 	uint64_t generation;
121 
122 	/* Original node permissions. */
123 	struct node_perms perms;
124 
125 	/* Generation count checking required? */
126 	bool check_gen;
127 
128 	/* Modified? */
129 	bool modified;
130 
131 	/* Transaction node in data base? */
132 	bool ta_node;
133 };
134 
135 struct changed_domain
136 {
137 	/* List of all changed domains in the context of this transaction. */
138 	struct list_head list;
139 
140 	/* Identifier of the changed domain. */
141 	unsigned int domid;
142 
143 	/* Amount by which this domain's nbentry field has changed. */
144 	int nbentry;
145 };
146 
147 struct transaction
148 {
149 	/* List of all transactions active on this connection. */
150 	struct list_head list;
151 
152 	/* Connection-local identifier for this transaction. */
153 	uint32_t id;
154 
155 	/* Generation when transaction started. */
156 	uint64_t generation;
157 
158 	/* List of accessed nodes. */
159 	struct list_head accessed;
160 
161 	/* List of changed domains - to record the changed domain entry number */
162 	struct list_head changed_domains;
163 
164 	/* Flag for letting transaction fail. */
165 	bool fail;
166 };
167 
168 extern int quota_max_transaction;
169 uint64_t generation;
170 
set_tdb_key(const char * name,TDB_DATA * key)171 static void set_tdb_key(const char *name, TDB_DATA *key)
172 {
173 	key->dptr = (char *)name;
174 	key->dsize = strlen(name);
175 }
176 
find_accessed_node(struct transaction * trans,const char * name)177 static struct accessed_node *find_accessed_node(struct transaction *trans,
178 						const char *name)
179 {
180 	struct accessed_node *i;
181 
182 	list_for_each_entry(i, &trans->accessed, list)
183 		if (streq(i->node, name))
184 			return i;
185 
186 	return NULL;
187 }
188 
transaction_get_node_name(void * ctx,struct transaction * trans,const char * name)189 static char *transaction_get_node_name(void *ctx, struct transaction *trans,
190 				       const char *name)
191 {
192 	return talloc_asprintf(ctx, "%"PRIu64"/%s", trans->generation, name);
193 }
194 
195 /*
196  * Prepend the transaction to name if node has been modified in the current
197  * transaction.
198  */
transaction_prepend(struct connection * conn,const char * name,TDB_DATA * key)199 int transaction_prepend(struct connection *conn, const char *name,
200 			TDB_DATA *key)
201 {
202 	char *tdb_name;
203 
204 	if (!conn || !conn->transaction ||
205 	    !find_accessed_node(conn->transaction, name)) {
206 		set_tdb_key(name, key);
207 		return 0;
208 	}
209 
210 	tdb_name = transaction_get_node_name(conn->transaction,
211 					     conn->transaction, name);
212 	if (!tdb_name)
213 		return errno;
214 
215 	set_tdb_key(tdb_name, key);
216 
217 	return 0;
218 }
219 
220 /*
221  * A node has been accessed.
222  *
223  * Modifying accesses (write, delete) always update the generation (global and
224  * node->generation).
225  *
226  * Accesses in a transaction will be added to the list of accessed nodes
227  * if not already done. Read type accesses will copy the node to the
228  * transaction specific data base part, write type accesses go there
229  * anyway.
230  *
231  * If not NULL, key will be supplied with name and length of name of the node
232  * to be accessed in the data base.
233  */
access_node(struct connection * conn,struct node * node,enum node_access_type type,TDB_DATA * key)234 int access_node(struct connection *conn, struct node *node,
235 		enum node_access_type type, TDB_DATA *key)
236 {
237 	struct accessed_node *i = NULL;
238 	struct transaction *trans;
239 	TDB_DATA local_key;
240 	const char *trans_name = NULL;
241 	int ret;
242 	bool introduce = false;
243 
244 	if (type != NODE_ACCESS_READ) {
245 		node->generation = ++generation;
246 		if (conn && !conn->transaction)
247 			wrl_apply_debit_direct(conn);
248 	}
249 
250 	if (!conn || !conn->transaction) {
251 		/* They're changing the global database. */
252 		if (key)
253 			set_tdb_key(node->name, key);
254 		return 0;
255 	}
256 
257 	trans = conn->transaction;
258 
259 	trans_name = transaction_get_node_name(node, trans, node->name);
260 	if (!trans_name)
261 		goto nomem;
262 
263 	i = find_accessed_node(trans, node->name);
264 	if (!i) {
265 		i = talloc_zero(trans, struct accessed_node);
266 		if (!i)
267 			goto nomem;
268 		i->node = talloc_strdup(i, node->name);
269 		if (!i->node)
270 			goto nomem;
271 		if (node->generation != NO_GENERATION && node->perms.num) {
272 			i->perms.p = talloc_array(i, struct xs_permissions,
273 						  node->perms.num);
274 			if (!i->perms.p)
275 				goto nomem;
276 			i->perms.num = node->perms.num;
277 			memcpy(i->perms.p, node->perms.p,
278 			       i->perms.num * sizeof(*i->perms.p));
279 		}
280 
281 		introduce = true;
282 		i->ta_node = false;
283 
284 		/*
285 		 * Additional transaction-specific node for read type. We only
286 		 * have to verify read nodes if we didn't write them.
287 		 *
288 		 * The node is created and written to DB here to distinguish
289 		 * from the write types.
290 		 */
291 		if (type == NODE_ACCESS_READ) {
292 			i->generation = node->generation;
293 			i->check_gen = true;
294 			if (node->generation != NO_GENERATION) {
295 				set_tdb_key(trans_name, &local_key);
296 				ret = write_node_raw(conn, &local_key, node, true);
297 				if (ret)
298 					goto err;
299 				i->ta_node = true;
300 			}
301 		}
302 		list_add_tail(&i->list, &trans->accessed);
303 	}
304 
305 	if (type != NODE_ACCESS_READ)
306 		i->modified = true;
307 
308 	if (introduce && type == NODE_ACCESS_DELETE)
309 		/* Nothing to delete. */
310 		return -1;
311 
312 	if (key) {
313 		set_tdb_key(trans_name, key);
314 		if (type == NODE_ACCESS_WRITE)
315 			i->ta_node = true;
316 		if (type == NODE_ACCESS_DELETE)
317 			i->ta_node = false;
318 	}
319 
320 	return 0;
321 
322 nomem:
323 	ret = ENOMEM;
324 err:
325 	talloc_free((void *)trans_name);
326 	talloc_free(i);
327 	trans->fail = true;
328 	errno = ret;
329 	return ret;
330 }
331 
332 /*
333  * Finalize transaction:
334  * Walk through accessed nodes and check generation against global data.
335  * If all entries match, read the transaction entries and write them without
336  * transaction prepended. Delete all transaction specific nodes in the data
337  * base.
338  */
finalize_transaction(struct connection * conn,struct transaction * trans)339 static int finalize_transaction(struct connection *conn,
340 				struct transaction *trans)
341 {
342 	struct accessed_node *i;
343 	TDB_DATA key, ta_key, data;
344 	struct xs_tdb_record_hdr *hdr;
345 	uint64_t gen;
346 	char *trans_name;
347 	int ret;
348 
349 	list_for_each_entry(i, &trans->accessed, list) {
350 		if (!i->check_gen)
351 			continue;
352 
353 		set_tdb_key(i->node, &key);
354 		data = tdb_fetch(tdb_ctx, key);
355 		hdr = (void *)data.dptr;
356 		if (!data.dptr) {
357 			if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST)
358 				return EIO;
359 			gen = NO_GENERATION;
360 		} else
361 			gen = hdr->generation;
362 		talloc_free(data.dptr);
363 		if (i->generation != gen)
364 			return EAGAIN;
365 	}
366 
367 	while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
368 		trans_name = transaction_get_node_name(i, trans, i->node);
369 		if (!trans_name)
370 			/* We are doomed: the transaction is only partial. */
371 			goto err;
372 
373 		set_tdb_key(trans_name, &ta_key);
374 
375 		if (i->modified) {
376 			set_tdb_key(i->node, &key);
377 			if (i->ta_node) {
378 				data = tdb_fetch(tdb_ctx, ta_key);
379 				if (!data.dptr)
380 					goto err;
381 				hdr = (void *)data.dptr;
382 				hdr->generation = ++generation;
383 				ret = tdb_store(tdb_ctx, key, data,
384 						TDB_REPLACE);
385 				talloc_free(data.dptr);
386 				if (ret)
387 					goto err;
388 				fire_watches(conn, trans, i->node, NULL, false,
389 					     i->perms.p ? &i->perms : NULL);
390 			} else {
391 				fire_watches(conn, trans, i->node, NULL, false,
392 					     i->perms.p ? &i->perms : NULL);
393 				if (tdb_delete(tdb_ctx, key))
394 					goto err;
395 			}
396 		}
397 
398 		if (i->ta_node && tdb_delete(tdb_ctx, ta_key))
399 			goto err;
400 		list_del(&i->list);
401 		talloc_free(i);
402 	}
403 
404 	return 0;
405 
406 err:
407 	corrupt(conn, "Partial transaction");
408 	return EIO;
409 }
410 
destroy_transaction(void * _transaction)411 static int destroy_transaction(void *_transaction)
412 {
413 	struct transaction *trans = _transaction;
414 	struct accessed_node *i;
415 	char *trans_name;
416 	TDB_DATA key;
417 
418 	wrl_ntransactions--;
419 	trace_destroy(trans, "transaction");
420 	while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
421 		if (i->ta_node) {
422 			trans_name = transaction_get_node_name(i, trans,
423 							       i->node);
424 			if (trans_name) {
425 				set_tdb_key(trans_name, &key);
426 				tdb_delete(tdb_ctx, key);
427 			}
428 		}
429 		list_del(&i->list);
430 		talloc_free(i);
431 	}
432 
433 	return 0;
434 }
435 
transaction_lookup(struct connection * conn,uint32_t id)436 struct transaction *transaction_lookup(struct connection *conn, uint32_t id)
437 {
438 	struct transaction *trans;
439 
440 	if (id == 0)
441 		return NULL;
442 
443 	list_for_each_entry(trans, &conn->transaction_list, list)
444 		if (trans->id == id)
445 			return trans;
446 
447 	return ERR_PTR(-ENOENT);
448 }
449 
do_transaction_start(struct connection * conn,struct buffered_data * in)450 int do_transaction_start(struct connection *conn, struct buffered_data *in)
451 {
452 	struct transaction *trans, *exists;
453 	char id_str[20];
454 
455 	/* We don't support nested transactions. */
456 	if (conn->transaction)
457 		return EBUSY;
458 
459 	if (conn->id && conn->transaction_started > quota_max_transaction)
460 		return ENOSPC;
461 
462 	/* Attach transaction to input for autofree until it's complete */
463 	trans = talloc_zero(in, struct transaction);
464 	if (!trans)
465 		return ENOMEM;
466 
467 	INIT_LIST_HEAD(&trans->accessed);
468 	INIT_LIST_HEAD(&trans->changed_domains);
469 	trans->fail = false;
470 	trans->generation = ++generation;
471 
472 	/* Pick an unused transaction identifier. */
473 	do {
474 		trans->id = conn->next_transaction_id;
475 		exists = transaction_lookup(conn, conn->next_transaction_id++);
476 	} while (!IS_ERR(exists));
477 
478 	/* Now we own it. */
479 	list_add_tail(&trans->list, &conn->transaction_list);
480 	talloc_steal(conn, trans);
481 	talloc_set_destructor(trans, destroy_transaction);
482 	conn->transaction_started++;
483 	wrl_ntransactions++;
484 
485 	snprintf(id_str, sizeof(id_str), "%u", trans->id);
486 	send_reply(conn, XS_TRANSACTION_START, id_str, strlen(id_str)+1);
487 
488 	return 0;
489 }
490 
transaction_fix_domains(struct transaction * trans,bool update)491 static int transaction_fix_domains(struct transaction *trans, bool update)
492 {
493 	struct changed_domain *d;
494 	int cnt;
495 
496 	list_for_each_entry(d, &trans->changed_domains, list) {
497 		cnt = domain_entry_fix(d->domid, d->nbentry, update);
498 		if (!update && cnt >= quota_nb_entry_per_domain)
499 			return ENOSPC;
500 	}
501 
502 	return 0;
503 }
504 
do_transaction_end(struct connection * conn,struct buffered_data * in)505 int do_transaction_end(struct connection *conn, struct buffered_data *in)
506 {
507 	const char *arg = onearg(in);
508 	struct transaction *trans;
509 	int ret;
510 
511 	if (!arg || (!streq(arg, "T") && !streq(arg, "F")))
512 		return EINVAL;
513 
514 	if ((trans = conn->transaction) == NULL)
515 		return ENOENT;
516 
517 	conn->transaction = NULL;
518 	list_del(&trans->list);
519 	conn->transaction_started--;
520 
521 	/* Attach transaction to in for auto-cleanup */
522 	talloc_steal(in, trans);
523 
524 	if (streq(arg, "T")) {
525 		if (trans->fail)
526 			return ENOMEM;
527 		ret = transaction_fix_domains(trans, false);
528 		if (ret)
529 			return ret;
530 		if (finalize_transaction(conn, trans))
531 			return EAGAIN;
532 
533 		wrl_apply_debit_trans_commit(conn);
534 
535 		/* fix domain entry for each changed domain */
536 		transaction_fix_domains(trans, true);
537 	}
538 	send_ack(conn, XS_TRANSACTION_END);
539 
540 	return 0;
541 }
542 
transaction_entry_inc(struct transaction * trans,unsigned int domid)543 void transaction_entry_inc(struct transaction *trans, unsigned int domid)
544 {
545 	struct changed_domain *d;
546 
547 	list_for_each_entry(d, &trans->changed_domains, list)
548 		if (d->domid == domid) {
549 			d->nbentry++;
550 			return;
551 		}
552 
553 	d = talloc(trans, struct changed_domain);
554 	if (!d) {
555 		/* Let the transaction fail. */
556 		trans->fail = true;
557 		return;
558 	}
559 	d->domid = domid;
560 	d->nbentry = 1;
561 	list_add_tail(&d->list, &trans->changed_domains);
562 }
563 
transaction_entry_dec(struct transaction * trans,unsigned int domid)564 void transaction_entry_dec(struct transaction *trans, unsigned int domid)
565 {
566 	struct changed_domain *d;
567 
568 	list_for_each_entry(d, &trans->changed_domains, list)
569 		if (d->domid == domid) {
570 			d->nbentry--;
571 			return;
572 		}
573 
574 	d = talloc(trans, struct changed_domain);
575 	if (!d) {
576 		/* Let the transaction fail. */
577 		trans->fail = true;
578 		return;
579 	}
580 	d->domid = domid;
581 	d->nbentry = -1;
582 	list_add_tail(&d->list, &trans->changed_domains);
583 }
584 
conn_delete_all_transactions(struct connection * conn)585 void conn_delete_all_transactions(struct connection *conn)
586 {
587 	struct transaction *trans;
588 
589 	while ((trans = list_top(&conn->transaction_list,
590 				 struct transaction, list))) {
591 		list_del(&trans->list);
592 		talloc_free(trans);
593 	}
594 
595 	assert(conn->transaction == NULL);
596 
597 	conn->transaction_started = 0;
598 }
599 
check_transactions(struct hashtable * hash)600 int check_transactions(struct hashtable *hash)
601 {
602 	struct connection *conn;
603 	struct transaction *trans;
604 	struct accessed_node *i;
605 	char *tname, *tnode;
606 
607 	list_for_each_entry(conn, &connections, list) {
608 		list_for_each_entry(trans, &conn->transaction_list, list) {
609 			tname = talloc_asprintf(trans, "%"PRIu64,
610 						trans->generation);
611 			if (!tname || !remember_string(hash, tname))
612 				goto nomem;
613 
614 			list_for_each_entry(i, &trans->accessed, list) {
615 				if (!i->ta_node)
616 					continue;
617 				tnode = transaction_get_node_name(tname, trans,
618 								  i->node);
619 				if (!tnode || !remember_string(hash, tnode))
620 					goto nomem;
621 				talloc_free(tnode);
622 			}
623 
624 			talloc_free(tname);
625 		}
626 	}
627 
628 	return 0;
629 
630 nomem:
631 	talloc_free(tname);
632 	return ENOMEM;
633 }
634 
635 /*
636  * Local variables:
637  *  mode: C
638  *  c-file-style: "linux"
639  *  indent-tabs-mode: t
640  *  c-basic-offset: 8
641  *  tab-width: 8
642  * End:
643  */
644