1 /*
2 Transaction code for Xen Store Daemon.
3 Copyright (C) 2005 Rusty Russell IBM Corporation
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <inttypes.h>
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <sys/wait.h>
24 #include <sys/time.h>
25 #include <time.h>
26 #include <assert.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <fcntl.h>
30 #include <unistd.h>
31 #include "talloc.h"
32 #include "list.h"
33 #include "xenstored_transaction.h"
34 #include "xenstored_watch.h"
35 #include "xenstored_domain.h"
36 #include "xenstore_lib.h"
37 #include "utils.h"
38
39 /*
40 * Some notes regarding detection and handling of transaction conflicts:
41 *
42 * Basic source of reference is the 'generation' count. Each writing access
43 * (either normal write or in a transaction) to the tdb data base will set
44 * the node specific generation count to the global generation count.
45 * For being able to identify a transaction the transaction specific generation
46 * count is initialized with the global generation count when starting the
47 * transaction.
48 * Each time the global generation count is copied to either a node or a
49 * transaction it is incremented. This ensures all nodes and/or transactions
50 * are having a unique generation count. The increment is done _before_ the
51 * copy as that is needed for checking whether a domain was created before
52 * or after a node has been written (the domain's generation is set with the
53 * actual generation count without incrementing it, in order to support
54 * writing a node for a domain before the domain has been officially
55 * introduced).
56 *
57 * Transaction conflicts are detected by checking the generation count of all
58 * nodes read in the transaction to match with the generation count in the
59 * global data base at the end of the transaction. Nodes which have been
60 * modified in the transaction don't have to be checked to match even if they
61 * have been read, as the modified node will be globally visible after the
62 * succeeded transaction possibly overwriting another modification which may
63 * have occurred concurrent to the transaction.
64 *
65 * Examples:
66 * ---------
67 * The following notation is used:
68 * I: initial state
69 * G global generation count
70 * g(X) generation count of node X
71 * G(1) generation count of transaction 1
72 * g(1:Y) saved generation count of node Y in transaction 1
73 * TA1: operation in transaction 1
74 * X=1:X replace global node X with transaction 1 specific value of X
75 *
76 * 1. Simple transaction doing: read node A, write node B
77 * I: g(A) = 1, g(B) = 2, G = 3
78 * Start transaction 1: G(1) = 3, G = 4
79 * TA1: read node A: g(1:A) = 1
80 * TA1: write node B: g(1:B) = 4, G = 5
81 * End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 5, G = 6
82 *
83 * 2. Transaction with conflicting write
84 * I: g(A) = 1, g(B) = 2, G = 3
85 * Start transaction 1: G(1) = 3, G = 4
86 * TA1: read node A: g(1:A) = 1
87 * write node A: g(A) = 4, G = 5
88 * TA1: write node B: g(1:B) = 5, G = 6
89 * End TA1: g(1:A) != g(A) => EAGAIN
90 *
91 * 3. Transaction with conflicting delete
92 * I: g(A) = 1, g(B) = 2, G = 3
93 * Start transaction 1: G(1) = 3, G = 4
94 * TA1: read node A: g(1:A) = 1
95 * delete node A: g(A) = ~0
96 * TA1: write node B: g(1:B) = 4, G = 5
97 * End TA1: g(1:A) != g(A) => EAGAIN
98 *
99 * 4. Two interfering transactions
100 * I: g(A) = 1, g(B) = 2, G = 3
101 * Start transaction 1: G(1) = 3, G = 4
102 * Start transaction 2: G(2) = 4, G = 5
103 * TA1: read node A: g(1:A) = 1
104 * TA2: read node B: g(2:B) = 2
105 * TA1: write node B: g(1:B) = 5, G = 6
106 * TA2: write node A: g(2:A) = 6, G = 7
107 * End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 7, G = 8
108 * End TA2: g(2:B) != g(B) => EAGAIN
109 */
110
111 struct accessed_node
112 {
113 /* List of all changed nodes in the context of this transaction. */
114 struct list_head list;
115
116 /* The name of the node. */
117 char *node;
118
119 /* Generation count (or NO_GENERATION) for conflict checking. */
120 uint64_t generation;
121
122 /* Original node permissions. */
123 struct node_perms perms;
124
125 /* Generation count checking required? */
126 bool check_gen;
127
128 /* Modified? */
129 bool modified;
130
131 /* Transaction node in data base? */
132 bool ta_node;
133 };
134
135 struct changed_domain
136 {
137 /* List of all changed domains in the context of this transaction. */
138 struct list_head list;
139
140 /* Identifier of the changed domain. */
141 unsigned int domid;
142
143 /* Amount by which this domain's nbentry field has changed. */
144 int nbentry;
145 };
146
147 struct transaction
148 {
149 /* List of all transactions active on this connection. */
150 struct list_head list;
151
152 /* Connection-local identifier for this transaction. */
153 uint32_t id;
154
155 /* Generation when transaction started. */
156 uint64_t generation;
157
158 /* List of accessed nodes. */
159 struct list_head accessed;
160
161 /* List of changed domains - to record the changed domain entry number */
162 struct list_head changed_domains;
163
164 /* Flag for letting transaction fail. */
165 bool fail;
166 };
167
168 extern int quota_max_transaction;
169 uint64_t generation;
170
set_tdb_key(const char * name,TDB_DATA * key)171 static void set_tdb_key(const char *name, TDB_DATA *key)
172 {
173 key->dptr = (char *)name;
174 key->dsize = strlen(name);
175 }
176
find_accessed_node(struct transaction * trans,const char * name)177 static struct accessed_node *find_accessed_node(struct transaction *trans,
178 const char *name)
179 {
180 struct accessed_node *i;
181
182 list_for_each_entry(i, &trans->accessed, list)
183 if (streq(i->node, name))
184 return i;
185
186 return NULL;
187 }
188
transaction_get_node_name(void * ctx,struct transaction * trans,const char * name)189 static char *transaction_get_node_name(void *ctx, struct transaction *trans,
190 const char *name)
191 {
192 return talloc_asprintf(ctx, "%"PRIu64"/%s", trans->generation, name);
193 }
194
195 /*
196 * Prepend the transaction to name if node has been modified in the current
197 * transaction.
198 */
transaction_prepend(struct connection * conn,const char * name,TDB_DATA * key)199 int transaction_prepend(struct connection *conn, const char *name,
200 TDB_DATA *key)
201 {
202 char *tdb_name;
203
204 if (!conn || !conn->transaction ||
205 !find_accessed_node(conn->transaction, name)) {
206 set_tdb_key(name, key);
207 return 0;
208 }
209
210 tdb_name = transaction_get_node_name(conn->transaction,
211 conn->transaction, name);
212 if (!tdb_name)
213 return errno;
214
215 set_tdb_key(tdb_name, key);
216
217 return 0;
218 }
219
220 /*
221 * A node has been accessed.
222 *
223 * Modifying accesses (write, delete) always update the generation (global and
224 * node->generation).
225 *
226 * Accesses in a transaction will be added to the list of accessed nodes
227 * if not already done. Read type accesses will copy the node to the
228 * transaction specific data base part, write type accesses go there
229 * anyway.
230 *
231 * If not NULL, key will be supplied with name and length of name of the node
232 * to be accessed in the data base.
233 */
access_node(struct connection * conn,struct node * node,enum node_access_type type,TDB_DATA * key)234 int access_node(struct connection *conn, struct node *node,
235 enum node_access_type type, TDB_DATA *key)
236 {
237 struct accessed_node *i = NULL;
238 struct transaction *trans;
239 TDB_DATA local_key;
240 const char *trans_name = NULL;
241 int ret;
242 bool introduce = false;
243
244 if (type != NODE_ACCESS_READ) {
245 node->generation = ++generation;
246 if (conn && !conn->transaction)
247 wrl_apply_debit_direct(conn);
248 }
249
250 if (!conn || !conn->transaction) {
251 /* They're changing the global database. */
252 if (key)
253 set_tdb_key(node->name, key);
254 return 0;
255 }
256
257 trans = conn->transaction;
258
259 trans_name = transaction_get_node_name(node, trans, node->name);
260 if (!trans_name)
261 goto nomem;
262
263 i = find_accessed_node(trans, node->name);
264 if (!i) {
265 i = talloc_zero(trans, struct accessed_node);
266 if (!i)
267 goto nomem;
268 i->node = talloc_strdup(i, node->name);
269 if (!i->node)
270 goto nomem;
271 if (node->generation != NO_GENERATION && node->perms.num) {
272 i->perms.p = talloc_array(i, struct xs_permissions,
273 node->perms.num);
274 if (!i->perms.p)
275 goto nomem;
276 i->perms.num = node->perms.num;
277 memcpy(i->perms.p, node->perms.p,
278 i->perms.num * sizeof(*i->perms.p));
279 }
280
281 introduce = true;
282 i->ta_node = false;
283
284 /*
285 * Additional transaction-specific node for read type. We only
286 * have to verify read nodes if we didn't write them.
287 *
288 * The node is created and written to DB here to distinguish
289 * from the write types.
290 */
291 if (type == NODE_ACCESS_READ) {
292 i->generation = node->generation;
293 i->check_gen = true;
294 if (node->generation != NO_GENERATION) {
295 set_tdb_key(trans_name, &local_key);
296 ret = write_node_raw(conn, &local_key, node, true);
297 if (ret)
298 goto err;
299 i->ta_node = true;
300 }
301 }
302 list_add_tail(&i->list, &trans->accessed);
303 }
304
305 if (type != NODE_ACCESS_READ)
306 i->modified = true;
307
308 if (introduce && type == NODE_ACCESS_DELETE)
309 /* Nothing to delete. */
310 return -1;
311
312 if (key) {
313 set_tdb_key(trans_name, key);
314 if (type == NODE_ACCESS_WRITE)
315 i->ta_node = true;
316 if (type == NODE_ACCESS_DELETE)
317 i->ta_node = false;
318 }
319
320 return 0;
321
322 nomem:
323 ret = ENOMEM;
324 err:
325 talloc_free((void *)trans_name);
326 talloc_free(i);
327 trans->fail = true;
328 errno = ret;
329 return ret;
330 }
331
332 /*
333 * Finalize transaction:
334 * Walk through accessed nodes and check generation against global data.
335 * If all entries match, read the transaction entries and write them without
336 * transaction prepended. Delete all transaction specific nodes in the data
337 * base.
338 */
finalize_transaction(struct connection * conn,struct transaction * trans)339 static int finalize_transaction(struct connection *conn,
340 struct transaction *trans)
341 {
342 struct accessed_node *i;
343 TDB_DATA key, ta_key, data;
344 struct xs_tdb_record_hdr *hdr;
345 uint64_t gen;
346 char *trans_name;
347 int ret;
348
349 list_for_each_entry(i, &trans->accessed, list) {
350 if (!i->check_gen)
351 continue;
352
353 set_tdb_key(i->node, &key);
354 data = tdb_fetch(tdb_ctx, key);
355 hdr = (void *)data.dptr;
356 if (!data.dptr) {
357 if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST)
358 return EIO;
359 gen = NO_GENERATION;
360 } else
361 gen = hdr->generation;
362 talloc_free(data.dptr);
363 if (i->generation != gen)
364 return EAGAIN;
365 }
366
367 while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
368 trans_name = transaction_get_node_name(i, trans, i->node);
369 if (!trans_name)
370 /* We are doomed: the transaction is only partial. */
371 goto err;
372
373 set_tdb_key(trans_name, &ta_key);
374
375 if (i->modified) {
376 set_tdb_key(i->node, &key);
377 if (i->ta_node) {
378 data = tdb_fetch(tdb_ctx, ta_key);
379 if (!data.dptr)
380 goto err;
381 hdr = (void *)data.dptr;
382 hdr->generation = ++generation;
383 ret = tdb_store(tdb_ctx, key, data,
384 TDB_REPLACE);
385 talloc_free(data.dptr);
386 if (ret)
387 goto err;
388 fire_watches(conn, trans, i->node, NULL, false,
389 i->perms.p ? &i->perms : NULL);
390 } else {
391 fire_watches(conn, trans, i->node, NULL, false,
392 i->perms.p ? &i->perms : NULL);
393 if (tdb_delete(tdb_ctx, key))
394 goto err;
395 }
396 }
397
398 if (i->ta_node && tdb_delete(tdb_ctx, ta_key))
399 goto err;
400 list_del(&i->list);
401 talloc_free(i);
402 }
403
404 return 0;
405
406 err:
407 corrupt(conn, "Partial transaction");
408 return EIO;
409 }
410
destroy_transaction(void * _transaction)411 static int destroy_transaction(void *_transaction)
412 {
413 struct transaction *trans = _transaction;
414 struct accessed_node *i;
415 char *trans_name;
416 TDB_DATA key;
417
418 wrl_ntransactions--;
419 trace_destroy(trans, "transaction");
420 while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
421 if (i->ta_node) {
422 trans_name = transaction_get_node_name(i, trans,
423 i->node);
424 if (trans_name) {
425 set_tdb_key(trans_name, &key);
426 tdb_delete(tdb_ctx, key);
427 }
428 }
429 list_del(&i->list);
430 talloc_free(i);
431 }
432
433 return 0;
434 }
435
transaction_lookup(struct connection * conn,uint32_t id)436 struct transaction *transaction_lookup(struct connection *conn, uint32_t id)
437 {
438 struct transaction *trans;
439
440 if (id == 0)
441 return NULL;
442
443 list_for_each_entry(trans, &conn->transaction_list, list)
444 if (trans->id == id)
445 return trans;
446
447 return ERR_PTR(-ENOENT);
448 }
449
do_transaction_start(struct connection * conn,struct buffered_data * in)450 int do_transaction_start(struct connection *conn, struct buffered_data *in)
451 {
452 struct transaction *trans, *exists;
453 char id_str[20];
454
455 /* We don't support nested transactions. */
456 if (conn->transaction)
457 return EBUSY;
458
459 if (conn->id && conn->transaction_started > quota_max_transaction)
460 return ENOSPC;
461
462 /* Attach transaction to input for autofree until it's complete */
463 trans = talloc_zero(in, struct transaction);
464 if (!trans)
465 return ENOMEM;
466
467 INIT_LIST_HEAD(&trans->accessed);
468 INIT_LIST_HEAD(&trans->changed_domains);
469 trans->fail = false;
470 trans->generation = ++generation;
471
472 /* Pick an unused transaction identifier. */
473 do {
474 trans->id = conn->next_transaction_id;
475 exists = transaction_lookup(conn, conn->next_transaction_id++);
476 } while (!IS_ERR(exists));
477
478 /* Now we own it. */
479 list_add_tail(&trans->list, &conn->transaction_list);
480 talloc_steal(conn, trans);
481 talloc_set_destructor(trans, destroy_transaction);
482 conn->transaction_started++;
483 wrl_ntransactions++;
484
485 snprintf(id_str, sizeof(id_str), "%u", trans->id);
486 send_reply(conn, XS_TRANSACTION_START, id_str, strlen(id_str)+1);
487
488 return 0;
489 }
490
transaction_fix_domains(struct transaction * trans,bool update)491 static int transaction_fix_domains(struct transaction *trans, bool update)
492 {
493 struct changed_domain *d;
494 int cnt;
495
496 list_for_each_entry(d, &trans->changed_domains, list) {
497 cnt = domain_entry_fix(d->domid, d->nbentry, update);
498 if (!update && cnt >= quota_nb_entry_per_domain)
499 return ENOSPC;
500 }
501
502 return 0;
503 }
504
do_transaction_end(struct connection * conn,struct buffered_data * in)505 int do_transaction_end(struct connection *conn, struct buffered_data *in)
506 {
507 const char *arg = onearg(in);
508 struct transaction *trans;
509 int ret;
510
511 if (!arg || (!streq(arg, "T") && !streq(arg, "F")))
512 return EINVAL;
513
514 if ((trans = conn->transaction) == NULL)
515 return ENOENT;
516
517 conn->transaction = NULL;
518 list_del(&trans->list);
519 conn->transaction_started--;
520
521 /* Attach transaction to in for auto-cleanup */
522 talloc_steal(in, trans);
523
524 if (streq(arg, "T")) {
525 if (trans->fail)
526 return ENOMEM;
527 ret = transaction_fix_domains(trans, false);
528 if (ret)
529 return ret;
530 if (finalize_transaction(conn, trans))
531 return EAGAIN;
532
533 wrl_apply_debit_trans_commit(conn);
534
535 /* fix domain entry for each changed domain */
536 transaction_fix_domains(trans, true);
537 }
538 send_ack(conn, XS_TRANSACTION_END);
539
540 return 0;
541 }
542
transaction_entry_inc(struct transaction * trans,unsigned int domid)543 void transaction_entry_inc(struct transaction *trans, unsigned int domid)
544 {
545 struct changed_domain *d;
546
547 list_for_each_entry(d, &trans->changed_domains, list)
548 if (d->domid == domid) {
549 d->nbentry++;
550 return;
551 }
552
553 d = talloc(trans, struct changed_domain);
554 if (!d) {
555 /* Let the transaction fail. */
556 trans->fail = true;
557 return;
558 }
559 d->domid = domid;
560 d->nbentry = 1;
561 list_add_tail(&d->list, &trans->changed_domains);
562 }
563
transaction_entry_dec(struct transaction * trans,unsigned int domid)564 void transaction_entry_dec(struct transaction *trans, unsigned int domid)
565 {
566 struct changed_domain *d;
567
568 list_for_each_entry(d, &trans->changed_domains, list)
569 if (d->domid == domid) {
570 d->nbentry--;
571 return;
572 }
573
574 d = talloc(trans, struct changed_domain);
575 if (!d) {
576 /* Let the transaction fail. */
577 trans->fail = true;
578 return;
579 }
580 d->domid = domid;
581 d->nbentry = -1;
582 list_add_tail(&d->list, &trans->changed_domains);
583 }
584
conn_delete_all_transactions(struct connection * conn)585 void conn_delete_all_transactions(struct connection *conn)
586 {
587 struct transaction *trans;
588
589 while ((trans = list_top(&conn->transaction_list,
590 struct transaction, list))) {
591 list_del(&trans->list);
592 talloc_free(trans);
593 }
594
595 assert(conn->transaction == NULL);
596
597 conn->transaction_started = 0;
598 }
599
check_transactions(struct hashtable * hash)600 int check_transactions(struct hashtable *hash)
601 {
602 struct connection *conn;
603 struct transaction *trans;
604 struct accessed_node *i;
605 char *tname, *tnode;
606
607 list_for_each_entry(conn, &connections, list) {
608 list_for_each_entry(trans, &conn->transaction_list, list) {
609 tname = talloc_asprintf(trans, "%"PRIu64,
610 trans->generation);
611 if (!tname || !remember_string(hash, tname))
612 goto nomem;
613
614 list_for_each_entry(i, &trans->accessed, list) {
615 if (!i->ta_node)
616 continue;
617 tnode = transaction_get_node_name(tname, trans,
618 i->node);
619 if (!tnode || !remember_string(hash, tnode))
620 goto nomem;
621 talloc_free(tnode);
622 }
623
624 talloc_free(tname);
625 }
626 }
627
628 return 0;
629
630 nomem:
631 talloc_free(tname);
632 return ENOMEM;
633 }
634
635 /*
636 * Local variables:
637 * mode: C
638 * c-file-style: "linux"
639 * indent-tabs-mode: t
640 * c-basic-offset: 8
641 * tab-width: 8
642 * End:
643 */
644