/*
Simple prototype Xen Store Daemon providing simple tree-like database.
Copyright (C) 2005 Rusty Russell IBM Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; If not, see .
*/
#include
#include
#include
#include
#ifndef NO_SOCKETS
#include
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "utils.h"
#include "list.h"
#include "talloc.h"
#include "xenstore_lib.h"
#include "xenstored_core.h"
#include "xenstored_watch.h"
#include "xenstored_transaction.h"
#include "xenstored_domain.h"
#include "xenstored_control.h"
#include "tdb.h"
#ifndef NO_SOCKETS
#if defined(HAVE_SYSTEMD)
#define XEN_SYSTEMD_ENABLED 1
#endif
#endif
#if defined(XEN_SYSTEMD_ENABLED)
#include
#endif
extern xenevtchn_handle *xce_handle; /* in xenstored_domain.c */
static int xce_pollfd_idx = -1;
static struct pollfd *fds;
static unsigned int current_array_size;
static unsigned int nr_fds;
static int sock = -1;
static int ro_sock = -1;
#define ROUNDUP(_x, _w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
static bool verbose = false;
LIST_HEAD(connections);
int tracefd = -1;
static bool recovery = true;
static int reopen_log_pipe[2];
static int reopen_log_pipe0_pollfd_idx = -1;
char *tracefile = NULL;
TDB_CONTEXT *tdb_ctx = NULL;
static const char *sockmsg_string(enum xsd_sockmsg_type type);
#define log(...) \
do { \
char *s = talloc_asprintf(NULL, __VA_ARGS__); \
if (s) { \
trace("%s\n", s); \
syslog(LOG_ERR, "%s\n", s); \
talloc_free(s); \
} else { \
trace("talloc failure during logging\n"); \
syslog(LOG_ERR, "talloc failure during logging\n"); \
} \
} while (0)
int quota_nb_entry_per_domain = 1000;
int quota_nb_watch_per_domain = 128;
int quota_max_entry_size = 2048; /* 2K */
int quota_max_transaction = 10;
int quota_nb_perms_per_node = 5;
void trace(const char *fmt, ...)
{
va_list arglist;
char *str;
char sbuf[1024];
int ret, dummy;
if (tracefd < 0)
return;
/* try to use a static buffer */
va_start(arglist, fmt);
ret = vsnprintf(sbuf, 1024, fmt, arglist);
va_end(arglist);
if (ret <= 1024) {
dummy = write(tracefd, sbuf, ret);
return;
}
/* fail back to dynamic allocation */
va_start(arglist, fmt);
str = talloc_vasprintf(NULL, fmt, arglist);
va_end(arglist);
if (str) {
dummy = write(tracefd, str, strlen(str));
talloc_free(str);
}
}
static void trace_io(const struct connection *conn,
const struct buffered_data *data,
int out)
{
unsigned int i;
time_t now;
struct tm *tm;
#ifdef HAVE_DTRACE
dtrace_io(conn, data, out);
#endif
if (tracefd < 0)
return;
now = time(NULL);
tm = localtime(&now);
trace("%s %p %04d%02d%02d %02d:%02d:%02d %s (",
out ? "OUT" : "IN", conn,
tm->tm_year + 1900, tm->tm_mon + 1,
tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec,
sockmsg_string(data->hdr.msg.type));
for (i = 0; i < data->hdr.msg.len; i++)
trace("%c", (data->buffer[i] != '\0') ? data->buffer[i] : ' ');
trace(")\n");
}
void trace_create(const void *data, const char *type)
{
trace("CREATE %s %p\n", type, data);
}
void trace_destroy(const void *data, const char *type)
{
trace("DESTROY %s %p\n", type, data);
}
/**
* Signal handler for SIGHUP, which requests that the trace log is reopened
* (in the main loop). A single byte is written to reopen_log_pipe, to awaken
* the poll() in the main loop.
*/
static void trigger_reopen_log(int signal __attribute__((unused)))
{
char c = 'A';
int dummy;
dummy = write(reopen_log_pipe[1], &c, 1);
}
void close_log(void)
{
if (tracefd >= 0)
close(tracefd);
tracefd = -1;
}
void reopen_log(void)
{
if (tracefile) {
close_log();
tracefd = open(tracefile, O_WRONLY|O_CREAT|O_APPEND, 0600);
if (tracefd < 0)
perror("Could not open tracefile");
else
trace("\n***\n");
}
}
static bool write_messages(struct connection *conn)
{
int ret;
struct buffered_data *out;
out = list_top(&conn->out_list, struct buffered_data, list);
if (out == NULL)
return true;
if (out->inhdr) {
if (verbose)
xprintf("Writing msg %s (%.*s) out to %p\n",
sockmsg_string(out->hdr.msg.type),
out->hdr.msg.len,
out->buffer, conn);
ret = conn->write(conn, out->hdr.raw + out->used,
sizeof(out->hdr) - out->used);
if (ret < 0)
return false;
out->used += ret;
if (out->used < sizeof(out->hdr))
return true;
out->inhdr = false;
out->used = 0;
/* Second write might block if non-zero. */
if (out->hdr.msg.len && !conn->domain)
return true;
}
ret = conn->write(conn, out->buffer + out->used,
out->hdr.msg.len - out->used);
if (ret < 0)
return false;
out->used += ret;
if (out->used != out->hdr.msg.len)
return true;
trace_io(conn, out, 1);
list_del(&out->list);
talloc_free(out);
return true;
}
static int destroy_conn(void *_conn)
{
struct connection *conn = _conn;
/* Flush outgoing if possible, but don't block. */
if (!conn->domain) {
struct pollfd pfd;
pfd.fd = conn->fd;
pfd.events = POLLOUT;
while (!list_empty(&conn->out_list)
&& poll(&pfd, 1, 0) == 1)
if (!write_messages(conn))
break;
close(conn->fd);
}
if (conn->target)
talloc_unlink(conn, conn->target);
list_del(&conn->list);
trace_destroy(conn, "connection");
return 0;
}
/* This function returns index inside the array if succeed, -1 if fail */
static int set_fd(int fd, short events)
{
int ret;
if (current_array_size < nr_fds + 1) {
struct pollfd *new_fds = NULL;
unsigned long newsize;
/* Round up to 2^8 boundary, in practice this just
* make newsize larger than current_array_size.
*/
newsize = ROUNDUP(nr_fds + 1, 8);
new_fds = realloc(fds, sizeof(struct pollfd)*newsize);
if (!new_fds)
goto fail;
fds = new_fds;
memset(&fds[0] + current_array_size, 0,
sizeof(struct pollfd ) * (newsize-current_array_size));
current_array_size = newsize;
}
fds[nr_fds].fd = fd;
fds[nr_fds].events = events;
ret = nr_fds;
nr_fds++;
return ret;
fail:
syslog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
return -1;
}
static void initialize_fds(int *p_sock_pollfd_idx, int *p_ro_sock_pollfd_idx,
int *ptimeout)
{
struct connection *conn;
struct wrl_timestampt now;
if (fds)
memset(fds, 0, sizeof(struct pollfd) * current_array_size);
nr_fds = 0;
*ptimeout = -1;
if (sock != -1)
*p_sock_pollfd_idx = set_fd(sock, POLLIN|POLLPRI);
if (ro_sock != -1)
*p_ro_sock_pollfd_idx = set_fd(ro_sock, POLLIN|POLLPRI);
if (reopen_log_pipe[0] != -1)
reopen_log_pipe0_pollfd_idx =
set_fd(reopen_log_pipe[0], POLLIN|POLLPRI);
if (xce_handle != NULL)
xce_pollfd_idx = set_fd(xenevtchn_fd(xce_handle),
POLLIN|POLLPRI);
wrl_gettime_now(&now);
wrl_log_periodic(now);
list_for_each_entry(conn, &connections, list) {
if (conn->domain) {
wrl_check_timeout(conn->domain, now, ptimeout);
if (domain_can_read(conn) ||
(domain_can_write(conn) &&
!list_empty(&conn->out_list)))
*ptimeout = 0;
} else {
short events = POLLIN|POLLPRI;
if (!list_empty(&conn->out_list))
events |= POLLOUT;
conn->pollfd_idx = set_fd(conn->fd, events);
}
}
}
/*
* If it fails, returns NULL and sets errno.
* Temporary memory allocations will be done with ctx.
*/
struct node *read_node(struct connection *conn, const void *ctx,
const char *name)
{
TDB_DATA key, data;
struct xs_tdb_record_hdr *hdr;
struct node *node;
node = talloc(ctx, struct node);
if (!node) {
errno = ENOMEM;
return NULL;
}
node->name = talloc_strdup(node, name);
if (!node->name) {
talloc_free(node);
errno = ENOMEM;
return NULL;
}
if (transaction_prepend(conn, name, &key))
return NULL;
data = tdb_fetch(tdb_ctx, key);
if (data.dptr == NULL) {
if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) {
node->generation = NO_GENERATION;
access_node(conn, node, NODE_ACCESS_READ, NULL);
errno = ENOENT;
} else {
log("TDB error on read: %s", tdb_errorstr(tdb_ctx));
errno = EIO;
}
talloc_free(node);
return NULL;
}
node->parent = NULL;
talloc_steal(node, data.dptr);
/* Datalen, childlen, number of permissions */
hdr = (void *)data.dptr;
node->generation = hdr->generation;
node->perms.num = hdr->num_perms;
node->datalen = hdr->datalen;
node->childlen = hdr->childlen;
/* Permissions are struct xs_permissions. */
node->perms.p = hdr->perms;
if (domain_adjust_node_perms(node)) {
talloc_free(node);
return NULL;
}
/* Data is binary blob (usually ascii, no nul). */
node->data = node->perms.p + hdr->num_perms;
/* Children is strings, nul separated. */
node->children = node->data + node->datalen;
access_node(conn, node, NODE_ACCESS_READ, NULL);
return node;
}
int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node,
bool no_quota_check)
{
TDB_DATA data;
void *p;
struct xs_tdb_record_hdr *hdr;
if (domain_adjust_node_perms(node))
return errno;
data.dsize = sizeof(*hdr)
+ node->perms.num * sizeof(node->perms.p[0])
+ node->datalen + node->childlen;
if (!no_quota_check && domain_is_unprivileged(conn) &&
data.dsize >= quota_max_entry_size) {
errno = ENOSPC;
return errno;
}
data.dptr = talloc_size(node, data.dsize);
hdr = (void *)data.dptr;
hdr->generation = node->generation;
hdr->num_perms = node->perms.num;
hdr->datalen = node->datalen;
hdr->childlen = node->childlen;
memcpy(hdr->perms, node->perms.p,
node->perms.num * sizeof(*node->perms.p));
p = hdr->perms + node->perms.num;
memcpy(p, node->data, node->datalen);
p += node->datalen;
memcpy(p, node->children, node->childlen);
/* TDB should set errno, but doesn't even set ecode AFAICT. */
if (tdb_store(tdb_ctx, *key, data, TDB_REPLACE) != 0) {
corrupt(conn, "Write of %s failed", key->dptr);
errno = EIO;
return errno;
}
return 0;
}
static int write_node(struct connection *conn, struct node *node,
bool no_quota_check)
{
TDB_DATA key;
if (access_node(conn, node, NODE_ACCESS_WRITE, &key))
return errno;
return write_node_raw(conn, &key, node, no_quota_check);
}
enum xs_perm_type perm_for_conn(struct connection *conn,
const struct node_perms *perms)
{
unsigned int i;
enum xs_perm_type mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
if (!conn->can_write)
mask &= ~XS_PERM_WRITE;
/* Owners and tools get it all... */
if (!domain_is_unprivileged(conn) || perms->p[0].id == conn->id
|| (conn->target && perms->p[0].id == conn->target->id))
return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
for (i = 1; i < perms->num; i++)
if (!(perms->p[i].perms & XS_PERM_IGNORE) &&
(perms->p[i].id == conn->id ||
(conn->target && perms->p[i].id == conn->target->id)))
return perms->p[i].perms & mask;
return perms->p[0].perms & mask;
}
/*
* Get name of node parent.
* Temporary memory allocations are done with ctx.
*/
char *get_parent(const void *ctx, const char *node)
{
char *parent;
char *slash = strrchr(node + 1, '/');
parent = slash ? talloc_asprintf(ctx, "%.*s", (int)(slash - node), node)
: talloc_strdup(ctx, "/");
if (!parent)
errno = ENOMEM;
return parent;
}
/*
* What do parents say?
* Temporary memory allocations are done with ctx.
*/
static int ask_parents(struct connection *conn, const void *ctx,
const char *name, enum xs_perm_type *perm)
{
struct node *node;
do {
name = get_parent(ctx, name);
if (!name)
return errno;
node = read_node(conn, ctx, name);
if (node)
break;
if (errno == ENOMEM)
return errno;
} while (!streq(name, "/"));
/* No permission at root? We're in trouble. */
if (!node) {
corrupt(conn, "No permissions file at root");
*perm = XS_PERM_NONE;
return 0;
}
*perm = perm_for_conn(conn, &node->perms);
return 0;
}
/*
* We have a weird permissions system. You can allow someone into a
* specific node without allowing it in the parents. If it's going to
* fail, however, we don't want the errno to indicate any information
* about the node.
* Temporary memory allocations are done with ctx.
*/
static int errno_from_parents(struct connection *conn, const void *ctx,
const char *node, int errnum,
enum xs_perm_type perm)
{
enum xs_perm_type parent_perm = XS_PERM_NONE;
/* We always tell them about memory failures. */
if (errnum == ENOMEM)
return errnum;
if (ask_parents(conn, ctx, node, &parent_perm))
return errno;
if (parent_perm & perm)
return errnum;
return EACCES;
}
/*
* If it fails, returns NULL and sets errno.
* Temporary memory allocations are done with ctx.
*/
static struct node *get_node(struct connection *conn,
const void *ctx,
const char *name,
enum xs_perm_type perm)
{
struct node *node;
if (!name || !is_valid_nodename(name)) {
errno = EINVAL;
return NULL;
}
node = read_node(conn, ctx, name);
/* If we don't have permission, we don't have node. */
if (node) {
if ((perm_for_conn(conn, &node->perms) & perm) != perm) {
errno = EACCES;
node = NULL;
}
}
/* Clean up errno if they weren't supposed to know. */
if (!node && errno != ENOMEM)
errno = errno_from_parents(conn, ctx, name, errno, perm);
return node;
}
static struct buffered_data *new_buffer(void *ctx)
{
struct buffered_data *data;
data = talloc_zero(ctx, struct buffered_data);
if (data == NULL)
return NULL;
data->inhdr = true;
return data;
}
/* Return length of string (including nul) at this offset.
* If there is no nul, returns 0 for failure.
*/
static unsigned int get_string(const struct buffered_data *data,
unsigned int offset)
{
const char *nul;
if (offset >= data->used)
return 0;
nul = memchr(data->buffer + offset, 0, data->used - offset);
if (!nul)
return 0;
return nul - (data->buffer + offset) + 1;
}
/* Break input into vectors, return the number, fill in up to num of them.
* Always returns the actual number of nuls in the input. Stores the
* positions of the starts of the nul-terminated strings in vec.
* Callers who use this and then rely only on vec[] will
* ignore any data after the final nul.
*/
unsigned int get_strings(struct buffered_data *data,
char *vec[], unsigned int num)
{
unsigned int off, i, len;
off = i = 0;
while ((len = get_string(data, off)) != 0) {
if (i < num)
vec[i] = data->buffer + off;
i++;
off += len;
}
return i;
}
static void send_error(struct connection *conn, int error)
{
unsigned int i;
for (i = 0; error != xsd_errors[i].errnum; i++) {
if (i == ARRAY_SIZE(xsd_errors) - 1) {
eprintf("xenstored: error %i untranslatable", error);
i = 0; /* EINVAL */
break;
}
}
send_reply(conn, XS_ERROR, xsd_errors[i].errstring,
strlen(xsd_errors[i].errstring) + 1);
}
void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
const void *data, unsigned int len)
{
struct buffered_data *bdata;
if ( len > XENSTORE_PAYLOAD_MAX ) {
send_error(conn, E2BIG);
return;
}
/* Replies reuse the request buffer, events need a new one. */
if (type != XS_WATCH_EVENT) {
bdata = conn->in;
/* Drop asynchronous responses, e.g. errors for watch events. */
if (!bdata)
return;
bdata->inhdr = true;
bdata->used = 0;
conn->in = NULL;
} else {
/* Message is a child of the connection for auto-cleanup. */
bdata = new_buffer(conn);
/*
* Allocation failure here is unfortunate: we have no way to
* tell anybody about it.
*/
if (!bdata)
return;
}
if (len <= DEFAULT_BUFFER_SIZE)
bdata->buffer = bdata->default_buffer;
else
bdata->buffer = talloc_array(bdata, char, len);
if (!bdata->buffer) {
if (type == XS_WATCH_EVENT) {
/* Same as above: no way to tell someone. */
talloc_free(bdata);
return;
}
/* re-establish request buffer for sending ENOMEM. */
conn->in = bdata;
send_error(conn, ENOMEM);
return;
}
/* Update relevant header fields and fill in the message body. */
bdata->hdr.msg.type = type;
bdata->hdr.msg.len = len;
memcpy(bdata->buffer, data, len);
/* Queue for later transmission. */
list_add_tail(&bdata->list, &conn->out_list);
return;
}
/* Some routines (write, mkdir, etc) just need a non-error return */
void send_ack(struct connection *conn, enum xsd_sockmsg_type type)
{
send_reply(conn, type, "OK", sizeof("OK"));
}
static bool valid_chars(const char *node)
{
/* Nodes can have lots of crap. */
return (strspn(node,
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789-/_@") == strlen(node));
}
bool is_valid_nodename(const char *node)
{
/* Must start in /. */
if (!strstarts(node, "/"))
return false;
/* Cannot end in / (unless it's just "/"). */
if (strends(node, "/") && !streq(node, "/"))
return false;
/* No double //. */
if (strstr(node, "//"))
return false;
if (strlen(node) > XENSTORE_ABS_PATH_MAX)
return false;
return valid_chars(node);
}
/* We expect one arg in the input: return NULL otherwise.
* The payload must contain exactly one nul, at the end.
*/
const char *onearg(struct buffered_data *in)
{
if (!in->used || get_string(in, 0) != in->used)
return NULL;
return in->buffer;
}
static char *perms_to_strings(const void *ctx, const struct node_perms *perms,
unsigned int *len)
{
unsigned int i;
char *strings = NULL;
char buffer[MAX_STRLEN(unsigned int) + 1];
for (*len = 0, i = 0; i < perms->num; i++) {
if (!xs_perm_to_string(&perms->p[i], buffer, sizeof(buffer)))
return NULL;
strings = talloc_realloc(ctx, strings, char,
*len + strlen(buffer) + 1);
if (!strings)
return NULL;
strcpy(strings + *len, buffer);
*len += strlen(buffer) + 1;
}
return strings;
}
char *canonicalize(struct connection *conn, const void *ctx, const char *node)
{
const char *prefix;
if (!node || (node[0] == '/') || (node[0] == '@'))
return (char *)node;
prefix = get_implicit_path(conn);
if (prefix)
return talloc_asprintf(ctx, "%s/%s", prefix, node);
return (char *)node;
}
static struct node *get_node_canonicalized(struct connection *conn,
const void *ctx,
const char *name,
char **canonical_name,
enum xs_perm_type perm)
{
char *tmp_name;
if (!canonical_name)
canonical_name = &tmp_name;
*canonical_name = canonicalize(conn, ctx, name);
return get_node(conn, ctx, *canonical_name, perm);
}
static int send_directory(struct connection *conn, struct buffered_data *in)
{
struct node *node;
node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ);
if (!node)
return errno;
send_reply(conn, XS_DIRECTORY, node->children, node->childlen);
return 0;
}
static int send_directory_part(struct connection *conn,
struct buffered_data *in)
{
unsigned int off, len, maxlen, genlen;
char *child, *data;
struct node *node;
char gen[24];
if (xs_count_strings(in->buffer, in->used) != 2)
return EINVAL;
/* First arg is node name. */
node = get_node_canonicalized(conn, in, in->buffer, NULL, XS_PERM_READ);
if (!node)
return errno;
/* Second arg is childlist offset. */
off = atoi(in->buffer + strlen(in->buffer) + 1);
genlen = snprintf(gen, sizeof(gen), "%"PRIu64, node->generation) + 1;
/* Offset behind list: just return a list with an empty string. */
if (off >= node->childlen) {
gen[genlen] = 0;
send_reply(conn, XS_DIRECTORY_PART, gen, genlen + 1);
return 0;
}
len = 0;
maxlen = XENSTORE_PAYLOAD_MAX - genlen - 1;
child = node->children + off;
while (len + strlen(child) < maxlen) {
len += strlen(child) + 1;
child += strlen(child) + 1;
if (off + len == node->childlen)
break;
}
data = talloc_array(in, char, genlen + len + 1);
if (!data)
return ENOMEM;
memcpy(data, gen, genlen);
memcpy(data + genlen, node->children + off, len);
if (off + len == node->childlen) {
data[genlen + len] = 0;
len++;
}
send_reply(conn, XS_DIRECTORY_PART, data, genlen + len);
return 0;
}
static int do_read(struct connection *conn, struct buffered_data *in)
{
struct node *node;
node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ);
if (!node)
return errno;
send_reply(conn, XS_READ, node->data, node->datalen);
return 0;
}
static void delete_node_single(struct connection *conn, struct node *node)
{
TDB_DATA key;
if (access_node(conn, node, NODE_ACCESS_DELETE, &key))
return;
if (tdb_delete(tdb_ctx, key) != 0) {
corrupt(conn, "Could not delete '%s'", node->name);
return;
}
domain_entry_dec(conn, node);
}
/* Must not be / */
static char *basename(const char *name)
{
return strrchr(name, '/') + 1;
}
static struct node *construct_node(struct connection *conn, const void *ctx,
const char *name)
{
const char *base;
unsigned int baselen;
struct node *parent, *node;
char *children, *parentname = get_parent(ctx, name);
if (!parentname)
return NULL;
/* If parent doesn't exist, create it. */
parent = read_node(conn, parentname, parentname);
if (!parent)
parent = construct_node(conn, ctx, parentname);
if (!parent)
return NULL;
/* Add child to parent. */
base = basename(name);
baselen = strlen(base) + 1;
children = talloc_array(ctx, char, parent->childlen + baselen);
if (!children)
goto nomem;
memcpy(children, parent->children, parent->childlen);
memcpy(children + parent->childlen, base, baselen);
parent->children = children;
parent->childlen += baselen;
/* Allocate node */
node = talloc(ctx, struct node);
if (!node)
goto nomem;
node->name = talloc_strdup(node, name);
if (!node->name)
goto nomem;
/* Inherit permissions, except unprivileged domains own what they create */
node->perms.num = parent->perms.num;
node->perms.p = talloc_memdup(node, parent->perms.p,
node->perms.num * sizeof(*node->perms.p));
if (!node->perms.p)
goto nomem;
if (domain_is_unprivileged(conn))
node->perms.p[0].id = conn->id;
/* No children, no data */
node->children = node->data = NULL;
node->childlen = node->datalen = 0;
node->parent = parent;
return node;
nomem:
errno = ENOMEM;
return NULL;
}
static int destroy_node(void *_node)
{
struct node *node = _node;
TDB_DATA key;
if (streq(node->name, "/"))
corrupt(NULL, "Destroying root node!");
key.dptr = (void *)node->name;
key.dsize = strlen(node->name);
tdb_delete(tdb_ctx, key);
domain_entry_dec(talloc_parent(node), node);
return 0;
}
static struct node *create_node(struct connection *conn, const void *ctx,
const char *name,
void *data, unsigned int datalen)
{
struct node *node, *i;
node = construct_node(conn, ctx, name);
if (!node)
return NULL;
node->data = data;
node->datalen = datalen;
/*
* We write out the nodes bottom up.
* All new created nodes will have i->parent set, while the final
* node will be already existing and won't have i->parent set.
* New nodes are subject to quota handling.
* Initially set a destructor for all new nodes removing them from
* TDB again and undoing quota accounting for the case of an error
* during the write loop.
*/
for (i = node; i; i = i->parent) {
/* i->parent is set for each new node, so check quota. */
if (i->parent &&
domain_entry(conn) >= quota_nb_entry_per_domain) {
errno = ENOSPC;
return NULL;
}
if (write_node(conn, i, false))
return NULL;
/* Account for new node, set destructor for error case. */
if (i->parent) {
domain_entry_inc(conn, i);
talloc_set_destructor(i, destroy_node);
}
}
/* OK, now remove destructors so they stay around */
for (i = node; i->parent; i = i->parent)
talloc_set_destructor(i, NULL);
return node;
}
/* path, data... */
static int do_write(struct connection *conn, struct buffered_data *in)
{
unsigned int offset, datalen;
struct node *node;
char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
char *name;
/* Extra "strings" can be created by binary data. */
if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
return EINVAL;
offset = strlen(vec[0]) + 1;
datalen = in->used - offset;
node = get_node_canonicalized(conn, in, vec[0], &name, XS_PERM_WRITE);
if (!node) {
/* No permissions, invalid input? */
if (errno != ENOENT)
return errno;
node = create_node(conn, in, name, in->buffer + offset,
datalen);
if (!node)
return errno;
} else {
node->data = in->buffer + offset;
node->datalen = datalen;
if (write_node(conn, node, false))
return errno;
}
fire_watches(conn, in, name, node, false, NULL);
send_ack(conn, XS_WRITE);
return 0;
}
static int do_mkdir(struct connection *conn, struct buffered_data *in)
{
struct node *node;
char *name;
node = get_node_canonicalized(conn, in, onearg(in), &name,
XS_PERM_WRITE);
/* If it already exists, fine. */
if (!node) {
/* No permissions? */
if (errno != ENOENT)
return errno;
node = create_node(conn, in, name, NULL, 0);
if (!node)
return errno;
fire_watches(conn, in, name, node, false, NULL);
}
send_ack(conn, XS_MKDIR);
return 0;
}
/* Delete memory using memmove. */
static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
{
memmove(mem + off, mem + off + len, total - off - len);
}
static void remove_child_entry(struct connection *conn, struct node *node,
size_t offset)
{
size_t childlen = strlen(node->children + offset);
memdel(node->children, offset, childlen + 1, node->childlen);
node->childlen -= childlen + 1;
if (write_node(conn, node, true))
corrupt(conn, "Can't update parent node '%s'", node->name);
}
static void delete_child(struct connection *conn,
struct node *node, const char *childname)
{
unsigned int i;
for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
if (streq(node->children+i, childname)) {
remove_child_entry(conn, node, i);
return;
}
}
corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
}
static int delete_node(struct connection *conn, const void *ctx,
struct node *parent, struct node *node)
{
char *name;
/* Delete children. */
while (node->childlen) {
struct node *child;
name = talloc_asprintf(node, "%s/%s", node->name,
node->children);
child = name ? read_node(conn, node, name) : NULL;
if (child) {
if (delete_node(conn, ctx, node, child))
return errno;
} else {
trace("delete_node: Error deleting child '%s/%s'!\n",
node->name, node->children);
/* Quit deleting. */
errno = ENOMEM;
return errno;
}
talloc_free(name);
}
fire_watches(conn, ctx, node->name, node, true, NULL);
delete_node_single(conn, node);
delete_child(conn, parent, basename(node->name));
talloc_free(node);
return 0;
}
static int _rm(struct connection *conn, const void *ctx, struct node *node,
const char *name)
{
/*
* Deleting node by node, so the result is always consistent even in
* case of a failure.
*/
struct node *parent;
char *parentname = get_parent(ctx, name);
if (!parentname)
return errno;
parent = read_node(conn, ctx, parentname);
if (!parent)
return (errno == ENOMEM) ? ENOMEM : EINVAL;
node->parent = parent;
/*
* Fire the watches now, when we can still see the node permissions.
* This fine as we are single threaded and the next possible read will
* be handled only after the node has been really removed.
*/
fire_watches(conn, ctx, name, node, false, NULL);
return delete_node(conn, ctx, parent, node);
}
static int do_rm(struct connection *conn, struct buffered_data *in)
{
struct node *node;
int ret;
char *name;
char *parentname;
node = get_node_canonicalized(conn, in, onearg(in), &name,
XS_PERM_WRITE);
if (!node) {
/* Didn't exist already? Fine, if parent exists. */
if (errno == ENOENT) {
parentname = get_parent(in, name);
if (!parentname)
return errno;
node = read_node(conn, in, parentname);
if (node) {
send_ack(conn, XS_RM);
return 0;
}
/* Restore errno, just in case. */
if (errno != ENOMEM)
errno = ENOENT;
}
return errno;
}
if (streq(name, "/"))
return EINVAL;
ret = _rm(conn, in, node, name);
if (ret)
return ret;
send_ack(conn, XS_RM);
return 0;
}
static int do_get_perms(struct connection *conn, struct buffered_data *in)
{
struct node *node;
char *strings;
unsigned int len;
node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ);
if (!node)
return errno;
strings = perms_to_strings(node, &node->perms, &len);
if (!strings)
return errno;
send_reply(conn, XS_GET_PERMS, strings, len);
return 0;
}
static int do_set_perms(struct connection *conn, struct buffered_data *in)
{
struct node_perms perms, old_perms;
char *name, *permstr;
struct node *node;
perms.num = xs_count_strings(in->buffer, in->used);
if (perms.num < 2)
return EINVAL;
perms.num--;
if (domain_is_unprivileged(conn) &&
perms.num > quota_nb_perms_per_node)
return ENOSPC;
permstr = in->buffer + strlen(in->buffer) + 1;
perms.p = talloc_array(in, struct xs_permissions, perms.num);
if (!perms.p)
return ENOMEM;
if (!xs_strings_to_perms(perms.p, perms.num, permstr))
return errno;
/* First arg is node name. */
if (strstarts(in->buffer, "@")) {
if (set_perms_special(conn, in->buffer, &perms))
return errno;
send_ack(conn, XS_SET_PERMS);
return 0;
}
/* We must own node to do this (tools can do this too). */
node = get_node_canonicalized(conn, in, in->buffer, &name,
XS_PERM_WRITE | XS_PERM_OWNER);
if (!node)
return errno;
/* Unprivileged domains may not change the owner. */
if (domain_is_unprivileged(conn) &&
perms.p[0].id != node->perms.p[0].id)
return EPERM;
old_perms = node->perms;
domain_entry_dec(conn, node);
node->perms = perms;
domain_entry_inc(conn, node);
if (write_node(conn, node, false))
return errno;
fire_watches(conn, in, name, node, false, &old_perms);
send_ack(conn, XS_SET_PERMS);
return 0;
}
static struct {
const char *str;
int (*func)(struct connection *conn, struct buffered_data *in);
unsigned int flags;
#define XS_FLAG_NOTID (1U << 0) /* Ignore transaction id. */
#define XS_FLAG_PRIV (1U << 1) /* Privileged domain only. */
} const wire_funcs[XS_TYPE_COUNT] = {
[XS_CONTROL] =
{ "CONTROL", do_control, XS_FLAG_PRIV },
[XS_DIRECTORY] = { "DIRECTORY", send_directory },
[XS_READ] = { "READ", do_read },
[XS_GET_PERMS] = { "GET_PERMS", do_get_perms },
[XS_WATCH] =
{ "WATCH", do_watch, XS_FLAG_NOTID },
[XS_UNWATCH] =
{ "UNWATCH", do_unwatch, XS_FLAG_NOTID },
[XS_TRANSACTION_START] = { "TRANSACTION_START", do_transaction_start },
[XS_TRANSACTION_END] = { "TRANSACTION_END", do_transaction_end },
[XS_INTRODUCE] =
{ "INTRODUCE", do_introduce, XS_FLAG_PRIV },
[XS_RELEASE] =
{ "RELEASE", do_release, XS_FLAG_PRIV },
[XS_GET_DOMAIN_PATH] = { "GET_DOMAIN_PATH", do_get_domain_path },
[XS_WRITE] = { "WRITE", do_write },
[XS_MKDIR] = { "MKDIR", do_mkdir },
[XS_RM] = { "RM", do_rm },
[XS_SET_PERMS] = { "SET_PERMS", do_set_perms },
[XS_WATCH_EVENT] = { "WATCH_EVENT", NULL },
[XS_ERROR] = { "ERROR", NULL },
[XS_IS_DOMAIN_INTRODUCED] =
{ "IS_DOMAIN_INTRODUCED", do_is_domain_introduced, XS_FLAG_PRIV },
[XS_RESUME] =
{ "RESUME", do_resume, XS_FLAG_PRIV },
[XS_SET_TARGET] =
{ "SET_TARGET", do_set_target, XS_FLAG_PRIV },
[XS_RESET_WATCHES] = { "RESET_WATCHES", do_reset_watches },
[XS_DIRECTORY_PART] = { "DIRECTORY_PART", send_directory_part },
};
/*
* Keep the connection alive but stop processing any new request or sending
* reponse. This is to allow sending @releaseDomain watch event at the correct
* moment and/or to allow the connection to restart (not yet implemented).
*
* All watches, transactions, buffers will be freed.
*/
static void ignore_connection(struct connection *conn)
{
struct buffered_data *out, *tmp;
trace("CONN %p ignored\n", conn);
conn->is_ignored = true;
conn_delete_all_watches(conn);
conn_delete_all_transactions(conn);
list_for_each_entry_safe(out, tmp, &conn->out_list, list) {
list_del(&out->list);
talloc_free(out);
}
talloc_free(conn->in);
conn->in = NULL;
}
static const char *sockmsg_string(enum xsd_sockmsg_type type)
{
if ((unsigned int)type < ARRAY_SIZE(wire_funcs) && wire_funcs[type].str)
return wire_funcs[type].str;
return "**UNKNOWN**";
}
/* Process "in" for conn: "in" will vanish after this conversation, so
* we can talloc off it for temporary variables. May free "conn".
*/
static void process_message(struct connection *conn, struct buffered_data *in)
{
struct transaction *trans;
enum xsd_sockmsg_type type = in->hdr.msg.type;
int ret;
if ((unsigned int)type >= XS_TYPE_COUNT || !wire_funcs[type].func) {
eprintf("Client unknown operation %i", type);
send_error(conn, ENOSYS);
return;
}
if ((wire_funcs[type].flags & XS_FLAG_PRIV) &&
domain_is_unprivileged(conn)) {
send_error(conn, EACCES);
return;
}
trans = (wire_funcs[type].flags & XS_FLAG_NOTID)
? NULL : transaction_lookup(conn, in->hdr.msg.tx_id);
if (IS_ERR(trans)) {
send_error(conn, -PTR_ERR(trans));
return;
}
assert(conn->transaction == NULL);
conn->transaction = trans;
ret = wire_funcs[type].func(conn, in);
if (ret)
send_error(conn, ret);
conn->transaction = NULL;
}
static void consider_message(struct connection *conn)
{
if (verbose)
xprintf("Got message %s len %i from %p\n",
sockmsg_string(conn->in->hdr.msg.type),
conn->in->hdr.msg.len, conn);
process_message(conn, conn->in);
assert(conn->in == NULL);
}
/*
* Errors in reading or allocating here means we get out of sync, so we mark
* the connection as ignored.
*/
static void handle_input(struct connection *conn)
{
int bytes;
struct buffered_data *in;
if (!conn->in) {
conn->in = new_buffer(conn);
/* In case of no memory just try it again next time. */
if (!conn->in)
return;
}
in = conn->in;
/* Not finished header yet? */
if (in->inhdr) {
if (in->used != sizeof(in->hdr)) {
bytes = conn->read(conn, in->hdr.raw + in->used,
sizeof(in->hdr) - in->used);
if (bytes < 0)
goto bad_client;
in->used += bytes;
if (in->used != sizeof(in->hdr))
return;
if (in->hdr.msg.len > XENSTORE_PAYLOAD_MAX) {
syslog(LOG_ERR, "Client tried to feed us %i",
in->hdr.msg.len);
goto bad_client;
}
}
if (in->hdr.msg.len <= DEFAULT_BUFFER_SIZE)
in->buffer = in->default_buffer;
else
in->buffer = talloc_array(in, char, in->hdr.msg.len);
/* In case of no memory just try it again next time. */
if (!in->buffer)
return;
in->used = 0;
in->inhdr = false;
}
bytes = conn->read(conn, in->buffer + in->used,
in->hdr.msg.len - in->used);
if (bytes < 0)
goto bad_client;
in->used += bytes;
if (in->used != in->hdr.msg.len)
return;
trace_io(conn, in, 0);
consider_message(conn);
return;
bad_client:
ignore_connection(conn);
}
static void handle_output(struct connection *conn)
{
/* Ignore the connection if an error occured */
if (!write_messages(conn))
ignore_connection(conn);
}
struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
{
struct connection *new;
new = talloc_zero(talloc_autofree_context(), struct connection);
if (!new)
return NULL;
new->fd = -1;
new->pollfd_idx = -1;
new->write = write;
new->read = read;
new->can_write = true;
new->is_ignored = false;
new->transaction_started = 0;
INIT_LIST_HEAD(&new->out_list);
INIT_LIST_HEAD(&new->watches);
INIT_LIST_HEAD(&new->transaction_list);
list_add_tail(&new->list, &connections);
talloc_set_destructor(new, destroy_conn);
trace_create(new, "connection");
return new;
}
#ifdef NO_SOCKETS
static void accept_connection(int sock, bool canwrite)
{
}
#else
static int writefd(struct connection *conn, const void *data, unsigned int len)
{
int rc;
while ((rc = write(conn->fd, data, len)) < 0) {
if (errno == EAGAIN) {
rc = 0;
break;
}
if (errno != EINTR)
break;
}
return rc;
}
static int readfd(struct connection *conn, void *data, unsigned int len)
{
int rc;
while ((rc = read(conn->fd, data, len)) < 0) {
if (errno == EAGAIN) {
rc = 0;
break;
}
if (errno != EINTR)
break;
}
/* Reading zero length means we're done with this connection. */
if ((rc == 0) && (len != 0)) {
errno = EBADF;
rc = -1;
}
return rc;
}
static void accept_connection(int sock, bool canwrite)
{
int fd;
struct connection *conn;
fd = accept(sock, NULL, NULL);
if (fd < 0)
return;
conn = new_connection(writefd, readfd);
if (conn) {
conn->fd = fd;
conn->can_write = canwrite;
} else
close(fd);
}
#endif
static int tdb_flags;
/* We create initial nodes manually. */
static void manual_node(const char *name, const char *child)
{
struct node *node;
struct xs_permissions perms = { .id = 0, .perms = XS_PERM_NONE };
node = talloc_zero(NULL, struct node);
if (!node)
barf_perror("Could not allocate initial node %s", name);
node->name = name;
node->perms.p = &perms;
node->perms.num = 1;
node->children = (char *)child;
if (child)
node->childlen = strlen(child) + 1;
if (write_node(NULL, node, false))
barf_perror("Could not create initial node %s", name);
talloc_free(node);
}
static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...)
{
va_list ap;
char *s;
va_start(ap, fmt);
s = talloc_vasprintf(NULL, fmt, ap);
va_end(ap);
if (s) {
trace("TDB: %s\n", s);
syslog(LOG_ERR, "TDB: %s", s);
if (verbose)
xprintf("TDB: %s", s);
talloc_free(s);
} else {
trace("talloc failure during logging\n");
syslog(LOG_ERR, "talloc failure during logging\n");
}
}
static void setup_structure(void)
{
char *tdbname;
tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
if (!tdbname)
barf_perror("Could not create tdbname");
if (!(tdb_flags & TDB_INTERNAL))
unlink(tdbname);
tdb_ctx = tdb_open_ex(tdbname, 7919, tdb_flags, O_RDWR|O_CREAT|O_EXCL,
0640, &tdb_logger, NULL);
if (!tdb_ctx)
barf_perror("Could not create tdb file %s", tdbname);
manual_node("/", "tool");
manual_node("/tool", "xenstored");
manual_node("/tool/xenstored", NULL);
check_store();
}
static unsigned int hash_from_key_fn(void *k)
{
char *str = k;
unsigned int hash = 5381;
char c;
while ((c = *str++))
hash = ((hash << 5) + hash) + (unsigned int)c;
return hash;
}
static int keys_equal_fn(void *key1, void *key2)
{
return 0 == strcmp((char *)key1, (char *)key2);
}
static char *child_name(const char *s1, const char *s2)
{
if (strcmp(s1, "/")) {
return talloc_asprintf(NULL, "%s/%s", s1, s2);
}
else {
return talloc_asprintf(NULL, "/%s", s2);
}
}
int remember_string(struct hashtable *hash, const char *str)
{
char *k = malloc(strlen(str) + 1);
if (!k)
return 0;
strcpy(k, str);
return hashtable_insert(hash, k, (void *)1);
}
/**
* A node has a children field that names the children of the node, separated
* by NULs. We check whether there are entries in there that are duplicated
* (and if so, delete the second one), and whether there are any that do not
* have a corresponding child node (and if so, delete them). Each valid child
* is then recursively checked.
*
* No deleting is performed if the recovery flag is cleared (i.e. -R was
* passed on the command line).
*
* As we go, we record each node in the given reachable hashtable. These
* entries will be used later in clean_store.
*/
static int check_store_(const char *name, struct hashtable *reachable)
{
struct node *node = read_node(NULL, name, name);
int ret = 0;
if (node) {
size_t i = 0;
struct hashtable * children =
create_hashtable(16, hash_from_key_fn, keys_equal_fn);
if (!remember_string(reachable, name)) {
hashtable_destroy(children, 0);
log("check_store: ENOMEM");
return ENOMEM;
}
while (i < node->childlen && !ret) {
struct node *childnode;
size_t childlen = strlen(node->children + i);
char * childname = child_name(node->name,
node->children + i);
if (!childname) {
log("check_store: ENOMEM");
ret = ENOMEM;
break;
}
childnode = read_node(NULL, childname, childname);
if (childnode) {
if (hashtable_search(children, childname)) {
log("check_store: '%s' is duplicated!",
childname);
if (recovery) {
remove_child_entry(NULL, node,
i);
i -= childlen + 1;
}
}
else {
if (!remember_string(children,
childname)) {
log("check_store: ENOMEM");
talloc_free(childnode);
talloc_free(childname);
ret = ENOMEM;
break;
}
ret = check_store_(childname,
reachable);
}
} else if (errno != ENOMEM) {
log("check_store: No child '%s' found!\n",
childname);
if (recovery) {
remove_child_entry(NULL, node, i);
i -= childlen + 1;
}
} else {
log("check_store: ENOMEM");
ret = ENOMEM;
}
talloc_free(childnode);
talloc_free(childname);
i += childlen + 1;
}
hashtable_destroy(children, 0 /* Don't free values (they are
all (void *)1) */);
talloc_free(node);
} else if (errno != ENOMEM) {
/* Impossible, because no database should ever be without the
root, and otherwise, we've just checked in our caller
(which made a recursive call to get here). */
log("check_store: No child '%s' found: impossible!", name);
} else {
log("check_store: ENOMEM");
ret = ENOMEM;
}
return ret;
}
/**
* Helper to clean_store below.
*/
static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val,
void *private)
{
struct hashtable *reachable = private;
char *slash;
char * name = talloc_strndup(NULL, key.dptr, key.dsize);
if (!name) {
log("clean_store: ENOMEM");
return 1;
}
if (name[0] != '/') {
slash = strchr(name, '/');
if (slash)
*slash = 0;
}
if (!hashtable_search(reachable, name)) {
log("clean_store: '%s' is orphaned!", name);
if (recovery) {
tdb_delete(tdb, key);
}
}
talloc_free(name);
return 0;
}
/**
* Given the list of reachable nodes, iterate over the whole store, and
* remove any that were not reached.
*/
static void clean_store(struct hashtable *reachable)
{
tdb_traverse(tdb_ctx, &clean_store_, reachable);
}
void check_store(void)
{
char * root = talloc_strdup(NULL, "/");
struct hashtable * reachable =
create_hashtable(16, hash_from_key_fn, keys_equal_fn);
if (!reachable) {
log("check_store: ENOMEM");
return;
}
log("Checking store ...");
if (!check_store_(root, reachable) &&
!check_transactions(reachable))
clean_store(reachable);
log("Checking store complete.");
hashtable_destroy(reachable, 0 /* Don't free values (they are all
(void *)1) */);
talloc_free(root);
}
/* Something is horribly wrong: check the store. */
void corrupt(struct connection *conn, const char *fmt, ...)
{
va_list arglist;
char *str;
int saved_errno = errno;
va_start(arglist, fmt);
str = talloc_vasprintf(NULL, fmt, arglist);
va_end(arglist);
log("corruption detected by connection %i: err %s: %s",
conn ? (int)conn->id : -1, strerror(saved_errno), str);
check_store();
}
#ifndef NO_SOCKETS
static void destroy_fds(void)
{
if (sock >= 0)
close(sock);
if (ro_sock >= 0)
close(ro_sock);
}
static void init_sockets(void)
{
struct sockaddr_un addr;
const char *soc_str = xs_daemon_socket();
const char *soc_str_ro = xs_daemon_socket_ro();
/* Create sockets for them to listen to. */
atexit(destroy_fds);
sock = socket(PF_UNIX, SOCK_STREAM, 0);
if (sock < 0)
barf_perror("Could not create socket");
ro_sock = socket(PF_UNIX, SOCK_STREAM, 0);
if (ro_sock < 0)
barf_perror("Could not create socket");
/* FIXME: Be more sophisticated, don't mug running daemon. */
unlink(soc_str);
unlink(soc_str_ro);
addr.sun_family = AF_UNIX;
if(strlen(soc_str) >= sizeof(addr.sun_path))
barf_perror("socket string '%s' too long", soc_str);
strcpy(addr.sun_path, soc_str);
if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
barf_perror("Could not bind socket to %s", soc_str);
if(strlen(soc_str_ro) >= sizeof(addr.sun_path))
barf_perror("socket string '%s' too long", soc_str_ro);
strcpy(addr.sun_path, soc_str_ro);
if (bind(ro_sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
barf_perror("Could not bind socket to %s", soc_str_ro);
if (chmod(soc_str, 0600) != 0
|| chmod(soc_str_ro, 0660) != 0)
barf_perror("Could not chmod sockets");
if (listen(sock, 1) != 0 || listen(ro_sock, 1) != 0)
barf_perror("Could not listen on sockets");
}
#endif
static void usage(void)
{
fprintf(stderr,
"Usage:\n"
"\n"
" xenstored \n"
"\n"
"where options may include:\n"
"\n"
" -D, --no-domain-init to state that xenstored should not initialise dom0,\n"
" -F, --pid-file giving a file for the daemon's pid to be written,\n"
" -H, --help to output this message,\n"
" -N, --no-fork to request that the daemon does not fork,\n"
" -P, --output-pid to request that the pid of the daemon is output,\n"
" -T, --trace-file giving the file for logging, and\n"
" -E, --entry-nb limit the number of entries per domain,\n"
" -S, --entry-size limit the size of entry per domain, and\n"
" -W, --watch-nb limit the number of watches per domain,\n"
" -t, --transaction limit the number of transaction allowed per domain,\n"
" -A, --perm-nb limit the number of permissions per node,\n"
" -R, --no-recovery to request that no recovery should be attempted when\n"
" the store is corrupted (debug only),\n"
" -I, --internal-db store database in memory, not on disk\n"
" -V, --verbose to request verbose execution.\n");
}
static struct option options[] = {
{ "no-domain-init", 0, NULL, 'D' },
{ "entry-nb", 1, NULL, 'E' },
{ "pid-file", 1, NULL, 'F' },
{ "event", 1, NULL, 'e' },
{ "master-domid", 1, NULL, 'm' },
{ "help", 0, NULL, 'H' },
{ "no-fork", 0, NULL, 'N' },
{ "priv-domid", 1, NULL, 'p' },
{ "output-pid", 0, NULL, 'P' },
{ "entry-size", 1, NULL, 'S' },
{ "trace-file", 1, NULL, 'T' },
{ "transaction", 1, NULL, 't' },
{ "perm-nb", 1, NULL, 'A' },
{ "no-recovery", 0, NULL, 'R' },
{ "internal-db", 0, NULL, 'I' },
{ "verbose", 0, NULL, 'V' },
{ "watch-nb", 1, NULL, 'W' },
{ NULL, 0, NULL, 0 } };
extern void dump_conn(struct connection *conn);
int dom0_domid = 0;
int dom0_event = 0;
int priv_domid = 0;
int main(int argc, char *argv[])
{
int opt;
int sock_pollfd_idx = -1, ro_sock_pollfd_idx = -1;
bool dofork = true;
bool outputpid = false;
bool no_domain_init = false;
const char *pidfile = NULL;
int timeout;
while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:T:RVW:", options,
NULL)) != -1) {
switch (opt) {
case 'D':
no_domain_init = true;
break;
case 'E':
quota_nb_entry_per_domain = strtol(optarg, NULL, 10);
break;
case 'F':
pidfile = optarg;
break;
case 'H':
usage();
return 0;
case 'N':
dofork = false;
break;
case 'P':
outputpid = true;
break;
case 'R':
recovery = false;
break;
case 'S':
quota_max_entry_size = strtol(optarg, NULL, 10);
break;
case 't':
quota_max_transaction = strtol(optarg, NULL, 10);
break;
case 'T':
tracefile = optarg;
break;
case 'I':
tdb_flags = TDB_INTERNAL|TDB_NOLOCK;
break;
case 'V':
verbose = true;
break;
case 'W':
quota_nb_watch_per_domain = strtol(optarg, NULL, 10);
break;
case 'A':
quota_nb_perms_per_node = strtol(optarg, NULL, 10);
break;
case 'e':
dom0_event = strtol(optarg, NULL, 10);
break;
case 'm':
dom0_domid = strtol(optarg, NULL, 10);
break;
case 'p':
priv_domid = strtol(optarg, NULL, 10);
break;
}
}
if (optind != argc)
barf("%s: No arguments desired", argv[0]);
reopen_log();
/* make sure xenstored directories exist */
/* Errors ignored here, will be reported when we open files */
mkdir(xs_daemon_rundir(), 0755);
mkdir(xs_daemon_rootdir(), 0755);
if (dofork) {
openlog("xenstored", 0, LOG_DAEMON);
daemonize();
}
if (pidfile)
write_pidfile(pidfile);
/* Talloc leak reports go to stderr, which is closed if we fork. */
if (!dofork)
talloc_enable_leak_report_full();
/* Don't kill us with SIGPIPE. */
signal(SIGPIPE, SIG_IGN);
talloc_enable_null_tracking();
#ifndef NO_SOCKETS
init_sockets();
#endif
init_pipe(reopen_log_pipe);
/* Setup the database */
setup_structure();
/* Listen to hypervisor. */
if (!no_domain_init)
domain_init();
/* Restore existing connections. */
restore_existing_connections();
if (outputpid) {
printf("%ld\n", (long)getpid());
fflush(stdout);
}
/* redirect to /dev/null now we're ready to accept connections */
if (dofork)
finish_daemonize();
signal(SIGHUP, trigger_reopen_log);
if (tracefile)
tracefile = talloc_strdup(NULL, tracefile);
/* Get ready to listen to the tools. */
initialize_fds(&sock_pollfd_idx, &ro_sock_pollfd_idx, &timeout);
/* Tell the kernel we're up and running. */
xenbus_notify_running();
#if defined(XEN_SYSTEMD_ENABLED)
sd_notify(1, "READY=1");
fprintf(stderr, SD_NOTICE "xenstored is ready\n");
#endif
/* Main loop. */
for (;;) {
struct connection *conn, *next;
if (poll(fds, nr_fds, timeout) < 0) {
if (errno == EINTR)
continue;
barf_perror("Poll failed");
}
if (reopen_log_pipe0_pollfd_idx != -1) {
if (fds[reopen_log_pipe0_pollfd_idx].revents
& ~POLLIN) {
close(reopen_log_pipe[0]);
close(reopen_log_pipe[1]);
init_pipe(reopen_log_pipe);
} else if (fds[reopen_log_pipe0_pollfd_idx].revents
& POLLIN) {
char c;
if (read(reopen_log_pipe[0], &c, 1) != 1)
barf_perror("read failed");
reopen_log();
}
reopen_log_pipe0_pollfd_idx = -1;
}
if (sock_pollfd_idx != -1) {
if (fds[sock_pollfd_idx].revents & ~POLLIN) {
barf_perror("sock poll failed");
break;
} else if (fds[sock_pollfd_idx].revents & POLLIN) {
accept_connection(sock, true);
sock_pollfd_idx = -1;
}
}
if (ro_sock_pollfd_idx != -1) {
if (fds[ro_sock_pollfd_idx].revents & ~POLLIN) {
barf_perror("ro sock poll failed");
break;
} else if (fds[ro_sock_pollfd_idx].revents & POLLIN) {
accept_connection(ro_sock, false);
ro_sock_pollfd_idx = -1;
}
}
if (xce_pollfd_idx != -1) {
if (fds[xce_pollfd_idx].revents & ~POLLIN) {
barf_perror("xce_handle poll failed");
break;
} else if (fds[xce_pollfd_idx].revents & POLLIN) {
handle_event();
xce_pollfd_idx = -1;
}
}
next = list_entry(connections.next, typeof(*conn), list);
if (&next->list != &connections)
talloc_increase_ref_count(next);
while (&next->list != &connections) {
conn = next;
next = list_entry(conn->list.next,
typeof(*conn), list);
if (&next->list != &connections)
talloc_increase_ref_count(next);
if (conn->domain) {
if (domain_can_read(conn))
handle_input(conn);
if (talloc_free(conn) == 0)
continue;
talloc_increase_ref_count(conn);
if (domain_can_write(conn) &&
!list_empty(&conn->out_list))
handle_output(conn);
if (talloc_free(conn) == 0)
continue;
} else {
if (conn->pollfd_idx != -1) {
if (fds[conn->pollfd_idx].revents
& ~(POLLIN|POLLOUT))
talloc_free(conn);
else if ((fds[conn->pollfd_idx].revents
& POLLIN) &&
!conn->is_ignored)
handle_input(conn);
}
if (talloc_free(conn) == 0)
continue;
talloc_increase_ref_count(conn);
if (conn->pollfd_idx != -1) {
if (fds[conn->pollfd_idx].revents
& ~(POLLIN|POLLOUT))
talloc_free(conn);
else if ((fds[conn->pollfd_idx].revents
& POLLOUT) &&
!conn->is_ignored)
handle_output(conn);
}
if (talloc_free(conn) == 0)
continue;
conn->pollfd_idx = -1;
}
}
initialize_fds(&sock_pollfd_idx, &ro_sock_pollfd_idx, &timeout);
}
}
/*
* Local variables:
* mode: C
* c-file-style: "linux"
* indent-tabs-mode: t
* c-basic-offset: 8
* tab-width: 8
* End:
*/