Re: [Xen-devel] [PATCH v4 3/4] xenstore: rework of transaction handling

2017-03-31 Thread Juergen Gross
On 31/03/17 13:00, Wei Liu wrote:
> On Thu, Mar 30, 2017 at 06:41:11PM +0200, Juergen Gross wrote:
> [...]
>>>
node->generation = generation++;
 -  return;
 +  if (conn && !conn->transaction)
 +  wrl_apply_debit_direct(conn);
 +  }
 +
 +  if (!conn || !conn->transaction) {
 +  if (key)
 +  set_tdb_key(node->name, key);
 +  return 0;
}
  
trans = conn->transaction;
  
 -  node->generation = generation + trans->trans_gen++;
 +  trans_name = transaction_get_node_name(node, trans, node->name);
 +  if (!trans_name)
 +  goto nomem;
  
 -  list_for_each_entry(i, >changes, list) {
 -  if (streq(i->node, node->name)) {
 -  if (recurse)
 -  i->recurse = recurse;
 -  return;
 -  }
 -  }
 -
 -  i = talloc(trans, struct changed_node);
 +  i = find_accessed_node(trans, node->name);
if (!i) {
 -  /* All we can do is let the transaction fail. */
 -  generation++;
 -  return;
 +  i = talloc_zero(trans, struct accessed_node);
 +  if (!i)
 +  goto nomem;
 +  i->node = talloc_strdup(i, node->name);
 +  if (!i->node)
 +  goto nomem;
 +
 +  introduce = true;
 +  i->ta_node = false;
 +
 +  /* We have to verify read nodes only if we didn't write them. */
 +  if (type == NODE_ACCESS_READ) {
 +  i->generation = node->generation;
 +  i->check_gen = true;
 +  if (node->generation != NO_GENERATION) {
 +  set_tdb_key(trans_name, _key);
 +  ret = write_node_raw(conn, _key, node);
 +  if (ret)
 +  goto err;
 +  i->ta_node = true;
 +  }
 +  }
>>>
>>> I think the current code structure is a bit confusing.
>>>
>>> For write types, the call to write_node_raw is outside of this function
>>> but for read type it is inside this function.
>>
>> This is due to the different purpose of write_node_raw() in both
>> cases:
>>
>> In the read case we write an _additional_ node into the transaction,
>> while in the write case it is just the user's operation.
>>
>>>
 +  list_add_tail(>list, >accessed);
}
 -  i->node = talloc_strdup(i, node->name);
 -  if (!i->node) {
 -  /* All we can do is let the transaction fail. */
 -  generation++;
 +
 +  if (type != NODE_ACCESS_READ)
 +  i->modified = true;
 +
 +  if (introduce && type == NODE_ACCESS_DELETE)
 +  /* Nothing to delete. */
 +  return -1;
 +
 +  if (key) {
 +  set_tdb_key(trans_name, key);
 +  if (type == NODE_ACCESS_WRITE)
 +  i->ta_node = true;
 +  if (type == NODE_ACCESS_DELETE)
 +  i->ta_node = false;
>>>
>>> The same caveat made me wonder if ta_node was really what it meant to
>>> be because I couldn't find where write_node_raw was.
>>>
>>> Can I suggest you make them consistent? Either lift the write_node_raw
>>> for read to read_node or push down the write_node_raw in delete_node and
>>> write_node here?
>>>
>>> If I misunderstand how this works, please tell me...
>>
>> As I already said: in the read case writing the node is depending
>> on the context and lifting it up would seem rather strange: why
>> should reading a node contain a write_node_raw() call?
>>
>> In the write case I believe we should keep write_node_raw() where
>> it belongs: in the appropriate function modifying the node.
>>
>> All actions in access_node() are transaction specific and _additional_
>> in the transaction case.
> 
> 
> OK, my line of reasoning is that whatever DB operation is necessary to
> make a functionality work should be consistent. That means even having a
> DB write in read is OK.  But I think your explanation is fine, too.
> 
> May I then suggest adding the following to the write for read type in
> access_node?
> 
>   /* Additional transaction-specific node for read type. We only have to 
>* verify read nodes only if we didn't write them.
>*
>* The node is created and written to DB here to distinguish from the
>* write types.
>*/

Will do.

Thanks,


Juergen


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v4 3/4] xenstore: rework of transaction handling

2017-03-31 Thread Wei Liu
On Thu, Mar 30, 2017 at 06:41:11PM +0200, Juergen Gross wrote:
[...]
> > 
> >>node->generation = generation++;
> >> -  return;
> >> +  if (conn && !conn->transaction)
> >> +  wrl_apply_debit_direct(conn);
> >> +  }
> >> +
> >> +  if (!conn || !conn->transaction) {
> >> +  if (key)
> >> +  set_tdb_key(node->name, key);
> >> +  return 0;
> >>}
> >>  
> >>trans = conn->transaction;
> >>  
> >> -  node->generation = generation + trans->trans_gen++;
> >> +  trans_name = transaction_get_node_name(node, trans, node->name);
> >> +  if (!trans_name)
> >> +  goto nomem;
> >>  
> >> -  list_for_each_entry(i, >changes, list) {
> >> -  if (streq(i->node, node->name)) {
> >> -  if (recurse)
> >> -  i->recurse = recurse;
> >> -  return;
> >> -  }
> >> -  }
> >> -
> >> -  i = talloc(trans, struct changed_node);
> >> +  i = find_accessed_node(trans, node->name);
> >>if (!i) {
> >> -  /* All we can do is let the transaction fail. */
> >> -  generation++;
> >> -  return;
> >> +  i = talloc_zero(trans, struct accessed_node);
> >> +  if (!i)
> >> +  goto nomem;
> >> +  i->node = talloc_strdup(i, node->name);
> >> +  if (!i->node)
> >> +  goto nomem;
> >> +
> >> +  introduce = true;
> >> +  i->ta_node = false;
> >> +
> >> +  /* We have to verify read nodes only if we didn't write them. */
> >> +  if (type == NODE_ACCESS_READ) {
> >> +  i->generation = node->generation;
> >> +  i->check_gen = true;
> >> +  if (node->generation != NO_GENERATION) {
> >> +  set_tdb_key(trans_name, _key);
> >> +  ret = write_node_raw(conn, _key, node);
> >> +  if (ret)
> >> +  goto err;
> >> +  i->ta_node = true;
> >> +  }
> >> +  }
> > 
> > I think the current code structure is a bit confusing.
> > 
> > For write types, the call to write_node_raw is outside of this function
> > but for read type it is inside this function.
> 
> This is due to the different purpose of write_node_raw() in both
> cases:
> 
> In the read case we write an _additional_ node into the transaction,
> while in the write case it is just the user's operation.
> 
> > 
> >> +  list_add_tail(>list, >accessed);
> >>}
> >> -  i->node = talloc_strdup(i, node->name);
> >> -  if (!i->node) {
> >> -  /* All we can do is let the transaction fail. */
> >> -  generation++;
> >> +
> >> +  if (type != NODE_ACCESS_READ)
> >> +  i->modified = true;
> >> +
> >> +  if (introduce && type == NODE_ACCESS_DELETE)
> >> +  /* Nothing to delete. */
> >> +  return -1;
> >> +
> >> +  if (key) {
> >> +  set_tdb_key(trans_name, key);
> >> +  if (type == NODE_ACCESS_WRITE)
> >> +  i->ta_node = true;
> >> +  if (type == NODE_ACCESS_DELETE)
> >> +  i->ta_node = false;
> > 
> > The same caveat made me wonder if ta_node was really what it meant to
> > be because I couldn't find where write_node_raw was.
> > 
> > Can I suggest you make them consistent? Either lift the write_node_raw
> > for read to read_node or push down the write_node_raw in delete_node and
> > write_node here?
> > 
> > If I misunderstand how this works, please tell me...
> 
> As I already said: in the read case writing the node is depending
> on the context and lifting it up would seem rather strange: why
> should reading a node contain a write_node_raw() call?
> 
> In the write case I believe we should keep write_node_raw() where
> it belongs: in the appropriate function modifying the node.
> 
> All actions in access_node() are transaction specific and _additional_
> in the transaction case.


OK, my line of reasoning is that whatever DB operation is necessary to
make a functionality work should be consistent. That means even having a
DB write in read is OK.  But I think your explanation is fine, too.

May I then suggest adding the following to the write for read type in
access_node?

  /* Additional transaction-specific node for read type. We only have to 
   * verify read nodes only if we didn't write them.
   *
   * The node is created and written to DB here to distinguish from the
   * write types.
   */

Wei.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v4 3/4] xenstore: rework of transaction handling

2017-03-30 Thread Juergen Gross
On 30/03/17 17:41, Wei Liu wrote:
> On Thu, Mar 30, 2017 at 03:11:48PM +0200, Juergen Gross wrote:
> [...]
>> +
>> +/*
>> + * A node has been accessed.
>> + *
>> + * Modifying accesses (write, delete) always update the generation (global 
>> and
>> + * node->generation).
>> + *
>> + * Accesses in a transaction will be added to the list of accessed nodes
>> + * if not already done. Read type accesses will copy the node to the
>> + * transaction specific data base part, write type accesses go there
>> + * anyway.
>> + *
>> + * If not NULL, key will be supplied with name and length of name of the 
>> node
>> + * to be accessed in the data base.
>> + */
>> +int access_node(struct connection *conn, struct node *node,
>> +enum node_access_type type, TDB_DATA *key)
>> +{
>> +struct accessed_node *i = NULL;
>>  struct transaction *trans;
>> +TDB_DATA local_key;
>> +const char *trans_name = NULL;
>> +int ret;
>> +bool introduce = false;
>>  
>> -if (!conn || !conn->transaction) {
>> +if (type != NODE_ACCESS_READ) {
>>  /* They're changing the global database. */
> 
> This comment should be moved below.

Right.

> 
>>  node->generation = generation++;
>> -return;
>> +if (conn && !conn->transaction)
>> +wrl_apply_debit_direct(conn);
>> +}
>> +
>> +if (!conn || !conn->transaction) {
>> +if (key)
>> +set_tdb_key(node->name, key);
>> +return 0;
>>  }
>>  
>>  trans = conn->transaction;
>>  
>> -node->generation = generation + trans->trans_gen++;
>> +trans_name = transaction_get_node_name(node, trans, node->name);
>> +if (!trans_name)
>> +goto nomem;
>>  
>> -list_for_each_entry(i, >changes, list) {
>> -if (streq(i->node, node->name)) {
>> -if (recurse)
>> -i->recurse = recurse;
>> -return;
>> -}
>> -}
>> -
>> -i = talloc(trans, struct changed_node);
>> +i = find_accessed_node(trans, node->name);
>>  if (!i) {
>> -/* All we can do is let the transaction fail. */
>> -generation++;
>> -return;
>> +i = talloc_zero(trans, struct accessed_node);
>> +if (!i)
>> +goto nomem;
>> +i->node = talloc_strdup(i, node->name);
>> +if (!i->node)
>> +goto nomem;
>> +
>> +introduce = true;
>> +i->ta_node = false;
>> +
>> +/* We have to verify read nodes only if we didn't write them. */
>> +if (type == NODE_ACCESS_READ) {
>> +i->generation = node->generation;
>> +i->check_gen = true;
>> +if (node->generation != NO_GENERATION) {
>> +set_tdb_key(trans_name, _key);
>> +ret = write_node_raw(conn, _key, node);
>> +if (ret)
>> +goto err;
>> +i->ta_node = true;
>> +}
>> +}
> 
> I think the current code structure is a bit confusing.
> 
> For write types, the call to write_node_raw is outside of this function
> but for read type it is inside this function.

This is due to the different purpose of write_node_raw() in both
cases:

In the read case we write an _additional_ node into the transaction,
while in the write case it is just the user's operation.

> 
>> +list_add_tail(>list, >accessed);
>>  }
>> -i->node = talloc_strdup(i, node->name);
>> -if (!i->node) {
>> -/* All we can do is let the transaction fail. */
>> -generation++;
>> +
>> +if (type != NODE_ACCESS_READ)
>> +i->modified = true;
>> +
>> +if (introduce && type == NODE_ACCESS_DELETE)
>> +/* Nothing to delete. */
>> +return -1;
>> +
>> +if (key) {
>> +set_tdb_key(trans_name, key);
>> +if (type == NODE_ACCESS_WRITE)
>> +i->ta_node = true;
>> +if (type == NODE_ACCESS_DELETE)
>> +i->ta_node = false;
> 
> The same caveat made me wonder if ta_node was really what it meant to
> be because I couldn't find where write_node_raw was.
> 
> Can I suggest you make them consistent? Either lift the write_node_raw
> for read to read_node or push down the write_node_raw in delete_node and
> write_node here?
> 
> If I misunderstand how this works, please tell me...

As I already said: in the read case writing the node is depending
on the context and lifting it up would seem rather strange: why
should reading a node contain a write_node_raw() call?

In the write case I believe we should keep write_node_raw() where
it belongs: in the appropriate function modifying the node.

All actions in access_node() are transaction specific and 

Re: [Xen-devel] [PATCH v4 3/4] xenstore: rework of transaction handling

2017-03-30 Thread Wei Liu
On Thu, Mar 30, 2017 at 03:11:48PM +0200, Juergen Gross wrote:
[...]
> +
> +/*
> + * A node has been accessed.
> + *
> + * Modifying accesses (write, delete) always update the generation (global 
> and
> + * node->generation).
> + *
> + * Accesses in a transaction will be added to the list of accessed nodes
> + * if not already done. Read type accesses will copy the node to the
> + * transaction specific data base part, write type accesses go there
> + * anyway.
> + *
> + * If not NULL, key will be supplied with name and length of name of the node
> + * to be accessed in the data base.
> + */
> +int access_node(struct connection *conn, struct node *node,
> + enum node_access_type type, TDB_DATA *key)
> +{
> + struct accessed_node *i = NULL;
>   struct transaction *trans;
> + TDB_DATA local_key;
> + const char *trans_name = NULL;
> + int ret;
> + bool introduce = false;
>  
> - if (!conn || !conn->transaction) {
> + if (type != NODE_ACCESS_READ) {
>   /* They're changing the global database. */

This comment should be moved below.

>   node->generation = generation++;
> - return;
> + if (conn && !conn->transaction)
> + wrl_apply_debit_direct(conn);
> + }
> +
> + if (!conn || !conn->transaction) {
> + if (key)
> + set_tdb_key(node->name, key);
> + return 0;
>   }
>  
>   trans = conn->transaction;
>  
> - node->generation = generation + trans->trans_gen++;
> + trans_name = transaction_get_node_name(node, trans, node->name);
> + if (!trans_name)
> + goto nomem;
>  
> - list_for_each_entry(i, >changes, list) {
> - if (streq(i->node, node->name)) {
> - if (recurse)
> - i->recurse = recurse;
> - return;
> - }
> - }
> -
> - i = talloc(trans, struct changed_node);
> + i = find_accessed_node(trans, node->name);
>   if (!i) {
> - /* All we can do is let the transaction fail. */
> - generation++;
> - return;
> + i = talloc_zero(trans, struct accessed_node);
> + if (!i)
> + goto nomem;
> + i->node = talloc_strdup(i, node->name);
> + if (!i->node)
> + goto nomem;
> +
> + introduce = true;
> + i->ta_node = false;
> +
> + /* We have to verify read nodes only if we didn't write them. */
> + if (type == NODE_ACCESS_READ) {
> + i->generation = node->generation;
> + i->check_gen = true;
> + if (node->generation != NO_GENERATION) {
> + set_tdb_key(trans_name, _key);
> + ret = write_node_raw(conn, _key, node);
> + if (ret)
> + goto err;
> + i->ta_node = true;
> + }
> + }

I think the current code structure is a bit confusing.

For write types, the call to write_node_raw is outside of this function
but for read type it is inside this function.

> + list_add_tail(>list, >accessed);
>   }
> - i->node = talloc_strdup(i, node->name);
> - if (!i->node) {
> - /* All we can do is let the transaction fail. */
> - generation++;
> +
> + if (type != NODE_ACCESS_READ)
> + i->modified = true;
> +
> + if (introduce && type == NODE_ACCESS_DELETE)
> + /* Nothing to delete. */
> + return -1;
> +
> + if (key) {
> + set_tdb_key(trans_name, key);
> + if (type == NODE_ACCESS_WRITE)
> + i->ta_node = true;
> + if (type == NODE_ACCESS_DELETE)
> + i->ta_node = false;

The same caveat made me wonder if ta_node was really what it meant to
be because I couldn't find where write_node_raw was.

Can I suggest you make them consistent? Either lift the write_node_raw
for read to read_node or push down the write_node_raw in delete_node and
write_node here?

If I misunderstand how this works, please tell me...

> + }
> +
> + return 0;
> +
> +nomem:
> + ret = ENOMEM;
> +err:
> + talloc_free((void *)trans_name);
> + talloc_free(i);
> + trans->fail = true;
> + errno = ret;
> + return ret;
> +}
> +
> +/*
> + * Finalize transaction:
> + * Walk through accessed nodes and check generation against global data.
> + * If all entries match, read the transaction entries and write them without
> + * transaction prepended. Delete all transaction specific nodes in the data
> + * base.
> + */
[...]
> @@ -269,7 +525,7 @@ void transaction_entry_inc(struct transaction *trans, 
> unsigned int domid)
>   d = talloc(trans, struct changed_domain);
>   if (!d) {
>

[Xen-devel] [PATCH v4 3/4] xenstore: rework of transaction handling

2017-03-30 Thread Juergen Gross
The handling of transactions in xenstored is rather clumsy today:

- Each transaction in progress is keeping a local copy of the complete
  xenstore data base
- A transaction will fail as soon as any node is being modified outside
  the transaction

This is leading to a very bad behavior in case of a large xenstore.
Memory consumption of xenstored is much higher than necessary and with
many domains up transactions failures will be more and more common.

Instead of keeping a complete copy of the data base for each
transaction store the transaction data in the same data base as the
normal xenstore entries prepended with the transaction in the single
nodes either read or modified. At the end of the transaction walk
through all nodes accessed and check for conflicting modifications.
In case no conflicts are found write all modified nodes to the data
base without transaction identifier.

Following tests have been performed:
- create/destroy of various domains, including HVM with ioemu-stubdom
  (xenstored and xenstore-stubdom)
- multiple concurrent runs of xs-test over several minutes
  (xenstored and xenstore-stubdom)
- test for memory leaks of xenstored by dumping talloc reports before
  and after the tests

Signed-off-by: Juergen Gross 
---
V4: remove access_type parameter from transaction_prepend() (Liu Wei)
remove a comment referencing tdb_context()
---
 tools/xenstore/xenstored_core.c| 146 +--
 tools/xenstore/xenstored_core.h|  17 +-
 tools/xenstore/xenstored_domain.c  |  24 +-
 tools/xenstore/xenstored_domain.h  |   2 +-
 tools/xenstore/xenstored_transaction.c | 425 +++--
 tools/xenstore/xenstored_transaction.h |  18 +-
 6 files changed, 460 insertions(+), 172 deletions(-)

diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
index ee4c9e1..a76736d 100644
--- a/tools/xenstore/xenstored_core.c
+++ b/tools/xenstore/xenstored_core.c
@@ -81,9 +81,8 @@ static bool recovery = true;
 static int reopen_log_pipe[2];
 static int reopen_log_pipe0_pollfd_idx = -1;
 char *tracefile = NULL;
-static TDB_CONTEXT *tdb_ctx = NULL;
+TDB_CONTEXT *tdb_ctx = NULL;
 
-static void corrupt(struct connection *conn, const char *fmt, ...);
 static const char *sockmsg_string(enum xsd_sockmsg_type type);
 
 #define log(...)   \
@@ -105,24 +104,6 @@ int quota_nb_watch_per_domain = 128;
 int quota_max_entry_size = 2048; /* 2K */
 int quota_max_transaction = 10;
 
-TDB_CONTEXT *tdb_context(struct connection *conn)
-{
-   /* conn = NULL used in manual_node at setup. */
-   if (!conn || !conn->transaction)
-   return tdb_ctx;
-   return tdb_transaction_context(conn->transaction);
-}
-
-bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb)
-{
-   if (!(tdb_ctx->flags & TDB_INTERNAL))
-   if (rename(newname, xs_daemon_tdb()) != 0)
-   return false;
-   tdb_close(tdb_ctx);
-   tdb_ctx = talloc_steal(talloc_autofree_context(), newtdb);
-   return true;
-}
-
 void trace(const char *fmt, ...)
 {
va_list arglist;
@@ -385,35 +366,38 @@ static struct node *read_node(struct connection *conn, 
const void *ctx,
TDB_DATA key, data;
struct xs_tdb_record_hdr *hdr;
struct node *node;
-   TDB_CONTEXT * context = tdb_context(conn);
 
-   key.dptr = (void *)name;
-   key.dsize = strlen(name);
-   data = tdb_fetch(context, key);
+   node = talloc(ctx, struct node);
+   if (!node) {
+   errno = ENOMEM;
+   return NULL;
+   }
+   node->name = talloc_strdup(node, name);
+   if (!node->name) {
+   talloc_free(node);
+   errno = ENOMEM;
+   return NULL;
+   }
+
+   if (transaction_prepend(conn, name, ))
+   return NULL;
+
+   data = tdb_fetch(tdb_ctx, key);
 
if (data.dptr == NULL) {
-   if (tdb_error(context) == TDB_ERR_NOEXIST)
+   if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) {
+   node->generation = NO_GENERATION;
+   access_node(conn, node, NODE_ACCESS_READ, NULL);
errno = ENOENT;
-   else {
-   log("TDB error on read: %s", tdb_errorstr(context));
+   } else {
+   log("TDB error on read: %s", tdb_errorstr(tdb_ctx));
errno = EIO;
}
-   return NULL;
-   }
-
-   node = talloc(ctx, struct node);
-   if (!node) {
-   errno = ENOMEM;
-   return NULL;
-   }
-   node->name = talloc_strdup(node, name);
-   if (!node->name) {
talloc_free(node);
-   errno = ENOMEM;
return NULL;
}
+
node->parent = NULL;
-   node->tdb = tdb_context(conn);
talloc_steal(node,