Re: [Qemu-devel] Re: [RFC PATCH v3 2/3] block: call the snapshot handlers of the protocol drivers

2010-05-17 Thread MORITA Kazutaka
On Mon, May 17, 2010 at 9:20 PM, Kevin Wolf kw...@redhat.com wrote:
 Am 17.05.2010 14:19, schrieb MORITA Kazutaka:
 At Mon, 17 May 2010 13:08:08 +0200,
 Kevin Wolf wrote:

 Am 17.05.2010 12:19, schrieb MORITA Kazutaka:

  int bdrv_snapshot_goto(BlockDriverState *bs,
                         const char *snapshot_id)
  {
      BlockDriver *drv = bs-drv;
 +    int ret, open_ret;
 +
      if (!drv)
          return -ENOMEDIUM;
 -    if (!drv-bdrv_snapshot_goto)
 -        return -ENOTSUP;
 -    return drv-bdrv_snapshot_goto(bs, snapshot_id);
 +    if (drv-bdrv_snapshot_goto)
 +        return drv-bdrv_snapshot_goto(bs, snapshot_id);
 +
 +    if (bs-file) {
 +        drv-bdrv_close(bs);
 +        ret = bdrv_snapshot_goto(bs-file, snapshot_id);
 +        open_ret = drv-bdrv_open(bs, bs-open_flags);
 +        if (open_ret  0) {
 +            bdrv_delete(bs);

 I think you mean bs-file here.

 Kevin

 This is an error of re-opening the format driver, so what we should
 delete here is not bs-file but bs, isn't it?  If we failed to open bs
 here, the drive doesn't seem to work anymore.

 But bdrv_delete means basically free it. This is almost guaranteed to
 lead to crashes because that BlockDriverState is still in use in other
 places.

 One additional case of use after free is in the very next line:

 +            bs-drv = NULL;

 You can't do that when bs is freed, obviously. But I think just setting
 bs-drv to NULL without bdrv_deleting it before is the better way. It
 will fail any requests (with -ENOMEDIUM), but can't produce crashes.
 This is also what bdrv_commit does in such situations.

 In this state, we don't access the underlying file any more, so we could
 delete bs-file - this is why I thought you actually meant to do that.


I'm sorry for the confusion.  I understand what we should do here.
I'll fix it for the next post.

Thanks,

Kazutaka



[Qemu-devel] [PATCH] add support for protocol driver create_options

2010-05-19 Thread MORITA Kazutaka
This patch enables protocol drivers to use their create options which
are not supported by the format.  For example, protcol drivers can use
a backing_file option with raw format.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block.c   |7 +++
 block.h   |1 +
 qemu-img.c|   49 ++---
 qemu-option.c |   52 +---
 qemu-option.h |2 ++
 5 files changed, 85 insertions(+), 26 deletions(-)

diff --git a/block.c b/block.c
index 48d8468..0ab9424 100644
--- a/block.c
+++ b/block.c
@@ -56,7 +56,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t 
sector_num,
 uint8_t *buf, int nb_sectors);
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
  const uint8_t *buf, int nb_sectors);
-static BlockDriver *find_protocol(const char *filename);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
 QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -210,7 +209,7 @@ int bdrv_create_file(const char* filename, 
QEMUOptionParameter *options)
 {
 BlockDriver *drv;
 
-drv = find_protocol(filename);
+drv = bdrv_find_protocol(filename);
 if (drv == NULL) {
 drv = bdrv_find_format(file);
 }
@@ -283,7 +282,7 @@ static BlockDriver *find_hdev_driver(const char *filename)
 return drv;
 }
 
-static BlockDriver *find_protocol(const char *filename)
+BlockDriver *bdrv_find_protocol(const char *filename)
 {
 BlockDriver *drv1;
 char protocol[128];
@@ -469,7 +468,7 @@ int bdrv_file_open(BlockDriverState **pbs, const char 
*filename, int flags)
 BlockDriver *drv;
 int ret;
 
-drv = find_protocol(filename);
+drv = bdrv_find_protocol(filename);
 if (!drv) {
 return -ENOENT;
 }
diff --git a/block.h b/block.h
index 24efeb6..9034ebb 100644
--- a/block.h
+++ b/block.h
@@ -54,6 +54,7 @@ void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
+BlockDriver *bdrv_find_protocol(const char *filename);
 BlockDriver *bdrv_find_format(const char *format_name);
 BlockDriver *bdrv_find_whitelisted_format(const char *format_name);
 int bdrv_create(BlockDriver *drv, const char* filename,
diff --git a/qemu-img.c b/qemu-img.c
index d3c30a7..8ae7184 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -252,8 +252,8 @@ static int img_create(int argc, char **argv)
 const char *base_fmt = NULL;
 const char *filename;
 const char *base_filename = NULL;
-BlockDriver *drv;
-QEMUOptionParameter *param = NULL;
+BlockDriver *drv, *proto_drv;
+QEMUOptionParameter *param = NULL, *create_options = NULL;
 char *options = NULL;
 
 flags = 0;
@@ -286,33 +286,42 @@ static int img_create(int argc, char **argv)
 }
 }
 
+/* Get the filename */
+if (optind = argc)
+help();
+filename = argv[optind++];
+
 /* Find driver and parse its options */
 drv = bdrv_find_format(fmt);
 if (!drv)
 error(Unknown file format '%s', fmt);
 
+proto_drv = bdrv_find_protocol(filename);
+if (!proto_drv)
+error(Unknown protocol '%s', filename);
+
+create_options = append_option_parameters(create_options,
+  drv-create_options);
+create_options = append_option_parameters(create_options,
+  proto_drv-create_options);
+
 if (options  !strcmp(options, ?)) {
-print_option_help(drv-create_options);
+print_option_help(create_options);
 return 0;
 }
 
 /* Create parameter list with default values */
-param = parse_option_parameters(, drv-create_options, param);
+param = parse_option_parameters(, create_options, param);
 set_option_parameter_int(param, BLOCK_OPT_SIZE, -1);
 
 /* Parse -o options */
 if (options) {
-param = parse_option_parameters(options, drv-create_options, param);
+param = parse_option_parameters(options, create_options, param);
 if (param == NULL) {
 error(Invalid options for file format '%s'., fmt);
 }
 }
 
-/* Get the filename */
-if (optind = argc)
-help();
-filename = argv[optind++];
-
 /* Add size to parameters */
 if (optind  argc) {
 set_option_parameter(param, BLOCK_OPT_SIZE, argv[optind++]);
@@ -362,6 +371,7 @@ static int img_create(int argc, char **argv)
 puts();
 
 ret = bdrv_create(drv, filename, param);
+free_option_parameters(create_options);
 free_option_parameters(param);
 
 if (ret  0) {
@@ -543,14 +553,14 @@ static int img_convert(int argc, char **argv)
 {
 int c, ret, n, n1, bs_n, bs_i, flags, cluster_size, cluster_sectors;
 const char *fmt, *out_fmt, *out_baseimg, *out_filename;
-BlockDriver *drv;
+BlockDriver *drv, *proto_drv;
 BlockDriverState **bs, *out_bs;
 int64_t

Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-20 Thread MORITA Kazutaka
At Fri, 21 May 2010 00:16:46 +0200,
Christian Brunner wrote:
 
 2010/5/20 Anthony Liguori anth...@codemonkey.ws:
  With new approaches like Sheepdog or Ceph, things are getting a lot
  cheaper and you can scale your system without disrupting your service.
  The concepts are quite similar to what Amazon is doing in their EC2
  environment, but they certainly won't publish it as OpenSource anytime
  soon.
 
  Both projects have advantages and disadvantages. Ceph is a bit more
  universal as it implements a whole filesystem. Sheepdog is more
  feature complete in regards of managing images (e.g. snapshots). Both

I think a major difference is that Sheepdog servers act fully
autonomously.  Any Sheepdog server has no fixed role such as a monitor
server, and Sheepdog doesn't require any configuration about a list of
nodes in the cluster.


  projects require some additional work to become stable, but they are
  on a good way.
 
  I would really like to see both drivers in the qemu tree, as they are
  the key to a design shift in how storage in the datacenter is being
  built.
 
 
  I'd be more interested in enabling people to build these types of storage
  systems without touching qemu.
 
 You could do this by using Yehuda's rbd kernel driver, but I think
 that it would be better to avoid this additional layer.
 

I agree.  In addition, if a storage client is a qemu driver, the
storage system can support some features specific to qemu such as live
snapshot from qemu monitor.

Regards,

Kazutaka




Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-21 Thread MORITA Kazutaka
At Fri, 21 May 2010 06:28:42 +0100,
Stefan Hajnoczi wrote:
 
 On Thu, May 20, 2010 at 11:16 PM, Christian Brunner c...@muc.de wrote:
  2010/5/20 Anthony Liguori anth...@codemonkey.ws:
  Both sheepdog and ceph ultimately transmit I/O over a socket to a central
  daemon, right?  So could we not standardize a protocol for this that both
  sheepdog and ceph could implement?
 
  There is no central daemon. The concept is that they talk to many
  storage nodes at the same time. Data is distributed and replicated
  over many nodes in the network. The mechanism to do this is quite
  complex. I don't know about sheepdog, but in Ceph this is called RADOS
  (reliable autonomic distributed object store). Sheepdog and Ceph may
  look similar, but this is where they act different. I don't think that
  it would be possible to implement a common protocol.
 
 I believe Sheepdog has a local daemon on each node.  The QEMU storage
 backend talks to the daemon on the same node, which then does the real
 network communication with the rest of the distributed storage system.

Yes.  It is because Sheepdog doesn't have a configuration about
cluster membership as I mentioned in another mail, so the drvier
doesn't know which node to access other than localhost.

  So I think we're not talking about a network protocol here, we're
 talking about a common interface that can be used by QEMU and other
 programs to take advantage of Ceph, Sheepdog, etc services available
 on the local node.
 
 Haven't looked into your patch enough yet, but does librados talk
 directly over the network or does it connect to a local daemon/driver?
 

AFAIK, librados access directly over the network, so I think it is
difficult to define a common interface.


Thanks,

Kazutaka




[Qemu-devel] Re: [PATCH] add support for protocol driver create_options

2010-05-24 Thread MORITA Kazutaka
At Fri, 21 May 2010 13:40:31 +0200,
Kevin Wolf wrote:
 
 Am 20.05.2010 07:36, schrieb MORITA Kazutaka:
  This patch enables protocol drivers to use their create options which
  are not supported by the format.  For example, protcol drivers can use
  a backing_file option with raw format.
  
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 
 Hm, this is not stackable, right? Though I do see that making it
 stackable would require some bigger changes, so maybe we can get away
 with claiming that this approach covers everything that happens in practice.
 
 If we accept that this is the desired behaviour, the code looks good to me.
 
As you say, this patch isn't stackable; we must specify a image name
with at most 1 format and 1 protocol.  I cannot think of a situation
where we want to use more than one protocol to create qemu images, so
this seems to be enough to me.

Thanks,

Kazutaka



[Qemu-devel] Re: [PATCH] add support for protocol driver create_options

2010-05-24 Thread MORITA Kazutaka
At Fri, 21 May 2010 18:57:36 +0200,
Kevin Wolf wrote:
 
 Am 20.05.2010 07:36, schrieb MORITA Kazutaka:
  +
  +/*
  + * Append an option list (list) to an option list (dest).
  + *
  + * If dest is NULL, a new copy of list is created.
  + *
  + * Returns a pointer to the first element of dest (or the newly allocated 
  copy)
  + */
  +QEMUOptionParameter *append_option_parameters(QEMUOptionParameter *dest,
  +QEMUOptionParameter *list)
  +{
  +size_t num_options, num_dest_options;
  +
  +num_options = count_option_parameters(dest);
  +num_dest_options = num_options;
  +
  +num_options += count_option_parameters(list);
  +
  +dest = qemu_realloc(dest, (num_options + 1) * 
  sizeof(QEMUOptionParameter));
  +
  +while (list  list-name) {
  +if (get_option_parameter(dest, list-name) == NULL) {
  +dest[num_dest_options++] = *list;
 
 You need to add a dest[num_dest_options].name = NULL; here. Otherwise
 the next loop iteration works on uninitialized memory and possibly an
 unterminated list. I got a segfault for that reason.
 

I forgot to add it, sorry.
Fixed version is below.

Thanks,

Kazutaka

==
This patch enables protocol drivers to use their create options which
are not supported by the format.  For example, protcol drivers can use
a backing_file option with raw format.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block.c   |7 +++
 block.h   |1 +
 qemu-img.c|   49 ++---
 qemu-option.c |   53 ++---
 qemu-option.h |2 ++
 5 files changed, 86 insertions(+), 26 deletions(-)

diff --git a/block.c b/block.c
index 202f895..3ed35ed 100644
--- a/block.c
+++ b/block.c
@@ -56,7 +56,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t 
sector_num,
 uint8_t *buf, int nb_sectors);
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
  const uint8_t *buf, int nb_sectors);
-static BlockDriver *find_protocol(const char *filename);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
 QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -210,7 +209,7 @@ int bdrv_create_file(const char* filename, 
QEMUOptionParameter *options)
 {
 BlockDriver *drv;
 
-drv = find_protocol(filename);
+drv = bdrv_find_protocol(filename);
 if (drv == NULL) {
 drv = bdrv_find_format(file);
 }
@@ -283,7 +282,7 @@ static BlockDriver *find_hdev_driver(const char *filename)
 return drv;
 }
 
-static BlockDriver *find_protocol(const char *filename)
+BlockDriver *bdrv_find_protocol(const char *filename)
 {
 BlockDriver *drv1;
 char protocol[128];
@@ -469,7 +468,7 @@ int bdrv_file_open(BlockDriverState **pbs, const char 
*filename, int flags)
 BlockDriver *drv;
 int ret;
 
-drv = find_protocol(filename);
+drv = bdrv_find_protocol(filename);
 if (!drv) {
 return -ENOENT;
 }
diff --git a/block.h b/block.h
index b95a9c0..bd11cc0 100644
--- a/block.h
+++ b/block.h
@@ -54,6 +54,7 @@ void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
+BlockDriver *bdrv_find_protocol(const char *filename);
 BlockDriver *bdrv_find_format(const char *format_name);
 BlockDriver *bdrv_find_whitelisted_format(const char *format_name);
 int bdrv_create(BlockDriver *drv, const char* filename,
diff --git a/qemu-img.c b/qemu-img.c
index d3c30a7..8ae7184 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -252,8 +252,8 @@ static int img_create(int argc, char **argv)
 const char *base_fmt = NULL;
 const char *filename;
 const char *base_filename = NULL;
-BlockDriver *drv;
-QEMUOptionParameter *param = NULL;
+BlockDriver *drv, *proto_drv;
+QEMUOptionParameter *param = NULL, *create_options = NULL;
 char *options = NULL;
 
 flags = 0;
@@ -286,33 +286,42 @@ static int img_create(int argc, char **argv)
 }
 }
 
+/* Get the filename */
+if (optind = argc)
+help();
+filename = argv[optind++];
+
 /* Find driver and parse its options */
 drv = bdrv_find_format(fmt);
 if (!drv)
 error(Unknown file format '%s', fmt);
 
+proto_drv = bdrv_find_protocol(filename);
+if (!proto_drv)
+error(Unknown protocol '%s', filename);
+
+create_options = append_option_parameters(create_options,
+  drv-create_options);
+create_options = append_option_parameters(create_options,
+  proto_drv-create_options);
+
 if (options  !strcmp(options, ?)) {
-print_option_help(drv-create_options);
+print_option_help(create_options);
 return 0;
 }
 
 /* Create parameter list with default values */
-param = parse_option_parameters(, drv-create_options, param);
+param = parse_option_parameters(, create_options

Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-24 Thread MORITA Kazutaka
At Sun, 23 May 2010 15:01:59 +0300,
Avi Kivity wrote:
 
 On 05/21/2010 12:29 AM, Anthony Liguori wrote:
 
  I'd be more interested in enabling people to build these types of 
  storage systems without touching qemu.
 
  Both sheepdog and ceph ultimately transmit I/O over a socket to a 
  central daemon, right? 
 
 That incurs an extra copy.
 
  So could we not standardize a protocol for this that both sheepdog and 
  ceph could implement?
 
 The protocol already exists, nbd.  It doesn't support snapshotting etc. 
 but we could extend it.
 

I have no objection to use another protocol for Sheepdog support, but
I think nbd protocol is unsuitable for the large storage pool with
many VM images.  It is because nbd protocol doesn't support specifing
a file name to open.  If we use nbd with such a storage system, the
server needs to listen ports as many as the number of VM images.  As
far as I see the protocol, It looks difficult to extend it without
breaking backward compatibility.

Regards,

Kazutaka

 But IMO what's needed is a plugin API for the block layer.
 



Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-24 Thread MORITA Kazutaka
At Mon, 24 May 2010 14:05:58 +0300,
Avi Kivity wrote:
 
 On 05/24/2010 10:12 AM, MORITA Kazutaka wrote:
  At Sun, 23 May 2010 15:01:59 +0300,
  Avi Kivity wrote:
 
  On 05/21/2010 12:29 AM, Anthony Liguori wrote:
   
  I'd be more interested in enabling people to build these types of
  storage systems without touching qemu.
 
  Both sheepdog and ceph ultimately transmit I/O over a socket to a
  central daemon, right?
 
  That incurs an extra copy.
 
   
  So could we not standardize a protocol for this that both sheepdog and
  ceph could implement?
 
  The protocol already exists, nbd.  It doesn't support snapshotting etc.
  but we could extend it.
 
   
  I have no objection to use another protocol for Sheepdog support, but
  I think nbd protocol is unsuitable for the large storage pool with
  many VM images.  It is because nbd protocol doesn't support specifing
  a file name to open.  If we use nbd with such a storage system, the
  server needs to listen ports as many as the number of VM images.  As
  far as I see the protocol, It looks difficult to extend it without
  breaking backward compatibility.
 
 
 The server would be local and talk over a unix domain socket, perhaps 
 anonymous.
 
 nbd has other issues though, such as requiring a copy and no support for 
 metadata operations such as snapshot and file size extension.
 

Sorry, my explanation was unclear.  I'm not sure how running servers
on localhost can solve the problem.

What I wanted to say was that we cannot specify the image of VM. With
nbd protocol, command line arguments are as follows:

 $ qemu nbd:hostname:port

As this syntax shows, with nbd protocol the client cannot pass the VM
image name to the server.

Regards,

Kazutaka



Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-24 Thread MORITA Kazutaka
At Mon, 24 May 2010 14:56:29 +0300,
Avi Kivity wrote:
 
 On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
 
  The server would be local and talk over a unix domain socket, perhaps
  anonymous.
 
  nbd has other issues though, such as requiring a copy and no support for
  metadata operations such as snapshot and file size extension.
 
   
  Sorry, my explanation was unclear.  I'm not sure how running servers
  on localhost can solve the problem.
 
 
 The local server can convert from the local (nbd) protocol to the remote 
 (sheepdog, ceph) protocol.
 
  What I wanted to say was that we cannot specify the image of VM. With
  nbd protocol, command line arguments are as follows:
 
$ qemu nbd:hostname:port
 
  As this syntax shows, with nbd protocol the client cannot pass the VM
  image name to the server.
 
 
 We would extend it to allow it to connect to a unix domain socket:
 
qemu nbd:unix:/path/to/socket
 
 The server at the other end would associate the socket with a filename 
 and forward it to the server using the remote protocol.
 

Thank you for the explanation.  Sheepdog could achieve desired
behavior by creating socket files for all the VM images when the
daemon starts up.

 However, I don't think nbd would be a good protocol.  My preference 
 would be for a plugin API, or for a new local protocol that uses 
 splice() to avoid copies.
 

Both would be okay for Sheepdog.  I want to take a suitable approach
for qemu.

Thanks,

Kazutaka



Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-25 Thread MORITA Kazutaka
At Mon, 24 May 2010 14:16:32 -0500,
Anthony Liguori wrote:
 
 On 05/24/2010 06:56 AM, Avi Kivity wrote:
  On 05/24/2010 02:42 PM, MORITA Kazutaka wrote:
 
  The server would be local and talk over a unix domain socket, perhaps
  anonymous.
 
  nbd has other issues though, such as requiring a copy and no support 
  for
  metadata operations such as snapshot and file size extension.
 
  Sorry, my explanation was unclear.  I'm not sure how running servers
  on localhost can solve the problem.
 
  The local server can convert from the local (nbd) protocol to the 
  remote (sheepdog, ceph) protocol.
 
  What I wanted to say was that we cannot specify the image of VM. With
  nbd protocol, command line arguments are as follows:
 
$ qemu nbd:hostname:port
 
  As this syntax shows, with nbd protocol the client cannot pass the VM
  image name to the server.
 
  We would extend it to allow it to connect to a unix domain socket:
 
qemu nbd:unix:/path/to/socket
 
 nbd is a no-go because it only supports a single, synchronous I/O 
 operation at a time and has no mechanism for extensibility.
 
 If we go this route, I think two options are worth considering.  The 
 first would be a purely socket based approach where we just accepted the 
 extra copy.
 
 The other potential approach would be shared memory based.  We export 
 all guest ram as shared memory along with a small bounce buffer pool.  
 We would then use a ring queue (potentially even using virtio-blk) and 
 an eventfd for notification.
 

The shared memory approach assumes that there is a local server who
can talk with the storage system.  But Ceph doesn't require the local
server, and Sheepdog would be extended to support VMs running outside
the storage system.  We could run a local daemon who can only work as
proxy, but I don't think it looks a clean approach.  So I think a
socket based approach is the right way to go.

BTW, is it required to design a common interface?  The way Sheepdog
replicates data is different from Ceph, so I think it is not possible
to define a common protocol as Christian says.

Regards,

Kazutaka

  The server at the other end would associate the socket with a filename 
  and forward it to the server using the remote protocol.
 
  However, I don't think nbd would be a good protocol.  My preference 
  would be for a plugin API, or for a new local protocol that uses 
  splice() to avoid copies.
 
 I think a good shared memory implementation would be preferable to 
 plugins.  I think it's worth attempting to do a plugin interface for the 
 block layer but I strongly suspect it would not be sufficient.
 
 I would not want to see plugins that interacted with BlockDriverState 
 directly, for instance.  We change it far too often.  Our main loop 
 functions are also not terribly stable so I'm not sure how we would 
 handle that (unless we forced all block plugins to be in a separate thread).
 



[Qemu-devel] Re: [PATCH] add support for protocol driver create_options

2010-05-25 Thread MORITA Kazutaka
At Tue, 25 May 2010 15:43:17 +0200,
Kevin Wolf wrote:
 
 Am 24.05.2010 08:34, schrieb MORITA Kazutaka:
  At Fri, 21 May 2010 18:57:36 +0200,
  Kevin Wolf wrote:
 
  Am 20.05.2010 07:36, schrieb MORITA Kazutaka:
  +
  +/*
  + * Append an option list (list) to an option list (dest).
  + *
  + * If dest is NULL, a new copy of list is created.
  + *
  + * Returns a pointer to the first element of dest (or the newly 
  allocated copy)
  + */
  +QEMUOptionParameter *append_option_parameters(QEMUOptionParameter *dest,
  +QEMUOptionParameter *list)
  +{
  +size_t num_options, num_dest_options;
  +
  +num_options = count_option_parameters(dest);
  +num_dest_options = num_options;
  +
  +num_options += count_option_parameters(list);
  +
  +dest = qemu_realloc(dest, (num_options + 1) * 
  sizeof(QEMUOptionParameter));
  +
  +while (list  list-name) {
  +if (get_option_parameter(dest, list-name) == NULL) {
  +dest[num_dest_options++] = *list;
 
  You need to add a dest[num_dest_options].name = NULL; here. Otherwise
  the next loop iteration works on uninitialized memory and possibly an
  unterminated list. I got a segfault for that reason.
 
  
  I forgot to add it, sorry.
  Fixed version is below.
  
  Thanks,
  
  Kazutaka
  
  ==
  This patch enables protocol drivers to use their create options which
  are not supported by the format.  For example, protcol drivers can use
  a backing_file option with raw format.
  
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 
 $ ./qemu-img create -f qcow2 -o cluster_size=4k /tmp/test.qcow2 4G
 Unknown option 'cluster_size'
 qemu-img: Invalid options for file format 'qcow2'.
 
 I think you added another num_dest_options++ which shouldn't be there.
 

Sorry again.  I wrongly added `dest[num_dest_options++].name = NULL;'
instead of `dest[num_dest_options].name = NULL;'.

Thanks,

Kazutaka

==
This patch enables protocol drivers to use their create options which
are not supported by the format.  For example, protcol drivers can use
a backing_file option with raw format.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block.c   |7 +++
 block.h   |1 +
 qemu-img.c|   49 ++---
 qemu-option.c |   53 ++---
 qemu-option.h |2 ++
 5 files changed, 86 insertions(+), 26 deletions(-)

diff --git a/block.c b/block.c
index 6e7766a..f881f10 100644
--- a/block.c
+++ b/block.c
@@ -56,7 +56,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t 
sector_num,
 uint8_t *buf, int nb_sectors);
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
  const uint8_t *buf, int nb_sectors);
-static BlockDriver *find_protocol(const char *filename);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
 QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -210,7 +209,7 @@ int bdrv_create_file(const char* filename, 
QEMUOptionParameter *options)
 {
 BlockDriver *drv;
 
-drv = find_protocol(filename);
+drv = bdrv_find_protocol(filename);
 if (drv == NULL) {
 drv = bdrv_find_format(file);
 }
@@ -283,7 +282,7 @@ static BlockDriver *find_hdev_driver(const char *filename)
 return drv;
 }
 
-static BlockDriver *find_protocol(const char *filename)
+BlockDriver *bdrv_find_protocol(const char *filename)
 {
 BlockDriver *drv1;
 char protocol[128];
@@ -478,7 +477,7 @@ int bdrv_file_open(BlockDriverState **pbs, const char 
*filename, int flags)
 BlockDriver *drv;
 int ret;
 
-drv = find_protocol(filename);
+drv = bdrv_find_protocol(filename);
 if (!drv) {
 return -ENOENT;
 }
diff --git a/block.h b/block.h
index 24efeb6..9034ebb 100644
--- a/block.h
+++ b/block.h
@@ -54,6 +54,7 @@ void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
+BlockDriver *bdrv_find_protocol(const char *filename);
 BlockDriver *bdrv_find_format(const char *format_name);
 BlockDriver *bdrv_find_whitelisted_format(const char *format_name);
 int bdrv_create(BlockDriver *drv, const char* filename,
diff --git a/qemu-img.c b/qemu-img.c
index cb007b7..ea091f0 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -252,8 +252,8 @@ static int img_create(int argc, char **argv)
 const char *base_fmt = NULL;
 const char *filename;
 const char *base_filename = NULL;
-BlockDriver *drv;
-QEMUOptionParameter *param = NULL;
+BlockDriver *drv, *proto_drv;
+QEMUOptionParameter *param = NULL, *create_options = NULL;
 char *options = NULL;
 
 flags = 0;
@@ -286,33 +286,42 @@ static int img_create(int argc, char **argv)
 }
 }
 
+/* Get the filename */
+if (optind = argc)
+help();
+filename = argv[optind++];
+
 /* Find driver and parse its options */
 drv = bdrv_find_format(fmt);
 if (!drv)
 error(Unknown

Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-25 Thread MORITA Kazutaka
At Tue, 25 May 2010 10:12:53 -0700 (PDT),
Sage Weil wrote:
 
 On Tue, 25 May 2010, Avi Kivity wrote:
   What's the reason for not having these drivers upstream? Do we gain
   anything by hiding them from our users and requiring them to install the
   drivers separately from somewhere else?
  
  
  Six months.
 
 FWIW, we (Ceph) aren't complaining about the 6 month lag time (and I don't 
 think the Sheepdog guys are either).
 
I agree.  We aren't complaining about it.

 From our perspective, the current BlockDriver abstraction is ideal, as it 
 represents the reality of qemu's interaction with storage.  Any 'external' 
 interface will be inferior to that in one way or another.  But either way, 
 we are perfectly willing to work with you to all to keep in sync with any 
 future BlockDriver API improvements.  It is worth our time investment even 
 if the API is less stable.
 
I agree.

 The ability to dynamically load a shared object using the existing api 
 would make development a bit easier, but I'm not convinced it's better for 
 for users.  I think having ceph and sheepdog upstream with qemu will serve 
 end users best, and we at least are willing to spend the time to help 
 maintain that code in qemu.git.
 
I agree.

Regards,

Kazutaka



[Qemu-devel] [RFC PATCH v4 2/3] block: call the snapshot handlers of the protocol drivers

2010-05-27 Thread MORITA Kazutaka
When snapshot handlers are not defined in the format driver, it is
better to call the ones of the protocol driver.  This enables us to
implement snapshot support in the protocol driver.

We need to call bdrv_close() and bdrv_open() handlers of the format
driver before and after bdrv_snapshot_goto() call of the protocol.  It is
because the contents of the block driver state may need to be changed
after loading vmstate.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block.c |   61 +++--
 1 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/block.c b/block.c
index da0dc47..cf80dbf 100644
--- a/block.c
+++ b/block.c
@@ -1697,9 +1697,11 @@ int bdrv_save_vmstate(BlockDriverState *bs, const 
uint8_t *buf,
 BlockDriver *drv = bs-drv;
 if (!drv)
 return -ENOMEDIUM;
-if (!drv-bdrv_save_vmstate)
-return -ENOTSUP;
-return drv-bdrv_save_vmstate(bs, buf, pos, size);
+if (drv-bdrv_save_vmstate)
+return drv-bdrv_save_vmstate(bs, buf, pos, size);
+if (bs-file)
+return bdrv_save_vmstate(bs-file, buf, pos, size);
+return -ENOTSUP;
 }
 
 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
@@ -1708,9 +1710,11 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 BlockDriver *drv = bs-drv;
 if (!drv)
 return -ENOMEDIUM;
-if (!drv-bdrv_load_vmstate)
-return -ENOTSUP;
-return drv-bdrv_load_vmstate(bs, buf, pos, size);
+if (drv-bdrv_load_vmstate)
+return drv-bdrv_load_vmstate(bs, buf, pos, size);
+if (bs-file)
+return bdrv_load_vmstate(bs-file, buf, pos, size);
+return -ENOTSUP;
 }
 
 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
@@ -1734,20 +1738,37 @@ int bdrv_snapshot_create(BlockDriverState *bs,
 BlockDriver *drv = bs-drv;
 if (!drv)
 return -ENOMEDIUM;
-if (!drv-bdrv_snapshot_create)
-return -ENOTSUP;
-return drv-bdrv_snapshot_create(bs, sn_info);
+if (drv-bdrv_snapshot_create)
+return drv-bdrv_snapshot_create(bs, sn_info);
+if (bs-file)
+return bdrv_snapshot_create(bs-file, sn_info);
+return -ENOTSUP;
 }
 
 int bdrv_snapshot_goto(BlockDriverState *bs,
const char *snapshot_id)
 {
 BlockDriver *drv = bs-drv;
+int ret, open_ret;
+
 if (!drv)
 return -ENOMEDIUM;
-if (!drv-bdrv_snapshot_goto)
-return -ENOTSUP;
-return drv-bdrv_snapshot_goto(bs, snapshot_id);
+if (drv-bdrv_snapshot_goto)
+return drv-bdrv_snapshot_goto(bs, snapshot_id);
+
+if (bs-file) {
+drv-bdrv_close(bs);
+ret = bdrv_snapshot_goto(bs-file, snapshot_id);
+open_ret = drv-bdrv_open(bs, bs-open_flags);
+if (open_ret  0) {
+bdrv_delete(bs-file);
+bs-drv = NULL;
+return open_ret;
+}
+return ret;
+}
+
+return -ENOTSUP;
 }
 
 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
@@ -1755,9 +1776,11 @@ int bdrv_snapshot_delete(BlockDriverState *bs, const 
char *snapshot_id)
 BlockDriver *drv = bs-drv;
 if (!drv)
 return -ENOMEDIUM;
-if (!drv-bdrv_snapshot_delete)
-return -ENOTSUP;
-return drv-bdrv_snapshot_delete(bs, snapshot_id);
+if (drv-bdrv_snapshot_delete)
+return drv-bdrv_snapshot_delete(bs, snapshot_id);
+if (bs-file)
+return bdrv_snapshot_delete(bs-file, snapshot_id);
+return -ENOTSUP;
 }
 
 int bdrv_snapshot_list(BlockDriverState *bs,
@@ -1766,9 +1789,11 @@ int bdrv_snapshot_list(BlockDriverState *bs,
 BlockDriver *drv = bs-drv;
 if (!drv)
 return -ENOMEDIUM;
-if (!drv-bdrv_snapshot_list)
-return -ENOTSUP;
-return drv-bdrv_snapshot_list(bs, psn_info);
+if (drv-bdrv_snapshot_list)
+return drv-bdrv_snapshot_list(bs, psn_info);
+if (bs-file)
+return bdrv_snapshot_list(bs-file, psn_info);
+return -ENOTSUP;
 }
 
 #define NB_SUFFIXES 4
-- 
1.5.6.5




[Qemu-devel] [RFC PATCH v4 1/3] close all the block drivers before the qemu process exits

2010-05-27 Thread MORITA Kazutaka
This patch calls the close handler of the block driver before the qemu
process exits.

This is necessary because the sheepdog block driver releases the lock
of VM images in the close handler.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block.c |9 +
 block.h |1 +
 vl.c|1 +
 3 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/block.c b/block.c
index 24c63f6..da0dc47 100644
--- a/block.c
+++ b/block.c
@@ -646,6 +646,15 @@ void bdrv_close(BlockDriverState *bs)
 }
 }
 
+void bdrv_close_all(void)
+{
+BlockDriverState *bs;
+
+QTAILQ_FOREACH(bs, bdrv_states, list) {
+bdrv_close(bs);
+}
+}
+
 void bdrv_delete(BlockDriverState *bs)
 {
 /* remove from list, if necessary */
diff --git a/block.h b/block.h
index 756670d..25744b1 100644
--- a/block.h
+++ b/block.h
@@ -123,6 +123,7 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
 /* Ensure contents are flushed to disk.  */
 void bdrv_flush(BlockDriverState *bs);
 void bdrv_flush_all(void);
+void bdrv_close_all(void);
 
 int bdrv_has_zero_init(BlockDriverState *bs);
 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
diff --git a/vl.c b/vl.c
index 7121cd0..8ffe36f 100644
--- a/vl.c
+++ b/vl.c
@@ -1992,6 +1992,7 @@ static void main_loop(void)
 vm_stop(r);
 }
 }
+bdrv_close_all();
 pause_all_vcpus();
 }
 
-- 
1.5.6.5




[Qemu-devel] [RFC PATCH v4 3/3] block: add sheepdog driver for distributed storage support

2010-05-27 Thread MORITA Kazutaka
Sheepdog is a distributed storage system for QEMU. It provides highly
available block level storage volumes to VMs like Amazon EBS.  This
patch adds a qemu block driver for Sheepdog.

Sheepdog features are:
- No node in the cluster is special (no metadata node, no control
  node, etc)
- Linear scalability in performance and capacity
- No single point of failure
- Autonomous management (zero configuration)
- Useful volume management support such as snapshot and cloning
- Thin provisioning
- Autonomous load balancing

The more details are available at the project site:
http://www.osrg.net/sheepdog/

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 Makefile.objs|2 +-
 block/sheepdog.c | 1835 ++
 2 files changed, 1836 insertions(+), 1 deletions(-)
 create mode 100644 block/sheepdog.c

diff --git a/Makefile.objs b/Makefile.objs
index 1a942e5..527a754 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -14,7 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o 
vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
-block-nested-y += parallels.o nbd.o blkdebug.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/sheepdog.c b/block/sheepdog.c
new file mode 100644
index 000..68545e8
--- /dev/null
+++ b/block/sheepdog.c
@@ -0,0 +1,1835 @@
+/*
+ * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/.
+ */
+#include netdb.h
+#include netinet/tcp.h
+
+#include qemu-common.h
+#include qemu-error.h
+#include block_int.h
+
+#define SD_PROTO_VER 0x01
+
+#define SD_DEFAULT_ADDR localhost:7000
+
+#define SD_OP_CREATE_AND_WRITE_OBJ  0x01
+#define SD_OP_READ_OBJ   0x02
+#define SD_OP_WRITE_OBJ  0x03
+
+#define SD_OP_NEW_VDI0x11
+#define SD_OP_LOCK_VDI   0x12
+#define SD_OP_RELEASE_VDI0x13
+#define SD_OP_GET_VDI_INFO   0x14
+#define SD_OP_READ_VDIS  0x15
+
+#define SD_FLAG_CMD_WRITE0x01
+#define SD_FLAG_CMD_COW  0x02
+
+#define SD_RES_SUCCESS   0x00 /* Success */
+#define SD_RES_UNKNOWN   0x01 /* Unknown error */
+#define SD_RES_NO_OBJ0x02 /* No object found */
+#define SD_RES_EIO   0x03 /* I/O error */
+#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
+#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
+#define SD_RES_SYSTEM_ERROR  0x06 /* System error */
+#define SD_RES_VDI_LOCKED0x07 /* Vdi is locked */
+#define SD_RES_NO_VDI0x08 /* No vdi found */
+#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
+#define SD_RES_VDI_READ  0x0A /* Cannot read requested vdi */
+#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
+#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
+#define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */
+#define SD_RES_NO_TAG0x0E /* Requested tag is not found */
+#define SD_RES_STARTUP   0x0F /* Sheepdog is on starting up */
+#define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */
+#define SD_RES_SHUTDOWN  0x11 /* Sheepdog is shutting down */
+#define SD_RES_NO_MEM0x12 /* Cannot allocate memory */
+#define SD_RES_FULL_VDI  0x13 /* we already have the maximum vdis */
+#define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */
+#define SD_RES_NO_SPACE  0x15 /* Server has no room for new objects */
+#define SD_RES_WAIT_FOR_FORMAT  0x16 /* Sheepdog is waiting for a format 
operation */
+#define SD_RES_WAIT_FOR_JOIN0x17 /* Sheepdog is waiting for other nodes 
joining */
+#define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
+
+/*
+ * Object ID rules
+ *
+ *  0 - 19 (20 bits): data object space
+ * 20 - 31 (12 bits): reserved data object space
+ * 32 - 55 (24 bits): vdi object space
+ * 56 - 59 ( 4 bits): reserved vdi object space
+ * 60 - 63 ( 4 bits): object type indentifier space
+ */
+
+#define VDI_SPACE_SHIFT   32
+#define VDI_BIT (UINT64_C(1)  63)
+#define VMSTATE_BIT (UINT64_C(1)  62)
+#define MAX_DATA_OBJS (1ULL  20)
+#define MAX_CHILDREN 1024
+#define SD_MAX_VDI_LEN 256
+#define SD_NR_VDIS   (1U  24)
+#define SD_DATA_OBJ_SIZE (UINT64_C(1)  22)
+
+#define SD_INODE_SIZE (sizeof(SheepdogInode))
+#define CURRENT_VDI_ID 0
+
+typedef struct SheepdogReq {
+   uint8_t proto_ver;
+   uint8_t opcode;
+   uint16_tflags;
+   uint32_tepoch;
+   uint32_tid

[Qemu-devel] Re: [RFC PATCH v4 0/3] Sheepdog: distributed storage system for QEMU

2010-06-03 Thread MORITA Kazutaka
At Wed, 02 Jun 2010 12:49:02 +0200,
Kevin Wolf wrote:
 
 Am 28.05.2010 04:44, schrieb MORITA Kazutaka:
  Hi all,
  
  This patch adds a block driver for Sheepdog distributed storage
  system.  Please consider for inclusion.
 
 Hint for next time: You should remove the RFC from the subject line if
 you think the patch is ready for inclusion. Otherwise I might miss this
 and think you only want comments on it.
 

Thanks for the advice. I'll do so the next time.

  MORITA Kazutaka (3):
close all the block drivers before the qemu process exits
block: call the snapshot handlers of the protocol drivers
block: add sheepdog driver for distributed storage support
 
 Thanks, I have applied the first two patches to the block branch, they
 look good to me. I'll send some comments for the third one (though it's
 only coding style until now).
 

Thanks a lot.

Kazutaka



[Qemu-devel] Re: [RFC PATCH v4 3/3] block: add sheepdog driver for distributed storage support

2010-06-03 Thread MORITA Kazutaka
At Tue, 01 Jun 2010 09:58:04 -0500,
Thanks for your comments!

Chris Krumme wrote:
 
 On 05/27/2010 09:44 PM, MORITA Kazutaka wrote:
  Sheepdog is a distributed storage system for QEMU. It provides highly

  +
  +static int connect_to_sdog(const char *addr)
  +{
  +   char buf[64];
  +   char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
  +   char name[256], *p;
  +   int fd, ret;
  +   struct addrinfo hints, *res, *res0;
  +   int port = 0;
  +
  +   if (!addr) {
  +   addr = SD_DEFAULT_ADDR;
  +   }
  +
  +   strcpy(name, addr);
 
 
 Can strlen(addr) be  sizeof(name)?
 

Yes, we should check the length of addr. This would causes overflows.

  +
  +   p = name;
  +   while (*p) {
  +   if (*p == ':') {
  +   *p++ = '\0';
 
 
 May also need to check for p  name + sizeof(name).
 

p should be NULL-terminated, so the check is not required, I think.

  +   break;
  +   } else {
  +   p++;
  +   }
  +   }
  +
  +   if (*p == '\0') {
  +   error_report(cannot find a port number, %s\n, name);
  +   return -1;
  +   }
  +   port = strtol(p, NULL, 10);
 
 
 Are negative numbers valid here?
 

No. It is better to use strtoul.


  +
  +static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
  +char *vdi, int vdi_len, uint32_t *snapid)
  +{
  +   char *p, *q;
  +   int nr_sep;
  +
  +   p = q = strdup(filename);
  +
  +   if (!p) {
 
 
 I think Qemu has a version of strdup that will not return NULL.
 

Yes. We can use qemu_strdup here.


  +
  +/* TODO: error cleanups */
  +static int sd_open(BlockDriverState *bs, const char *filename, int flags)
  +{
  +   int ret, fd;
  +   uint32_t vid = 0;
  +   BDRVSheepdogState *s = bs-opaque;
  +   char vdi[256];
  +   uint32_t snapid;
  +   int for_snapshot = 0;
  +   char *buf;
  +
  +   strstart(filename, sheepdog:, (const char **)filename);
  +
  +   buf = qemu_malloc(SD_INODE_SIZE);
  +
  +   memset(vdi, 0, sizeof(vdi));
  +   if (parse_vdiname(s, filename, vdi, sizeof(vdi),snapid)  0) {
  +   goto out;
  +   }
  +   s-fd = get_sheep_fd(s);
  +   if (s-fd  0) {
 
 
 buf is not freed, goto out maybe.
 

Yes, we should goto out here.


  +
  +static int do_sd_create(const char *addr, char *filename, char *tag,
  +   int64_t total_sectors, uint32_t base_vid,
  +   uint32_t *vdi_id, int snapshot)
  +{
  +   SheepdogVdiReq hdr;
  +   SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)hdr;
  +   int fd, ret;
  +   unsigned int wlen, rlen = 0;
  +   char buf[SD_MAX_VDI_LEN];
  +
  +   fd = connect_to_sdog(addr);
  +   if (fd  0) {
  +   return -1;
  +   }
  +
  +   strncpy(buf, filename, SD_MAX_VDI_LEN);
  +
  +   memset(hdr, 0, sizeof(hdr));
  +   hdr.opcode = SD_OP_NEW_VDI;
  +   hdr.base_vdi_id = base_vid;
  +
  +   wlen = SD_MAX_VDI_LEN;
  +
  +   hdr.flags = SD_FLAG_CMD_WRITE;
  +   hdr.snapid = snapshot;
  +
  +   hdr.data_length = wlen;
  +   hdr.vdi_size = total_sectors * 512;
 
 
 There is another patch on the list changing 512 to a define for sector size.
 

OK. We'll define SECTOR_SIZE.


  +
  +   ret = do_req(fd, (SheepdogReq *)hdr, buf,wlen,rlen);
  +
  +   close(fd);
  +
  +   if (ret) {
  +   return -1;
  +   }
  +
  +   if (rsp-result != SD_RES_SUCCESS) {
  +   error_report(%s, %s\n, sd_strerror(rsp-result), filename);
  +   return -1;
  +   }
  +
  +   if (vdi_id) {
  +   *vdi_id = rsp-vdi_id;
  +   }
  +
  +   return 0;
  +}
  +
  +static int sd_create(const char *filename, QEMUOptionParameter *options)
  +{
  +   int ret;
  +   uint32_t vid = 0;
  +   int64_t total_sectors = 0;
  +   char *backing_file = NULL;
  +
  +   strstart(filename, sheepdog:, (const char **)filename);
  +
  +   while (options  options-name) {
  +   if (!strcmp(options-name, BLOCK_OPT_SIZE)) {
  +   total_sectors = options-value.n / 512;
 
 Use define.
  +   } else if (!strcmp(options-name, BLOCK_OPT_BACKING_FILE)) {
  +   backing_file = options-value.s;
  +   }
  +   options++;
  +   }
  +
  +   if (backing_file) {
  +   BlockDriverState bs;
  +   char vdi[SD_MAX_VDI_LEN];
  +   uint32_t snapid;
  +
  +   strstart(backing_file, sheepdog:, (const char 
  **)backing_file);
  +   memset(bs, 0, sizeof(bs));
  +
  +   bs.opaque = qemu_malloc(sizeof(BDRVSheepdogState));
 
 
 bs seems to have a short life span, is opaque getting freed?
 

No, we should free it.


  +
  +static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo 
  *sn_info)
  +{
  +   BDRVSheepdogState *s = bs-opaque;
  +   int ret, fd;
  +   uint32_t new_vid;
  +   SheepdogInode *inode;
  +   unsigned int datalen;
  +   uint64_t offset;
  +
  +   dprintf(sn_info: name %s id_str %s s: name %s vm_state_size %d 
  +   is_current %d\n, sn_info-name, sn_info-id_str,
  +   s

[Qemu-devel] Re: [RFC PATCH v4 3/3] block: add sheepdog driver for distributed storage support

2010-06-03 Thread MORITA Kazutaka
At Wed, 02 Jun 2010 15:55:42 +0200,
Kevin Wolf wrote:
 
 Am 28.05.2010 04:44, schrieb MORITA Kazutaka:
  Sheepdog is a distributed storage system for QEMU. It provides highly
  available block level storage volumes to VMs like Amazon EBS.  This
  patch adds a qemu block driver for Sheepdog.
  
  Sheepdog features are:
  - No node in the cluster is special (no metadata node, no control
node, etc)
  - Linear scalability in performance and capacity
  - No single point of failure
  - Autonomous management (zero configuration)
  - Useful volume management support such as snapshot and cloning
  - Thin provisioning
  - Autonomous load balancing
  
  The more details are available at the project site:
  http://www.osrg.net/sheepdog/
  
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
  ---
   Makefile.objs|2 +-
   block/sheepdog.c | 1835 
  ++
   2 files changed, 1836 insertions(+), 1 deletions(-)
   create mode 100644 block/sheepdog.c
 
 One general thing: The code uses some mix of spaces and tabs for
 indentation, with the greatest part using tabs. According to
 CODING_STYLE it should consistently use four spaces instead.
 

OK.  I'll fix the indentation according to CODYING_STYLE.


  +
  +typedef struct SheepdogInode {
  +   char name[SD_MAX_VDI_LEN];
  +   uint64_t ctime;
  +   uint64_t snap_ctime;
  +   uint64_t vm_clock_nsec;
  +   uint64_t vdi_size;
  +   uint64_t vm_state_size;
  +   uint16_t copy_policy;
  +   uint8_t  nr_copies;
  +   uint8_t  block_size_shift;
  +   uint32_t snap_id;
  +   uint32_t vdi_id;
  +   uint32_t parent_vdi_id;
  +   uint32_t child_vdi_id[MAX_CHILDREN];
  +   uint32_t data_vdi_id[MAX_DATA_OBJS];
 
 Wow, this is a huge array. :-)
 
 So Sheepdog has a fixed limit of 16 TB, right?
 

MAX_DATA_OBJS is (1  20), and the size of a object is 4 MB.  So the
limit of the Sheepdog image size is 4 TB.

These values are hard-coded, and I guess they should be configurable.


 
  +} SheepdogInode;
  +

  +
  +static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
  +{
  +   SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
  +
  +   acb-canceled = 1;
  +}
 
 Does this provide the right semantics? You haven't really cancelled the
 request, but you pretend to. So you actually complete the request in the
 background and then throw the return code away.
 
 I seem to remember that posix-aio-compat.c waits at this point for
 completion of the requests, calls the callbacks and only afterwards
 returns from aio_cancel when no more requests are in flight.
 
 Or if you can really cancel requests, it would be the best option, of
 course.
 

Sheepdog cannot cancel the requests which are already sent to the
servers.  So, as you say, we pretend to cancel the requests without
waiting for completion of them.  However, are there any situation
where pretending to cancel causes problems in practice?

To wait for completion of the requests here, we may need to create
another thread for processing I/O like posix-aio-compat.c.


  +
  +static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
  +   int write)
 
 I've spent at least 15 minutes figuring out what this function does. I
 think I've got it now more or less, but I've come to the conclusion that
 this code needs more comments.
 
 I'd suggest to add a header comment to all non-trivial functions and
 maybe somewhere on the top a general description of how things work.
 
 As far as I understood now, there are basically two parts of request
 handling:
 
 1. The request is sent to the server. Its AIOCB is saved in a list in
 the BDRVSheepdogState. It doesn't pass a callback or anything for the
 completion.
 
 2. aio_read_response is registered as a fd handler to the sheepdog
 connection. When the server responds, it searches the right AIOCB in the
 list and the second part of request handling starts.
 
 do_send_recv is the function that is used to do all communication with
 the server. The iov stuff looks like it's only used for some data, but
 seems this is not true - it's also used for the metadata of the protocol.
 
 Did I understand it right so far?
 

Yes, exactly.  I'll add comments to make codes more readable.


  +{
  +   struct msghdr msg;
  +   int ret, diff;
  +
  +   memset(msg, 0, sizeof(msg));
  +   msg.msg_iov = iov;
  +   msg.msg_iovlen = 1;
  +
  +   len += offset;
  +
  +   while (iov-iov_len  len) {
  +   len -= iov-iov_len;
  +
  +   iov++;
  +   msg.msg_iovlen++;
  +   }
 
 You're counting the number of elements in the iov here. qemu_iovec would
 already have these (and also len), wouldn't it make sense to use it as
 the abstraction? Though I'm not sure where these iovecs come from, so
 the answer might be no.
 

We uses struct msghdr for sendmsg/recvmsg here, so using iovec
directly looks simpler.


  +
  +   diff = iov-iov_len - len;
  +   iov-iov_len -= diff;
  +
  +   while (msg.msg_iov-iov_len

[Qemu-devel] Re: [RFC PATCH v4 3/3] block: add sheepdog driver for distributed storage support

2010-06-06 Thread MORITA Kazutaka
At Fri, 04 Jun 2010 13:04:00 +0200,
Kevin Wolf wrote:
 
 Am 03.06.2010 18:23, schrieb MORITA Kazutaka:
  +static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
  +{
  + SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
  +
  + acb-canceled = 1;
  +}
 
  Does this provide the right semantics? You haven't really cancelled the
  request, but you pretend to. So you actually complete the request in the
  background and then throw the return code away.
 
  I seem to remember that posix-aio-compat.c waits at this point for
  completion of the requests, calls the callbacks and only afterwards
  returns from aio_cancel when no more requests are in flight.
 
  Or if you can really cancel requests, it would be the best option, of
  course.
 
  
  Sheepdog cannot cancel the requests which are already sent to the
  servers.  So, as you say, we pretend to cancel the requests without
  waiting for completion of them.  However, are there any situation
  where pretending to cancel causes problems in practice?
 
 I'm not sure how often it would happen in practice, but if the guest OS
 thinks the old value is on disk when in fact the new one is, this could
 lead to corruption. I think if it can happen, even without evidence that
 it actually does, it's already relevant enough.
 

I agree.

  To wait for completion of the requests here, we may need to create
  another thread for processing I/O like posix-aio-compat.c.
 
 I don't think you need a thread to get the same behaviour, you just need
 to call the fd handlers like in the main loop. It would probably be the
 first driver doing this, though, and it's not an often used code path,
 so it might be a bad idea.
 
 Maybe it's reasonable to just complete the request with -EIO? This way
 the guest couldn't make any assumption about the data written. On the
 other hand, it could be unhappy about failed requests, but that's
 probably better than corruption.
 

Completing with -EIO looks good to me.  Thanks for the advice.
I'll send an updated patch tomorrow.

Regards,

Kazutaka



[Qemu-devel] [PATCH v5] block: add sheepdog driver for distributed storage support

2010-06-07 Thread MORITA Kazutaka
Sheepdog is a distributed storage system for QEMU. It provides highly
available block level storage volumes to VMs like Amazon EBS.  This
patch adds a qemu block driver for Sheepdog.

Sheepdog features are:
- No node in the cluster is special (no metadata node, no control
  node, etc)
- Linear scalability in performance and capacity
- No single point of failure
- Autonomous management (zero configuration)
- Useful volume management support such as snapshot and cloning
- Thin provisioning
- Autonomous load balancing

The more details are available at the project site:
http://www.osrg.net/sheepdog/

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
Changes from v4 to v5 are:
 - address the comments to the sheepdog driver (Thanks Kevin, Chris!)
 -- fix a coding style
 -- fix aio_cancel handling
 -- fix an overflow bug in coping hostname
 -- add comments to the non-trivial functions
 - remove already applied patches from the patchset

Changes from v3 to v4 are:
 - fix error handling in bdrv_snapshot_goto.

Changes from v2 to v3 are:

 - add drv-bdrv_close() and drv-bdrv_open() before and after
   bdrv_snapshot_goto() call of the protocol.
 - address the review comments on the sheepdog driver code.

Changes from v1 to v2 are:

 - rebase onto git://repo.or.cz/qemu/kevin.git block
 - modify the sheepdog driver as a protocol driver
 - add new patch to call the snapshot handler of the protocol

 Makefile.objs|2 +-
 block/sheepdog.c | 1905 ++
 2 files changed, 1906 insertions(+), 1 deletions(-)
 create mode 100644 block/sheepdog.c

diff --git a/Makefile.objs b/Makefile.objs
index 54dec26..070db8a 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -14,7 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o 
vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
-block-nested-y += parallels.o nbd.o blkdebug.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/sheepdog.c b/block/sheepdog.c
new file mode 100644
index 000..a9477a5
--- /dev/null
+++ b/block/sheepdog.c
@@ -0,0 +1,1905 @@
+/*
+ * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/.
+ */
+#include netdb.h
+#include netinet/tcp.h
+
+#include qemu-common.h
+#include qemu-error.h
+#include block_int.h
+
+#define SD_PROTO_VER 0x01
+
+#define SD_DEFAULT_ADDR localhost
+#define SD_DEFAULT_PORT 7000
+
+#define SD_OP_CREATE_AND_WRITE_OBJ  0x01
+#define SD_OP_READ_OBJ   0x02
+#define SD_OP_WRITE_OBJ  0x03
+
+#define SD_OP_NEW_VDI0x11
+#define SD_OP_LOCK_VDI   0x12
+#define SD_OP_RELEASE_VDI0x13
+#define SD_OP_GET_VDI_INFO   0x14
+#define SD_OP_READ_VDIS  0x15
+
+#define SD_FLAG_CMD_WRITE0x01
+#define SD_FLAG_CMD_COW  0x02
+
+#define SD_RES_SUCCESS   0x00 /* Success */
+#define SD_RES_UNKNOWN   0x01 /* Unknown error */
+#define SD_RES_NO_OBJ0x02 /* No object found */
+#define SD_RES_EIO   0x03 /* I/O error */
+#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
+#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
+#define SD_RES_SYSTEM_ERROR  0x06 /* System error */
+#define SD_RES_VDI_LOCKED0x07 /* Vdi is locked */
+#define SD_RES_NO_VDI0x08 /* No vdi found */
+#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
+#define SD_RES_VDI_READ  0x0A /* Cannot read requested vdi */
+#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
+#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
+#define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */
+#define SD_RES_NO_TAG0x0E /* Requested tag is not found */
+#define SD_RES_STARTUP   0x0F /* Sheepdog is on starting up */
+#define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */
+#define SD_RES_SHUTDOWN  0x11 /* Sheepdog is shutting down */
+#define SD_RES_NO_MEM0x12 /* Cannot allocate memory */
+#define SD_RES_FULL_VDI  0x13 /* we already have the maximum vdis */
+#define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */
+#define SD_RES_NO_SPACE  0x15 /* Server has no room for new objects */
+#define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */
+#define SD_RES_WAIT_FOR_JOIN0x17 /* Waiting for other nodes joining */
+#define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
+
+/*
+ * Object ID rules
+ *
+ *  0 - 19 (20 bits

Re: [Qemu-devel] [PATCH v4] savevm: Really verify if a drive supports snapshots

2010-06-07 Thread MORITA Kazutaka
At Fri,  4 Jun 2010 16:35:59 -0300,
Miguel Di Ciurcio Filho wrote:
 
 Both bdrv_can_snapshot() and bdrv_has_snapshot() does not work as advertized.
 
 First issue: Their names implies different porpouses, but they do the same 
 thing
 and have exactly the same code. Maybe copied and pasted and forgotten?
 bdrv_has_snapshot() is called in various places for actually checking if there
 is snapshots or not.
 
 Second issue: the way bdrv_can_snapshot() verifies if a block driver supports 
 or
 not snapshots does not catch all cases. E.g.: a raw image.
 
 So when do_savevm() is called, first thing it does is to set a global
 BlockDriverState to save the VM memory state calling get_bs_snapshots().
 
 static BlockDriverState *get_bs_snapshots(void)
 {
 BlockDriverState *bs;
 DriveInfo *dinfo;
 
 if (bs_snapshots)
 return bs_snapshots;
 QTAILQ_FOREACH(dinfo, drives, next) {
 bs = dinfo-bdrv;
 if (bdrv_can_snapshot(bs))
 goto ok;
 }
 return NULL;
  ok:
 bs_snapshots = bs;
 return bs;
 }
 
 bdrv_can_snapshot() may return a BlockDriverState that does not support
 snapshots and do_savevm() goes on.
 
 Later on in do_savevm(), we find:
 
 QTAILQ_FOREACH(dinfo, drives, next) {
 bs1 = dinfo-bdrv;
 if (bdrv_has_snapshot(bs1)) {
 /* Write VM state size only to the image that contains the state 
 */
 sn-vm_state_size = (bs == bs1 ? vm_state_size : 0);
 ret = bdrv_snapshot_create(bs1, sn);
 if (ret  0) {
 monitor_printf(mon, Error while creating snapshot on '%s'\n,
bdrv_get_device_name(bs1));
 }
 }
 }
 
 bdrv_has_snapshot(bs1) is not checking if the device does support or has
 snapshots as explained above. Only in bdrv_snapshot_create() the device is
 actually checked for snapshot support.
 
 So, in cases where the first device supports snapshots, and the second does 
 not,
 the snapshot on the first will happen anyways. I believe this is not a good
 behavior. It should be an all or nothing process.
 
 This patch addresses these issues by making bdrv_can_snapshot() actually do
 what it must do and enforces better tests to avoid errors in the middle of
 do_savevm(). bdrv_has_snapshot() is removed and replaced by 
 bdrv_can_snapshot()
 where appropriate.
 
 bdrv_can_snapshot() was moved from savevm.c to block.c. It makes more sense 
 to me.
 
 The loadvm_state() function was updated too to enforce that when loading a VM 
 at
 least all writable devices must support snapshots too.
 
 Signed-off-by: Miguel Di Ciurcio Filho miguel.fi...@gmail.com
 ---
  block.c  |   11 +++
  block.h  |1 +
  savevm.c |   58 --
  3 files changed, 48 insertions(+), 22 deletions(-)
 
 diff --git a/block.c b/block.c
 index cd70730..ace3cdb 100644
 --- a/block.c
 +++ b/block.c
 @@ -1720,6 +1720,17 @@ void bdrv_debug_event(BlockDriverState *bs, 
 BlkDebugEvent event)
  /**/
  /* handling of snapshots */
  
 +int bdrv_can_snapshot(BlockDriverState *bs)
 +{
 +BlockDriver *drv = bs-drv;
 +if (!drv || !drv-bdrv_snapshot_create || bdrv_is_removable(bs) ||
 +bdrv_is_read_only(bs)) {
 +return 0;
 +}
 +
 +return 1;
 +}
 +

The underlying protocol could support snapshots, so I think we should
check against bs-file too.

--- a/block.c
+++ b/block.c
@@ -1671,6 +1671,9 @@ int bdrv_can_snapshot(BlockDriverState *bs)
 BlockDriver *drv = bs-drv;
 if (!drv || !drv-bdrv_snapshot_create || bdrv_is_removable(bs) ||
 bdrv_is_read_only(bs)) {
+if (bs-file) {
+return bdrv_can_snapshot(bs-file);
+}
 return 0;
 }
 
Regards,

Kazutaka



[Qemu-devel] Re: [PATCH v5] block: add sheepdog driver for distributed storage support

2010-06-15 Thread MORITA Kazutaka
At Tue, 15 Jun 2010 10:24:14 +0200,
Kevin Wolf wrote:
 
 Am 14.06.2010 21:48, schrieb MORITA Kazutaka:
  3) qemu-io aio_read/write doesn't seem to work well with it. I only get
  the result of the AIO request when I exit qemu-io. This may be a qemu-io
  problem or a Sheepdog one. We need to look into this, qemu-io is
  important for testing and debugging (particularly for qemu-iotests)
 
  Sheepdog receives responses from the server in the fd handler to the
  socket connection.  But, while qemu-io executes aio_read/aio_write, it
  doesn't call qemu_aio_wait() and the fd handler isn't invoked at all.
  This seems to be the reason of the problem.
  
  I'm not sure this is a qemu-io problem or a Sheepdog one.  If it is a
  qemu-io problem, we need to call qemu_aio_wait() somewhere in the
  command_loop(), I guess.  If it is a Sheepdog problem, we need to
  consider another mechanism to receive responses...
 
 Not sure either.
 
 I think posix-aio-compat needs fd handlers to be called, too, and it
 kind of works. I'm saying kind of because after an aio_read/write
 command qemu-io exits (it doesn't with Sheepdog). And when exiting there
 is a qemu_aio_wait(), so this explains why you get a result there.
 
 I guess it's a bug in the posix-aio-compat case rather than with Sheepdog.
 
It seems that fgets() is interrupted by a signal in fetchline() and
qemu-io exits.

BTW, I think we should call the fd handlers when user input is idle
and the fds become ready.  I'll send the patch later.

 The good news is that if qemu-iotests works with only one aio_read/write
 command before qemu-io exits, it's going to work with Sheepdog, too.
 
Great!


Thanks,

Kazutaka



[Qemu-devel] [PATCH 1/2] qemu-io: retry fgets() when errno is EINTR

2010-06-15 Thread MORITA Kazutaka
posix-aio-compat sends a signal in aio operations, so we should
consider that fgets() could be interrupted here.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 cmd.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/cmd.c b/cmd.c
index 2336334..460df92 100644
--- a/cmd.c
+++ b/cmd.c
@@ -272,7 +272,10 @@ fetchline(void)
return NULL;
printf(%s, get_prompt());
fflush(stdout);
+again:
if (!fgets(line, MAXREADLINESZ, stdin)) {
+   if (errno == EINTR)
+   goto again;
free(line);
return NULL;
}
-- 
1.5.6.5




[Qemu-devel] [PATCH 2/2] qemu-io: check registered fds in command_loop()

2010-06-15 Thread MORITA Kazutaka
Some block drivers use an aio handler and do I/O completion routines
in it.  However, the handler is not invoked if we only do
aio_read/write, because registered fds are not checked at all.

This patch registers a command processing function as a fd handler to
STDIO, and calls qemu_aio_wait() in command_loop().  Any other
handlers can be invoked when user input is idle.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 cmd.c |   53 +++--
 1 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/cmd.c b/cmd.c
index 460df92..2b66e24 100644
--- a/cmd.c
+++ b/cmd.c
@@ -24,6 +24,7 @@
 #include getopt.h
 
 #include cmd.h
+#include qemu-aio.h
 
 #define _(x)   x   /* not gettext support yet */
 
@@ -149,6 +150,37 @@ add_args_command(
args_func = af;
 }
 
+static char *get_prompt(void);
+
+static void do_command(void *opaque)
+{
+   int c;
+   int *done = opaque;
+   char*input;
+   char**v;
+   const cmdinfo_t *ct;
+
+   if ((input = fetchline()) == NULL) {
+   *done = 1;
+   return;
+   }
+   v = breakline(input, c);
+   if (c) {
+   ct = find_command(v[0]);
+   if (ct)
+   *done = command(ct, c, v);
+   else
+   fprintf(stderr, _(command \%s\ not found\n),
+   v[0]);
+   }
+   doneline(input, v);
+
+   if (*done == 0) {
+   printf(%s, get_prompt());
+   fflush(stdout);
+   }
+}
+
 void
 command_loop(void)
 {
@@ -186,20 +218,15 @@ command_loop(void)
free(cmdline);
return;
}
+
+   printf(%s, get_prompt());
+   fflush(stdout);
+
+   qemu_aio_set_fd_handler(STDIN_FILENO, do_command, NULL, NULL, NULL, 
done);
while (!done) {
-   if ((input = fetchline()) == NULL)
-   break;
-   v = breakline(input, c);
-   if (c) {
-   ct = find_command(v[0]);
-   if (ct)
-   done = command(ct, c, v);
-   else
-   fprintf(stderr, _(command \%s\ not found\n),
-   v[0]);
-   }
-   doneline(input, v);
+   qemu_aio_wait();
}
+   qemu_aio_set_fd_handler(STDIN_FILENO, NULL, NULL, NULL, NULL, NULL);
 }
 
 /* from libxcmd/input.c */
@@ -270,8 +297,6 @@ fetchline(void)
 
if (!line)
return NULL;
-   printf(%s, get_prompt());
-   fflush(stdout);
 again:
if (!fgets(line, MAXREADLINESZ, stdin)) {
if (errno == EINTR)
-- 
1.5.6.5




[Qemu-devel] [PATCH 0/2] qemu-io: fix aio_read/write problems

2010-06-15 Thread MORITA Kazutaka
Hi,

This patchset fixes the following qemu-io problems:

 - Qemu-io exits suddenly when we do aio_read/write to drivers which
   use posix-aio-compat.

 - We cannot get the results of aio_read/write if we don't do other
   operations.  This problem occurs when the block driver uses a fd
   handler to get I/O completion.

Thanks,

Kazutaka


MORITA Kazutaka (2):
  qemu-io: retry fgets() when errno is EINTR
  qemu-io: check registered fds in command_loop()

 cmd.c |   56 ++--
 1 files changed, 42 insertions(+), 14 deletions(-)




Re: [Qemu-devel] Re: [PATCH 1/2] qemu-io: retry fgets() when errno is EINTR

2010-06-16 Thread MORITA Kazutaka
At Wed, 16 Jun 2010 13:04:47 +0200,
Kevin Wolf wrote:
 
 Am 15.06.2010 19:53, schrieb MORITA Kazutaka:
  posix-aio-compat sends a signal in aio operations, so we should
  consider that fgets() could be interrupted here.
  
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
  ---
   cmd.c |3 +++
   1 files changed, 3 insertions(+), 0 deletions(-)
  
  diff --git a/cmd.c b/cmd.c
  index 2336334..460df92 100644
  --- a/cmd.c
  +++ b/cmd.c
  @@ -272,7 +272,10 @@ fetchline(void)
  return NULL;
  printf(%s, get_prompt());
  fflush(stdout);
  +again:
  if (!fgets(line, MAXREADLINESZ, stdin)) {
  +   if (errno == EINTR)
  +   goto again;
  free(line);
  return NULL;
  }
 
 This looks like a loop replaced by goto (and braces are missing). What
 about this instead?
 
 do {
 ret = fgets(...)
 } while (ret == NULL  errno == EINTR)
 
 if (ret == NULL) {
fail
 }
 

I agree.

However, it seems that my second patch have already solved the
problem.  We register this readline routines as an aio handler now, so
fgets() does not block and cannot return with EINTR.

This patch looks no longer needed, sorry.

Thanks,

Kazutaka



Re: [Qemu-devel] Re: [PATCH 1/2] qemu-io: retry fgets() when errno is EINTRg

2010-06-17 Thread MORITA Kazutaka
At Thu, 17 Jun 2010 18:18:18 +0100,
Jamie Lokier wrote:
 
 Kevin Wolf wrote:
  Am 16.06.2010 18:52, schrieb MORITA Kazutaka:
   At Wed, 16 Jun 2010 13:04:47 +0200,
   Kevin Wolf wrote:
  
   Am 15.06.2010 19:53, schrieb MORITA Kazutaka:
   posix-aio-compat sends a signal in aio operations, so we should
   consider that fgets() could be interrupted here.
  
   Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
   ---
cmd.c |3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
  
   diff --git a/cmd.c b/cmd.c
   index 2336334..460df92 100644
   --- a/cmd.c
   +++ b/cmd.c
   @@ -272,7 +272,10 @@ fetchline(void)
   return NULL;
   printf(%s, get_prompt());
   fflush(stdout);
   +again:
   if (!fgets(line, MAXREADLINESZ, stdin)) {
   +   if (errno == EINTR)
   +   goto again;
   free(line);
   return NULL;
   }
  
   This looks like a loop replaced by goto (and braces are missing). What
   about this instead?
  
   do {
   ret = fgets(...)
   } while (ret == NULL  errno == EINTR)
  
   if (ret == NULL) {
  fail
   }
  
   
   I agree.
   
   However, it seems that my second patch have already solved the
   problem.  We register this readline routines as an aio handler now, so
   fgets() does not block and cannot return with EINTR.
   
   This patch looks no longer needed, sorry.
  
  Good point. Thanks for having a look.
 
 Anyway, are you sure stdio functions can be interrupted with EINTR?
 Linus reminds us that some stdio functions have to retry internally
 anyway:
 
 http://comments.gmane.org/gmane.comp.version-control.git/18285
 

I think It is another problem whether fgets() retries internally when
a read system call is interrupted.  We should handle EINTR if the
system call can set EINTR.  I think a read() doesn't return with EINTR
if it doesn't block on Linux environment, but it may be not true on
other operating systems.

I send the fixed patch.  I'm not sure this patch is really needed, but
doesn't hurt anyway.

=
posix-aio-compat sends a signal in aio operations, so we should
consider that fgets() could be interrupted here.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 cmd.c |   14 +-
 1 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cmd.c b/cmd.c
index aee2a38..733bacd 100644
--- a/cmd.c
+++ b/cmd.c
@@ -293,14 +293,18 @@ fetchline(void)
 char *
 fetchline(void)
 {
-   char*p, *line = malloc(MAXREADLINESZ);
+   char*p, *line = malloc(MAXREADLINESZ), *ret;
 
if (!line)
return NULL;
-   if (!fgets(line, MAXREADLINESZ, stdin)) {
-   free(line);
-   return NULL;
-   }
+do {
+ret = fgets(line, MAXREADLINESZ, stdin);
+} while (ret == NULL  errno == EINTR);
+
+if (ret == NULL) {
+free(line);
+return NULL;
+}
p = line + strlen(line);
if (p != line  p[-1] == '\n')
p[-1] = '\0';
-- 
1.5.6.5




[Qemu-devel] [PATCH v2] qemu-io: check registered fds in command_loop()

2010-06-20 Thread MORITA Kazutaka
Some block drivers use an aio handler and do I/O completion routines
in it.  However, the handler is not invoked if we only do
aio_read/write, because registered fds are not checked at all.

This patch registers an aio handler of STDIO to checks whether we can
read a command without blocking, and calls qemu_aio_wait() in
command_loop().  Any other handlers can be invoked when user input is
idle.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---

It seems that the QEMU aio implementation doesn't allow to call
qemu_aio_wait() in the aio handler, so the previous patch is broken.

This patch only checks that STDIO is ready to read a line in the aio
handler, and invokes a command in command_loop().

I think this also fixes the problem which occurs in qemu-iotests.

 cmd.c |   33 ++---
 1 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/cmd.c b/cmd.c
index 2336334..db2c9c4 100644
--- a/cmd.c
+++ b/cmd.c
@@ -24,6 +24,7 @@
 #include getopt.h
 
 #include cmd.h
+#include qemu-aio.h
 
 #define _(x)   x   /* not gettext support yet */
 
@@ -149,10 +150,20 @@ add_args_command(
args_func = af;
 }
 
+static void prep_fetchline(void *opaque)
+{
+int *fetchable = opaque;
+
+qemu_aio_set_fd_handler(STDIN_FILENO, NULL, NULL, NULL, NULL, NULL);
+*fetchable= 1;
+}
+
+static char *get_prompt(void);
+
 void
 command_loop(void)
 {
-   int c, i, j = 0, done = 0;
+   int c, i, j = 0, done = 0, fetchable = 0, prompted = 0;
char*input;
char**v;
const cmdinfo_t *ct;
@@ -186,7 +197,21 @@ command_loop(void)
free(cmdline);
return;
}
+
while (!done) {
+if (!prompted) {
+printf(%s, get_prompt());
+fflush(stdout);
+qemu_aio_set_fd_handler(STDIN_FILENO, prep_fetchline, NULL, NULL,
+NULL, fetchable);
+prompted = 1;
+}
+
+qemu_aio_wait();
+
+if (!fetchable) {
+continue;
+}
if ((input = fetchline()) == NULL)
break;
v = breakline(input, c);
@@ -199,7 +224,11 @@ command_loop(void)
v[0]);
}
doneline(input, v);
+
+prompted = 0;
+fetchable = 0;
}
+qemu_aio_set_fd_handler(STDIN_FILENO, NULL, NULL, NULL, NULL, NULL);
 }
 
 /* from libxcmd/input.c */
@@ -270,8 +299,6 @@ fetchline(void)
 
if (!line)
return NULL;
-   printf(%s, get_prompt());
-   fflush(stdout);
if (!fgets(line, MAXREADLINESZ, stdin)) {
free(line);
return NULL;
-- 
1.5.6.5




[Qemu-devel] [PATCH] qemu-img: avoid calling exit(1) to release resources properly

2010-06-20 Thread MORITA Kazutaka
This patch removes exit(1) from error(), and properly releases
resources such as a block driver and an allocated memory.

For testing the Sheepdog block driver with qemu-iotests, it is
necessary to call bdrv_delete() before the program exits.  Because the
driver releases the lock of VM images in the close handler.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 qemu-img.c |  235 +++-
 1 files changed, 184 insertions(+), 51 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index ea091f0..fe8a577 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -39,14 +39,13 @@ typedef struct img_cmd_t {
 /* Default to cache=writeback as data integrity is not important for qemu-tcg. 
*/
 #define BDRV_O_FLAGS BDRV_O_CACHE_WB
 
-static void QEMU_NORETURN error(const char *fmt, ...)
+static void error(const char *fmt, ...)
 {
 va_list ap;
 va_start(ap, fmt);
 fprintf(stderr, qemu-img: );
 vfprintf(stderr, fmt, ap);
 fprintf(stderr, \n);
-exit(1);
 va_end(ap);
 }
 
@@ -197,57 +196,76 @@ static BlockDriverState *bdrv_new_open(const char 
*filename,
 char password[256];
 
 bs = bdrv_new();
-if (!bs)
+if (!bs) {
 error(Not enough memory);
+goto fail;
+}
 if (fmt) {
 drv = bdrv_find_format(fmt);
-if (!drv)
+if (!drv) {
 error(Unknown file format '%s', fmt);
+goto fail;
+}
 } else {
 drv = NULL;
 }
 if (bdrv_open(bs, filename, flags, drv)  0) {
 error(Could not open '%s', filename);
+goto fail;
 }
 if (bdrv_is_encrypted(bs)) {
 printf(Disk image '%s' is encrypted.\n, filename);
-if (read_password(password, sizeof(password))  0)
+if (read_password(password, sizeof(password))  0) {
 error(No password given);
-if (bdrv_set_key(bs, password)  0)
+goto fail;
+}
+if (bdrv_set_key(bs, password)  0) {
 error(invalid password);
+goto fail;
+}
 }
 return bs;
+fail:
+if (bs) {
+bdrv_delete(bs);
+}
+return NULL;
 }
 
-static void add_old_style_options(const char *fmt, QEMUOptionParameter *list,
+static int add_old_style_options(const char *fmt, QEMUOptionParameter *list,
 int flags, const char *base_filename, const char *base_fmt)
 {
 if (flags  BLOCK_FLAG_ENCRYPT) {
 if (set_option_parameter(list, BLOCK_OPT_ENCRYPT, on)) {
 error(Encryption not supported for file format '%s', fmt);
+return -1;
 }
 }
 if (flags  BLOCK_FLAG_COMPAT6) {
 if (set_option_parameter(list, BLOCK_OPT_COMPAT6, on)) {
 error(VMDK version 6 not supported for file format '%s', fmt);
+return -1;
 }
 }
 
 if (base_filename) {
 if (set_option_parameter(list, BLOCK_OPT_BACKING_FILE, base_filename)) 
{
 error(Backing file not supported for file format '%s', fmt);
+return -1;
 }
 }
 if (base_fmt) {
 if (set_option_parameter(list, BLOCK_OPT_BACKING_FMT, base_fmt)) {
 error(Backing file format not supported for file format '%s', 
fmt);
+return -1;
 }
 }
+return 0;
 }
 
 static int img_create(int argc, char **argv)
 {
-int c, ret, flags;
+int c, ret = 0, flags;
 const char *fmt = raw;
 const char *base_fmt = NULL;
 const char *filename;
@@ -293,12 +311,16 @@ static int img_create(int argc, char **argv)
 
 /* Find driver and parse its options */
 drv = bdrv_find_format(fmt);
-if (!drv)
+if (!drv) {
 error(Unknown file format '%s', fmt);
+return 1;
+}
 
 proto_drv = bdrv_find_protocol(filename);
-if (!proto_drv)
+if (!proto_drv) {
 error(Unknown protocol '%s', filename);
+return 1;
+}
 
 create_options = append_option_parameters(create_options,
   drv-create_options);
@@ -307,7 +329,7 @@ static int img_create(int argc, char **argv)
 
 if (options  !strcmp(options, ?)) {
 print_option_help(create_options);
-return 0;
+goto out;
 }
 
 /* Create parameter list with default values */
@@ -319,6 +341,8 @@ static int img_create(int argc, char **argv)
 param = parse_option_parameters(options, create_options, param);
 if (param == NULL) {
 error(Invalid options for file format '%s'., fmt);
+ret = -1;
+goto out;
 }
 }
 
@@ -328,7 +352,10 @@ static int img_create(int argc, char **argv)
 }
 
 /* Add old-style options to parameters */
-add_old_style_options(fmt, param, flags, base_filename, base_fmt);
+ret = add_old_style_options(fmt, param, flags, base_filename, base_fmt);
+if (ret  0) {
+goto out;
+}
 
 // The size for the image must always be specified, with one

[Qemu-devel] [PATCH v6] block: add sheepdog driver for distributed storage support

2010-06-20 Thread MORITA Kazutaka
Sheepdog is a distributed storage system for QEMU. It provides highly
available block level storage volumes to VMs like Amazon EBS.  This
patch adds a qemu block driver for Sheepdog.

Sheepdog features are:
- No node in the cluster is special (no metadata node, no control
  node, etc)
- Linear scalability in performance and capacity
- No single point of failure
- Autonomous management (zero configuration)
- Useful volume management support such as snapshot and cloning
- Thin provisioning
- Autonomous load balancing

The more details are available at the project site:
http://www.osrg.net/sheepdog/

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---

I've addressed the comments and tested with qemu-iotests which is
hacked for Sheepdog.

This version changes an inode data format to support a snapshot tag
name, so to test this patch, please pull the latest sheepdog server
codes.
  git://sheepdog.git.sourceforge.net/gitroot/sheepdog/sheepdog next

Sheepdog passes almost all testcases against a raw format, but failed in
the following ones:
 - 005: Sheepdog cannot support a larger image than 4 TB, so failed in
creating a 5 TB image.
 - 012: Sheepdog images are not files, so cannot make them read-only
by chmod.

Thanks,
Kazutaka


Changes from v5 to v6 are:
 - support a snapshot name
 - support resizing images (stretching only) to pass a qemu-iotests check
 - fix compile errors on the WIN32 environment
 - initialize an array to avoid a valgrind warning
 - remove an aio handler when it is no longer needed

Changes from v4 to v5 are:
 - address the comments to the sheepdog driver (Thanks Kevin, Chris!)
 -- fix a coding style
 -- fix aio_cancel handling
 -- fix an overflow bug in coping hostname
 -- add comments to the non-trivial functions
 - remove already applied patches from the patchset

Changes from v3 to v4 are:
 - fix error handling in bdrv_snapshot_goto.

Changes from v2 to v3 are:

 - add drv-bdrv_close() and drv-bdrv_open() before and after
   bdrv_snapshot_goto() call of the protocol.
 - address the review comments on the sheepdog driver code.

Changes from v1 to v2 are:

 - rebase onto git://repo.or.cz/qemu/kevin.git block
 - modify the sheepdog driver as a protocol driver
 - add new patch to call the snapshot handler of the protocol


 Makefile.objs|2 +-
 block/sheepdog.c | 2036 ++
 2 files changed, 2037 insertions(+), 1 deletions(-)
 create mode 100644 block/sheepdog.c

diff --git a/Makefile.objs b/Makefile.objs
index 2bfb6d1..4c37182 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -14,7 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o 
vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
-block-nested-y += parallels.o nbd.o blkdebug.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/sheepdog.c b/block/sheepdog.c
new file mode 100644
index 000..69a2494
--- /dev/null
+++ b/block/sheepdog.c
@@ -0,0 +1,2036 @@
+/*
+ * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/.
+ */
+#ifdef _WIN32
+#include windows.h
+#include winsock2.h
+#include ws2tcpip.h
+#else
+#include netdb.h
+#include netinet/tcp.h
+
+#define closesocket(s) close(s)
+#endif
+
+#include qemu-common.h
+#include qemu-error.h
+#include qemu_socket.h
+#include block_int.h
+
+#define SD_PROTO_VER 0x01
+
+#define SD_DEFAULT_ADDR localhost
+#define SD_DEFAULT_PORT 7000
+
+#define SD_OP_CREATE_AND_WRITE_OBJ  0x01
+#define SD_OP_READ_OBJ   0x02
+#define SD_OP_WRITE_OBJ  0x03
+
+#define SD_OP_NEW_VDI0x11
+#define SD_OP_LOCK_VDI   0x12
+#define SD_OP_RELEASE_VDI0x13
+#define SD_OP_GET_VDI_INFO   0x14
+#define SD_OP_READ_VDIS  0x15
+
+#define SD_FLAG_CMD_WRITE0x01
+#define SD_FLAG_CMD_COW  0x02
+
+#define SD_RES_SUCCESS   0x00 /* Success */
+#define SD_RES_UNKNOWN   0x01 /* Unknown error */
+#define SD_RES_NO_OBJ0x02 /* No object found */
+#define SD_RES_EIO   0x03 /* I/O error */
+#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
+#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
+#define SD_RES_SYSTEM_ERROR  0x06 /* System error */
+#define SD_RES_VDI_LOCKED0x07 /* Vdi is locked */
+#define SD_RES_NO_VDI0x08 /* No vdi found */
+#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */
+#define SD_RES_VDI_READ  0x0A /* Cannot read requested

Re: [Qemu-devel] [PATCH 1/2] qemu-img check: Distinguish different kinds of errors

2010-07-06 Thread MORITA Kazutaka
At Fri,  2 Jul 2010 19:14:59 +0200,
Kevin Wolf wrote:
 
 People think that their images are corrupted when in fact there are just some
 leaked clusters. Differentiating several error cases should make the messages
 more comprehensible.
 
 Signed-off-by: Kevin Wolf kw...@redhat.com
 ---
  block.c|   10 ++--
  block.h|   10 -
  qemu-img.c |   62 +--
  3 files changed, 63 insertions(+), 19 deletions(-)
 
 diff --git a/block.c b/block.c
 index dd6dd76..b0ceef0 100644
 --- a/block.c
 +++ b/block.c
 @@ -710,15 +710,19 @@ DeviceState *bdrv_get_attached(BlockDriverState *bs)
  /*
   * Run consistency checks on an image
   *
 - * Returns the number of errors or -errno when an internal error occurs
 + * Returns 0 if the check could be completed (it doesn't mean that the image 
 is
 + * free of errors) or -errno when an internal error occured. The results of 
 the
 + * check are stored in res.
   */
 -int bdrv_check(BlockDriverState *bs)
 +int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
  {
  if (bs-drv-bdrv_check == NULL) {
  return -ENOTSUP;
  }
  
 -return bs-drv-bdrv_check(bs);
 +memset(res, 0, sizeof(*res));
 +res-corruptions = bs-drv-bdrv_check(bs);
 +return res-corruptions  0 ? res-corruptions : 0;
  }
  
  /* commit COW file into the raw image */
 diff --git a/block.h b/block.h
 index 3d03b3e..c2a7e4c 100644
 --- a/block.h
 +++ b/block.h
 @@ -74,7 +74,6 @@ void bdrv_close(BlockDriverState *bs);
  int bdrv_attach(BlockDriverState *bs, DeviceState *qdev);
  void bdrv_detach(BlockDriverState *bs, DeviceState *qdev);
  DeviceState *bdrv_get_attached(BlockDriverState *bs);
 -int bdrv_check(BlockDriverState *bs);
  int bdrv_read(BlockDriverState *bs, int64_t sector_num,
uint8_t *buf, int nb_sectors);
  int bdrv_write(BlockDriverState *bs, int64_t sector_num,
 @@ -97,6 +96,15 @@ int bdrv_change_backing_file(BlockDriverState *bs,
  const char *backing_file, const char *backing_fmt);
  void bdrv_register(BlockDriver *bdrv);
  
 +
 +typedef struct BdrvCheckResult {
 +int corruptions;
 +int leaks;
 +int check_errors;
 +} BdrvCheckResult;
 +
 +int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res);
 +
  /* async block I/O */
  typedef struct BlockDriverAIOCB BlockDriverAIOCB;
  typedef void BlockDriverCompletionFunc(void *opaque, int ret);
 diff --git a/qemu-img.c b/qemu-img.c
 index 700af21..1782ac9 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
 @@ -425,11 +425,20 @@ out:
  return 0;
  }
  
 +/*
 + * Checks an image for consistency. Exit codes:
 + *
 + * 0 - Check completed, image is good
 + * 1 - Check not completed because of internal errors
 + * 2 - Check completed, image is corrupted
 + * 3 - Check completed, image has leaked clusters, but is good otherwise
 + */
  static int img_check(int argc, char **argv)
  {
  int c, ret;
  const char *filename, *fmt;
  BlockDriverState *bs;
 +BdrvCheckResult result;
  
  fmt = NULL;
  for(;;) {
 @@ -453,28 +462,51 @@ static int img_check(int argc, char **argv)
  if (!bs) {
  return 1;
  }
 -ret = bdrv_check(bs);
 -switch(ret) {
 -case 0:
 -printf(No errors were found on the image.\n);
 -break;
 -case -ENOTSUP:
 +ret = bdrv_check(bs, result);
 +
 +if (ret == -ENOTSUP) {
  error(This image format does not support checks);
 -break;
 -default:
 -if (ret  0) {
 -error(An error occurred during the check);
 -} else {
 -printf(%d errors were found on the image.\n, ret);
 +return 1;

Is it okay to call bdrv_delete(bs) before return?  It is necessary for
the sheepdog driver to pass qemu-iotests.

Kazutaka


--- a/qemu-img.c
+++ b/qemu-img.c
@@ -466,6 +466,7 @@ static int img_check(int argc, char **argv)
 
 if (ret == -ENOTSUP) {
 error(This image format does not support checks);
+bdrv_delete(bs);
 return 1;
 }
 



[Qemu-devel] [PATCH] sheepdog: fix compile error on systems without TCP_CORK

2010-07-07 Thread MORITA Kazutaka
WIN32 is not only the system which doesn't have TCP_CORK (e.g. OS X).

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---

Betts, I think this patch fix the compile error.  Can you try this
one?

 block/sheepdog.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 69a2494..81aa564 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -889,7 +889,7 @@ static int aio_flush_request(void *opaque)
 return !QLIST_EMPTY(s-outstanding_aio_head);
 }
 
-#ifdef _WIN32
+#if !defined(SOL_TCP) || !defined(TCP_CORK)
 
 static int set_cork(int fd, int v)
 {
-- 
1.5.6.5




[Qemu-devel] [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-20 Thread MORITA Kazutaka

Hi everyone,

Sheepdog is a distributed storage system for KVM/QEMU. It provides
highly available block level storage volumes to VMs like Amazon EBS.
Sheepdog supports advanced volume management features such as snapshot,
cloning, and thin provisioning. Sheepdog runs on several tens or hundreds
of nodes, and the architecture is fully symmetric; there is no central
node such as a meta-data server.

The following list describes the features of Sheepdog.

* Linear scalability in performance and capacity
* No single point of failure
* Redundant architecture (data is written to multiple nodes)
- Tolerance against network failure
* Zero configuration (newly added machines will join the cluster 
automatically)
- Autonomous load balancing
* Snapshot
- Online snapshot from qemu-monitor
* Clone from a snapshot volume
* Thin provisioning
- Amazon EBS API support (to use from a Eucalyptus instance)

(* = current features, - = on our todo list)

More details and download links are here:

http://www.osrg.net/sheepdog/

Note that the code is still in an early stage.
There are some critical TODO items:

- VM image deletion support
- Support architectures other than X86_64
- Data recoverys
- Free space management
- Guarantee reliability and availability under heavy load
- Performance improvement
- Reclaim unused blocks
- More documentation

We hope finding people interested in working together.
Enjoy!


Here are examples:

- create images

$ kvm-img create -f sheepdog Alice's Disk 256G
$ kvm-img create -f sheepdog Bob's Disk 256G

- list images

$ shepherd info -t vdi
   4 : Alice's Disk  256 GB (allocated: 0 MB, shared: 0 MB), 2009-10-15
16:17:18, tag:0, current
   8 : Bob's Disk256 GB (allocated: 0 MB, shared: 0 MB), 2009-10-15
16:29:20, tag:0, current

- start up a virtual machine

$ kvm --drive format=sheepdog,file=Alice's Disk

- create a snapshot

$ kvm-img snapshot -c name sheepdog:Alice's Disk

- clone from a snapshot

$ kvm-img create -b sheepdog:Alice's Disk:0 -f sheepdog Charlie's Disk


Thanks.

--
MORITA, Kazutaka

NTT Cyber Space Labs
OSS Computing Project
Kernel Group
E-mail: morita.kazut...@lab.ntt.co.jp





[Qemu-devel] Re: [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-23 Thread MORITA Kazutaka
Hello,

Does the following patch work for you?

diff --git a/sheep/work.c b/sheep/work.c
index 4df8dc0..45f362d 100644
--- a/sheep/work.c
+++ b/sheep/work.c
@@ -28,6 +28,7 @@
 #include syscall.h
 #include sys/types.h
 #include linux/types.h
+#define _LINUX_FCNTL_H
 #include linux/signalfd.h

 #include list.h


On Wed, Oct 21, 2009 at 5:45 PM, Nikolai K. Bochev
n.boc...@grandstarco.com wrote:
 Hello,

 I am getting the following error trying to compile sheepdog on Ubuntu 9.10 ( 
 2.6.31-14 x64 ) :

 cd shepherd; make
 make[1]: Entering directory 
 `/home/shiny/Packages/sheepdog-2009102101/shepherd'
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE shepherd.c 
 -o shepherd.o
 shepherd.c: In function ‘main’:
 shepherd.c:300: warning: dereferencing pointer ‘hdr.55’ does break 
 strict-aliasing rules
 shepherd.c:300: note: initialized from here
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE treeview.c 
 -o treeview.o
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE 
 ../lib/event.c -o ../lib/event.o
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE 
 ../lib/net.c -o ../lib/net.o
 ../lib/net.c: In function ‘write_object’:
 ../lib/net.c:358: warning: ‘vosts’ may be used uninitialized in this function
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE 
 ../lib/logger.c -o ../lib/logger.o
 cc shepherd.o treeview.o ../lib/event.o ../lib/net.o ../lib/logger.o -o 
 shepherd -lncurses -lcrypto
 make[1]: Leaving directory `/home/shiny/Packages/sheepdog-2009102101/shepherd'
 cd sheep; make
 make[1]: Entering directory `/home/shiny/Packages/sheepdog-2009102101/sheep'
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE sheep.c -o 
 sheep.o
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE store.c -o 
 store.o
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE net.c -o 
 net.o
 cc -c -g -O2 -Wall -Wstrict-prototypes -I../include -D_GNU_SOURCE work.c -o 
 work.o
 In file included from /usr/include/asm/fcntl.h:1,
                 from /usr/include/linux/fcntl.h:4,
                 from /usr/include/linux/signalfd.h:13,
                 from work.c:31:
 /usr/include/asm-generic/fcntl.h:117: error: redefinition of ‘struct flock’
 /usr/include/asm-generic/fcntl.h:140: error: redefinition of ‘struct flock64’
 make[1]: *** [work.o] Error 1
 make[1]: Leaving directory `/home/shiny/Packages/sheepdog-2009102101/sheep'
 make: *** [all] Error 2

 I have all the required libs installed. Patching and compiling qemu-kvm went 
 flawless.

 - Original Message -
 From: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 To: k...@vger.kernel.org, qemu-devel@nongnu.org, linux-fsde...@vger.kernel.org
 Sent: Wednesday, October 21, 2009 8:13:47 AM
 Subject: [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

 Hi everyone,

 Sheepdog is a distributed storage system for KVM/QEMU. It provides
 highly available block level storage volumes to VMs like Amazon EBS.
 Sheepdog supports advanced volume management features such as snapshot,
 cloning, and thin provisioning. Sheepdog runs on several tens or hundreds
 of nodes, and the architecture is fully symmetric; there is no central
 node such as a meta-data server.

 The following list describes the features of Sheepdog.

     * Linear scalability in performance and capacity
     * No single point of failure
     * Redundant architecture (data is written to multiple nodes)
     - Tolerance against network failure
     * Zero configuration (newly added machines will join the cluster 
 automatically)
     - Autonomous load balancing
     * Snapshot
     - Online snapshot from qemu-monitor
     * Clone from a snapshot volume
     * Thin provisioning
     - Amazon EBS API support (to use from a Eucalyptus instance)

 (* = current features, - = on our todo list)

 More details and download links are here:

 http://www.osrg.net/sheepdog/

 Note that the code is still in an early stage.
 There are some critical TODO items:

     - VM image deletion support
     - Support architectures other than X86_64
     - Data recoverys
     - Free space management
     - Guarantee reliability and availability under heavy load
     - Performance improvement
     - Reclaim unused blocks
     - More documentation

 We hope finding people interested in working together.
 Enjoy!


 Here are examples:

 - create images

 $ kvm-img create -f sheepdog Alice's Disk 256G
 $ kvm-img create -f sheepdog Bob's Disk 256G

 - list images

 $ shepherd info -t vdi
    4 : Alice's Disk  256 GB (allocated: 0 MB, shared: 0 MB), 2009-10-15
 16:17:18, tag:        0, current
    8 : Bob's Disk    256 GB (allocated: 0 MB, shared: 0 MB), 2009-10-15
 16:29:20, tag:        0, current

 - start up a virtual machine

 $ kvm --drive format=sheepdog,file=Alice's Disk

 - create a snapshot

 $ kvm-img snapshot -c name sheepdog:Alice's Disk

 - clone from a snapshot

 $ kvm-img create -b sheepdog:Alice's

[Qemu-devel] Re: [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-23 Thread MORITA Kazutaka
We use JGroups (Java library) for reliable multicast communication in
our cluster manager daemon. We don't worry about the performance much
since the cluster manager daemon is not involved in the I/O path. We
might think about moving to corosync if it is more stable than
JGroups.

On Wed, Oct 21, 2009 at 6:08 PM, Dietmar Maurer diet...@proxmox.com wrote:
 Quite interesting. But would it be possible to use corosync for the cluster 
 communication? The point is that we need corosync anyways for pacemaker, it 
 is written in C (high performance) and seem to implement the feature you need?

 -Original Message-
 From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On
 Behalf Of MORITA Kazutaka
 Sent: Mittwoch, 21. Oktober 2009 07:14
 To: k...@vger.kernel.org; qemu-devel@nongnu.org; linux-
 fsde...@vger.kernel.org
 Subject: [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

 Hi everyone,

 Sheepdog is a distributed storage system for KVM/QEMU. It provides
 highly available block level storage volumes to VMs like Amazon EBS.
 Sheepdog supports advanced volume management features such as snapshot,
 cloning, and thin provisioning. Sheepdog runs on several tens or
 hundreds
 of nodes, and the architecture is fully symmetric; there is no central
 node such as a meta-data server.

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html




-- 
MORITA, Kazutaka

NTT Cyber Space Labs
OSS Computing Project
Kernel Group
E-mail: morita.kazut...@lab.ntt.co.jp




Re: [Qemu-devel] Re: [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-23 Thread MORITA Kazutaka
On Fri, Oct 23, 2009 at 8:10 PM, Alexander Graf ag...@suse.de wrote:

 On 23.10.2009, at 12:41, MORITA Kazutaka wrote:

 On Fri, Oct 23, 2009 at 12:30 AM, Avi Kivity a...@redhat.com wrote:

 How is load balancing implemented?  Can you move an image transparently

 while a guest is running?  Will an image be moved closer to its guest?

 Sheepdog uses consistent hashing to decide where objects store; I/O
 load is balanced across the nodes. When a new node is added or the
 existing node is removed, the hash table changes and the data
 automatically and transparently are moved over nodes.

 We plan to implement a mechanism to distribute the data not randomly
 but intelligently; we could use machine load, the locations of VMs, etc.

 What exactly does balanced mean? Can it cope with individual nodes having
 more disk space than others?

I mean objects are uniformly distributed over the nodes by the hash function.
Distribution using free disk space information is one of TODOs.

 Do you support multiple guests accessing the same image?

 A VM image can be attached to any VMs but one VM at a time; multiple
 running VMs cannot access to the same VM image.

 What about read-only access? Imagine you'd have 5 kvm instances each
 accessing it using -snapshot.

By creating new clone images from existing snapshot image, you can do
the similar thing.
Sheepdog can create cloning image instantly.

-- 
MORITA, Kazutaka

NTT Cyber Space Labs
OSS Computing Project
Kernel Group
E-mail: morita.kazut...@lab.ntt.co.jp




Re: [Qemu-devel] [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-23 Thread MORITA Kazutaka
Hi,

Thanks for many comments.

Sheepdog git trees are created.

  Sheepdog server
git://sheepdog.git.sourceforge.net/gitroot/sheepdog/sheepdog

  Sheepdog client
git://sheepdog.git.sourceforge.net/gitroot/sheepdog/qemu-kvm

Please try!

On Wed, Oct 21, 2009 at 2:13 PM, MORITA Kazutaka
morita.kazut...@lab.ntt.co.jp wrote:
 Hi everyone,

 Sheepdog is a distributed storage system for KVM/QEMU. It provides
 highly available block level storage volumes to VMs like Amazon EBS.
 Sheepdog supports advanced volume management features such as snapshot,
 cloning, and thin provisioning. Sheepdog runs on several tens or hundreds
 of nodes, and the architecture is fully symmetric; there is no central
 node such as a meta-data server.

 The following list describes the features of Sheepdog.

    * Linear scalability in performance and capacity
    * No single point of failure
    * Redundant architecture (data is written to multiple nodes)
    - Tolerance against network failure
    * Zero configuration (newly added machines will join the cluster
 automatically)
    - Autonomous load balancing
    * Snapshot
    - Online snapshot from qemu-monitor
    * Clone from a snapshot volume
    * Thin provisioning
    - Amazon EBS API support (to use from a Eucalyptus instance)

 (* = current features, - = on our todo list)

 More details and download links are here:

 http://www.osrg.net/sheepdog/

 Note that the code is still in an early stage.
 There are some critical TODO items:

    - VM image deletion support
    - Support architectures other than X86_64
    - Data recoverys
    - Free space management
    - Guarantee reliability and availability under heavy load
    - Performance improvement
    - Reclaim unused blocks
    - More documentation

 We hope finding people interested in working together.
 Enjoy!


 Here are examples:

 - create images

 $ kvm-img create -f sheepdog Alice's Disk 256G
 $ kvm-img create -f sheepdog Bob's Disk 256G

 - list images

 $ shepherd info -t vdi
   4 : Alice's Disk  256 GB (allocated: 0 MB, shared: 0 MB), 2009-10-15
 16:17:18, tag:        0, current
   8 : Bob's Disk    256 GB (allocated: 0 MB, shared: 0 MB), 2009-10-15
 16:29:20, tag:        0, current

 - start up a virtual machine

 $ kvm --drive format=sheepdog,file=Alice's Disk

 - create a snapshot

 $ kvm-img snapshot -c name sheepdog:Alice's Disk

 - clone from a snapshot

 $ kvm-img create -b sheepdog:Alice's Disk:0 -f sheepdog Charlie's Disk


 Thanks.

 --
 MORITA, Kazutaka

 NTT Cyber Space Labs
 OSS Computing Project
 Kernel Group
 E-mail: morita.kazut...@lab.ntt.co.jp







-- 
MORITA, Kazutaka

NTT Cyber Space Labs
OSS Computing Project
Kernel Group
E-mail: morita.kazut...@lab.ntt.co.jp




Re: [Qemu-devel] [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-23 Thread MORITA Kazutaka
On Sat, Oct 24, 2009 at 4:45 AM, Javier Guerra jav...@guerrag.com wrote:
 On Fri, Oct 23, 2009 at 2:39 PM, MORITA Kazutaka
 morita.kazut...@lab.ntt.co.jp wrote:
 Thanks for many comments.

 Sheepdog git trees are created.

 great!

 is there any client (no matter how crude) besides the patched
 KVM/Qemu?  it would make it far easier to hack around...

No, there isn't. Sorry.
I think we should provide a test client as soon as possible.

-- 
MORITA, Kazutaka

NTT Cyber Space Labs
OSS Computing Project
Kernel Group
E-mail: morita.kazut...@lab.ntt.co.jp




[Qemu-devel] Re: [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-26 Thread MORITA Kazutaka

On 2009/10/25 17:51, Dietmar Maurer wrote:

Do you support multiple guests accessing the same image?

A VM image can be attached to any VMs but one VM at a time; multiple
running VMs cannot access to the same VM image.


I guess this is a problem when you want to do live migrations?


Yes, because Sheepdog locks a VM image when it is opened.
To avoid this problem, locking must be delayed until migration has done.
This is also a TODO item.

--
MORITA Kazutaka







[Qemu-devel] Re: [ANNOUNCE] Sheepdog: Distributed Storage System for KVM

2009-10-27 Thread MORITA Kazutaka

On 2009/10/21 14:13, MORITA Kazutaka wrote:

Hi everyone,

Sheepdog is a distributed storage system for KVM/QEMU. It provides
highly available block level storage volumes to VMs like Amazon EBS.
Sheepdog supports advanced volume management features such as snapshot,
cloning, and thin provisioning. Sheepdog runs on several tens or hundreds
of nodes, and the architecture is fully symmetric; there is no central
node such as a meta-data server.


We added some pages to Sheepdog website:

 Design: http://www.osrg.net/sheepdog/design.html
 FAQ   : http://www.osrg.net/sheepdog/faq.html

Sheepdog mailing list is also ready to use (thanks for Tomasz)

 Subscribe/Unsubscribe/Preferences
   http://lists.wpkg.org/mailman/listinfo/sheepdog
 Archive
   http://lists.wpkg.org/pipermail/sheepdog/

We are always looking for developers or users interested in
participating in Sheepdog project!

Thanks.

MORITA Kazutaka




Re: [Qemu-devel] [PATCH] get rid of private bitmap functions in block/sheepdog.c, use generic ones

2011-03-14 Thread MORITA Kazutaka
On Thu, Mar 10, 2011 at 11:03 PM, Michael Tokarev m...@tls.msk.ru wrote:
 qemu now has generic bitmap functions,
 so don't redefine them in sheepdog.c,
 use common header instead.  A small cleanup.

 Here's only one function which is actually
 used in sheepdog and gets replaced with
 a generic one (simplified):

 - static inline int test_bit(int nr, const volatile unsigned long *addr)
 + static inline int test_bit(int nr, const unsigned long *addr)
  {
 -  return ((1UL  (nr % BITS_PER_LONG))
             ((unsigned long*)addr)[nr / BITS_PER_LONG])) != 0;
 +  return 1UL  (addr[nr / BITS_PER_LONG]  (nr  (BITS_PER_LONG-1)));
  }

 The body is equivalent, but the argument is not: there's
 volatile in there.  Why it is used for - I'm not sure.

 Signed-off-by: Michael Tokarev m...@tls.msk.ru

Looks good.  Thanks!

Acked-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



[Qemu-devel] [PATCH 0/3] sheepdog: fix aio related issues

2011-03-29 Thread MORITA Kazutaka
This patchset fixes the Sheepodg AIO problems pointed out in:
  http://lists.gnu.org/archive/html/qemu-devel/2011-02/msg02495.html
  http://lists.gnu.org/archive/html/qemu-devel/2011-02/msg02474.html

Thanks,

Kazutaka


MORITA Kazutaka (3):
  sheepdog: make send/recv operations non-blocking
  sheepdog: allow cancellation of I/Os which are not processed yet
  sheepdog: avoid accessing a buffer of the canceled I/O request

 block/sheepdog.c |  462 +++---
 1 files changed, 334 insertions(+), 128 deletions(-)




[Qemu-devel] [PATCH 3/3] sheepdog: avoid accessing a buffer of the canceled I/O request

2011-03-29 Thread MORITA Kazutaka
We cannot access the buffer of the canceled I/O request because its
AIOCB callback is already called and the buffer is not valid.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   12 ++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index ed98701..6f60721 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -79,6 +79,7 @@
 #define SD_DATA_OBJ_SIZE (UINT64_C(1)  22)
 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
 #define SECTOR_SIZE 512
+#define BUF_SIZE 4096
 
 #define SD_INODE_SIZE (sizeof(SheepdogInode))
 #define CURRENT_VDI_ID 0
@@ -900,8 +901,15 @@ static void aio_read_response(void *opaque)
 }
 conn_state = C_IO_DATA;
 case C_IO_DATA:
-ret = do_readv(fd, acb-qiov-iov, aio_req-data_len - done,
-   aio_req-iov_offset + done);
+if (acb-canceled) {
+char tmp_buf[BUF_SIZE];
+int len = MIN(aio_req-data_len - done, sizeof(tmp_buf));
+
+ret = do_read(fd, tmp_buf, len, 0);
+} else {
+ret = do_readv(fd, acb-qiov-iov, aio_req-data_len - done,
+   aio_req-iov_offset + done);
+}
 if (ret  0) {
 error_report(failed to get the data, %s\n, strerror(errno));
 conn_state = C_IO_CLOSED;
-- 
1.5.6.5




[Qemu-devel] [PATCH 2/3] sheepdog: allow cancellation of I/Os which are not processed yet

2011-03-29 Thread MORITA Kazutaka
We can cancel I/O requests safely if they are not sent to the servers.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   37 +
 1 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index cedf806..ed98701 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -421,6 +421,43 @@ static void sd_finish_aiocb(SheepdogAIOCB *acb)
 static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
 {
 SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
+BDRVSheepdogState *s = blockacb-bs-opaque;
+AIOReq *areq, *next, *oldest_send_req = NULL;
+
+if (acb-bh) {
+/*
+ * sd_readv_writev_bh_cb() is not called yet, so we can
+ * release this safely
+ */
+qemu_bh_delete(acb-bh);
+acb-bh = NULL;
+qemu_aio_release(acb);
+return;
+}
+
+QLIST_FOREACH(areq, s-outstanding_aio_head, outstanding_aio_siblings) {
+if (areq-state == AIO_SEND_OBJREQ) {
+oldest_send_req = areq;
+}
+}
+
+QLIST_FOREACH_SAFE(areq, s-outstanding_aio_head,
+   outstanding_aio_siblings, next) {
+if (areq-state == AIO_RECV_OBJREQ) {
+continue;
+}
+if (areq-state == AIO_SEND_OBJREQ  areq == oldest_send_req) {
+/* the oldest AIO_SEND_OBJREQ request could be being sent */
+continue;
+}
+free_aio_req(s, areq);
+}
+
+if (QLIST_EMPTY(acb-aioreq_head)) {
+/* there is no outstanding request */
+qemu_aio_release(acb);
+return;
+}
 
 /*
  * Sheepdog cannot cancel the requests which are already sent to
-- 
1.5.6.5




[Qemu-devel] [PATCH 1/3] sheepdog: make send/recv operations non-blocking

2011-03-29 Thread MORITA Kazutaka
This patch avoids retrying send/recv in AIO path when the sheepdog
connection is not ready for the operation.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |  417 +-
 1 files changed, 289 insertions(+), 128 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index a54e0de..cedf806 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -242,6 +242,19 @@ static inline int is_snapshot(struct SheepdogInode *inode)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
 
+enum ConnectionState {
+C_IO_HEADER,
+C_IO_DATA,
+C_IO_END,
+C_IO_CLOSED,
+};
+
+enum AIOReqState {
+AIO_PENDING,/* not ready for sending this request */
+AIO_SEND_OBJREQ,/* send this request */
+AIO_RECV_OBJREQ,/* receive a result of this request */
+};
+
 typedef struct AIOReq {
 SheepdogAIOCB *aiocb;
 unsigned int iov_offset;
@@ -253,6 +266,9 @@ typedef struct AIOReq {
 uint8_t flags;
 uint32_t id;
 
+enum AIOReqState state;
+struct SheepdogObjReq hdr;
+
 QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
 QLIST_ENTRY(AIOReq) aioreq_siblings;
 } AIOReq;
@@ -348,12 +364,14 @@ static const char * sd_strerror(int err)
  * 1. In the sd_aio_readv/writev, read/write requests are added to the
  *QEMU Bottom Halves.
  *
- * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
- *requests to the server and link the requests to the
- *outstanding_list in the BDRVSheepdogState.  we exits the
- *function without waiting for receiving the response.
+ * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we set up the
+ *I/O requests to the server and link the requests to the
+ *outstanding_list in the BDRVSheepdogState.
+ *
+ * 3. We send the request in aio_send_request, the fd handler to the
+ *sheepdog connection.
  *
- * 3. We receive the response in aio_read_response, the fd handler to
+ * 4. We receive the response in aio_read_response, the fd handler to
  *the sheepdog connection.  If metadata update is needed, we send
  *the write request to the vdi object in sd_write_done, the write
  *completion function.  The AIOCB callback is not called until all
@@ -377,8 +395,6 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, 
SheepdogAIOCB *acb,
 aio_req-flags = flags;
 aio_req-id = s-aioreq_seq_num++;
 
-QLIST_INSERT_HEAD(s-outstanding_aio_head, aio_req,
-  outstanding_aio_siblings);
 QLIST_INSERT_HEAD(acb-aioreq_head, aio_req, aioreq_siblings);
 
 return aio_req;
@@ -640,20 +656,17 @@ static int do_readv_writev(int sockfd, struct iovec *iov, 
int len,
 again:
 ret = do_send_recv(sockfd, iov, len, iov_offset, write);
 if (ret  0) {
-if (errno == EINTR || errno == EAGAIN) {
+if (errno == EINTR) {
 goto again;
 }
+if (errno == EAGAIN) {
+return 0;
+}
 error_report(failed to recv a rsp, %s\n, strerror(errno));
-return 1;
-}
-
-iov_offset += ret;
-len -= ret;
-if (len) {
-goto again;
+return -errno;
 }
 
-return 0;
+return ret;
 }
 
 static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
@@ -666,30 +679,30 @@ static int do_writev(int sockfd, struct iovec *iov, int 
len, int iov_offset)
 return do_readv_writev(sockfd, iov, len, iov_offset, 1);
 }
 
-static int do_read_write(int sockfd, void *buf, int len, int write)
+static int do_read_write(int sockfd, void *buf, int len, int skip, int write)
 {
 struct iovec iov;
 
 iov.iov_base = buf;
-iov.iov_len = len;
+iov.iov_len = len + skip;
 
-return do_readv_writev(sockfd, iov, len, 0, write);
+return do_readv_writev(sockfd, iov, len, skip, write);
 }
 
-static int do_read(int sockfd, void *buf, int len)
+static int do_read(int sockfd, void *buf, int len, int skip)
 {
-return do_read_write(sockfd, buf, len, 0);
+return do_read_write(sockfd, buf, len, skip, 0);
 }
 
-static int do_write(int sockfd, void *buf, int len)
+static int do_write(int sockfd, void *buf, int len, int skip)
 {
-return do_read_write(sockfd, buf, len, 1);
+return do_read_write(sockfd, buf, len, skip, 1);
 }
 
 static int send_req(int sockfd, SheepdogReq *hdr, void *data,
 unsigned int *wlen)
 {
-int ret;
+int ret, done = 0;
 struct iovec iov[2];
 
 iov[0].iov_base = hdr;
@@ -700,19 +713,23 @@ static int send_req(int sockfd, SheepdogReq *hdr, void 
*data,
 iov[1].iov_len = *wlen;
 }
 
-ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
-if (ret) {
-error_report(failed to send a req, %s\n, strerror(errno));
-ret = -1;
+while (done  sizeof(*hdr) + *wlen) {
+ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen - done, done);
+if (ret  0) {
+error_report(failed to send a req

Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines

2012-01-06 Thread MORITA Kazutaka
At Tue, 3 Jan 2012 08:13:51 +,
Stefan Hajnoczi wrote:
 
 On Mon, Jan 02, 2012 at 10:38:11PM +, Stefan Hajnoczi wrote:
  On Mon, Jan 2, 2012 at 3:39 PM, Christoph Hellwig h...@lst.de wrote:
   On Fri, Dec 30, 2011 at 10:35:01AM +, Stefan Hajnoczi wrote:
   If you can reproduce this bug and suspect coroutines are involved then I
  
   It's entirely reproducable.
  
   I've played around a bit and switched from the ucontext to the gthreads
   coroutine implementation.  The result seems odd, but starts to make sense.
  
   Running the workload I now get the following message from qemu:
  
   Co-routine re-entered recursively
  
   and the gdb backtrace looks like:
  
   (gdb) bt
   #0  0x7f2fff36f405 in *__GI_raise (sig=optimized out)
      at ../nptl/sysdeps/unix/sysv/linux/raise.c:64
   #1  0x7f2fff372680 in *__GI_abort () at abort.c:92
   #2  0x7f30019a6616 in qemu_coroutine_enter (co=0x7f3004d4d7b0, 
   opaque=0x0)
      at qemu-coroutine.c:53
   #3  0x7f30019a5e82 in qemu_co_queue_next_bh (opaque=optimized out)
      at qemu-coroutine-lock.c:43
   #4  0x7f30018d5a72 in qemu_bh_poll () at async.c:71
   #5  0x7f3001982990 in main_loop_wait (nonblocking=optimized out)
      at main-loop.c:472
   #6  0x7f30018cf714 in main_loop () at /home/hch/work/qemu/vl.c:1481
   #7  main (argc=optimized out, argv=optimized out, envp=optimized 
   out)
      at /home/hch/work/qemu/vl.c:3479
  
   adding some printks suggest this happens when calling add_aio_request from
   aio_read_response when either delaying creates, or updating metadata,
   although not everytime one of these cases happens.
  
   I've tried to understand how the recursive calling happens, but 
   unfortunately
   the whole coroutine code lacks any sort of documentation how it should
   behave or what it asserts about the callers.
  
   I don't have a sheepdog setup here but if there's an easy way to
   reproduce please let me know and I'll take a look.
  
   With the small patch below applied to the sheppdog source I can reproduce
   the issue on my laptop using the following setup:
  
   for port in 7000 7001 7002; do
      mkdir -p /mnt/sheepdog/$port
      /usr/sbin/sheep -p $port -c local /mnt/sheepdog/$port
      sleep 2
   done
  
   collie cluster format
   collie vdi create test 20G
  
   then start a qemu instance that uses the the sheepdog volume using the
   following device and drive lines:
  
          -drive if=none,file=sheepdog:test,cache=none,id=test \
          -device virtio-blk-pci,drive=test,id=testdev \
  
   finally, in the guest run:
  
          dd if=/dev/zero of=/dev/vdX bs=67108864 count=128 oflag=direct
  
  Thanks for these instructions.  I can reproduce the issue here.
  
  It seems suspicious the way that BDRVSheepdogState-co_recv and
  -co_send work.  The code adds select(2) read/write callback functions
  on the sheepdog socket file descriptor.  When the socket becomes
  writeable or readable the co_send or co_recv coroutines are entered.
  So far, so good, this is how a coroutine is integrated into the main
  loop of QEMU.
  
  The problem is that this patch is mixing things.  The co_recv
  coroutine runs aio_read_response(), which invokes send_pending_req().
  send_pending_req() invokes add_aio_request().  That function isn't
  suitable for co_recv's context because it actually sends data and hits
  a few blocking (yield) points.  It takes a coroutine mutex - but the
  select(2) read callback is still in place.  We're now still in the
  aio_read_response() call chain except we're actually not reading at
  all, we're trying to write!  And we'll get spurious wakeups if there
  is any data readable on the socket.
  
  So the co_recv coroutine has two things in the system that will try to 
  enter it:
  1. The select(2) read callback on the sheepdog socket.
  2. The aio_add_request() blocking operations, including a coroutine mutex.
  
  This is bad, a yielded coroutine should only have one thing that will
  enter it.  It's rare that it makes sense to have multiple things
  entering a coroutine.
  
  It's late here but I hope this gives Kazutaka some thoughts on what is
  causing the issue with this patch.
 
 Poked around some more this morning.  Here is a patch that isolates the
 bug:
 
 ---8---
 diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
 index 26ad76b..0d7a03f 100644
 --- a/qemu-coroutine-lock.c
 +++ b/qemu-coroutine-lock.c
 @@ -59,6 +59,16 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
  QTAILQ_INSERT_TAIL(queue-entries, self, co_queue_next);
  qemu_coroutine_yield();
  assert(qemu_in_coroutine());
 +{
 +Coroutine *co;
 +
 +QTAILQ_FOREACH(co, queue-entries, co_queue_next) {
 +assert(co != self);
 +}
 +QTAILQ_FOREACH(co, unlock_bh_queue, co_queue_next) {
 +assert(co != self);
 +}
 +}
  }
  
  void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue)
 ---8---
 
 

[Qemu-devel] [PATCH] sheepdog: add data preallocation support

2011-05-21 Thread MORITA Kazutaka
This introduces a qemu-img create option for sheepdog which allows the
data to be preallocated (note that sheepdog always preallocates
metadata).  This is necessary to use Sheepdog volumes as a backend
storage for iSCSI target.  More information is available at
https://sourceforge.net/apps/trac/sheepdog/wiki/General%20Protocol%20Support

The option is disabled by default and you need to enable it like the
following:

qemu-img create sheepdog:test -o preallocation=data 1G

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
Signed-off-by: FUJITA Tomonori fujita.tomon...@lab.ntt.co.jp
---
 block/sheepdog.c |   73 +-
 1 files changed, 72 insertions(+), 1 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 0392ca8..38ca9aa 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1292,6 +1292,57 @@ static int do_sd_create(char *filename, int64_t vdi_size,
 return 0;
 }
 
+static int sd_prealloc(uint32_t vid, int64_t vdi_size)
+{
+int fd, ret;
+SheepdogInode *inode;
+char *buf;
+unsigned long idx, max_idx;
+
+fd = connect_to_sdog(NULL, NULL);
+if (fd  0) {
+return -EIO;
+}
+
+inode = qemu_malloc(sizeof(*inode));
+buf = qemu_malloc(SD_DATA_OBJ_SIZE);
+
+ret = read_object(fd, (char *)inode, vid_to_vdi_oid(vid),
+  0, sizeof(*inode), 0);
+
+max_idx = (vdi_size + SD_DATA_OBJ_SIZE - 1) / SD_DATA_OBJ_SIZE;
+
+for (idx = 0; idx  max_idx; idx++) {
+uint64_t oid;
+oid = vid_to_data_oid(vid, idx);
+
+if (inode-data_vdi_id[idx]) {
+ret = read_object(fd, buf, vid_to_vdi_oid(inode-data_vdi_id[idx]),
+  1, SD_DATA_OBJ_SIZE, 0);
+if (ret)
+goto out;
+} else {
+memset(buf, 0, SD_DATA_OBJ_SIZE);
+}
+
+ret = write_object(fd, buf, oid, 1, SD_DATA_OBJ_SIZE, 0, 1);
+if (ret)
+goto out;
+
+inode-data_vdi_id[idx] = vid;
+ret = write_object(fd, (char *)inode, vid_to_vdi_oid(vid),
+   1, sizeof(*inode), 0, 0);
+if (ret)
+goto out;
+}
+out:
+free(inode);
+free(buf);
+closesocket(fd);
+
+return ret;
+}
+
 static int sd_create(const char *filename, QEMUOptionParameter *options)
 {
 int ret;
@@ -1301,6 +1352,7 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 BDRVSheepdogState s;
 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
 uint32_t snapid;
+int prealloc = 0;
 
 strstart(filename, sheepdog:, (const char **)filename);
 
@@ -1317,6 +1369,16 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 vdi_size = options-value.n;
 } else if (!strcmp(options-name, BLOCK_OPT_BACKING_FILE)) {
 backing_file = options-value.s;
+} else if (!strcmp(options-name, BLOCK_OPT_PREALLOC)) {
+if (!options-value.s || !strcmp(options-value.s, off)) {
+prealloc = 0;
+} else if (!strcmp(options-value.s, data)) {
+prealloc = 1;
+} else {
+error_report(Invalid preallocation mode: '%s'\n,
+options-value.s);
+return -EINVAL;
+}
 }
 options++;
 }
@@ -1354,7 +1416,11 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 bdrv_delete(bs);
 }
 
-return do_sd_create((char *)vdi, vdi_size, base_vid, vid, 0, s.addr, 
s.port);
+ret = do_sd_create((char *)vdi, vdi_size, base_vid, vid, 0, s.addr, 
s.port);
+if (!prealloc || ret)
+return ret;
+
+return sd_prealloc(vid, vdi_size);
 }
 
 static void sd_close(BlockDriverState *bs)
@@ -1990,6 +2056,11 @@ static QEMUOptionParameter sd_create_options[] = {
 .type = OPT_STRING,
 .help = File name of a base image
 },
+{
+.name = BLOCK_OPT_PREALLOC,
+.type = OPT_STRING,
+.help = Preallocation mode (allowed values: off, data)
+},
 { NULL }
 };
 
-- 
1.5.6.5




Re: [Qemu-devel] [PATCH] sheepdog: add data preallocation support

2011-05-23 Thread MORITA Kazutaka
At Mon, 23 May 2011 10:19:13 +0100,
Stefan Hajnoczi wrote:
 
 On Sat, May 21, 2011 at 1:35 PM, MORITA Kazutaka
 morita.kazut...@lab.ntt.co.jp wrote:
  +static int sd_prealloc(uint32_t vid, int64_t vdi_size)
  +{
  +    int fd, ret;
  +    SheepdogInode *inode;
  +    char *buf;
  +    unsigned long idx, max_idx;
 [...]
  +    max_idx = (vdi_size + SD_DATA_OBJ_SIZE - 1) / SD_DATA_OBJ_SIZE;
  +
  +    for (idx = 0; idx  max_idx; idx++) {
 
 Do you want to use uint64_t here instead of unsigned long, which may
 be too small on 32-bit hosts?

The index of a Sheepdog data object is within 32-bit range, so using
an unsigned long is safe here.

Thanks,

Kazutaka



Re: [Qemu-devel] [PATCH] sheepdog: add data preallocation support

2011-07-05 Thread MORITA Kazutaka
At Fri, 01 Jul 2011 10:29:09 +0200,
Kevin Wolf wrote:
 
 Am 21.05.2011 14:35, schrieb MORITA Kazutaka:
  This introduces a qemu-img create option for sheepdog which allows the
  data to be preallocated (note that sheepdog always preallocates
  metadata).  This is necessary to use Sheepdog volumes as a backend
  storage for iSCSI target.  More information is available at
  https://sourceforge.net/apps/trac/sheepdog/wiki/General%20Protocol%20Support
  
  The option is disabled by default and you need to enable it like the
  following:
  
  qemu-img create sheepdog:test -o preallocation=data 1G
  
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
  Signed-off-by: FUJITA Tomonori fujita.tomon...@lab.ntt.co.jp
 
 Hm, looks like I forgot about this patch and nobody pinged me...
 
   block/sheepdog.c |   73 
  +-
   1 files changed, 72 insertions(+), 1 deletions(-)
  
  diff --git a/block/sheepdog.c b/block/sheepdog.c
  index 0392ca8..38ca9aa 100644
  --- a/block/sheepdog.c
  +++ b/block/sheepdog.c
  @@ -1292,6 +1292,57 @@ static int do_sd_create(char *filename, int64_t 
  vdi_size,
   return 0;
   }
   
  +static int sd_prealloc(uint32_t vid, int64_t vdi_size)
 
 General comment to this function: Wouldn't it be easier to call the
 existing bdrv_write function in a loop instead of reimplementing the
 write manually? Though there may be a reason for it that I'm just missing.

Yes, it's much easier!  I'll rewrite this function with those, thanks.

 
  +{
  +int fd, ret;
  +SheepdogInode *inode;
  +char *buf;
  +unsigned long idx, max_idx;
  +
  +fd = connect_to_sdog(NULL, NULL);
  +if (fd  0) {
  +return -EIO;
  +}
  +
  +inode = qemu_malloc(sizeof(*inode));
  +buf = qemu_malloc(SD_DATA_OBJ_SIZE);
  +
  +ret = read_object(fd, (char *)inode, vid_to_vdi_oid(vid),
  +  0, sizeof(*inode), 0);
 
 No error handling?
 
  +
  +max_idx = (vdi_size + SD_DATA_OBJ_SIZE - 1) / SD_DATA_OBJ_SIZE;
  +
  +for (idx = 0; idx  max_idx; idx++) {
  +uint64_t oid;
  +oid = vid_to_data_oid(vid, idx);
  +
  +if (inode-data_vdi_id[idx]) {
  +ret = read_object(fd, buf, 
  vid_to_vdi_oid(inode-data_vdi_id[idx]),
  +  1, SD_DATA_OBJ_SIZE, 0);
  +if (ret)
  +goto out;
 
 Missing braces.
 
 Also, what is this if branch doing? Is it to ensure that we don't
 overwrite existing data? But then, isn't an image always empty when we
 preallocate it?

This branch is for handling a cloned image, which is created with -b
option.  This branch reads data from the backing file (read_object
returns zero when it succeeds) instead of filling buffer with zero.

 
  +} else {
  +memset(buf, 0, SD_DATA_OBJ_SIZE);
  +}
  +
  +ret = write_object(fd, buf, oid, 1, SD_DATA_OBJ_SIZE, 0, 1);
  +if (ret)
  +goto out;
 
 Braces
 
  +
  +inode-data_vdi_id[idx] = vid;
  +ret = write_object(fd, (char *)inode, vid_to_vdi_oid(vid),
  +   1, sizeof(*inode), 0, 0);
  +if (ret)
  +goto out;
 
 Same here
 
  +}
  +out:
  +free(inode);
  +free(buf);
  +closesocket(fd);
  +
  +return ret;
  +}
  +
   static int sd_create(const char *filename, QEMUOptionParameter *options)
   {
   int ret;
  @@ -1301,6 +1352,7 @@ static int sd_create(const char *filename, 
  QEMUOptionParameter *options)
   BDRVSheepdogState s;
   char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
   uint32_t snapid;
  +int prealloc = 0;
   
   strstart(filename, sheepdog:, (const char **)filename);
   
  @@ -1317,6 +1369,16 @@ static int sd_create(const char *filename, 
  QEMUOptionParameter *options)
   vdi_size = options-value.n;
   } else if (!strcmp(options-name, BLOCK_OPT_BACKING_FILE)) {
   backing_file = options-value.s;
  +} else if (!strcmp(options-name, BLOCK_OPT_PREALLOC)) {
  +if (!options-value.s || !strcmp(options-value.s, off)) {
  +prealloc = 0;
  +} else if (!strcmp(options-value.s, data)) {
  +prealloc = 1;
  +} else {
  +error_report(Invalid preallocation mode: '%s'\n,
  +options-value.s);
  +return -EINVAL;
  +}
   }
   options++;
   }
  @@ -1354,7 +1416,11 @@ static int sd_create(const char *filename, 
  QEMUOptionParameter *options)
   bdrv_delete(bs);
   }
   
  -return do_sd_create((char *)vdi, vdi_size, base_vid, vid, 0, s.addr, 
  s.port);
  +ret = do_sd_create((char *)vdi, vdi_size, base_vid, vid, 0, s.addr, 
  s.port);
  +if (!prealloc || ret)
  +return ret;
 
 And here.
 
  +
  +return sd_prealloc(vid, vdi_size);
   }
   
   static void sd_close(BlockDriverState *bs

[Qemu-devel] [PATCH v2] sheepdog: add full data preallocation support

2011-07-05 Thread MORITA Kazutaka
This introduces qemu-img create option for sheepdog which allows the
data to be fully preallocated (note that sheepdog always preallocates
metadata).

The option is disabled by default and you need to enable it like the
following:

qemu-img create sheepdog:test -o preallocation=full 1G

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
Signed-off-by: FUJITA Tomonori fujita.tomon...@lab.ntt.co.jp
---
 block/sheepdog.c |   71 +++--
 1 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 0392ca8..5b1eb57 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1292,6 +1292,49 @@ static int do_sd_create(char *filename, int64_t vdi_size,
 return 0;
 }
 
+static int sd_prealloc(const char *filename)
+{
+BlockDriverState *bs = NULL;
+uint32_t idx, max_idx;
+int64_t vdi_size;
+void *buf = qemu_mallocz(SD_DATA_OBJ_SIZE);
+int ret;
+
+ret = bdrv_file_open(bs, filename, BDRV_O_RDWR);
+if (ret  0) {
+goto out;
+}
+
+vdi_size = bdrv_getlength(bs);
+if (vdi_size  0) {
+ret = vdi_size;
+goto out;
+}
+max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
+
+for (idx = 0; idx  max_idx; idx++) {
+/*
+ * The created image can be a cloned image, so we need to read
+ * a data from the source image.
+ */
+ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+if (ret  0) {
+goto out;
+}
+ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+if (ret  0) {
+goto out;
+}
+}
+out:
+if (bs) {
+bdrv_delete(bs);
+}
+qemu_free(buf);
+
+return ret;
+}
+
 static int sd_create(const char *filename, QEMUOptionParameter *options)
 {
 int ret;
@@ -1301,13 +1344,15 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 BDRVSheepdogState s;
 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
 uint32_t snapid;
+int prealloc = 0;
+const char *vdiname;
 
-strstart(filename, sheepdog:, (const char **)filename);
+strstart(filename, sheepdog:, vdiname);
 
 memset(s, 0, sizeof(s));
 memset(vdi, 0, sizeof(vdi));
 memset(tag, 0, sizeof(tag));
-if (parse_vdiname(s, filename, vdi, snapid, tag)  0) {
+if (parse_vdiname(s, vdiname, vdi, snapid, tag)  0) {
 error_report(invalid filename\n);
 return -EINVAL;
 }
@@ -1317,6 +1362,16 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 vdi_size = options-value.n;
 } else if (!strcmp(options-name, BLOCK_OPT_BACKING_FILE)) {
 backing_file = options-value.s;
+} else if (!strcmp(options-name, BLOCK_OPT_PREALLOC)) {
+if (!options-value.s || !strcmp(options-value.s, off)) {
+prealloc = 0;
+} else if (!strcmp(options-value.s, full)) {
+prealloc = 1;
+} else {
+error_report(Invalid preallocation mode: '%s',
+ options-value.s);
+return -EINVAL;
+}
 }
 options++;
 }
@@ -1354,7 +1409,12 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 bdrv_delete(bs);
 }
 
-return do_sd_create((char *)vdi, vdi_size, base_vid, vid, 0, s.addr, 
s.port);
+ret = do_sd_create(vdi, vdi_size, base_vid, vid, 0, s.addr, s.port);
+if (!prealloc || ret) {
+return ret;
+}
+
+return sd_prealloc(filename);
 }
 
 static void sd_close(BlockDriverState *bs)
@@ -1990,6 +2050,11 @@ static QEMUOptionParameter sd_create_options[] = {
 .type = OPT_STRING,
 .help = File name of a base image
 },
+{
+.name = BLOCK_OPT_PREALLOC,
+.type = OPT_STRING,
+.help = Preallocation mode (allowed values: off, full)
+},
 { NULL }
 };
 
-- 
1.7.2.5




Re: [Qemu-devel] [Sheepdog] [PATCH] sheepdog: add data preallocation support

2011-07-08 Thread MORITA Kazutaka
At Wed, 06 Jul 2011 09:53:32 +0200,
Kevin Wolf wrote:
 
 Am 05.07.2011 20:21, schrieb MORITA Kazutaka:
  +
  +max_idx = (vdi_size + SD_DATA_OBJ_SIZE - 1) / SD_DATA_OBJ_SIZE;
  +
  +for (idx = 0; idx  max_idx; idx++) {
  +uint64_t oid;
  +oid = vid_to_data_oid(vid, idx);
  +
  +if (inode-data_vdi_id[idx]) {
  +ret = read_object(fd, buf, 
  vid_to_vdi_oid(inode-data_vdi_id[idx]),
  +  1, SD_DATA_OBJ_SIZE, 0);
  +if (ret)
  +goto out;
 
  Missing braces.
 
  Also, what is this if branch doing? Is it to ensure that we don't
  overwrite existing data? But then, isn't an image always empty when we
  preallocate it?
  
  This branch is for handling a cloned image, which is created with -b
  option.  This branch reads data from the backing file (read_object
  returns zero when it succeeds) instead of filling buffer with zero.
 
 Oh, I see. You support preallocation even with backing files. And
 suddenly it makes perfect sense. :-)
 
 (Although after completing preallocation, you won't need the backing
 file any more as all of it has been copied into the image. Maybe we
 should drop the reference then?)

Though we can drop it, Sheepdog uses the reference to show the VM
image relationships in a tree format as VMware does.  So as far as a
Sheepdog protocol is concerned, I think we should keep it.


Thanks,

Kazutaka



[Qemu-devel] [PATCH] sheepdog: use coroutines

2011-08-12 Thread MORITA Kazutaka
This makes the sheepdog block driver support bdrv_co_readv/writev
instead of bdrv_aio_readv/writev.

With this patch, Sheepdog network I/O becomes fully asynchronous.  The
block driver yields back when send/recv returns EAGAIN, and is resumed
when the sheepdog network connection is ready for the operation.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |  150 +
 1 files changed, 93 insertions(+), 57 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index e150ac0..e283c34 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -274,7 +274,7 @@ struct SheepdogAIOCB {
 int ret;
 enum AIOCBState aiocb_type;
 
-QEMUBH *bh;
+Coroutine *coroutine;
 void (*aio_done_func)(SheepdogAIOCB *);
 
 int canceled;
@@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState {
 char *port;
 int fd;
 
+CoMutex lock;
+Coroutine *co_send;
+Coroutine *co_recv;
+
 uint32_t aioreq_seq_num;
 QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
 } BDRVSheepdogState;
@@ -346,19 +350,16 @@ static const char * sd_strerror(int err)
 /*
  * Sheepdog I/O handling:
  *
- * 1. In the sd_aio_readv/writev, read/write requests are added to the
- *QEMU Bottom Halves.
- *
- * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
- *requests to the server and link the requests to the
- *outstanding_list in the BDRVSheepdogState.  we exits the
- *function without waiting for receiving the response.
+ * 1. In sd_co_rw_vector, we send the I/O requests to the server and
+ *link the requests to the outstanding_list in the
+ *BDRVSheepdogState.  The function exits without waiting for
+ *receiving the response.
  *
- * 3. We receive the response in aio_read_response, the fd handler to
+ * 2. We receive the response in aio_read_response, the fd handler to
  *the sheepdog connection.  If metadata update is needed, we send
  *the write request to the vdi object in sd_write_done, the write
- *completion function.  The AIOCB callback is not called until all
- *the requests belonging to the AIOCB are finished.
+ *completion function.  We switch back to sd_co_readv/writev after
+ *all the requests belonging to the AIOCB are finished.
  */
 
 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
@@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, AIOReq 
*aio_req)
 static void sd_finish_aiocb(SheepdogAIOCB *acb)
 {
 if (!acb-canceled) {
-acb-common.cb(acb-common.opaque, acb-ret);
+qemu_coroutine_enter(acb-coroutine, NULL);
 }
 qemu_aio_release(acb);
 }
@@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
  * Sheepdog cannot cancel the requests which are already sent to
  * the servers, so we just complete the request with -EIO here.
  */
-acb-common.cb(acb-common.opaque, -EIO);
+acb-ret = -EIO;
+qemu_coroutine_enter(acb-coroutine, NULL);
 acb-canceled = 1;
 }
 
@@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 
 acb-aio_done_func = NULL;
 acb-canceled = 0;
-acb-bh = NULL;
+acb-coroutine = qemu_coroutine_self();
 acb-ret = 0;
 QLIST_INIT(acb-aioreq_head);
 return acb;
 }
 
-static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
-{
-if (acb-bh) {
-error_report(bug: %d %d, acb-aiocb_type, acb-aiocb_type);
-return -EIO;
-}
-
-acb-bh = qemu_bh_new(cb, acb);
-qemu_bh_schedule(acb-bh);
-return 0;
-}
-
 #ifdef _WIN32
 
 struct msghdr {
@@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec *iov, 
int len,
 again:
 ret = do_send_recv(sockfd, iov, len, iov_offset, write);
 if (ret  0) {
-if (errno == EINTR || errno == EAGAIN) {
+if (errno == EINTR) {
+goto again;
+}
+if (errno == EAGAIN) {
+if (qemu_in_coroutine()) {
+qemu_coroutine_yield();
+}
 goto again;
 }
 error_report(failed to recv a rsp, %s, strerror(errno));
@@ -793,14 +789,14 @@ static void aio_read_response(void *opaque)
 unsigned long idx;
 
 if (QLIST_EMPTY(s-outstanding_aio_head)) {
-return;
+goto out;
 }
 
 /* read a header */
 ret = do_read(fd, rsp, sizeof(rsp));
 if (ret) {
 error_report(failed to get the header, %s, strerror(errno));
-return;
+goto out;
 }
 
 /* find the right aio_req from the outstanding_aio list */
@@ -811,7 +807,7 @@ static void aio_read_response(void *opaque)
 }
 if (!aio_req) {
 error_report(cannot find aio_req %x, rsp.id);
-return;
+goto out;
 }
 
 acb = aio_req-aiocb;
@@ -847,7 +843,7 @@ static void aio_read_response(void *opaque)
aio_req-iov_offset

Re: [Qemu-devel] [PATCH] sheepdog: use coroutines

2011-08-23 Thread MORITA Kazutaka
At Tue, 23 Aug 2011 14:29:50 +0200,
Kevin Wolf wrote:
 
 Am 12.08.2011 14:33, schrieb MORITA Kazutaka:
  This makes the sheepdog block driver support bdrv_co_readv/writev
  instead of bdrv_aio_readv/writev.
  
  With this patch, Sheepdog network I/O becomes fully asynchronous.  The
  block driver yields back when send/recv returns EAGAIN, and is resumed
  when the sheepdog network connection is ready for the operation.
  
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
  ---
   block/sheepdog.c |  150 
  +
   1 files changed, 93 insertions(+), 57 deletions(-)
  
  diff --git a/block/sheepdog.c b/block/sheepdog.c
  index e150ac0..e283c34 100644
  --- a/block/sheepdog.c
  +++ b/block/sheepdog.c
  @@ -274,7 +274,7 @@ struct SheepdogAIOCB {
   int ret;
   enum AIOCBState aiocb_type;
   
  -QEMUBH *bh;
  +Coroutine *coroutine;
   void (*aio_done_func)(SheepdogAIOCB *);
   
   int canceled;
  @@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState {
   char *port;
   int fd;
   
  +CoMutex lock;
  +Coroutine *co_send;
  +Coroutine *co_recv;
  +
   uint32_t aioreq_seq_num;
   QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
   } BDRVSheepdogState;
  @@ -346,19 +350,16 @@ static const char * sd_strerror(int err)
   /*
* Sheepdog I/O handling:
*
  - * 1. In the sd_aio_readv/writev, read/write requests are added to the
  - *QEMU Bottom Halves.
  - *
  - * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
  - *requests to the server and link the requests to the
  - *outstanding_list in the BDRVSheepdogState.  we exits the
  - *function without waiting for receiving the response.
  + * 1. In sd_co_rw_vector, we send the I/O requests to the server and
  + *link the requests to the outstanding_list in the
  + *BDRVSheepdogState.  The function exits without waiting for
  + *receiving the response.
*
  - * 3. We receive the response in aio_read_response, the fd handler to
  + * 2. We receive the response in aio_read_response, the fd handler to
*the sheepdog connection.  If metadata update is needed, we send
*the write request to the vdi object in sd_write_done, the write
  - *completion function.  The AIOCB callback is not called until all
  - *the requests belonging to the AIOCB are finished.
  + *completion function.  We switch back to sd_co_readv/writev after
  + *all the requests belonging to the AIOCB are finished.
*/
   
   static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB 
  *acb,
  @@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, 
  AIOReq *aio_req)
   static void sd_finish_aiocb(SheepdogAIOCB *acb)
   {
   if (!acb-canceled) {
  -acb-common.cb(acb-common.opaque, acb-ret);
  +qemu_coroutine_enter(acb-coroutine, NULL);
   }
   qemu_aio_release(acb);
   }
  @@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
* Sheepdog cannot cancel the requests which are already sent to
* the servers, so we just complete the request with -EIO here.
*/
  -acb-common.cb(acb-common.opaque, -EIO);
  +acb-ret = -EIO;
  +qemu_coroutine_enter(acb-coroutine, NULL);
   acb-canceled = 1;
   }
   
  @@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState 
  *bs, QEMUIOVector *qiov,
   
   acb-aio_done_func = NULL;
   acb-canceled = 0;
  -acb-bh = NULL;
  +acb-coroutine = qemu_coroutine_self();
   acb-ret = 0;
   QLIST_INIT(acb-aioreq_head);
   return acb;
   }
   
  -static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
  -{
  -if (acb-bh) {
  -error_report(bug: %d %d, acb-aiocb_type, acb-aiocb_type);
  -return -EIO;
  -}
  -
  -acb-bh = qemu_bh_new(cb, acb);
  -qemu_bh_schedule(acb-bh);
  -return 0;
  -}
  -
   #ifdef _WIN32
   
   struct msghdr {
  @@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec 
  *iov, int len,
   again:
   ret = do_send_recv(sockfd, iov, len, iov_offset, write);
   if (ret  0) {
  -if (errno == EINTR || errno == EAGAIN) {
  +if (errno == EINTR) {
  +goto again;
  +}
  +if (errno == EAGAIN) {
  +if (qemu_in_coroutine()) {
  +qemu_coroutine_yield();
  +}
 
 Who reenters the coroutine if we yield here?

The fd handlers, co_write_request() and co_read_response(), will call
qemu_coroutine_enter() for the coroutine.  It will be restarted after
the sheepdog network connection becomes ready.

 
 And of course for a coroutine based block driver I would like to see
 much less code in callback functions. But it's a good start anyway.

Yes, actually they can be much simpler.  This patch adds asynchrous
I/O support with a minimal change based on a coroutine, and I think
code

Re: [Qemu-devel] [PATCH 09/12] sheepdog: move coroutine send/recv function to generic code

2011-09-08 Thread MORITA Kazutaka
At Thu,  8 Sep 2011 17:25:02 +0200,
Paolo Bonzini wrote:
 
 Outside coroutines, avoid busy waiting on EAGAIN by temporarily
 making the socket blocking.
 
 The API of qemu_recvv/qemu_sendv is slightly different from
 do_readv/do_writev because they do not handle coroutines.  It
 returns the number of bytes written before encountering an
 EAGAIN.  The specificity of yielding on EAGAIN is entirely in
 qemu-coroutine.c.
 
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  block/sheepdog.c |  221 +
  cutils.c |  108 ++
  qemu-common.h|3 +
  qemu-coroutine.c |   71 +
  qemu-coroutine.h |   26 +++
  5 files changed, 229 insertions(+), 200 deletions(-)
 
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index af696a5..188a8d8 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
 @@ -443,129 +443,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState 
 *bs, QEMUIOVector *qiov,
  return acb;
  }
  
 -#ifdef _WIN32
 -
 -struct msghdr {
 -struct iovec *msg_iov;
 -size_tmsg_iovlen;
 -};
 -
 -static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
 -{
 -size_t size = 0;
 -char *buf, *p;
 -int i, ret;
 -
 -/* count the msg size */
 -for (i = 0; i  msg-msg_iovlen; i++) {
 -size += msg-msg_iov[i].iov_len;
 -}
 -buf = g_malloc(size);
 -
 -p = buf;
 -for (i = 0; i  msg-msg_iovlen; i++) {
 -memcpy(p, msg-msg_iov[i].iov_base, msg-msg_iov[i].iov_len);
 -p += msg-msg_iov[i].iov_len;
 -}
 -
 -ret = send(s, buf, size, flags);
 -
 -g_free(buf);
 -return ret;
 -}
 -
 -static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
 -{
 -size_t size = 0;
 -char *buf, *p;
 -int i, ret;
 -
 -/* count the msg size */
 -for (i = 0; i  msg-msg_iovlen; i++) {
 -size += msg-msg_iov[i].iov_len;
 -}
 -buf = g_malloc(size);
 -
 -ret = qemu_recv(s, buf, size, flags);
 -if (ret  0) {
 -goto out;
 -}
 -
 -p = buf;
 -for (i = 0; i  msg-msg_iovlen; i++) {
 -memcpy(msg-msg_iov[i].iov_base, p, msg-msg_iov[i].iov_len);
 -p += msg-msg_iov[i].iov_len;
 -}
 -out:
 -g_free(buf);
 -return ret;
 -}
 -
 -#endif
 -
 -/*
 - * Send/recv data with iovec buffers
 - *
 - * This function send/recv data from/to the iovec buffer directly.
 - * The first `offset' bytes in the iovec buffer are skipped and next
 - * `len' bytes are used.
 - *
 - * For example,
 - *
 - *   do_send_recv(sockfd, iov, len, offset, 1);
 - *
 - * is equals to
 - *
 - *   char *buf = malloc(size);
 - *   iov_to_buf(iov, iovcnt, buf, offset, size);
 - *   send(sockfd, buf, size, 0);
 - *   free(buf);
 - */
 -static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
 -int write)
 -{
 -struct msghdr msg;
 -int ret, diff;
 -
 -memset(msg, 0, sizeof(msg));
 -msg.msg_iov = iov;
 -msg.msg_iovlen = 1;
 -
 -len += offset;
 -
 -while (iov-iov_len  len) {
 -len -= iov-iov_len;
 -
 -iov++;
 -msg.msg_iovlen++;
 -}
 -
 -diff = iov-iov_len - len;
 -iov-iov_len -= diff;
 -
 -while (msg.msg_iov-iov_len = offset) {
 -offset -= msg.msg_iov-iov_len;
 -
 -msg.msg_iov++;
 -msg.msg_iovlen--;
 -}
 -
 -msg.msg_iov-iov_base = (char *) msg.msg_iov-iov_base + offset;
 -msg.msg_iov-iov_len -= offset;
 -
 -if (write) {
 -ret = sendmsg(sockfd, msg, 0);
 -} else {
 -ret = recvmsg(sockfd, msg, 0);
 -}
 -
 -msg.msg_iov-iov_base = (char *) msg.msg_iov-iov_base - offset;
 -msg.msg_iov-iov_len += offset;
 -
 -iov-iov_len += diff;
 -return ret;
 -}
 -
  static int connect_to_sdog(const char *addr, const char *port)
  {
  char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
 @@ -618,65 +495,6 @@ success:
  return fd;
  }
  
 -static int do_readv_writev(int sockfd, struct iovec *iov, int len,
 -   int iov_offset, int write)
 -{
 -int ret;
 -again:
 -ret = do_send_recv(sockfd, iov, len, iov_offset, write);
 -if (ret  0) {
 -if (errno == EINTR) {
 -goto again;
 -}
 -if (errno == EAGAIN) {
 -if (qemu_in_coroutine()) {
 -qemu_coroutine_yield();
 -}
 -goto again;
 -}
 -error_report(failed to recv a rsp, %s, strerror(errno));
 -return 1;
 -}
 -
 -iov_offset += ret;
 -len -= ret;
 -if (len) {
 -goto again;
 -}
 -
 -return 0;
 -}
 -
 -static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
 -{
 -return do_readv_writev(sockfd, iov, len, iov_offset, 0);
 -}
 -
 -static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
 -{
 -return do_readv_writev(sockfd, iov, len

Re: [Qemu-devel] [PATCH v2 09/12] sheepdog: move coroutine send/recv function to generic code

2011-09-12 Thread MORITA Kazutaka
At Fri,  9 Sep 2011 10:11:38 +0200,
Paolo Bonzini wrote:
 
 Outside coroutines, avoid busy waiting on EAGAIN by temporarily
 making the socket blocking.
 
 The API of qemu_recvv/qemu_sendv is slightly different from
 do_readv/do_writev because they do not handle coroutines.  It
 returns the number of bytes written before encountering an
 EAGAIN.  The specificity of yielding on EAGAIN is entirely in
 qemu-coroutine.c.
 
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
   Thanks for the review.  I checked with qemu-io that all of
 
   readv -v 0 524288 (x8)
 readv -v 0 262144 (x16)
 readv -v 0 1024 (x4096)
 readv -v 0 1536 (x2730) 1024
 readv -v 0 1024 512 (x2730) 1024
 
   work and produce the same output, while previously they would fail.
   Looks like it's hard to trigger the code just with qemu.
 
  block/sheepdog.c |  225 
 ++
  cutils.c |  103 +
  qemu-common.h|3 +
  qemu-coroutine.c |   70 +
  qemu-coroutine.h |   26 ++
  5 files changed, 225 insertions(+), 202 deletions(-)

Thanks, this passed qemu-iotests on Sheepdog.

Reviewed-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



Re: [Qemu-devel] [PATCH v2 01/15] sheepdog: add coroutine_fn markers

2011-09-16 Thread MORITA Kazutaka
At Fri, 16 Sep 2011 16:25:38 +0200,
Paolo Bonzini wrote:
 
 This makes the following patch easier to review.
 
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  block/sheepdog.c |   14 +++---
  1 files changed, 7 insertions(+), 7 deletions(-)

Reviewed-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



Re: [Qemu-devel] [PATCH v2 02/15] add socket_set_block

2011-09-16 Thread MORITA Kazutaka
At Fri, 16 Sep 2011 16:25:39 +0200,
Paolo Bonzini wrote:
 
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  oslib-posix.c |7 +++
  oslib-win32.c |6 ++
  qemu_socket.h |1 +
  3 files changed, 14 insertions(+), 0 deletions(-)

Reviewed-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



Re: [Qemu-devel] [PATCH v2 03/15] sheepdog: move coroutine send/recv function to generic code

2011-09-17 Thread MORITA Kazutaka
At Fri, 16 Sep 2011 16:25:40 +0200,
Paolo Bonzini wrote:
 
 Outside coroutines, avoid busy waiting on EAGAIN by temporarily
 making the socket blocking.
 
 The API of qemu_recvv/qemu_sendv is slightly different from
 do_readv/do_writev because they do not handle coroutines.  It
 returns the number of bytes written before encountering an
 EAGAIN.  The specificity of yielding on EAGAIN is entirely in
 qemu-coroutine.c.
 
 Reviewed-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  block/sheepdog.c |  225 
 ++
  cutils.c |  177 ++
  qemu-common.h|   30 +++
  3 files changed, 230 insertions(+), 202 deletions(-)

It seems this patch causes a compile error of qemu-ga.

Other things I noticed:

  static int send_req(int sockfd, SheepdogReq *hdr, void *data,
  unsigned int *wlen)
  {
 @@ -691,10 +509,9 @@ static int send_req(int sockfd, SheepdogReq *hdr, void 
 *data,
  iov[1].iov_len = *wlen;
  }
  
 -ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
 -if (ret) {
 +ret = qemu_sendv(sockfd, iov, sizeof(*hdr) + *wlen, 0);

This is wrong because qemu_sendv() may return a smaller value than
(sizeof(*hdr) + *wlen).  We need to do things like qemu_write_full()
here.

 +if (ret  0) {
  error_report(failed to send a req, %s, strerror(errno));
 -ret = -1;
  }
  
  return ret;
 @@ -704,17 +521,19 @@ static int do_req(int sockfd, SheepdogReq *hdr, void 
 *data,
unsigned int *wlen, unsigned int *rlen)
  {
  int ret;
 +struct iovec iov;
  
 +socket_set_block(sockfd);
  ret = send_req(sockfd, hdr, data, wlen);
 -if (ret) {
 -ret = -1;
 +if (ret  0) {
  goto out;
  }
  
 -ret = do_read(sockfd, hdr, sizeof(*hdr));
 -if (ret) {
 +iov.iov_base = hdr;
 +iov.iov_len = sizeof(*hdr);
 +ret = qemu_recvv(sockfd, iov, sizeof(*hdr), 0);

qemu_recvv() may also return a smaller value than sizeof(*hdr) here.

 +if (ret  0) {
  error_report(failed to get a rsp, %s, strerror(errno));
 -ret = -1;
  goto out;
  }
  
 @@ -723,15 +542,17 @@ static int do_req(int sockfd, SheepdogReq *hdr, void 
 *data,
  }
  
  if (*rlen) {
 -ret = do_read(sockfd, data, *rlen);
 -if (ret) {
 +iov.iov_base = data;
 +iov.iov_len = *rlen;
 +ret = qemu_recvv(sockfd, iov, *rlen, 0);

Same here.

 +if (ret  0) {
  error_report(failed to get the data, %s, strerror(errno));
 -ret = -1;
  goto out;
  }
  }
  ret = 0;
  out:
 +socket_set_nonblock(sockfd);
  return ret;
  }
  

[snip]

 +
 +/*
 + * Send/recv data with iovec buffers
 + *
 + * This function send/recv data from/to the iovec buffer directly.
 + * The first `offset' bytes in the iovec buffer are skipped and next
 + * `len' bytes are used.
 + *
 + * For example,
 + *
 + *   do_sendv_recvv(sockfd, iov, len, offset, 1);
 + *
 + * is equal to
 + *
 + *   char *buf = malloc(size);
 + *   iov_to_buf(iov, iovcnt, buf, offset, size);
 + *   send(sockfd, buf, size, 0);
 + *   free(buf);
 + */
 +static int do_sendv_recvv(int sockfd, struct iovec *iov, int len, int offset,
 +  int do_sendv)
 +{
 +int ret, diff, iovlen;
 +struct iovec *last_iov;
 +
 +/* last_iov is inclusive, so count from one.  */
 +iovlen = 1;
 +last_iov = iov;
 +len += offset;
 +
 +while (last_iov-iov_len  len) {
 +len -= last_iov-iov_len;
 +
 +last_iov++;
 +iovlen++;
 +}
 +
 +diff = last_iov-iov_len - len;
 +last_iov-iov_len -= diff;
 +
 +while (iov-iov_len = offset) {
 +offset -= iov-iov_len;
 +
 +iov++;
 +iovlen--;
 +}
 +
 +iov-iov_base = (char *) iov-iov_base + offset;
 +iov-iov_len -= offset;
 +
 +{
 +#ifdef CONFIG_IOVEC
 +struct msghdr msg;
 +memset(msg, 0, sizeof(msg));
 +msg.msg_iov = iov;
 +msg.msg_iovlen = iovlen;
 +
 +do {
 +if (do_sendv) {
 +ret = sendmsg(sockfd, msg, 0);
 +} else {
 +ret = recvmsg(sockfd, msg, 0);
 +}
 +} while (ret == -1  errno == EINTR);
 +#else
 +struct iovec *p = iov;
 +ret = 0;
 +while (iovlen  0) {
 +int rc;
 +if (do_sendv) {
 +rc = send(sockfd, p-iov_base, p-iov_len, 0);
 +} else {
 +rc = qemu_recv(sockfd, p-iov_base, p-iov_len, 0);
 +}
 +if (rc == -1) {
 +if (errno == EINTR) {
 +continue;
 +}
 +if (ret == 0) {
 +ret = -1;
 +}
 +break;
 +}
 +iovlen--, p

Re: [Qemu-devel] [PATCH v2 04/15] coroutine-io: handle zero returns from recv

2011-09-17 Thread MORITA Kazutaka
At Fri, 16 Sep 2011 16:25:41 +0200,
Paolo Bonzini wrote:
 
 When the other side is shutdown, read returns zero (writes return EPIPE).
 In this case, care must be taken to avoid infinite loops.  This error
 was already present in sheepdog.
 
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  cutils.c |8 +++-
  1 files changed, 7 insertions(+), 1 deletions(-)
 
 diff --git a/cutils.c b/cutils.c
 index b302020..295187f 100644
 --- a/cutils.c
 +++ b/cutils.c
 @@ -501,8 +501,11 @@ static int do_sendv_recvv(int sockfd, struct iovec *iov, 
 int len, int offset,
  }
  break;
  }
 -iovlen--, p++;
 +if (rc == 0) {
 +break;
 +}
  ret += rc;
 +iovlen--, p++;
  }
  #endif
  }
 @@ -567,6 +570,9 @@ int coroutine_fn qemu_co_sendv(int sockfd, struct iovec 
 *iov,
  }
  break;
  }
 +if (ret == 0) {
 +break;
 +}
  total += ret, len -= ret;
  }

When EPIPE is set, write() returns -1 doesn't it?

It looks like qemu_co_recvv() handles a zero return correctly, so I
think this patch is not needed.


Thanks,

Kazutaka



Re: [Qemu-devel] [PATCH v2 03/15] sheepdog: move coroutine send/recv function to generic code

2011-09-17 Thread MORITA Kazutaka
At Sat, 17 Sep 2011 16:49:22 +0200,
Paolo Bonzini wrote:
 
 On 09/17/2011 08:29 AM, MORITA Kazutaka wrote:
+#else
+struct iovec *p = iov;
+ret = 0;
+while (iovlen  0) {
+int rc;
+if (do_sendv) {
+rc = send(sockfd, p-iov_base, p-iov_len, 0);
+} else {
+rc = qemu_recv(sockfd, p-iov_base, p-iov_len, 0);
+}
+if (rc == -1) {
+if (errno == EINTR) {
+continue;
+}
+if (ret == 0) {
+ret = -1;
+}
+break;
+}
+iovlen--, p++;
+ret += rc;
+}
  This code can be called inside coroutines with a non-blocking fd, so
  should we avoid busy waiting?
 
 It doesn't busy wait, it exits with EAGAIN.  I'll squash in here the 

Oops, you're right.  Sorry for the noise.

Thanks,

Kazutaka


 first hunk of patch 4, which is needed.
 
 qemu_co_recvv already handles reads that return zero, unlike sheepdog's 
 do_readv_writev.  I probably moved it there inadvertently while moving 
 code around to cutils.c, but in order to fix qemu-ga I need to create a 
 new file qemu-coroutine-io.c.
 
 Kevin, do you want me to resubmit everything, or are you going to apply 
 some more patches to the block branch (5 to 12 should be fine)?
 
 Paolo
 



[Qemu-devel] [PATCH 6/6] sheepdog: traverse pending_list from the first for each time

2012-06-26 Thread MORITA Kazutaka
The pending list can be modified in other coroutine context
sd_co_rw_vector, so we need to traverse the list from the first again
after we send the pending request.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   22 --
 1 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index f6cd517..6e73efb 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -634,21 +634,31 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState 
*s, AIOReq *aio_req,
struct iovec *iov, int niov, int create,
enum AIOCBState aiocb_type);
 
+
+static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
+{
+AIOReq *aio_req;
+
+QLIST_FOREACH(aio_req, s-pending_aio_head, aio_siblings) {
+if (aio_req-oid == oid) {
+return aio_req;
+}
+}
+
+return NULL;
+}
+
 /*
  * This function searchs pending requests to the object `oid', and
  * sends them.
  */
 static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
 {
-AIOReq *aio_req, *next;
+AIOReq *aio_req;
 SheepdogAIOCB *acb;
 int ret;
 
-QLIST_FOREACH_SAFE(aio_req, s-pending_aio_head, aio_siblings, next) {
-if (aio_req-oid != oid) {
-continue;
-}
-
+while ((aio_req = find_pending_req(s, oid)) != NULL) {
 acb = aio_req-aiocb;
 /* move aio_req from pending list to inflight one */
 QLIST_REMOVE(aio_req, aio_siblings);
-- 
1.7.2.5




[Qemu-devel] [PATCH 3/6] sheepdog: use coroutine based socket functions in coroutine context

2012-06-26 Thread MORITA Kazutaka
This removes blocking network I/Os in coroutine context.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   10 --
 1 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 0b49c6d..5dc1d7a 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -541,11 +541,18 @@ static coroutine_fn int send_co_req(int sockfd, 
SheepdogReq *hdr, void *data,
 return ret;
 }
 
+static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
+  unsigned int *wlen, unsigned int *rlen);
+
 static int do_req(int sockfd, SheepdogReq *hdr, void *data,
   unsigned int *wlen, unsigned int *rlen)
 {
 int ret;
 
+if (qemu_in_coroutine()) {
+return do_co_req(sockfd, hdr, data, wlen, rlen);
+}
+
 socket_set_block(sockfd);
 ret = send_req(sockfd, hdr, data, wlen);
 if (ret  0) {
@@ -1642,7 +1649,6 @@ static coroutine_fn int sd_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 int ret;
 
 if (bs-growable  sector_num + nb_sectors  bs-total_sectors) {
-/* TODO: shouldn't block here */
 ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE);
 if (ret  0) {
 return ret;
@@ -1710,7 +1716,7 @@ static int coroutine_fn 
sd_co_flush_to_disk(BlockDriverState *bs)
 hdr.opcode = SD_OP_FLUSH_VDI;
 hdr.oid = vid_to_vdi_oid(inode-vdi_id);
 
-ret = do_co_req(s-flush_fd, (SheepdogReq *)hdr, NULL, wlen, rlen);
+ret = do_req(s-flush_fd, (SheepdogReq *)hdr, NULL, wlen, rlen);
 if (ret) {
 error_report(failed to send a request to the sheep);
 return ret;
-- 
1.7.2.5




[Qemu-devel] [PATCH 0/6] sheepdog: various fixes

2012-06-26 Thread MORITA Kazutaka
See individual patches for details.

MORITA Kazutaka (6):
  sheepdog: fix dprintf format strings
  sheepdog: restart I/O when socket becomes ready in do_co_req()
  sheepdog: use coroutine based socket functions in coroutine context
  sheepdog: make sure we don't free aiocb before sending all requests
  sheepdog: split outstanding list into inflight and pending
  sheepdog: traverse pending_list from the first for each time

 block/sheepdog.c |  130 +
 1 files changed, 81 insertions(+), 49 deletions(-)

-- 
1.7.2.5




[Qemu-devel] [PATCH 1/6] sheepdog: fix dprintf format strings

2012-06-26 Thread MORITA Kazutaka
This fixes warnings about dprintf format in debug mode.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |8 
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 8877f45..afd06aa 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1571,11 +1571,11 @@ static int coroutine_fn sd_co_rw_vector(void *p)
 }
 
 if (create) {
-dprintf(update ino (% PRIu32) % PRIu64  % PRIu64
- % PRIu64 \n, inode-vdi_id, oid,
+dprintf(update ino (% PRIu32 ) % PRIu64  % PRIu64  %ld\n,
+inode-vdi_id, oid,
 vid_to_data_oid(inode-data_vdi_id[idx], idx), idx);
 oid = vid_to_data_oid(inode-vdi_id, idx);
-dprintf(new oid %lx\n, oid);
+dprintf(new oid % PRIx64 \n, oid);
 }
 
 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, 
done);
@@ -1726,7 +1726,7 @@ static int sd_snapshot_create(BlockDriverState *bs, 
QEMUSnapshotInfo *sn_info)
 SheepdogInode *inode;
 unsigned int datalen;
 
-dprintf(sn_info: name %s id_str %s s: name %s vm_state_size %d 
+dprintf(sn_info: name %s id_str %s s: name %s vm_state_size % PRId64  
 is_snapshot %d\n, sn_info-name, sn_info-id_str,
 s-name, sn_info-vm_state_size, s-is_snapshot);
 
-- 
1.7.2.5




[Qemu-devel] [PATCH 4/6] sheepdog: make sure we don't free aiocb before sending all requests

2012-06-26 Thread MORITA Kazutaka
This patch increments the pending counter before sending requests, and
make sures that aiocb is not freed while sending them.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   29 -
 1 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 5dc1d7a..d4e5e3a 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -260,7 +260,6 @@ typedef struct AIOReq {
 uint32_t id;
 
 QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
-QLIST_ENTRY(AIOReq) aioreq_siblings;
 } AIOReq;
 
 enum AIOCBState {
@@ -283,8 +282,7 @@ struct SheepdogAIOCB {
 void (*aio_done_func)(SheepdogAIOCB *);
 
 int canceled;
-
-QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
+int nr_pending;
 };
 
 typedef struct BDRVSheepdogState {
@@ -388,19 +386,19 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, 
SheepdogAIOCB *acb,
 
 QLIST_INSERT_HEAD(s-outstanding_aio_head, aio_req,
   outstanding_aio_siblings);
-QLIST_INSERT_HEAD(acb-aioreq_head, aio_req, aioreq_siblings);
 
+acb-nr_pending++;
 return aio_req;
 }
 
-static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
+static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 {
 SheepdogAIOCB *acb = aio_req-aiocb;
+
 QLIST_REMOVE(aio_req, outstanding_aio_siblings);
-QLIST_REMOVE(aio_req, aioreq_siblings);
 g_free(aio_req);
 
-return !QLIST_EMPTY(acb-aioreq_head);
+acb-nr_pending--;
 }
 
 static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
@@ -446,7 +444,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 acb-canceled = 0;
 acb-coroutine = qemu_coroutine_self();
 acb-ret = 0;
-QLIST_INIT(acb-aioreq_head);
+acb-nr_pending = 0;
 return acb;
 }
 
@@ -663,7 +661,7 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState 
*s, uint64_t oid, ui
 if (ret  0) {
 error_report(add_aio_request is failed);
 free_aio_req(s, aio_req);
-if (QLIST_EMPTY(acb-aioreq_head)) {
+if (!acb-nr_pending) {
 sd_finish_aiocb(acb);
 }
 }
@@ -684,7 +682,6 @@ static void coroutine_fn aio_read_response(void *opaque)
 int ret;
 AIOReq *aio_req = NULL;
 SheepdogAIOCB *acb;
-int rest;
 unsigned long idx;
 
 if (QLIST_EMPTY(s-outstanding_aio_head)) {
@@ -755,8 +752,8 @@ static void coroutine_fn aio_read_response(void *opaque)
 error_report(%s, sd_strerror(rsp.result));
 }
 
-rest = free_aio_req(s, aio_req);
-if (!rest) {
+free_aio_req(s, aio_req);
+if (!acb-nr_pending) {
 /*
  * We've finished all requests which belong to the AIOCB, so
  * we can switch back to sd_co_readv/writev now.
@@ -1568,6 +1565,12 @@ static int coroutine_fn sd_co_rw_vector(void *p)
 }
 }
 
+/*
+ * Make sure we don't free the aiocb before we are done with all requests.
+ * This additional reference is dropped at the end of this function.
+ */
+acb-nr_pending++;
+
 while (done != total) {
 uint8_t flags = 0;
 uint64_t old_oid = 0;
@@ -1636,7 +1639,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
 done += len;
 }
 out:
-if (QLIST_EMPTY(acb-aioreq_head)) {
+if (!--acb-nr_pending) {
 return acb-ret;
 }
 return 1;
-- 
1.7.2.5




[Qemu-devel] [PATCH 5/6] sheepdog: split outstanding list into inflight and pending

2012-06-26 Thread MORITA Kazutaka
outstanding_list_head is used for both pending and inflight requests.
This patch splits it and improves readability.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   49 -
 1 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index d4e5e3a..f6cd517 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -259,7 +259,7 @@ typedef struct AIOReq {
 uint8_t flags;
 uint32_t id;
 
-QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
+QLIST_ENTRY(AIOReq) aio_siblings;
 } AIOReq;
 
 enum AIOCBState {
@@ -305,7 +305,8 @@ typedef struct BDRVSheepdogState {
 Coroutine *co_recv;
 
 uint32_t aioreq_seq_num;
-QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
+QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
+QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
 } BDRVSheepdogState;
 
 static const char * sd_strerror(int err)
@@ -356,7 +357,7 @@ static const char * sd_strerror(int err)
  * Sheepdog I/O handling:
  *
  * 1. In sd_co_rw_vector, we send the I/O requests to the server and
- *link the requests to the outstanding_list in the
+ *link the requests to the inflight_list in the
  *BDRVSheepdogState.  The function exits without waiting for
  *receiving the response.
  *
@@ -384,9 +385,6 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, 
SheepdogAIOCB *acb,
 aio_req-flags = flags;
 aio_req-id = s-aioreq_seq_num++;
 
-QLIST_INSERT_HEAD(s-outstanding_aio_head, aio_req,
-  outstanding_aio_siblings);
-
 acb-nr_pending++;
 return aio_req;
 }
@@ -395,7 +393,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, 
AIOReq *aio_req)
 {
 SheepdogAIOCB *acb = aio_req-aiocb;
 
-QLIST_REMOVE(aio_req, outstanding_aio_siblings);
+QLIST_REMOVE(aio_req, aio_siblings);
 g_free(aio_req);
 
 acb-nr_pending--;
@@ -640,22 +638,21 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState 
*s, AIOReq *aio_req,
  * This function searchs pending requests to the object `oid', and
  * sends them.
  */
-static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid, 
uint32_t id)
+static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
 {
 AIOReq *aio_req, *next;
 SheepdogAIOCB *acb;
 int ret;
 
-QLIST_FOREACH_SAFE(aio_req, s-outstanding_aio_head,
-   outstanding_aio_siblings, next) {
-if (id == aio_req-id) {
-continue;
-}
+QLIST_FOREACH_SAFE(aio_req, s-pending_aio_head, aio_siblings, next) {
 if (aio_req-oid != oid) {
 continue;
 }
 
 acb = aio_req-aiocb;
+/* move aio_req from pending list to inflight one */
+QLIST_REMOVE(aio_req, aio_siblings);
+QLIST_INSERT_HEAD(s-inflight_aio_head, aio_req, aio_siblings);
 ret = add_aio_request(s, aio_req, acb-qiov-iov,
   acb-qiov-niov, 0, acb-aiocb_type);
 if (ret  0) {
@@ -684,7 +681,7 @@ static void coroutine_fn aio_read_response(void *opaque)
 SheepdogAIOCB *acb;
 unsigned long idx;
 
-if (QLIST_EMPTY(s-outstanding_aio_head)) {
+if (QLIST_EMPTY(s-inflight_aio_head)) {
 goto out;
 }
 
@@ -695,8 +692,8 @@ static void coroutine_fn aio_read_response(void *opaque)
 goto out;
 }
 
-/* find the right aio_req from the outstanding_aio list */
-QLIST_FOREACH(aio_req, s-outstanding_aio_head, outstanding_aio_siblings) 
{
+/* find the right aio_req from the inflight aio list */
+QLIST_FOREACH(aio_req, s-inflight_aio_head, aio_siblings) {
 if (aio_req-id == rsp.id) {
 break;
 }
@@ -734,7 +731,7 @@ static void coroutine_fn aio_read_response(void *opaque)
  * create requests are not allowed, so we search the
  * pending requests here.
  */
-send_pending_req(s, vid_to_data_oid(s-inode.vdi_id, idx), rsp.id);
+send_pending_req(s, vid_to_data_oid(s-inode.vdi_id, idx));
 }
 break;
 case AIOCB_READ_UDATA:
@@ -786,7 +783,8 @@ static int aio_flush_request(void *opaque)
 {
 BDRVSheepdogState *s = opaque;
 
-return !QLIST_EMPTY(s-outstanding_aio_head);
+return !QLIST_EMPTY(s-inflight_aio_head) ||
+!QLIST_EMPTY(s-pending_aio_head);
 }
 
 static int set_nodelay(int fd)
@@ -1103,7 +1101,8 @@ static int sd_open(BlockDriverState *bs, const char 
*filename, int flags)
 
 strstart(filename, sheepdog:, (const char **)filename);
 
-QLIST_INIT(s-outstanding_aio_head);
+QLIST_INIT(s-inflight_aio_head);
+QLIST_INIT(s-pending_aio_head);
 s-fd = -1;
 
 memset(vdi, 0, sizeof(vdi));
@@ -1465,6 +1464,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 iov.iov_len = sizeof(s-inode);
 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s

[Qemu-devel] [PATCH 2/6] sheepdog: restart I/O when socket becomes ready in do_co_req()

2012-06-26 Thread MORITA Kazutaka
Currently, no one reenters the yielded coroutine.  This fixes it.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   14 ++
 1 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index afd06aa..0b49c6d 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -577,10 +577,21 @@ out:
 return ret;
 }
 
+static void restart_co_req(void *opaque)
+{
+Coroutine *co = opaque;
+
+qemu_coroutine_enter(co, NULL);
+}
+
 static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
   unsigned int *wlen, unsigned int *rlen)
 {
 int ret;
+Coroutine *co;
+
+co = qemu_coroutine_self();
+qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
 
 socket_set_block(sockfd);
 ret = send_co_req(sockfd, hdr, data, wlen);
@@ -588,6 +599,8 @@ static coroutine_fn int do_co_req(int sockfd, SheepdogReq 
*hdr, void *data,
 goto out;
 }
 
+qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co);
+
 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
 if (ret  sizeof(*hdr)) {
 error_report(failed to get a rsp, %s, strerror(errno));
@@ -609,6 +622,7 @@ static coroutine_fn int do_co_req(int sockfd, SheepdogReq 
*hdr, void *data,
 }
 ret = 0;
 out:
+qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
 socket_set_nonblock(sockfd);
 return ret;
 }
-- 
1.7.2.5




Re: [Qemu-devel] [sheepdog] [PATCH] sheepdog: do not blindly memset all read buffers

2012-06-28 Thread MORITA Kazutaka
At Wed, 27 Jun 2012 18:22:58 +0200,
Christoph Hellwig wrote:
 
 Only buffers that map to unallocated blocks need to be zeroed.
 
 Signed-off-by: Christoph Hellwig h...@lst.de
 
 Index: qemu/block/sheepdog.c
 ===
 --- qemu.orig/block/sheepdog.c2012-06-27 18:02:41.849867899 +0200
 +++ qemu/block/sheepdog.c 2012-06-27 18:08:35.929865783 +0200
 @@ -1556,18 +1556,24 @@ static int coroutine_fn sd_co_rw_vector(
  
  len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
  
 -if (!inode-data_vdi_id[idx]) {
 -if (acb-aiocb_type == AIOCB_READ_UDATA) {
 +
 +switch (acb-aiocb_type) {
 +case AIOCB_READ_UDATA:
 +if (!inode-data_vdi_id[idx]) {
 +qemu_iovec_memset_skip(acb-qiov, 0, len, offset);

'offset' is the offset of the sheepdog object.  I think it should be
'done' since we need to pass the number of skip bytes.

  goto done;
  }
 -
 -create = 1;
 -} else if (acb-aiocb_type == AIOCB_WRITE_UDATA
 -!is_data_obj_writable(inode, idx)) {
 -/* Copy-On-Write */
 -create = 1;
 -old_oid = oid;
 -flags = SD_FLAG_CMD_COW;
 +break;
 + case AIOCB_WRITE_UDATA:

Wrong indentation.

Thanks,

Kazutaka



Re: [Qemu-devel] [sheepdog] [PATCH] sheepdog: do not blindly memset all read buffers

2012-07-02 Thread MORITA Kazutaka
At Fri, 29 Jun 2012 17:38:24 +0200,
Christoph Hellwig wrote:
  
goto done;
}
   -
   -create = 1;
   -} else if (acb-aiocb_type == AIOCB_WRITE_UDATA
   -!is_data_obj_writable(inode, idx)) {
   -/* Copy-On-Write */
   -create = 1;
   -old_oid = oid;
   -flags = SD_FLAG_CMD_COW;
   +break;
   + case AIOCB_WRITE_UDATA:
  
  Wrong indentation.
 
 Where?  At least I can't find anything obvious and checkpath.pl is fine
 with the patch, too.

It seems that there is a redundant space before case AIOCB_WRITE_UDATA:.

==
$ ./scripts/checkpatch.pl your.patch
ERROR: switch and case should be at the same indent
#92: FILE: block/sheepdog.c:1560:
+switch (acb-aiocb_type) {
[...]
+ case AIOCB_WRITE_UDATA:

total: 1 errors, 0 warnings, 55 lines checked

c.patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.



Re: [Qemu-devel] [PATCH 3/6] sheepdog: use coroutine based socket functions in coroutine context

2012-07-04 Thread MORITA Kazutaka
At Tue, 03 Jul 2012 15:15:03 +0200,
Kevin Wolf wrote:
 
 Am 27.06.2012 00:26, schrieb MORITA Kazutaka:
  This removes blocking network I/Os in coroutine context.
  
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
  ---
   block/sheepdog.c |   10 --
   1 files changed, 8 insertions(+), 2 deletions(-)
  
  diff --git a/block/sheepdog.c b/block/sheepdog.c
  index 0b49c6d..5dc1d7a 100644
  --- a/block/sheepdog.c
  +++ b/block/sheepdog.c
  @@ -541,11 +541,18 @@ static coroutine_fn int send_co_req(int sockfd, 
  SheepdogReq *hdr, void *data,
   return ret;
   }
   
  +static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
  +  unsigned int *wlen, unsigned int *rlen);
  +
   static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 unsigned int *wlen, unsigned int *rlen)
   {
   int ret;
   
  +if (qemu_in_coroutine()) {
  +return do_co_req(sockfd, hdr, data, wlen, rlen);
  +}
  +
   socket_set_block(sockfd);
   ret = send_req(sockfd, hdr, data, wlen);
   if (ret  0) {
 
 How about replacing the non-coroutine implementation by code that
 creates a new coroutine and executes do_co_req() as well? This would
 reduce some code duplication.

Indeed.  I'll send a patch for it soon.

Thanks,

Kazutaka



[Qemu-devel] [PATCH] sheepdog: always use coroutine-based network functions

2012-07-04 Thread MORITA Kazutaka
This reduces some code duplication.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |  113 ++---
 1 files changed, 47 insertions(+), 66 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 6e73efb..4958672 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -498,26 +498,6 @@ success:
 return fd;
 }
 
-static int send_req(int sockfd, SheepdogReq *hdr, void *data,
-unsigned int *wlen)
-{
-int ret;
-
-ret = qemu_send_full(sockfd, hdr, sizeof(*hdr), 0);
-if (ret  sizeof(*hdr)) {
-error_report(failed to send a req, %s, strerror(errno));
-return -errno;
-}
-
-ret = qemu_send_full(sockfd, data, *wlen, 0);
-if (ret  *wlen) {
-error_report(failed to send a req, %s, strerror(errno));
-ret = -errno;
-}
-
-return ret;
-}
-
 static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
 unsigned int *wlen)
 {
@@ -537,49 +517,6 @@ static coroutine_fn int send_co_req(int sockfd, 
SheepdogReq *hdr, void *data,
 return ret;
 }
 
-static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
-  unsigned int *wlen, unsigned int *rlen);
-
-static int do_req(int sockfd, SheepdogReq *hdr, void *data,
-  unsigned int *wlen, unsigned int *rlen)
-{
-int ret;
-
-if (qemu_in_coroutine()) {
-return do_co_req(sockfd, hdr, data, wlen, rlen);
-}
-
-socket_set_block(sockfd);
-ret = send_req(sockfd, hdr, data, wlen);
-if (ret  0) {
-goto out;
-}
-
-ret = qemu_recv_full(sockfd, hdr, sizeof(*hdr), 0);
-if (ret  sizeof(*hdr)) {
-error_report(failed to get a rsp, %s, strerror(errno));
-ret = -errno;
-goto out;
-}
-
-if (*rlen  hdr-data_length) {
-*rlen = hdr-data_length;
-}
-
-if (*rlen) {
-ret = qemu_recv_full(sockfd, data, *rlen, 0);
-if (ret  *rlen) {
-error_report(failed to get the data, %s, strerror(errno));
-ret = -errno;
-goto out;
-}
-}
-ret = 0;
-out:
-socket_set_nonblock(sockfd);
-return ret;
-}
-
 static void restart_co_req(void *opaque)
 {
 Coroutine *co = opaque;
@@ -587,11 +524,26 @@ static void restart_co_req(void *opaque)
 qemu_coroutine_enter(co, NULL);
 }
 
-static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
-  unsigned int *wlen, unsigned int *rlen)
+typedef struct SheepdogReqCo {
+int sockfd;
+SheepdogReq *hdr;
+void *data;
+unsigned int *wlen;
+unsigned int *rlen;
+int ret;
+bool finished;
+} SheepdogReqCo;
+
+static coroutine_fn void do_co_req(void *opaque)
 {
 int ret;
 Coroutine *co;
+SheepdogReqCo *srco = opaque;
+int sockfd = srco-sockfd;
+SheepdogReq *hdr = srco-hdr;
+void *data = srco-data;
+unsigned int *wlen = srco-wlen;
+unsigned int *rlen = srco-rlen;
 
 co = qemu_coroutine_self();
 qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
@@ -627,7 +579,36 @@ static coroutine_fn int do_co_req(int sockfd, SheepdogReq 
*hdr, void *data,
 out:
 qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
 socket_set_nonblock(sockfd);
-return ret;
+
+srco-ret = ret;
+srco-finished = true;
+}
+
+static int do_req(int sockfd, SheepdogReq *hdr, void *data,
+  unsigned int *wlen, unsigned int *rlen)
+{
+Coroutine *co;
+SheepdogReqCo srco = {
+.sockfd = sockfd,
+.hdr = hdr,
+.data = data,
+.wlen = wlen,
+.rlen = rlen,
+.ret = 0,
+.finished = false,
+};
+
+if (qemu_in_coroutine()) {
+do_co_req(srco);
+} else {
+co = qemu_coroutine_create(do_co_req);
+qemu_coroutine_enter(co, srco);
+while (!srco.finished) {
+qemu_aio_wait();
+}
+}
+
+return srco.ret;
 }
 
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-- 
1.7.2.5




Re: [Qemu-devel] [sheepdog] [PATCH v2] sheepdog: do not blindly memset all read buffers

2012-07-09 Thread MORITA Kazutaka
At Sun, 8 Jul 2012 21:33:16 +0200,
Christoph Hellwig wrote:
 
 Only buffers that map to unallocated blocks need to be zeroed.
 
 Signed-off-by: Christoph Hellwig h...@lst.de
 
 ---
  block/sheepdog.c |   28 ++--
  1 file changed, 18 insertions(+), 10 deletions(-)
 
 Index: qemu/block/sheepdog.c
 ===
 --- qemu.orig/block/sheepdog.c2012-07-08 21:20:12.896597656 +0200
 +++ qemu/block/sheepdog.c 2012-07-08 21:20:56.563264061 +0200
 @@ -1556,18 +1556,26 @@ static int coroutine_fn sd_co_rw_vector(
  
  len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
  
 -if (!inode-data_vdi_id[idx]) {
 -if (acb-aiocb_type == AIOCB_READ_UDATA) {
 +switch (acb-aiocb_type) {
 +case AIOCB_READ_UDATA:
 +if (!inode-data_vdi_id[idx]) {
 +qemu_iovec_memset_skip(acb-qiov, 0, len, done);
  goto done;
  }
 -
 -create = 1;
 -} else if (acb-aiocb_type == AIOCB_WRITE_UDATA
 -!is_data_obj_writable(inode, idx)) {
 -/* Copy-On-Write */
 -create = 1;
 -old_oid = oid;
 -flags = SD_FLAG_CMD_COW;
 +break;
 +case AIOCB_WRITE_UDATA:
 +if (!inode-data_vdi_id[idx]) {
 +create = 1;
 +}
 +if (!is_data_obj_writable(inode, idx)) {
 +/* Copy-On-Write */
 +create = 1;
 +old_oid = oid;
 +flags = SD_FLAG_CMD_COW;
 +}
 +break;
 +default:
 +break;
  }
  
  if (create) {
 -- 

There is no code to remove a blind memset from sd_co_readv, which
existed in the previous patch.

The others look good to me.

Thanks,

Kazutaka



Re: [Qemu-devel] [PATCH v3] sheepdog: do not blindly memset all read buffers

2012-07-09 Thread MORITA Kazutaka
At Mon, 9 Jul 2012 16:34:13 +0200,
Christoph Hellwig wrote:
 
 Only buffers that map to unallocated blocks need to be zeroed.
 
 Signed-off-by: Christoph Hellwig h...@lst.de
 
 ---
  block/sheepdog.c |   37 ++---
  1 file changed, 18 insertions(+), 19 deletions(-)

Acked-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



Re: [Qemu-devel] [PATCH v4] sheepdog: do not blindly memset all read buffers

2012-07-10 Thread MORITA Kazutaka
At Tue, 10 Jul 2012 16:12:27 +0200,
Christoph Hellwig wrote:
 
 Only buffers that map to unallocated blocks need to be zeroed.
 
 Signed-off-by: Christoph Hellwig h...@lst.de
 
 ---
  block/sheepdog.c |   37 ++---
  1 file changed, 18 insertions(+), 19 deletions(-)

Acked-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



[Qemu-devel] [PATCH] sheepdog: fix co_recv coroutine context

2012-01-31 Thread MORITA Kazutaka
The co_recv coroutine has two things that will try to enter it:

  1. The select(2) read callback on the sheepdog socket.
  2. The aio_add_request() blocking operations, including a coroutine
 mutex.

This patch fixes it by setting NULL to co_recv before sending data.

In future, we should make the sheepdog driver fully coroutine-based
and simplify request handling.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 9416400..00276f6f 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -629,6 +629,9 @@ static void coroutine_fn aio_read_response(void *opaque)
 
 switch (acb-aiocb_type) {
 case AIOCB_WRITE_UDATA:
+/* this coroutine context is no longer suitable for co_recv
+ * because we may send data to update vdi objects */
+s-co_recv = NULL;
 if (!is_data_obj(aio_req-oid)) {
 break;
 }
-- 
1.7.2.5




Re: [Qemu-devel] [PATCH v3] sheepdog: implement SD_OP_FLUSH_VDI operation

2012-04-03 Thread MORITA Kazutaka
At Tue,  3 Apr 2012 16:35:00 +0800,
Liu Yuan wrote:
  
 +static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
 +{
 +BDRVSheepdogState *s = bs-opaque;
 +SheepdogObjReq hdr = { 0 };
 +SheepdogObjRsp *rsp = (SheepdogObjRsp *)hdr;
 +SheepdogInode *inode = s-inode;
 +int ret;
 +unsigned int wlen = 0, rlen = 0;
 +
 +if (!s-cache_enabled) {
 +return 0;
 +}
 +
 +hdr.opcode = SD_OP_FLUSH_VDI;
 +hdr.oid = vid_to_vdi_oid(inode-vdi_id);
 +
 +ret = do_co_req(s-fd, (SheepdogReq *)hdr, NULL, wlen, rlen);

This will mix the flush request with I/O requests on s-fd.  We can
read s-fd only in aio_read_response() and write only in
add_aio_request(), so please create another connection for flush
operations.

Thanks,

Kazutaka



Re: [Qemu-devel] [PATCH v4 UPDATE] sheepdog: implement SD_OP_FLUSH_VDI operation

2012-04-03 Thread MORITA Kazutaka
At Tue,  3 Apr 2012 18:08:10 +0800,
Liu Yuan wrote:
 
 From: Liu Yuan tailai...@taobao.com
 
 Flush operation is supposed to flush the write-back cache of
 sheepdog cluster.
 
 By issuing flush operation, we can assure the Guest of data
 reaching the sheepdog cluster storage.
 
 Cc: Kevin Wolf kw...@redhat.com
 Cc: Michael Tokarev m...@tls.msk.ru
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Cc: Stefan Hajnoczi stefa...@gmail.com
 Signed-off-by: Liu Yuan tailai...@taobao.com
 ---
  UPDATE: use flush_fd instead of s-fd.
 
  block/sheepdog.c |  142 -
  1 files changed, 128 insertions(+), 14 deletions(-)

Thanks, looks good to me.

Acked-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



Re: [Qemu-devel] RE. [PATCH] sheepdog: fix send req helpers

2012-04-03 Thread MORITA Kazutaka
At Tue,  3 Apr 2012 18:04:21 +0800,
Liu Yuan wrote:
 
 From: Liu Yuan tailai...@taobao.com
 
 Yes, I think so. Here is the patch.
 
 Subject: [PATCH] sheepdog: fix send req helpers
 From: Liu Yuan tailai...@taobao.com
 
 We should return if reading of the header fails.
 
 Cc: Kevin Wolf kw...@redhat.com
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Signed-off-by: Liu Yuan tailai...@taobao.com
 ---
  block/sheepdog.c |2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)

Acked-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



Re: [Qemu-devel] [PATCH 02/22] sheepdog: avoid a few buffer overruns

2012-05-09 Thread MORITA Kazutaka
At Wed, 09 May 2012 11:46:10 +0200,
Kevin Wolf wrote:
 
 Am 09.05.2012 11:23, schrieb Jim Meyering:
  From: Jim Meyering meyer...@redhat.com
  
  * parse_vdiname: Use pstrcpy, not strncpy, when the destination
  buffer must be NUL-terminated.
  * sd_open: Likewise, avoid buffer overrun.
  * do_sd_create: Likewise.  Leave the preceding memset, since
  pstrcpy does not NUL-fill, and filename needs that.
  * sd_snapshot_create: Add a comment/question.
  * find_vdi_name: Remove a useless memset.
  * sd_snapshot_goto: Remove a useless memset.
  Use pstrcpy to NUL-terminate, because find_vdi_name requires
  that its vdi arg (filename parameter) be NUL-terminated.
  It seems ok not to NUL-fill the buffer.
  Do the same for snapid: remove useless memset-0 (instead,
  zero tag[0]).  Use pstrcpy, not strncpy.
  * sd_snapshot_list: Use pstrcpy, not strncpy to write
  into the -name member.  Each must be NUL-terminated.
  
  Signed-off-by: Jim Meyering meyer...@redhat.com
 
 Acked-by: Kevin Wolf kw...@redhat.com
 
 Kazutaka, can you have a look? I think you may want to send patches on
 top to improve the places where Jim just put a comment.

Acked-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp

I'll send a patch on top of this to address Jim's comments.

Kazutaka

 
 Kevin
 
  ---
   block/sheepdog.c | 34 ++
   1 file changed, 22 insertions(+), 12 deletions(-)
  
  diff --git a/block/sheepdog.c b/block/sheepdog.c
  index 0ed6b19..f2579da 100644
  --- a/block/sheepdog.c
  +++ b/block/sheepdog.c
  @@ -852,14 +852,14 @@ static int parse_vdiname(BDRVSheepdogState *s, const 
  char *filename,
   s-port = 0;
   }
  
  -strncpy(vdi, p, SD_MAX_VDI_LEN);
  +pstrcpy(vdi, SD_MAX_VDI_LEN, p);
  
   p = strchr(vdi, ':');
   if (p) {
   *p++ = '\0';
   *snapid = strtoul(p, NULL, 10);
   if (*snapid == 0) {
  -strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
  +pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
   }
   } else {
   *snapid = CURRENT_VDI_ID; /* search current vdi */
  @@ -886,7 +886,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char 
  *filename, uint32_t snapid,
   return -1;
   }
  
  -memset(buf, 0, sizeof(buf));
  +/* This pair of strncpy calls ensures that the buffer is zero-filled,
  + * which is desirable since we'll soon be sending those bytes, and
  + * don't want the send_req to read uninitialized data.
  + */
   strncpy(buf, filename, SD_MAX_VDI_LEN);
   strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
  
  @@ -1129,7 +1132,7 @@ static int sd_open(BlockDriverState *bs, const char 
  *filename, int flags)
   s-max_dirty_data_idx = 0;
  
   bs-total_sectors = s-inode.vdi_size / SECTOR_SIZE;
  -strncpy(s-name, vdi, sizeof(s-name));
  +pstrcpy(s-name, sizeof(s-name), vdi);
   qemu_co_mutex_init(s-lock);
   g_free(buf);
   return 0;
  @@ -1157,8 +1160,11 @@ static int do_sd_create(char *filename, int64_t 
  vdi_size,
   return -EIO;
   }
  
  +/* FIXME: would it be better to fail (e.g., return -EIO) when filename
  + * does not fit in buf?  For now, just truncate and avoid buffer 
  overrun.
  + */
   memset(buf, 0, sizeof(buf));
  -strncpy(buf, filename, SD_MAX_VDI_LEN);
  +pstrcpy(buf, sizeof(buf), filename);
  
   memset(hdr, 0, sizeof(hdr));
   hdr.opcode = SD_OP_NEW_VDI;
  @@ -1709,6 +1715,9 @@ static int sd_snapshot_create(BlockDriverState *bs, 
  QEMUSnapshotInfo *sn_info)
  
   s-inode.vm_state_size = sn_info-vm_state_size;
   s-inode.vm_clock_nsec = sn_info-vm_clock_nsec;
  +/* It appears that inode.tag does not require a NUL terminator,
  + * which means this use of strncpy is ok.
  + */
   strncpy(s-inode.tag, sn_info-name, sizeof(s-inode.tag));
   /* we don't need to update entire object */
   datalen = SD_INODE_SIZE - sizeof(s-inode.data_vdi_id);
  @@ -1771,13 +1780,13 @@ static int sd_snapshot_goto(BlockDriverState *bs, 
  const char *snapshot_id)
  
   memcpy(old_s, s, sizeof(BDRVSheepdogState));
  
  -memset(vdi, 0, sizeof(vdi));
  -strncpy(vdi, s-name, sizeof(vdi));
  +pstrcpy(vdi, sizeof(vdi), s-name);
  
  -memset(tag, 0, sizeof(tag));
   snapid = strtoul(snapshot_id, NULL, 10);
  -if (!snapid) {
  -strncpy(tag, s-name, sizeof(tag));
  +if (snapid) {
  +tag[0] = 0;
  +} else {
  +pstrcpy(tag, sizeof(tag), s-name);
   }
  
   ret = find_vdi_name(s, vdi, snapid, tag, vid, 1);
  @@ -1905,8 +1914,9 @@ static int sd_snapshot_list(BlockDriverState *bs, 
  QEMUSnapshotInfo **psn_tab)
  
   snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), 
  %u,
inode.snap_id);
  -strncpy(sn_tab[found].name, inode.tag,
  -MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
  +pstrcpy(sn_tab[found

[Qemu-devel] [PATCH 1.1 1/4] sheepdog: mark image as snapshot when tag is specified

2012-05-16 Thread MORITA Kazutaka
When a snapshot tag is specified in the filename, the opened image is
a snapshot.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index e01d371..776a1cc 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1103,7 +1103,7 @@ static int sd_open(BlockDriverState *bs, const char 
*filename, int flags)
 }
 }
 
-if (snapid) {
+if (snapid || tag[0] != '\0') {
 dprintf(% PRIx32  snapshot inode was open.\n, vid);
 s-is_snapshot = 1;
 }
-- 
1.7.2.5




[Qemu-devel] [PATCH 1.1 2/4] sheepdog: fix return value of do_load_save_vm_state

2012-05-16 Thread MORITA Kazutaka
bdrv_save_vmstate and bdrv_load_vmstate should return the number of
processed bytes on success.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 776a1cc..146a221 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1932,7 +1932,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, 
uint8_t *data,
 int64_t pos, int size, int load)
 {
 int fd, create;
-int ret = 0;
+int ret = 0, done = 0;
 unsigned int data_len;
 uint64_t vmstate_oid;
 uint32_t vdi_index;
@@ -1971,10 +1971,14 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, 
uint8_t *data,
 
 pos += data_len;
 size -= data_len;
-ret += data_len;
+done += data_len;
 }
 cleanup:
 closesocket(fd);
+
+if (done)
+return done;
+
 return ret;
 }
 
-- 
1.7.2.5




[Qemu-devel] [PATCH 1.1 3/4] sheepdog: return -errno on error

2012-05-16 Thread MORITA Kazutaka
On error, BlockDriver APIs should return -errno instead of -1.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   78 +++--
 1 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 146a221..42f2ee8 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -468,7 +468,7 @@ static int connect_to_sdog(const char *addr, const char 
*port)
 if (ret) {
 error_report(unable to get address info %s, %s,
  addr, strerror(errno));
-return -1;
+return -errno;
 }
 
 for (res = res0; res; res = res-ai_next) {
@@ -495,7 +495,7 @@ static int connect_to_sdog(const char *addr, const char 
*port)
 dprintf(connected to %s:%s\n, addr, port);
 goto success;
 }
-fd = -1;
+fd = -errno;
 error_report(failed connect to %s:%s, addr, port);
 success:
 freeaddrinfo(res0);
@@ -510,12 +510,13 @@ static int send_req(int sockfd, SheepdogReq *hdr, void 
*data,
 ret = qemu_send_full(sockfd, hdr, sizeof(*hdr), 0);
 if (ret  sizeof(*hdr)) {
 error_report(failed to send a req, %s, strerror(errno));
-return ret;
+return -errno;
 }
 
 ret = qemu_send_full(sockfd, data, *wlen, 0);
 if (ret  *wlen) {
 error_report(failed to send a req, %s, strerror(errno));
+ret = -errno;
 }
 
 return ret;
@@ -553,6 +554,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 ret = qemu_recv_full(sockfd, hdr, sizeof(*hdr), 0);
 if (ret  sizeof(*hdr)) {
 error_report(failed to get a rsp, %s, strerror(errno));
+ret = -errno;
 goto out;
 }
 
@@ -564,6 +566,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 ret = qemu_recv_full(sockfd, data, *rlen, 0);
 if (ret  *rlen) {
 error_report(failed to get the data, %s, strerror(errno));
+ret = -errno;
 goto out;
 }
 }
@@ -587,6 +590,7 @@ static int do_co_req(int sockfd, SheepdogReq *hdr, void 
*data,
 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
 if (ret  sizeof(*hdr)) {
 error_report(failed to get a rsp, %s, strerror(errno));
+ret = -errno;
 goto out;
 }
 
@@ -598,6 +602,7 @@ static int do_co_req(int sockfd, SheepdogReq *hdr, void 
*data,
 ret = qemu_co_recv(sockfd, data, *rlen);
 if (ret  *rlen) {
 error_report(failed to get the data, %s, strerror(errno));
+ret = -errno;
 goto out;
 }
 }
@@ -787,7 +792,7 @@ static int get_sheep_fd(BDRVSheepdogState *s)
 fd = connect_to_sdog(s-addr, s-port);
 if (fd  0) {
 error_report(%s, strerror(errno));
-return -1;
+return fd;
 }
 
 socket_set_nonblock(fd);
@@ -796,7 +801,7 @@ static int get_sheep_fd(BDRVSheepdogState *s)
 if (ret) {
 error_report(%s, strerror(errno));
 closesocket(fd);
-return -1;
+return -errno;
 }
 
 qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s);
@@ -883,7 +888,7 @@ static int find_vdi_name(BDRVSheepdogState *s, char 
*filename, uint32_t snapid,
 
 fd = connect_to_sdog(s-addr, s-port);
 if (fd  0) {
-return -1;
+return fd;
 }
 
 memset(buf, 0, sizeof(buf));
@@ -904,14 +909,17 @@ static int find_vdi_name(BDRVSheepdogState *s, char 
*filename, uint32_t snapid,
 
 ret = do_req(fd, (SheepdogReq *)hdr, buf, wlen, rlen);
 if (ret) {
-ret = -1;
 goto out;
 }
 
 if (rsp-result != SD_RES_SUCCESS) {
 error_report(cannot get vdi info, %s, %s %d %s,
  sd_strerror(rsp-result), filename, snapid, tag);
-ret = -1;
+if (rsp-result == SD_RES_NO_VDI) {
+ret = -ENOENT;
+} else {
+ret = -EIO;
+}
 goto out;
 }
 *vid = rsp-vdi_id;
@@ -980,7 +988,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState 
*s, AIOReq *aio_req,
 if (ret  0) {
 qemu_co_mutex_unlock(s-lock);
 error_report(failed to send a req, %s, strerror(errno));
-return -EIO;
+return -errno;
 }
 
 if (wlen) {
@@ -988,7 +996,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState 
*s, AIOReq *aio_req,
 if (ret  0) {
 qemu_co_mutex_unlock(s-lock);
 error_report(failed to send a data, %s, strerror(errno));
-return -EIO;
+return -errno;
 }
 }
 
@@ -1038,7 +1046,7 @@ static int read_write_object(int fd, char *buf, uint64_t 
oid, int copies,
 ret = do_req(fd, (SheepdogReq *)hdr, buf, wlen, rlen);
 if (ret) {
 error_report(failed to send a request to the sheep);
-return -1;
+return ret;
 }
 
 switch (rsp-result) {
@@ -1046,7 +1054,7 @@ static int read_write_object(int fd, char *buf

[Qemu-devel] [PATCH 1.1 4/4] sheepdog: use heap instead of stack for BDRVSheepdogState

2012-05-16 Thread MORITA Kazutaka
bdrv_create() is called in coroutine context now, so we cannot use
more stack than 1 MB in the function if we use ucontext coroutine.
This patch allocates BDRVSheepdogState, whose size is 4 MB, on the
heap in sd_create().

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   35 ++-
 1 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 42f2ee8..b614a49 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1249,24 +1249,26 @@ out:
 
 static int sd_create(const char *filename, QEMUOptionParameter *options)
 {
-int ret;
+int ret = 0;
 uint32_t vid = 0, base_vid = 0;
 int64_t vdi_size = 0;
 char *backing_file = NULL;
-BDRVSheepdogState s;
+BDRVSheepdogState *s;
 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
 uint32_t snapid;
 int prealloc = 0;
 const char *vdiname;
 
+s = g_malloc0(sizeof(BDRVSheepdogState));
+
 strstart(filename, sheepdog:, vdiname);
 
-memset(s, 0, sizeof(s));
 memset(vdi, 0, sizeof(vdi));
 memset(tag, 0, sizeof(tag));
-if (parse_vdiname(s, vdiname, vdi, snapid, tag)  0) {
+if (parse_vdiname(s, vdiname, vdi, snapid, tag)  0) {
 error_report(invalid filename);
-return -EINVAL;
+ret = -EINVAL;
+goto out;
 }
 
 while (options  options-name) {
@@ -1282,7 +1284,8 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 } else {
 error_report(Invalid preallocation mode: '%s',
  options-value.s);
-return -EINVAL;
+ret = -EINVAL;
+goto out;
 }
 }
 options++;
@@ -1290,7 +1293,8 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 
 if (vdi_size  SD_MAX_VDI_SIZE) {
 error_report(too big image size);
-return -EINVAL;
+ret = -EINVAL;
+goto out;
 }
 
 if (backing_file) {
@@ -1302,12 +1306,13 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 drv = bdrv_find_protocol(backing_file);
 if (!drv || strcmp(drv-protocol_name, sheepdog) != 0) {
 error_report(backing_file must be a sheepdog image);
-return -EINVAL;
+ret = -EINVAL;
+goto out;
 }
 
 ret = bdrv_file_open(bs, backing_file, 0);
 if (ret  0) {
-return ret;
+goto out;
 }
 
 s = bs-opaque;
@@ -1315,19 +1320,23 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 if (!is_snapshot(s-inode)) {
 error_report(cannot clone from a non snapshot vdi);
 bdrv_delete(bs);
-return -EINVAL;
+ret = -EINVAL;
+goto out;
 }
 
 base_vid = s-inode.vdi_id;
 bdrv_delete(bs);
 }
 
-ret = do_sd_create(vdi, vdi_size, base_vid, vid, 0, s.addr, s.port);
+ret = do_sd_create(vdi, vdi_size, base_vid, vid, 0, s-addr, s-port);
 if (!prealloc || ret) {
-return ret;
+goto out;
 }
 
-return sd_prealloc(filename);
+ret = sd_prealloc(filename);
+out:
+g_free(s);
+return ret;
 }
 
 static void sd_close(BlockDriverState *bs)
-- 
1.7.2.5




[Qemu-devel] [PATCH 1.1 0/4] sheepdog: various sheepdog fixes

2012-05-16 Thread MORITA Kazutaka
This patchset contains various fixes for Sheepdog.  See individual
patches for details.

MORITA Kazutaka (4):
  sheepdog: mark image as snapshot when tag is specified
  sheepdog: fix return value of do_load_save_vm_state
  sheepdog: return -errno on error
  sheepdog: use heap instead of stack for BDRVSheepdogState

 block/sheepdog.c |  121 +-
 1 files changed, 74 insertions(+), 47 deletions(-)

-- 
1.7.2.5




[Qemu-devel] [PATCH 1.1 v2] sheepdog: fix return value of do_load_save_vm_state

2012-05-29 Thread MORITA Kazutaka
bdrv_save_vmstate and bdrv_load_vmstate should return the vmstate size
on success, and -errno on error.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
Changes from v1
 - return an error for short reads/writes
 - fix a coding style problem

 block/sheepdog.c |   10 +-
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 6d52277..f46ca8f 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1957,7 +1957,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, 
uint8_t *data,
 int64_t pos, int size, int load)
 {
 int fd, create;
-int ret = 0;
+int ret = 0, remaining = size;
 unsigned int data_len;
 uint64_t vmstate_oid;
 uint32_t vdi_index;
@@ -1968,11 +1968,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, 
uint8_t *data,
 return fd;
 }
 
-while (size) {
+while (remaining) {
 vdi_index = pos / SD_DATA_OBJ_SIZE;
 offset = pos % SD_DATA_OBJ_SIZE;
 
-data_len = MIN(size, SD_DATA_OBJ_SIZE);
+data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
 
 vmstate_oid = vid_to_vmstate_oid(s-inode.vdi_id, vdi_index);
 
@@ -1993,9 +1993,9 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, 
uint8_t *data,
 }
 
 pos += data_len;
-size -= data_len;
-ret += data_len;
+remaining -= data_len;
 }
+ret = size;
 cleanup:
 closesocket(fd);
 return ret;
-- 
1.7.2.5




[Qemu-devel] [PATCH 1.1] sheepdog: add coroutine_fn markers to coroutine functions

2012-05-29 Thread MORITA Kazutaka
Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index f46ca8f..046f52a 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -522,8 +522,8 @@ static int send_req(int sockfd, SheepdogReq *hdr, void 
*data,
 return ret;
 }
 
-static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
-   unsigned int *wlen)
+static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
+   unsigned int *wlen)
 {
 int ret;
 
@@ -540,6 +540,7 @@ static int send_co_req(int sockfd, SheepdogReq *hdr, void 
*data,
 
 return ret;
 }
+
 static int do_req(int sockfd, SheepdogReq *hdr, void *data,
   unsigned int *wlen, unsigned int *rlen)
 {
@@ -576,8 +577,8 @@ out:
 return ret;
 }
 
-static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
- unsigned int *wlen, unsigned int *rlen)
+static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
+ unsigned int *wlen, unsigned int *rlen)
 {
 int ret;
 
-- 
1.7.2.5




Re: [Qemu-devel] [PATCH 1.1] sheepdog: add coroutine_fn markers to coroutine functions

2012-05-29 Thread MORITA Kazutaka
At Tue, 29 May 2012 18:30:17 +0200,
Andreas Färber wrote:
 
 Am 29.05.2012 18:22, schrieb MORITA Kazutaka:
  Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 
 Patch is tab-damaged.

Oops, I'll send a new version.

Thanks,

Kazutaka


 
 Andreas
 
  ---
   block/sheepdog.c |9 +
   1 files changed, 5 insertions(+), 4 deletions(-)
  
  diff --git a/block/sheepdog.c b/block/sheepdog.c
  index f46ca8f..046f52a 100644
  --- a/block/sheepdog.c
  +++ b/block/sheepdog.c
  @@ -522,8 +522,8 @@ static int send_req(int sockfd, SheepdogReq *hdr, void 
  *data,
   return ret;
   }
   
  -static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
  -   unsigned int *wlen)
  +static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void 
  *data,
  +   unsigned int *wlen)
   {
   int ret;
   
  @@ -540,6 +540,7 @@ static int send_co_req(int sockfd, SheepdogReq *hdr, 
  void *data,
   
   return ret;
   }
  +
   static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 unsigned int *wlen, unsigned int *rlen)
   {
  @@ -576,8 +577,8 @@ out:
   return ret;
   }
   
  -static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
  - unsigned int *wlen, unsigned int *rlen)
  +static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
  + unsigned int *wlen, unsigned int *rlen)
   {
   int ret;
   
 
 
 -- 
 SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
 GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
 



[Qemu-devel] [PATCH 1.1 v2] sheepdog: add coroutine_fn markers to coroutine functions

2012-05-29 Thread MORITA Kazutaka
Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---

Changes from v1:
 - use spaces for indentation

 block/sheepdog.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index f46ca8f..8877f45 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -522,8 +522,8 @@ static int send_req(int sockfd, SheepdogReq *hdr, void 
*data,
 return ret;
 }
 
-static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
-   unsigned int *wlen)
+static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
+unsigned int *wlen)
 {
 int ret;
 
@@ -540,6 +540,7 @@ static int send_co_req(int sockfd, SheepdogReq *hdr, void 
*data,
 
 return ret;
 }
+
 static int do_req(int sockfd, SheepdogReq *hdr, void *data,
   unsigned int *wlen, unsigned int *rlen)
 {
@@ -576,8 +577,8 @@ out:
 return ret;
 }
 
-static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
- unsigned int *wlen, unsigned int *rlen)
+static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
+  unsigned int *wlen, unsigned int *rlen)
 {
 int ret;
 
-- 
1.7.2.5




Re: [Qemu-devel] [PATCH] sheepdog: implement SD_OP_FLUSH_VDI operation

2012-04-20 Thread MORITA Kazutaka
At Fri, 20 Apr 2012 20:05:48 +0200,
Christoph Hellwig wrote:
 
 On Tue, Apr 03, 2012 at 01:35:50AM +0800, Liu Yuan wrote:
  From: Liu Yuan tailai...@taobao.com
  
  Flush operation is supposed to flush the write-back cache of
  sheepdog cluster.
  
  By issuing flush operation, we can assure the Guest of data
  reaching the sheepdog cluster storage.
 
 How does qemu know that the cache might need flushing in the backend?
 
 If the backend expects a flush we need to make sure one is sent after
 every write if the user selects cache=writethrough or cache=directsync.
 Given how expensive that is you should also consider adding an equivalent
 to the FUA flag for writes in that case, or simply allow the qemu driver
 to tell the cluster that it needs to operate in writethrough mode for
 this VDI.

His patch sets the SD_FLAG_CMD_CACHE flag for writes only when the
user selects cache=writeback or cache=none.  If SD_FLAG_CMD_CACHE is
not set in the request, Sheepdog servers are forced to flush the cache
like FUA commands.

Thanks,

Kazutaka



Re: [Qemu-devel] [PATCH] sheepdog: implement SD_OP_FLUSH_VDI operation

2012-04-23 Thread MORITA Kazutaka
At Mon, 23 Apr 2012 08:08:46 +0200,
Christoph Hellwig wrote:
 
 On Fri, Apr 20, 2012 at 12:15:36PM -0700, MORITA Kazutaka wrote:
  His patch sets the SD_FLAG_CMD_CACHE flag for writes only when the
  user selects cache=writeback or cache=none.  If SD_FLAG_CMD_CACHE is
  not set in the request, Sheepdog servers are forced to flush the cache
  like FUA commands.
 
 Ok, I missed that part and it thus seems ok.  What still sems missing
 is error handling in case the sheepdog cluster doesn't actually support
 the new flag.  What happens if cache=none is specified with a cluster
 not actually supporting it?  Remember that cache=none is the default for
 many management frontends to qemu.

SD_FLAG_CMD_CACHE is ignored in the older version of Sheepdog, so,
even if we specify cache=writeback or cache=none, the data is written
with O_DSYNC always and cannot be cached in the server's page cache or
volatile disk cache either.  I think it is safe.

It seems that there is another version problem with this patch;
bdrv_co_flush_to_disk() results in error with the older Sheepdog which
doesn't support SD_OP_FLUSH_VDI.  The simple fix is to increment
SD_PROTO_VER and prevent the newer qemu from connecting to the older
Sheepdog cluster, I think.

Thanks,

Kazutaka



[Qemu-devel] [PATCH] sheepdog: switch to writethrough mode if cluster doesn't support flush

2012-05-02 Thread MORITA Kazutaka
This is necessary for qemu to work with the older version of Sheepdog
which doesn't support SD_OP_FLUSH_VDI.

Signed-off-by: MORITA Kazutaka morita.kazut...@gmail.com
---
 block/sheepdog.c |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 0ed6b19..e01d371 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1678,6 +1678,14 @@ static int coroutine_fn 
sd_co_flush_to_disk(BlockDriverState *bs)
 return ret;
 }
 
+if (rsp-result == SD_RES_INVALID_PARMS) {
+dprintf(disable write cache since the server doesn't support it\n);
+
+s-cache_enabled = 0;
+closesocket(s-flush_fd);
+return 0;
+}
+
 if (rsp-result != SD_RES_SUCCESS) {
 error_report(%s, sd_strerror(rsp-result));
 return -EIO;
-- 
1.7.2.5




[Qemu-devel] [PATCH] sheepdog: fix savevm and loadvm

2012-08-29 Thread MORITA Kazutaka
This patch sets data to be sent to Sheepdog correctly and fixes savevm
and loadvm operations on a Sheepdog image.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index df4f441..e0753ee 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1986,7 +1986,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, 
uint8_t *data,
 vdi_index = pos / SD_DATA_OBJ_SIZE;
 offset = pos % SD_DATA_OBJ_SIZE;
 
-data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
+data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
 
 vmstate_oid = vid_to_vmstate_oid(s-inode.vdi_id, vdi_index);
 
@@ -2007,6 +2007,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, 
uint8_t *data,
 }
 
 pos += data_len;
+data += data_len;
 remaining -= data_len;
 }
 ret = size;
-- 
1.7.2.5




[Qemu-devel] [PATCH] sheepdog: use bool for boolean variables

2012-10-06 Thread MORITA Kazutaka
This improves readability.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c |   70 +++---
 1 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 4742f8a..3c88c59 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -201,12 +201,12 @@ static inline uint64_t fnv_64a_buf(void *buf, size_t len, 
uint64_t hval)
 return hval;
 }
 
-static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
+static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
 {
 return inode-vdi_id == inode-data_vdi_id[idx];
 }
 
-static inline int is_data_obj(uint64_t oid)
+static inline bool is_data_obj(uint64_t oid)
 {
 return !(VDI_BIT  oid);
 }
@@ -231,7 +231,7 @@ static inline uint64_t vid_to_data_oid(uint32_t vid, 
uint32_t idx)
 return ((uint64_t)vid  VDI_SPACE_SHIFT) | idx;
 }
 
-static inline int is_snapshot(struct SheepdogInode *inode)
+static inline bool is_snapshot(struct SheepdogInode *inode)
 {
 return !!inode-snap_ctime;
 }
@@ -281,7 +281,7 @@ struct SheepdogAIOCB {
 Coroutine *coroutine;
 void (*aio_done_func)(SheepdogAIOCB *);
 
-int canceled;
+bool canceled;
 int nr_pending;
 };
 
@@ -292,8 +292,8 @@ typedef struct BDRVSheepdogState {
 uint32_t max_dirty_data_idx;
 
 char name[SD_MAX_VDI_LEN];
-int is_snapshot;
-uint8_t cache_enabled;
+bool is_snapshot;
+bool cache_enabled;
 
 char *addr;
 char *port;
@@ -417,7 +417,7 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
  */
 acb-ret = -EIO;
 qemu_coroutine_enter(acb-coroutine, NULL);
-acb-canceled = 1;
+acb-canceled = true;
 }
 
 static AIOPool sd_aio_pool = {
@@ -439,7 +439,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 acb-nb_sectors = nb_sectors;
 
 acb-aio_done_func = NULL;
-acb-canceled = 0;
+acb-canceled = false;
 acb-coroutine = qemu_coroutine_self();
 acb-ret = 0;
 acb-nr_pending = 0;
@@ -613,7 +613,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
 }
 
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-   struct iovec *iov, int niov, int create,
+   struct iovec *iov, int niov, bool create,
enum AIOCBState aiocb_type);
 
 
@@ -646,7 +646,7 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState 
*s, uint64_t oid)
 QLIST_REMOVE(aio_req, aio_siblings);
 QLIST_INSERT_HEAD(s-inflight_aio_head, aio_req, aio_siblings);
 ret = add_aio_request(s, aio_req, acb-qiov-iov,
-  acb-qiov-niov, 0, acb-aiocb_type);
+  acb-qiov-niov, false, acb-aiocb_type);
 if (ret  0) {
 error_report(add_aio_request is failed);
 free_aio_req(s, aio_req);
@@ -940,7 +940,7 @@ out:
 }
 
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
-   struct iovec *iov, int niov, int create,
+   struct iovec *iov, int niov, bool create,
enum AIOCBState aiocb_type)
 {
 int nr_copies = s-inode.nr_copies;
@@ -1019,7 +1019,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState 
*s, AIOReq *aio_req,
 
 static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
  unsigned int datalen, uint64_t offset,
- int write, int create, uint8_t cache)
+ bool write, bool create, bool cache)
 {
 SheepdogObjReq hdr;
 SheepdogObjRsp *rsp = (SheepdogObjRsp *)hdr;
@@ -1068,18 +1068,18 @@ static int read_write_object(int fd, char *buf, 
uint64_t oid, int copies,
 }
 
 static int read_object(int fd, char *buf, uint64_t oid, int copies,
-   unsigned int datalen, uint64_t offset, uint8_t cache)
+   unsigned int datalen, uint64_t offset, bool cache)
 {
-return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0,
- cache);
+return read_write_object(fd, buf, oid, copies, datalen, offset, false,
+ false, cache);
 }
 
 static int write_object(int fd, char *buf, uint64_t oid, int copies,
-unsigned int datalen, uint64_t offset, int create,
-uint8_t cache)
+unsigned int datalen, uint64_t offset, bool create,
+bool cache)
 {
-return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create,
- cache);
+return read_write_object(fd, buf, oid, copies, datalen, offset, true,
+ create, cache);
 }
 
 static int sd_open(BlockDriverState *bs, const char *filename, int flags

Re: [Qemu-devel] [sheepdog] [PATCH for QEMU v3] sheepdog: add discard/trim support for sheepdog

2013-04-15 Thread MORITA Kazutaka
At Sun, 14 Apr 2013 13:16:44 +0800,
Liu Yuan wrote:
 
 From: Liu Yuan tailai...@taobao.com
 
 The 'TRIM' command from VM that is to release underlying data storage for
 better thin-provision is already supported by the Sheepdog.
 
 This patch adds the TRIM support at QEMU part.
 
 For older Sheepdog that doesn't support it, we return EIO to upper layer.

I think we can safely return 0 without doing anything when the server
doesn't support SD_OP_DISCARD.  Actually, if the block driver doesn't
support the discard operation, bdrv_co_discard() in block.c returns 0.


 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index 987018e..e852d4e 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
 @@ -34,6 +34,7 @@
  #define SD_OP_GET_VDI_INFO   0x14
  #define SD_OP_READ_VDIS  0x15
  #define SD_OP_FLUSH_VDI  0x16
 +#define SD_OP_DISCARD0x17

This is an opcode for objects, so I prefer SD_OP_DISCARD_OBJ.

  
 +static int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
 + int nb_sectors)

Should add coroutine_fn.

Thanks,

Kazutaka



Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: add discard/trim support for sheepdog

2013-04-15 Thread MORITA Kazutaka
At Mon, 15 Apr 2013 23:52:40 +0800,
Liu Yuan wrote:
 
 diff --git a/block/sheepdog.c b/block/sheepdog.c
 index 987018e..362244a 100644
 --- a/block/sheepdog.c
 +++ b/block/sheepdog.c
 @@ -34,6 +34,7 @@
  #define SD_OP_GET_VDI_INFO   0x14
  #define SD_OP_READ_VDIS  0x15
  #define SD_OP_FLUSH_VDI  0x16
 +#define SD_OP_DISCARD_OBJ0x17

I think SD_OP_DISCARD_OBJ should be 0x05 since we are using a value
less than 0x10 for SD_OP_*_OBJ.

The other parts look good to me.

Thanks,

Kazutaka



Re: [Qemu-devel] [sheepdog] [PATCH v5] sheepdog: add discard/trim support for sheepdog

2013-04-15 Thread MORITA Kazutaka
At Tue, 16 Apr 2013 00:15:04 +0800,
Liu Yuan wrote:
 
 From: Liu Yuan tailai...@taobao.com
 
 The 'TRIM' command from VM that is to release underlying data storage for
 better thin-provision is already supported by the Sheepdog.
 
 This patch adds the TRIM support at QEMU part.
 
 For older Sheepdog that doesn't support it, we return 0(success) to upper 
 layer.
 
 Cc: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
 Cc: Kevin Wolf kw...@redhat.com
 Cc: Stefan Hajnoczi stefa...@redhat.com
 Cc: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Liu Yuan tailai...@taobao.com
 ---
 v5:
  - adjust macro numbering
 v4:
  - adjust discard macro
  - return success when operation is not supported by sheep
  - add coroutine_fn marker
 v3:
  - fix a silly accidental deletion of 'default' in switch clause.
 v2:
  - skip the object when it is not allocated
 
  block/sheepdog.c |   56 
 +-
  1 file changed, 55 insertions(+), 1 deletion(-)

Reviewed-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp



Re: [Qemu-devel] [sheepdog] [PATCH] sheepdog: implement .bdrv_co_is_allocated

2013-04-22 Thread MORITA Kazutaka
At Thu, 18 Apr 2013 19:48:52 +0800,
Liu Yuan wrote:
 
 +static coroutine_fn int
 +sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 +   int *pnum)
 +{
 +BDRVSheepdogState *s = bs-opaque;
 +SheepdogInode *inode = s-inode;
 +unsigned long start = sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE, idx,

It looks better to use BDRV_SECTOR_SIZE .  I'd suggest preparing
another patch to replace all the SECTOR_SIZE with BDRV_SECTOR_SIZE.

 +  end = start + (nb_sectors * SECTOR_SIZE) / 
 SD_DATA_OBJ_SIZE;

Using 'start' to calculate 'end' is wrong because 'start' may be
rounded down.

 +
 +for (idx = start; idx = end; idx++) {
 +if (inode-data_vdi_id[idx] == 0) {
 +break;
 +}
 +}
 +if (idx == start) {
 +*pnum = SD_DATA_OBJ_SIZE / SECTOR_SIZE;

Isn't it better to set the longest length of the unallocated sectors?

 +return 0;
 +}
 +
 +*pnum = (idx - start) * SD_DATA_OBJ_SIZE / SECTOR_SIZE;
 +return 1;
 +}
 +

Thanks,

Kazutaka



[Qemu-devel] [PATCH v4 RESEND 3/5] sheepdog: accept URIs

2013-02-21 Thread MORITA Kazutaka
The URI syntax is consistent with the NBD and Gluster syntax.  The
syntax is

  sheepdog[+tcp]://[host:port]/vdiname[#snapid|#tag]

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c | 139 +--
 qemu-doc.texi|  16 +++
 qemu-options.hx  |  18 ++-
 3 files changed, 117 insertions(+), 56 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 51e75ad..bfa8a00 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -13,6 +13,7 @@
  */
 
 #include qemu-common.h
+#include qemu/uri.h
 #include qemu/error-report.h
 #include qemu/sockets.h
 #include block/block_int.h
@@ -816,8 +817,52 @@ static int get_sheep_fd(BDRVSheepdogState *s)
 return fd;
 }
 
+static int sd_parse_uri(BDRVSheepdogState *s, const char *filename,
+char *vdi, uint32_t *snapid, char *tag)
+{
+URI *uri;
+QueryParams *qp = NULL;
+int ret = 0;
+
+uri = uri_parse(filename);
+if (!uri) {
+return -EINVAL;
+}
+
+if (uri-path == NULL || !strcmp(uri-path, /)) {
+ret = -EINVAL;
+goto out;
+}
+pstrcpy(vdi, SD_MAX_VDI_LEN, uri-path + 1);
+
+/* sheepdog[+tcp]://[host:port]/vdiname */
+s-addr = g_strdup(uri-server ?: SD_DEFAULT_ADDR);
+if (uri-port) {
+s-port = g_strdup_printf(%d, uri-port);
+} else {
+s-port = g_strdup(SD_DEFAULT_PORT);
+}
+
+/* snapshot tag */
+if (uri-fragment) {
+*snapid = strtoul(uri-fragment, NULL, 10);
+if (*snapid == 0) {
+pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri-fragment);
+}
+} else {
+*snapid = CURRENT_VDI_ID; /* search current vdi */
+}
+
+out:
+if (qp) {
+query_params_free(qp);
+}
+uri_free(uri);
+return ret;
+}
+
 /*
- * Parse a filename
+ * Parse a filename (old syntax)
  *
  * filename must be one of the following formats:
  *   1. [vdiname]
@@ -836,9 +881,11 @@ static int get_sheep_fd(BDRVSheepdogState *s)
 static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
  char *vdi, uint32_t *snapid, char *tag)
 {
-char *p, *q;
-int nr_sep;
+char *p, *q, *uri;
+const char *host_spec, *vdi_spec;
+int nr_sep, ret;
 
+strstart(filename, sheepdog:, (const char **)filename);
 p = q = g_strdup(filename);
 
 /* count the number of separators */
@@ -851,38 +898,32 @@ static int parse_vdiname(BDRVSheepdogState *s, const char 
*filename,
 }
 p = q;
 
-/* use the first two tokens as hostname and port number. */
+/* use the first two tokens as host_spec. */
 if (nr_sep = 2) {
-s-addr = p;
+host_spec = p;
 p = strchr(p, ':');
-*p++ = '\0';
-
-s-port = p;
+p++;
 p = strchr(p, ':');
 *p++ = '\0';
 } else {
-s-addr = NULL;
-s-port = 0;
+host_spec = ;
 }
 
-pstrcpy(vdi, SD_MAX_VDI_LEN, p);
+vdi_spec = p;
 
-p = strchr(vdi, ':');
+p = strchr(vdi_spec, ':');
 if (p) {
-*p++ = '\0';
-*snapid = strtoul(p, NULL, 10);
-if (*snapid == 0) {
-pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
-}
-} else {
-*snapid = CURRENT_VDI_ID; /* search current vdi */
+*p++ = '#';
 }
 
-if (s-addr == NULL) {
-g_free(q);
-}
+uri = g_strdup_printf(sheepdog://%s/%s, host_spec, vdi_spec);
 
-return 0;
+ret = sd_parse_uri(s, uri, vdi, snapid, tag);
+
+g_free(q);
+g_free(uri);
+
+return ret;
 }
 
 static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
@@ -1097,16 +1138,19 @@ static int sd_open(BlockDriverState *bs, const char 
*filename, int flags)
 uint32_t snapid;
 char *buf = NULL;
 
-strstart(filename, sheepdog:, (const char **)filename);
-
 QLIST_INIT(s-inflight_aio_head);
 QLIST_INIT(s-pending_aio_head);
 s-fd = -1;
 
 memset(vdi, 0, sizeof(vdi));
 memset(tag, 0, sizeof(tag));
-if (parse_vdiname(s, filename, vdi, snapid, tag)  0) {
-ret = -EINVAL;
+
+if (strstr(filename, ://)) {
+ret = sd_parse_uri(s, filename, vdi, snapid, tag);
+} else {
+ret = parse_vdiname(s, filename, vdi, snapid, tag);
+}
+if (ret  0) {
 goto out;
 }
 s-fd = get_sheep_fd(s);
@@ -1275,17 +1319,17 @@ static int sd_create(const char *filename, 
QEMUOptionParameter *options)
 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
 uint32_t snapid;
 bool prealloc = false;
-const char *vdiname;
 
 s = g_malloc0(sizeof(BDRVSheepdogState));
 
-strstart(filename, sheepdog:, vdiname);
-
 memset(vdi, 0, sizeof(vdi));
 memset(tag, 0, sizeof(tag));
-if (parse_vdiname(s, vdiname, vdi, snapid, tag)  0) {
-error_report(invalid filename);
-ret = -EINVAL;
+if (strstr(filename, ://)) {
+ret = sd_parse_uri(s, filename, vdi, snapid, tag

[Qemu-devel] [PATCH v4 RESEND 2/5] move socket_set_nodelay to osdep.c

2013-02-21 Thread MORITA Kazutaka
Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 block/sheepdog.c   | 11 +--
 gdbstub.c  |  5 ++---
 include/qemu/sockets.h |  1 +
 qemu-char.c|  6 --
 slirp/tcp_subr.c   |  3 +--
 util/osdep.c   |  6 ++
 6 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index d466b23..51e75ad 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -787,15 +787,6 @@ static int aio_flush_request(void *opaque)
 !QLIST_EMPTY(s-pending_aio_head);
 }
 
-static int set_nodelay(int fd)
-{
-int ret, opt;
-
-opt = 1;
-ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)opt, sizeof(opt));
-return ret;
-}
-
 /*
  * Return a socket discriptor to read/write objects.
  *
@@ -814,7 +805,7 @@ static int get_sheep_fd(BDRVSheepdogState *s)
 
 socket_set_nonblock(fd);
 
-ret = set_nodelay(fd);
+ret = socket_set_nodelay(fd);
 if (ret) {
 error_report(%s, strerror(errno));
 closesocket(fd);
diff --git a/gdbstub.c b/gdbstub.c
index 32dfea9..e414ad9 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2841,7 +2841,7 @@ static void gdb_accept(void)
 GDBState *s;
 struct sockaddr_in sockaddr;
 socklen_t len;
-int val, fd;
+int fd;
 
 for(;;) {
 len = sizeof(sockaddr);
@@ -2858,8 +2858,7 @@ static void gdb_accept(void)
 }
 
 /* set short latency */
-val = 1;
-setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)val, sizeof(val));
+socket_set_nodelay(fd);
 
 s = g_malloc0(sizeof(GDBState));
 s-c_cpu = first_cpu;
diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index 803ae17..6125bf7 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -34,6 +34,7 @@ int inet_aton(const char *cp, struct in_addr *ia);
 int qemu_socket(int domain, int type, int protocol);
 int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen);
 int socket_set_cork(int fd, int v);
+int socket_set_nodelay(int fd);
 void socket_set_block(int fd);
 void socket_set_nonblock(int fd);
 int send_all(int fd, const void *buf, int len1);
diff --git a/qemu-char.c b/qemu-char.c
index e4b0f53..3e16b5b 100644
--- a/qemu-char.c
+++ b/qemu-char.c
@@ -2365,12 +2365,6 @@ static void tcp_chr_telnet_init(int fd)
 send(fd, (char *)buf, 3, 0);
 }
 
-static void socket_set_nodelay(int fd)
-{
-int val = 1;
-setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)val, sizeof(val));
-}
-
 static int tcp_chr_add_client(CharDriverState *chr, int fd)
 {
 TCPCharDriver *s = chr-opaque;
diff --git a/slirp/tcp_subr.c b/slirp/tcp_subr.c
index 317dc07..7b7ad60 100644
--- a/slirp/tcp_subr.c
+++ b/slirp/tcp_subr.c
@@ -430,8 +430,7 @@ void tcp_connect(struct socket *inso)
 setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)opt, sizeof(int));
 opt = 1;
 setsockopt(s, SOL_SOCKET, SO_OOBINLINE, (char *)opt, sizeof(int));
-opt = 1;
-setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (char *)opt, sizeof(int));
+socket_set_nodelay(s);
 
 so-so_fport = addr.sin_port;
 so-so_faddr = addr.sin_addr;
diff --git a/util/osdep.c b/util/osdep.c
index 5b51a03..c408261 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -63,6 +63,12 @@ int socket_set_cork(int fd, int v)
 #endif
 }
 
+int socket_set_nodelay(int fd)
+{
+int v = 1;
+return setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, v, sizeof(v));
+}
+
 int qemu_madvise(void *addr, size_t len, int advice)
 {
 if (advice == QEMU_MADV_INVALID) {
-- 
1.8.1.3.566.gaa39828




[Qemu-devel] [PATCH v4 RESEND 1/5] slirp/tcp_subr.c: fix coding style in tcp_connect

2013-02-21 Thread MORITA Kazutaka
Fix coding style in tcp_connect before the next patch.

Signed-off-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp
---
 slirp/tcp_subr.c | 140 ---
 1 file changed, 72 insertions(+), 68 deletions(-)

diff --git a/slirp/tcp_subr.c b/slirp/tcp_subr.c
index 1542e43..317dc07 100644
--- a/slirp/tcp_subr.c
+++ b/slirp/tcp_subr.c
@@ -384,83 +384,87 @@ int tcp_fconnect(struct socket *so)
  * the time it gets to accept(), so... We simply accept
  * here and SYN the local-host.
  */
-void
-tcp_connect(struct socket *inso)
+void tcp_connect(struct socket *inso)
 {
-   Slirp *slirp = inso-slirp;
-   struct socket *so;
-   struct sockaddr_in addr;
-   socklen_t addrlen = sizeof(struct sockaddr_in);
-   struct tcpcb *tp;
-   int s, opt;
+Slirp *slirp = inso-slirp;
+struct socket *so;
+struct sockaddr_in addr;
+socklen_t addrlen = sizeof(struct sockaddr_in);
+struct tcpcb *tp;
+int s, opt;
 
-   DEBUG_CALL(tcp_connect);
-   DEBUG_ARG(inso = %lx, (long)inso);
+DEBUG_CALL(tcp_connect);
+DEBUG_ARG(inso = %lx, (long)inso);
 
-   /*
-* If it's an SS_ACCEPTONCE socket, no need to socreate()
-* another socket, just use the accept() socket.
-*/
-   if (inso-so_state  SS_FACCEPTONCE) {
-   /* FACCEPTONCE already have a tcpcb */
-   so = inso;
-   } else {
-   if ((so = socreate(slirp)) == NULL) {
-   /* If it failed, get rid of the pending connection */
-   closesocket(accept(inso-s,(struct sockaddr 
*)addr,addrlen));
-   return;
-   }
-   if (tcp_attach(so)  0) {
-   free(so); /* NOT sofree */
-   return;
-   }
-   so-so_laddr = inso-so_laddr;
-   so-so_lport = inso-so_lport;
-   }
+/*
+ * If it's an SS_ACCEPTONCE socket, no need to socreate()
+ * another socket, just use the accept() socket.
+ */
+if (inso-so_state  SS_FACCEPTONCE) {
+/* FACCEPTONCE already have a tcpcb */
+so = inso;
+} else {
+so = socreate(slirp);
+if (so == NULL) {
+/* If it failed, get rid of the pending connection */
+closesocket(accept(inso-s, (struct sockaddr *)addr, addrlen));
+return;
+}
+if (tcp_attach(so)  0) {
+free(so); /* NOT sofree */
+return;
+}
+so-so_laddr = inso-so_laddr;
+so-so_lport = inso-so_lport;
+}
 
-   (void) tcp_mss(sototcpcb(so), 0);
+tcp_mss(sototcpcb(so), 0);
 
-   if ((s = accept(inso-s,(struct sockaddr *)addr,addrlen))  0) {
-   tcp_close(sototcpcb(so)); /* This will sofree() as well */
-   return;
-   }
-   socket_set_nonblock(s);
-   opt = 1;
-   setsockopt(s,SOL_SOCKET,SO_REUSEADDR,(char *)opt,sizeof(int));
-   opt = 1;
-   setsockopt(s,SOL_SOCKET,SO_OOBINLINE,(char *)opt,sizeof(int));
-   opt = 1;
-   setsockopt(s,IPPROTO_TCP,TCP_NODELAY,(char *)opt,sizeof(int));
-
-   so-so_fport = addr.sin_port;
-   so-so_faddr = addr.sin_addr;
-   /* Translate connections from localhost to the real hostname */
-if (so-so_faddr.s_addr == 0 ||
-(so-so_faddr.s_addr  loopback_mask) ==
-(loopback_addr.s_addr  loopback_mask)) {
-so-so_faddr = slirp-vhost_addr;
-}
+s = accept(inso-s, (struct sockaddr *)addr, addrlen);
+if (s  0) {
+tcp_close(sototcpcb(so)); /* This will sofree() as well */
+return;
+}
+socket_set_nonblock(s);
+opt = 1;
+setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)opt, sizeof(int));
+opt = 1;
+setsockopt(s, SOL_SOCKET, SO_OOBINLINE, (char *)opt, sizeof(int));
+opt = 1;
+setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (char *)opt, sizeof(int));
+
+so-so_fport = addr.sin_port;
+so-so_faddr = addr.sin_addr;
+/* Translate connections from localhost to the real hostname */
+if (so-so_faddr.s_addr == 0 ||
+(so-so_faddr.s_addr  loopback_mask) ==
+(loopback_addr.s_addr  loopback_mask)) {
+so-so_faddr = slirp-vhost_addr;
+}
 
-   /* Close the accept() socket, set right state */
-   if (inso-so_state  SS_FACCEPTONCE) {
-   closesocket(so-s); /* If we only accept once, close the 
accept() socket */
-   so-so_state = SS_NOFDREF; /* Don't select it yet, even though 
we have an FD */
-  /* if it's not FACCEPTONCE, it's 
already NOFDREF */
-   }
-   so-s = s;
-   so-so_state |= SS_INCOMING;
+/* Close the accept() socket, set right state */
+if (inso-so_state  SS_FACCEPTONCE) {
+/* If we only accept once, close the accept() socket */
+closesocket(so-s);
+
+/* Don't select it yet, even though we have

<    1   2   3   >