From: levin li <xingke....@taobao.com> This module provides function for users to take sheepdog VDIs as block devices in linux, users can register a VDI to its kernel space, it just like that a new hard disk is added to the computer, users can create partitions for the disk, format the disk or mount the disk, it provides users a efficient way to use sheepdog as distributed storage system.
The usage is easy, after install the module sheepdev.ko, it creates a proc entry '/proc/entry', you can write into the proc entry file to control the driver. Add a new block device from an existing sheepdog VDI: # echo "add 127.0.0.1:7070 a5d05d" > /proc/sheep It would create a block device /dev/sheepa, you can format/mount this device: # mkfs.ext4 /dev/sheepa # mount -t ext4 /sheep/sheepa test Remove a block device from the kernel: # echo "del sheepa" > /proc/sheep Signed-off-by: levin li <xingke....@taobao.com> --- sheepdev/connect.c | 178 ++++++++++++ sheepdev/module.c | 726 ++++++++++++++++++++++++++++++++++++++++++++++ sheepdev/sheep.c | 136 +++++++++ sheepdev/sheep.h | 88 ++++++ sheepdev/sheepdog_proto.h | 290 ++++++++++++++++++ 5 files changed, 1418 insertions(+) create mode 100644 sheepdev/connect.c create mode 100644 sheepdev/module.c create mode 100644 sheepdev/sheep.c create mode 100644 sheepdev/sheep.h create mode 100644 sheepdev/sheepdog_proto.h diff --git a/sheepdev/connect.c b/sheepdev/connect.c new file mode 100644 index 0000000..009a7b9 --- /dev/null +++ b/sheepdev/connect.c @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2012 Taobao Inc. + * + * Levin Li <xingke....@taobao.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "sheep.h" +#include "sheepdog_proto.h" + +int connect_to(struct socket **sock, const char *ip_addr, int port) +{ + int ret; + struct sockaddr_in addr; + + ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, sock); + if (ret) { + DBPRT("fail to create socket\n"); + return ret; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + addr.sin_addr.s_addr = in_aton(ip_addr); + + ret = (*sock)->ops->connect(*sock, (struct sockaddr *)&addr, + sizeof(addr), 0); + + if (!ret) + DBPRT("connected to %s:%d\n", ip_addr, port); + + return ret; +} + +int do_read(struct socket *sock, char *buf, const size_t length) +{ + struct msghdr msg; + struct iovec iov; + int ret = 0, received = 0, left = length; + mm_segment_t oldmm; + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + while (left > 0) { + oldmm = get_fs(); + set_fs(KERNEL_DS); + msg.msg_iov->iov_base = buf + received; + msg.msg_iov->iov_len = left; + ret = sock_recvmsg(sock, &msg, left, MSG_WAITALL); + set_fs(oldmm); + if (ret <= 0) + break; + left -= ret; + received += ret; + } + + return ret; +} + +static void forward_iov(struct msghdr *msg, int len) +{ + while (msg->msg_iov->iov_len <= len) { + len -= msg->msg_iov->iov_len; + msg->msg_iov++; + msg->msg_iovlen--; + } + + msg->msg_iov->iov_base = (char *) msg->msg_iov->iov_base + len; + msg->msg_iov->iov_len -= len; +} + + +static int do_write(struct socket *sock, struct msghdr *msg, int len) +{ + int ret; + mm_segment_t oldmm; + +rewrite: + oldmm = get_fs(); + set_fs(KERNEL_DS); + ret = sock_sendmsg(sock, msg, len); + set_fs(oldmm); + + if (ret < 0) { + if (ret == -EINTR) + goto rewrite; + if (ret == -EBUSY) { + DBPRT("busy\n"); + goto rewrite; + } + DBPRT("failed to write to socket: %d\n", ret); + return -EFAULT; + } + + len -= ret; + if (len) { + forward_iov(msg, ret); + goto rewrite; + } + + return 0; +} + +int send_req(struct socket *sock, struct sd_req *hdr, void *data, + unsigned int wlen) +{ + int ret; + struct msghdr msg; + struct iovec iov[2]; + + memset(&msg, 0, sizeof(msg)); + + msg.msg_iov = iov; + + msg.msg_iovlen = 1; + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + + if (wlen) { + msg.msg_iovlen++; + iov[1].iov_base = data; + iov[1].iov_len = wlen; + } + + ret = do_write(sock, &msg, sizeof(*hdr) + wlen); + if (ret) { + DBPRT("failed to send request %x, %d\n", hdr->opcode, wlen); + ret = -EFAULT; + } + + return ret; +} + +int exec_req(struct socket *sock, struct sd_req *hdr, void *data) +{ + int ret; + struct sd_rsp *rsp = (struct sd_rsp *)hdr; + unsigned int wlen, rlen; + + if (hdr->flags & SD_FLAG_CMD_WRITE) { + wlen = hdr->data_length; + rlen = 0; + } else { + wlen = 0; + rlen = hdr->data_length; + } + + if (send_req(sock, hdr, data, wlen)) + return -EFAULT; + + ret = do_read(sock, (char *)rsp, sizeof(*rsp)); + if (ret < 0) { + DBPRT("failed to read a response\n"); + return -EFAULT; + } + + if (rlen > rsp->data_length) + rlen = rsp->data_length; + + if (rlen) { + ret = do_read(sock, data, rlen); + if (ret < 0) { + DBPRT("failed to read the response data\n"); + return -EFAULT; + } + } + + return 0; +} diff --git a/sheepdev/module.c b/sheepdev/module.c new file mode 100644 index 0000000..bde57d3 --- /dev/null +++ b/sheepdev/module.c @@ -0,0 +1,726 @@ +/* + * Copyright (C) 2012 Taobao Inc. + * + * Levin Li <xingke....@taobao.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/hdreg.h> +#include <linux/proc_fs.h> +#include <linux/kthread.h> +#include "sheep.h" + +static int sheepdev_major; +spinlock_t devices_lock; +struct list_head dev_list; +static unsigned long *device_bitmap; +static struct proc_dir_entry *sheep_proc_entry; + +static void sheepdev_get(struct sheepdev *dev) +{ + atomic_inc(&dev->struct_refcnt); +} + +static void sheepdev_put(struct sheepdev *dev) +{ + if (atomic_dec_and_test(&dev->struct_refcnt)) + kfree(dev); +} + +static int add_request(struct sheepdev *dev, struct request *req, uint64_t oid, + int idx) +{ + struct sheep_request *s_req = kmalloc(sizeof(*s_req), GFP_KERNEL); + if (!s_req) + return -EIO; + + s_req->req_id = dev->req_id; + s_req->req = req; + s_req->oid = oid; + s_req->idx = idx; + INIT_LIST_HEAD(&s_req->list); + + spin_lock_irq(&dev->fin_lock); + list_add_tail(&s_req->list, &dev->finish_list); + spin_unlock_irq(&dev->fin_lock); + + if (dev->req_id > UINT_MAX) + dev->req_id = 1; + else + dev->req_id++; + + return 0; +} + +static void sheep_end_request(struct request *req, int ret) +{ + struct request_queue *q = req->q; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __blk_end_request_all(req, ret); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static int sheep_handle_request(struct request *req) +{ + struct req_iterator iter; + struct bio_vec *bvec; + struct gendisk *disk = req->rq_disk; + struct sheepdev *dev = disk->private_data; + unsigned long sector = blk_rq_pos(req); + unsigned long offset = sector * KERNEL_SECTOR_SIZE; + unsigned long nbytes = blk_rq_bytes(req); + int idx = offset / SHEEP_OBJECT_SIZE + 1; + uint64_t oid = vid_to_data_oid(dev->vid, idx); + uint64_t off = offset % SHEEP_OBJECT_SIZE; + int ret = 0, len = 0, create = 0; + int write = rq_data_dir(req); + void *sheep_buf = NULL; + + if (!write && dev->inode->data_vdi_id[idx] != dev->vid) { + rq_for_each_segment(bvec, req, iter) { + void *addr = kmap(bvec->bv_page); + memset(addr + bvec->bv_offset, 0, bvec->bv_len); + kunmap(bvec->bv_page); + } + sheep_end_request(req, 0); + return 0; + } else if (!write) { + ret = send_read_req(dev, oid, nbytes, off); + if (ret) + return -EIO; + + ret = add_request(dev, req, oid, idx); + if (ret) + return -EIO; + + return 0; + } + + /* For write requests */ + sheep_buf = kmalloc(nbytes, GFP_KERNEL); + if (!sheep_buf) + return -EIO; + + spin_lock(&dev->creating_lock); + if (!dev->inode->data_vdi_id[idx]) { + dev->inode->data_vdi_id[idx] = 1; + create = 1; + spin_unlock(&dev->creating_lock); + } else if (dev->inode->data_vdi_id[idx] != dev->vid){ + + spin_unlock(&dev->creating_lock); + wait_event_interruptible(dev->creating_wait, + dev->inode->data_vdi_id[idx] == dev->vid); + } else + spin_unlock(&dev->creating_lock); + + rq_for_each_segment(bvec, req, iter) { + void *addr = kmap(bvec->bv_page); + + memcpy(sheep_buf + len, addr + bvec->bv_offset, bvec->bv_len); + len += bvec->bv_len; + + if (rq_iter_last(req, iter)) { + ret = send_write_req(dev, oid, sheep_buf, len, off, + create); + if (ret != SD_RES_SUCCESS) { + kunmap(bvec->bv_page); + ret = -EIO; + goto out; + } + + ret = add_request(dev, req, oid, idx); + if (ret) { + kunmap(bvec->bv_page); + ret = -EIO; + goto out; + } + + if (!create) + goto done; + + /* For create operations we need to update inode data */ + oid = vid_to_vdi_oid(dev->vid); + off = offsetof(struct sheepdog_inode, data_vdi_id); + off += sizeof(uint32_t) * idx; + ret = send_write_req(dev, oid, (char *)&dev->vid, + sizeof(dev->vid), off, 0); + if (ret != SD_RES_SUCCESS) { + kunmap(bvec->bv_page); + ret = -EIO; + goto out; + } + + ret = add_request(dev, req, oid, idx); + if (ret) { + kunmap(bvec->bv_page); + ret = -EIO; + goto out; + } +done:; + } + + kunmap(bvec->bv_page); + } + +out: + kfree(sheep_buf); + return ret; +} + +static void sheep_request(struct request_queue *rq) +{ + struct request *req; + struct gendisk *disk; + struct sheepdev *dev; + + while ((req = blk_fetch_request(rq)) != NULL) { + + disk = req->rq_disk; + dev = disk->private_data; + + if (req->cmd_type != REQ_TYPE_FS) { + DBPRT("Skip non-fs request\n"); + __blk_end_request_all(req, -EIO); + } + + spin_lock(&dev->req_lock); + list_add_tail(&req->queuelist, &dev->pending_list); + spin_unlock(&dev->req_lock); + + wake_up_interruptible(&dev->req_wait); + } +} + +static int req_process_func(void *data) +{ + struct sheepdev *dev = (struct sheepdev *)data; + struct request *req; + int ret; + + sheepdev_get(dev); + + while (!kthread_should_stop() || !list_empty(&dev->pending_list)) { + wait_event_interruptible(dev->req_wait, + !list_empty(&dev->pending_list) || + kthread_should_stop()); + + spin_lock(&dev->req_lock); + if (list_empty(&dev->pending_list)) { + spin_unlock(&dev->req_lock); + continue; + } + + req = list_entry(dev->pending_list.next, struct request, + queuelist); + list_del_init(&req->queuelist); + spin_unlock(&dev->req_lock); + + ret = sheep_handle_request(req); + if (ret) + sheep_end_request(req, ret); + else + wake_up_interruptible(&dev->fin_wait); + } + + sheepdev_put(dev); + + return 0; +} + +static int sheepdev_open(struct block_device *blkdev, fmode_t mode) +{ + struct gendisk *disk = blkdev->bd_disk; + struct sheepdev *dev = disk->private_data; + + spin_lock(&dev->dev_lock); + dev->device_refcnt++; + spin_unlock(&dev->dev_lock); + + return 0; +} + +static int sheepdev_release(struct gendisk *disk, fmode_t mode) +{ + struct sheepdev *dev = disk->private_data; + + spin_lock(&dev->dev_lock); + dev->device_refcnt--; + spin_unlock(&dev->dev_lock); + + return 0; +} + +static struct block_device_operations sheepdev_ops = { + .owner = THIS_MODULE, + .open = sheepdev_open, + .release = sheepdev_release, +}; + +static int sheep_add_disk(struct sheepdev *dev) +{ + int ret; + struct request_queue *queue; + + dev->disk = alloc_disk(SHEEP_BLKDEV_MINORS); + if (!dev->disk) { + DBPRT("allocate gendisk failure\n"); + ret = -EBUSY; + return ret; + } + queue = blk_init_queue(sheep_request, &dev->que_lock); + /* 4M boundary */ + blk_queue_segment_boundary(queue, 0x3fffff); + dev->disk->major = sheepdev_major; + dev->disk->first_minor = dev->minor * SHEEP_BLKDEV_MINORS; + dev->disk->queue = queue; + dev->disk->fops = &sheepdev_ops; + dev->disk->private_data = dev; + snprintf(dev->disk->disk_name, sizeof(dev->disk->disk_name), + SHEEP_BLKDEV_NAME"%c", dev->minor + 'a'); + + set_capacity(dev->disk, dev->sectors); + add_disk(dev->disk); + + return 0; +} + +static struct sheep_request *find_request(struct sheepdev *dev, int id) +{ + struct sheep_request *req, *t; + + spin_lock_irq(&dev->fin_lock); + list_for_each_entry_safe(req, t, &dev->finish_list, list) { + if (req->req_id != id) + continue; + list_del_init(&req->list); + spin_unlock_irq(&dev->fin_lock); + return req; + } + spin_unlock_irq(&dev->fin_lock); + + return NULL; +} + +static int read_reply(struct sheepdev *dev, int *req_id, int *result, + void **data) +{ + int ret; + struct sd_rsp rsp; + void *buf = NULL; + + *result = 0; + *req_id = 0; + *data = NULL; + + ret = do_read(dev->sock, (char *)&rsp, sizeof(rsp)); + if (ret < 0) { + DBPRT("failed to read response\n"); + return -EIO; + } + + if (rsp.data_length > 0) { + buf = kmalloc(rsp.data_length, GFP_KERNEL); + if (!buf) { + DBPRT("No-mem\n"); + return -ENOMEM; + } + + ret = do_read(dev->sock, buf, rsp.data_length); + if (ret != rsp.data_length) { + kfree(buf); + return -EIO; + } + } + + *req_id = rsp.id; + *result = rsp.result; + *data = buf; + + return 0; +} + +static void cleanup_finish_list(struct sheepdev *dev) +{ + struct sheep_request *req, *t; + + spin_lock(&dev->fin_lock); + list_for_each_entry_safe(req, t, &dev->finish_list, list) { + list_del_init(&req->list); + sheep_end_request(req->req, -EIO); + kfree(req); + } + + spin_unlock(&dev->fin_lock); +} + +static int fin_process_func(void *data) +{ + struct sheepdev *dev = data; + struct sheep_request *sheep_req; + struct request *req; + int ret, req_id, res; + + sheepdev_get(dev); + + while (!kthread_should_stop() || !list_empty(&dev->finish_list)) { + void *buf = NULL; + + wait_event_interruptible(dev->fin_wait, + !list_empty(&dev->finish_list) || + kthread_should_stop()); + + spin_lock_irq(&dev->fin_lock); + if (list_empty(&dev->finish_list)) { + spin_unlock_irq(&dev->fin_lock); + continue; + } + spin_unlock_irq(&dev->fin_lock); + + ret = read_reply(dev, &req_id, &res, &buf); + if (ret) { + cleanup_finish_list(dev); + continue; + } + + sheep_req = find_request(dev, req_id); + if (!sheep_req) + goto next; + req = sheep_req->req; + + if (rq_data_dir(req)) { + int idx; + + res = (res != SD_RES_SUCCESS) ? -EIO : 0; + if (sheep_req->oid == vid_to_vdi_oid(dev->vid)) { + /* inode-update response */ + idx = sheep_req->idx; + } else { + /* oridinary write response */ + idx = data_oid_to_idx(sheep_req->oid); + + /* obj already exist */ + if (dev->inode->data_vdi_id[idx] == dev->vid) { + sheep_end_request(req, res); + goto next; + } + } + + spin_lock(&dev->creating_lock); + if (dev->inode->data_vdi_id[idx] == 2) { + /* + * Both obj-write and inode-update are complete + * we can end the write request and wake other + * requests waiting for this object. + */ + dev->inode->data_vdi_id[idx] = dev->vid; + spin_unlock(&dev->creating_lock); + + sheep_end_request(req, res); + wake_up_interruptible(&dev->creating_wait); + + goto next; + } else { + /* + * wait for obj-write or inode-update to complete + */ + dev->inode->data_vdi_id[idx]++; + } + spin_unlock(&dev->creating_lock); + + } else { + int len = 0; + struct req_iterator iter; + struct bio_vec *bvec; + + if (res != SD_RES_SUCCESS) { + sheep_end_request(req, -EIO); + goto next; + } + + rq_for_each_segment(bvec, req, iter) { + void *addr = kmap(bvec->bv_page); + memcpy(addr + bvec->bv_offset, buf + len, + bvec->bv_len); + len += bvec->bv_len; + kunmap(bvec->bv_page); + } + sheep_end_request(req, 0); + } +next: + kfree(buf); + kfree(sheep_req); + } + + sheepdev_put(dev); + return 0; +} + +static int dev_setup(struct sheepdev *dev) +{ + int ret; + + ret = sheep_vdi_setup(dev); + if (ret) { + return ret; + } + + spin_lock_init(&dev->que_lock); + spin_lock_init(&dev->req_lock); + spin_lock_init(&dev->fin_lock); + spin_lock_init(&dev->dev_lock); + spin_lock_init(&dev->creating_lock); + init_waitqueue_head(&dev->req_wait); + init_waitqueue_head(&dev->fin_wait); + init_waitqueue_head(&dev->creating_wait); + INIT_LIST_HEAD(&dev->pending_list); + INIT_LIST_HEAD(&dev->finish_list); + INIT_LIST_HEAD(&dev->dev_list); + + dev->req_id = 1; + dev->req_thread = kthread_run(req_process_func, dev, + "sheep_req"); + dev->fin_thread = kthread_run(fin_process_func, dev, + "sheep_fin"); + + ret = sheep_add_disk(dev); + if (ret) { + return ret; + } + + return 0; +} + +#define MAX_CMD_LEN 64 + +static int process_add_command(char *buf, int len) +{ + int i, ret = 0; + struct sheepdev *dev; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + memset(dev, 0, sizeof(*dev)); + + for (i = 0; buf[i] != '\0' && buf[i] != '\n' && + buf[i] != ' ' && buf[i] != ':' && i < len; i++); + + if (buf[i] != ' ' && buf[i] != ':') { + ret = -EINVAL; + goto out; + } + + memcpy(dev->ip_addr, buf, i); + dev->ip_addr[i] = '\0'; + if (buf[i] == ' ') { + dev->port = SD_LISTEN_PORT; + buf = &buf[i + 1]; + } else { + /* start from ':' to ' ' */ + char *tmp = &buf[i + 1]; + len -= (i + 1); + for (i = 0; tmp[i] != ' ' && tmp[i] != '\0' && + tmp[i] != '\n' && i < len; i++); + if (tmp[i] != ' ') { + ret = -EINVAL; + goto out; + } + tmp[i] = '\0'; + buf = &tmp[i + 1]; + dev->port = simple_strtol(tmp, NULL, 10); + } + + dev->vid = simple_strtol(buf, NULL, 16); + + spin_lock(&devices_lock); + dev->minor = find_next_zero_bit(device_bitmap, SHEEP_BLKDEV_MINORS, 0); + set_bit(dev->minor, device_bitmap); + spin_unlock(&devices_lock); + + ret = dev_setup(dev); + if (ret) { + clear_bit(dev->minor, device_bitmap); + goto out; + } else { + sheepdev_get(dev); + spin_lock(&devices_lock); + list_add_tail(&dev->dev_list, &dev_list); + spin_unlock(&devices_lock); + } + + return ret; +out: + kfree(dev); + return ret; +} + +static void remove_device(struct sheepdev *dev) +{ + DBPRT("remove device /dev/%s\n", dev->disk->disk_name); + + kthread_stop(dev->req_thread); + kthread_stop(dev->fin_thread); + wake_up_interruptible(&dev->req_wait); + wake_up_interruptible(&dev->fin_wait); + + blk_cleanup_queue(dev->disk->queue); + del_gendisk(dev->disk); + put_disk(dev->disk); + + clear_bit(dev->minor, device_bitmap); + inet_release(dev->sock); + + sheepdev_put(dev); +} + +static int process_del_command(char *buf, int len) +{ + struct sheepdev *dev, *t; + int ret = 0; + + if (buf[len - 1] != '\n') + return -EINVAL; + buf[len - 1] = '\0'; + + spin_lock(&devices_lock); + list_for_each_entry_safe(dev, t, &dev_list, dev_list) { + if (strcmp(buf, dev->disk->disk_name) != 0) + continue; + + spin_lock(&dev->dev_lock); + if (dev->device_refcnt) { + spin_unlock(&dev->dev_lock); + ret = -EBUSY; + } else { + spin_unlock(&dev->dev_lock); + list_del_init(&dev->dev_list); + remove_device(dev); + } + + break; + } + spin_unlock(&devices_lock); + + return ret; +} + +static ssize_t sheep_proc_write(struct file *filp, const char __user *buf, + size_t len, loff_t *offset) +{ + char *kern_buf, cmd_buf[MAX_CMD_LEN]; + int i, ret; + + kern_buf = kmalloc(len, GFP_KERNEL); + if (!kern_buf) + return -ENOMEM; + + if (copy_from_user(kern_buf, buf, len)) { + ret = -EINVAL; + goto out; + } + + for (i = 0; kern_buf[i] != '\0' && kern_buf[i] != '\n' && + kern_buf[i] != ' ' && i < len; i++); + + if (i > MAX_CMD_LEN || kern_buf[i] != ' ') { + ret = -EINVAL; + goto out; + } + memcpy(cmd_buf, kern_buf, i); + cmd_buf[i] = '\0'; + if (strcmp(cmd_buf, "add") == 0) { + ret = process_add_command(&kern_buf[i + 1], len - i - 1); + if (ret) + goto out; + } else if (strcmp(cmd_buf, "del") == 0) { + ret = process_del_command(&kern_buf[i + 1], len - i - 1); + if (ret) + goto out; + + } else { + ret = -EINVAL; + goto out; + } + + ret = len; +out: + kfree(kern_buf); + return ret; +} + +static struct file_operations sheep_proc_fops = { + .write = sheep_proc_write, +}; + +static int __init sheep_module_init(void) +{ + int ret; + + DBPRT("Block device driver for Sheepdog\n"); + + spin_lock_init(&devices_lock); + INIT_LIST_HEAD(&dev_list); + device_bitmap = kmalloc(SHEEP_BLKDEV_MINORS / 8, GFP_KERNEL); + if (!device_bitmap) + return -ENOMEM; + memset(device_bitmap, 0, SHEEP_BLKDEV_MINORS / 8); + + /* create proc entry for sheep control */ + sheep_proc_entry = create_proc_entry(PROC_ENTRY_NAME, + S_IFREG | S_IRUGO | S_IWUGO, NULL); + if (!sheep_proc_entry) + return -ENOMEM; + + sheep_proc_entry->proc_fops = &sheep_proc_fops; + + sheepdev_major = register_blkdev(0, SHEEP_BLKDEV_NAME); + if (sheepdev_major < 0) { + ret = sheepdev_major; + goto error; + } + + return 0; + +error: + remove_proc_entry(PROC_ENTRY_NAME, NULL); + return ret; +} + +static void __exit sheep_module_exit(void) +{ + struct sheepdev *dev, *t; + + list_for_each_entry_safe(dev, t, &dev_list, dev_list) { + list_del_init(&dev->dev_list); + remove_device(dev); + } + + remove_proc_entry(PROC_ENTRY_NAME, NULL); + unregister_blkdev(sheepdev_major, SHEEP_BLKDEV_NAME); + + kfree(device_bitmap); + + DBPRT("Sheepdog Block Device Removed.\n"); +} + +module_init(sheep_module_init); +module_exit(sheep_module_exit); + +MODULE_LICENSE("GPL"); diff --git a/sheepdev/sheep.c b/sheepdev/sheep.c new file mode 100644 index 0000000..642a5e7 --- /dev/null +++ b/sheepdev/sheep.c @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2012 Taobao Inc. + * + * Levin Li <xingke....@taobao.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "sheep.h" +#include "sheepdog_proto.h" + +static void sd_init_req(struct sd_req *req, uint8_t opcode) +{ + memset(req, 0, sizeof(*req)); + req->opcode = opcode; +} + +static int read_object(struct sheepdev *dev, uint64_t oid, void *data, + unsigned int datalen, uint64_t offset) +{ + struct sd_req hdr; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + int ret; + + sd_init_req(&hdr, SD_OP_READ_OBJ); + hdr.id = 0; + hdr.data_length = datalen; + + hdr.obj.oid = oid; + hdr.obj.offset = offset; + + ret = exec_req(dev->sock, &hdr, data); + + if (ret < 0) { + DBPRT("Failed to read object %llx\n", oid); + return SD_RES_EIO; + } + + if (rsp->result != SD_RES_SUCCESS) { + DBPRT("Failed to read object %llx,%d\n", oid, + rsp->result); + return SD_RES_EIO; + } + + return SD_RES_SUCCESS; +} + +int send_read_req(struct sheepdev *dev, uint64_t oid, + unsigned int datalen, uint64_t offset) +{ + struct sd_req hdr; + int ret; + + sd_init_req(&hdr, SD_OP_READ_OBJ); + hdr.id = dev->req_id; + hdr.data_length = datalen; + + hdr.obj.oid = oid; + hdr.obj.offset = offset; + + ret = send_req(dev->sock, &hdr, NULL, 0); + + if (ret < 0) { + DBPRT("Failed to read object %llx\n", oid); + return SD_RES_EIO; + } + + return SD_RES_SUCCESS; +} + +int send_write_req(struct sheepdev *dev, uint64_t oid, void *data, + unsigned int datalen, uint64_t offset, int create) +{ + struct sd_req hdr; + int ret; + + if (create) + sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); + else + sd_init_req(&hdr, SD_OP_WRITE_OBJ); + + hdr.id = dev->req_id; + hdr.data_length = datalen; + hdr.flags = SD_FLAG_CMD_WRITE | SD_FLAG_CMD_DIRECT; + + hdr.obj.oid = oid; + hdr.obj.offset = offset; + hdr.obj.copies = dev->inode->nr_copies; + + ret = send_req(dev->sock, &hdr, data, datalen); + + if (ret < 0) { + DBPRT("Failed to write object %llx\n", oid); + return SD_RES_EIO; + } + + return SD_RES_SUCCESS; +} + +int sheep_vdi_setup(struct sheepdev *dev) +{ + int ret; + struct sheepdog_inode *inode; + + inode = vmalloc(sizeof(*inode)); + if (!inode) + return -ENOMEM; + memset(inode, 0 , sizeof(*inode)); + + ret = connect_to(&dev->sock, dev->ip_addr, dev->port); + if (ret) { + ret = -EFAULT; + goto out; + } + + ret = read_object(dev, vid_to_vdi_oid(dev->vid), inode, + SD_INODE_SIZE, 0); + if (ret != SD_RES_SUCCESS) { + ret = -EFAULT; + goto out; + } + + dev->size = inode->vdi_size - SHEEP_OBJECT_SIZE; + dev->sectors = dev->size / KERNEL_SECTOR_SIZE; + dev->inode = inode; + + return 0; +out: + vfree(inode); + return ret; +} diff --git a/sheepdev/sheep.h b/sheepdev/sheep.h new file mode 100644 index 0000000..55b4062 --- /dev/null +++ b/sheepdev/sheep.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2012 Taobao Inc. + * + * Levin Li <xingke....@taobao.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __SHEEP_H_ +#define __SHEEP_H_ + +#include <linux/socket.h> +#include <linux/net.h> +#include <net/sock.h> +#include <linux/tcp.h> +#include <linux/socket.h> +#include <linux/slab.h> +#include <linux/in.h> +#include <linux/list.h> +#include <asm/atomic.h> +#include <net/inet_common.h> +#include "sheepdog_proto.h" + +#define SHEEP_OBJECT_SIZE (4 * 1024 * 1024) + +#define SHEEP_BLKDEV_NAME "sheep" +#define PROC_ENTRY_NAME "sheep" +#define KERNEL_SECTOR_SIZE 512 +#define SHEEP_BLKDEV_MINORS 1024 + +#define DBPRT(fmt, args...) printk(KERN_DEBUG "sheep: " fmt, ##args) + +struct sheepdev { + struct gendisk *disk; + struct socket *sock; + char ip_addr[16]; + unsigned int port; + unsigned int minor; + unsigned int req_id; + unsigned int vid; + unsigned long size; + unsigned long sectors; + atomic_t struct_refcnt; + unsigned int device_refcnt; + spinlock_t dev_lock; + spinlock_t req_lock; + spinlock_t fin_lock; + spinlock_t que_lock; + spinlock_t creating_lock; + struct task_struct *req_thread; + struct task_struct *fin_thread; + wait_queue_head_t req_wait; + wait_queue_head_t fin_wait; + wait_queue_head_t creating_wait; + struct list_head pending_list; + struct list_head finish_list; + struct list_head dev_list; + struct sheepdog_inode *inode; +}; + +struct sheep_request { + int req_id; + int idx; /* idx is only used when update inode */ + uint64_t oid; + struct request *req; + struct list_head list; +}; + +/* connect.c */ +int connect_to(struct socket **sock, const char *addr, int port); +int send_req(struct socket *sock, struct sd_req *hdr, void *data, + unsigned int wlen); +int do_read(struct socket *sock, char *buf, const size_t length); +int exec_req(struct socket *sock, struct sd_req *hdr, void *data); + +/* sheep.c */ +int send_read_req(struct sheepdev *sheepdev, uint64_t oid, + unsigned int datalen, uint64_t offset); +int send_write_req(struct sheepdev *sheepdev, uint64_t oid, void *data, + unsigned int datalen, uint64_t offset, int create); +int sheep_vdi_setup(struct sheepdev *sheep_dev); + +#endif diff --git a/sheepdev/sheepdog_proto.h b/sheepdev/sheepdog_proto.h new file mode 100644 index 0000000..3a0452c --- /dev/null +++ b/sheepdev/sheepdog_proto.h @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __SHEEPDOG_PROTO_H__ +#define __SHEEPDOG_PROTO_H__ + +#define UINT64_MAX (18446744073709551615ULL) +#define UINT64_C(x) ((x) + (UINT64_MAX - UINT64_MAX)) + +/* This or later version supports trimming zero sectors from read response */ +#define SD_PROTO_VER_TRIM_ZERO_SECTORS 0x02 + +#define SD_LISTEN_PORT 7000 + +#define SD_OP_CREATE_AND_WRITE_OBJ 0x01 +#define SD_OP_READ_OBJ 0x02 +#define SD_OP_WRITE_OBJ 0x03 +#define SD_OP_REMOVE_OBJ 0x04 + +#define SD_OP_NEW_VDI 0x11 +#define SD_OP_LOCK_VDI 0x12 +#define SD_OP_RELEASE_VDI 0x13 +#define SD_OP_GET_VDI_INFO 0x14 +#define SD_OP_READ_VDIS 0x15 +#define SD_OP_FLUSH_VDI 0x16 + +#define SD_FLAG_CMD_WRITE 0x01 +#define SD_FLAG_CMD_COW 0x02 +#define SD_FLAG_CMD_CACHE 0x04 +#define SD_FLAG_CMD_DIRECT 0x08 /* don't use object cache */ +/* flags above 0x80 are sheepdog-internal */ + +#define SD_RES_SUCCESS 0x00 /* Success */ +#define SD_RES_UNKNOWN 0x01 /* Unknown error */ +#define SD_RES_NO_OBJ 0x02 /* No object found */ +#define SD_RES_EIO 0x03 /* I/O error */ +#define SD_RES_VDI_EXIST 0x04 /* VDI exists already */ +#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ +#define SD_RES_SYSTEM_ERROR 0x06 /* System error */ +#define SD_RES_VDI_LOCKED 0x07 /* VDI is locked */ +#define SD_RES_NO_VDI 0x08 /* No VDI found */ +#define SD_RES_NO_BASE_VDI 0x09 /* No base VDI found */ +#define SD_RES_VDI_READ 0x0A /* Cannot read requested VDI */ +#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested VDI */ +#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base VDI */ +#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base VDI */ +#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ +#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ +#define SD_RES_VDI_NOT_LOCKED 0x10 /* VDI is not locked */ +#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ +#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ +#define SD_RES_FULL_VDI 0x13 /* we already have the maximum VDIs */ +#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ +#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ +#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Sheepdog is waiting for a format operation */ +#define SD_RES_WAIT_FOR_JOIN 0x17 /* Sheepdog is waiting for other nodes joining */ +#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ +#define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */ +#define SD_RES_FORCE_RECOVER 0x1A /* Users should not force recover this cluster */ +#define SD_RES_NO_STORE 0x20 /* No targeted backend store */ +#define SD_RES_NO_SUPPORT 0x21 /* Operation is not supported by backend store */ +#define SD_RES_NODE_IN_RECOVERY 0x22 /* Targeted node is in recovery */ +#define SD_RES_OBJ_RECOVERING 0x23 /* Object is recovering */ +#define SD_RES_KILLED 0x24 /* Node is killed */ +#define SD_RES_OID_EXIST 0x25 /* Object ID exists already */ +#define SD_RES_AGAIN 0x26 /* Ask to try again */ + +/* errors above 0x80 are sheepdog-internal */ + +/* + * Object ID rules + * + * 0 - 19 (20 bits): data object space + * 20 - 31 (12 bits): reserved data object space + * 32 - 55 (24 bits): VDI object space + * 56 - 59 ( 4 bits): reserved VDI object space + * 60 - 63 ( 4 bits): object type indentifier space + */ + +#define VDI_SPACE_SHIFT 32 +#define VDI_BIT (UINT64_C(1) << 63) +#define VMSTATE_BIT (UINT64_C(1) << 62) +#define VDI_ATTR_BIT (UINT64_C(1) << 61) +#define MAX_DATA_OBJS (1ULL << 20) +#define MAX_CHILDREN 1024U +#define SD_MAX_VDI_LEN 256U +#define SD_MAX_VDI_TAG_LEN 256U +#define SD_MAX_VDI_ATTR_KEY_LEN 256U +#define SD_MAX_VDI_ATTR_VALUE_LEN 65536U +#define SD_NR_VDIS (1U << 24) +#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) +#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) + +#define SD_INODE_SIZE (sizeof(struct sheepdog_inode)) +#define SD_INODE_HEADER_SIZE (sizeof(struct sheepdog_inode) - \ + sizeof(uint32_t) * MAX_DATA_OBJS) +#define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr)) +#define CURRENT_VDI_ID 0 + +#define STORE_LEN 16 + +struct sd_req { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + union { + struct { + uint64_t oid; + uint64_t cow_oid; + uint32_t copies; + uint32_t tgt_epoch; + uint64_t offset; + } obj; + struct { + uint64_t vdi_size; + uint32_t base_vdi_id; + uint32_t copies; + uint32_t snapid; + } vdi; + uint32_t __pad[8]; + }; +}; + +struct sd_rsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + union { + uint32_t result; + struct { + uint32_t __pad; + uint32_t copies; + uint64_t offset; + } obj; + struct { + uint32_t __pad; + uint32_t rsvd; + uint32_t vdi_id; + uint32_t attr_id; + uint32_t copies; + } vdi; + uint32_t __pad[8]; + }; +}; + +struct sheepdog_inode { + char name[SD_MAX_VDI_LEN]; + char tag[SD_MAX_VDI_TAG_LEN]; + uint64_t create_time; + uint64_t snap_ctime; + uint64_t vm_clock_nsec; + uint64_t vdi_size; + uint64_t vm_state_size; + uint16_t copy_policy; + uint8_t nr_copies; + uint8_t block_size_shift; + uint32_t snap_id; + uint32_t vdi_id; + uint32_t parent_vdi_id; + uint32_t child_vdi_id[MAX_CHILDREN]; + uint32_t data_vdi_id[MAX_DATA_OBJS]; +}; + +struct sheepdog_vdi_attr { + char name[SD_MAX_VDI_LEN]; + char tag[SD_MAX_VDI_TAG_LEN]; + uint64_t ctime; + uint32_t snap_id; + uint32_t value_len; + char key[SD_MAX_VDI_ATTR_KEY_LEN]; + char value[SD_MAX_VDI_ATTR_VALUE_LEN]; +}; + +#define SHA1_LEN 20 + +struct snap_log { + uint32_t epoch; + uint64_t time; + unsigned char sha1[SHA1_LEN]; +}; + +/* + * 64 bit FNV-1a non-zero initial basis + */ +#define FNV1A_64_INIT ((uint64_t) 0xcbf29ce484222325ULL) + +/* + * 64 bit Fowler/Noll/Vo FNV-1a hash code + */ +static inline uint64_t fnv_64a_buf(const void *buf, size_t len, uint64_t hval) +{ + unsigned char *bp = (unsigned char *) buf; + unsigned char *be = bp + len; + while (bp < be) { + hval ^= (uint64_t) *bp++; + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + return hval; +} + +static inline uint64_t hash_64(uint64_t val, unsigned int bits) +{ + uint64_t hash = fnv_64a_buf(&val, sizeof(uint64_t), FNV1A_64_INIT); + + return hash & ((1 << bits) - 1); +} + +static inline bool is_data_obj_writeable(const struct sheepdog_inode *inode, + int idx) +{ + return inode->vdi_id == inode->data_vdi_id[idx]; +} + +static inline bool is_vdi_obj(uint64_t oid) +{ + return !!(oid & VDI_BIT); +} + +static inline bool is_vmstate_obj(uint64_t oid) +{ + return !!(oid & VMSTATE_BIT); +} + +static inline bool is_vdi_attr_obj(uint64_t oid) +{ + return !!(oid & VDI_ATTR_BIT); +} + +static inline bool is_data_obj(uint64_t oid) +{ + return !is_vdi_obj(oid) && !is_vmstate_obj(oid) && + !is_vdi_attr_obj(oid); +} + +static inline size_t get_objsize(uint64_t oid) +{ + if (is_vdi_obj(oid)) + return SD_INODE_SIZE; + + if (is_vdi_attr_obj(oid)) + return SD_ATTR_OBJ_SIZE; + + return SD_DATA_OBJ_SIZE; +} + +static inline uint64_t data_oid_to_idx(uint64_t oid) +{ + return oid & (MAX_DATA_OBJS - 1); +} + +static inline uint64_t vid_to_vdi_oid(uint32_t vid) +{ + return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); +} + +static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) +{ + return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; +} + +static inline uint32_t oid_to_vid(uint64_t oid) +{ + return (~VDI_BIT & oid) >> VDI_SPACE_SHIFT; +} + +static inline uint64_t vid_to_attr_oid(uint32_t vid, uint32_t attrid) +{ + return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_ATTR_BIT | attrid; +} + +static inline uint32_t attr_oid_to_vid(uint64_t oid) +{ + return (~VDI_ATTR_BIT & oid) >> VDI_SPACE_SHIFT; +} + +#endif -- 1.7.11.7 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog