block/meson.build | 1 + block/vitastor.c | 1022 +++++++++++++++++++++++ hw/display/qxl.c | 11 + hw/timer/i8254_common.c | 2 +- meson.build | 22 + meson_options.txt | 2 + qapi/block-core.json | 37 +- scripts/ci/org.centos/stream/8/x86_64/configure | 3 +- scripts/meson-buildoptions.sh | 3 + target/i386/kvm/kvm.c | 2 + 10 files changed, 1102 insertions(+), 3 deletions(-) diff --git a/block/meson.build b/block/meson.build index 529fc172c6..d542dc0609 100644 --- a/block/meson.build +++ b/block/meson.build @@ -110,6 +110,7 @@ foreach m : [ [libnfs, 'nfs', files('nfs.c')], [libssh, 'ssh', files('ssh.c')], [rbd, 'rbd', files('rbd.c')], + [vitastor, 'vitastor', files('vitastor.c')], ] if m[0].found() module_ss = ss.source_set() diff --git a/block/vitastor.c b/block/vitastor.c new file mode 100644 index 0000000000..0024c5d335 --- /dev/null +++ b/block/vitastor.c @@ -0,0 +1,1022 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details) + +// QEMU block driver + +#ifdef VITASTOR_SOURCE_TREE +#define BUILD_DSO +#define _GNU_SOURCE +#endif +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#if QEMU_VERSION_MAJOR >= 8 +#include "block/block-io.h" +#endif +#include "block/block_int.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qerror.h" +#include "qemu/uri.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "qemu/option.h" + +#if QEMU_VERSION_MAJOR >= 3 +#include "qemu/units.h" +#include "block/qdict.h" +#include "qemu/cutils.h" +#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 10 +#include "qemu/cutils.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qjson.h" +#else +#include "qapi/qmp/qint.h" +#define qdict_put_int(options, name, num_val) qdict_put_obj(options, name, QOBJECT(qint_from_int(num_val))) +#define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value))) +#define qobject_unref QDECREF +#endif +#if QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 || QEMU_VERSION_MAJOR > 4 +#include "sysemu/replay.h" +#else +#include "sysemu/sysemu.h" +#endif + +#include "vitastor_c.h" + +#ifdef VITASTOR_SOURCE_TREE +void qemu_module_dummy(void) +{ +} + +void DSO_STAMP_FUN(void) +{ +} +#endif + +typedef struct VitastorFdData VitastorFdData; + +typedef struct VitastorClient +{ + void *proxy; + int uring_eventfd; + + void *watch; + char *config_path; + char *etcd_host; + char *etcd_prefix; + char *image; + int skip_parents; + uint64_t inode; + uint64_t pool; + uint64_t size; + long readonly; + int use_rdma; + char *rdma_device; + int rdma_port_num; + int rdma_gid_index; + int rdma_mtu; + QemuMutex mutex; + AioContext *ctx; + VitastorFdData **fds; + int fd_count, fd_alloc; + int bh_uring_scheduled; + + uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len; + uint32_t last_bitmap_granularity; + uint8_t *last_bitmap; +} VitastorClient; + +typedef struct VitastorFdData +{ + VitastorClient *cli; + int fd; + IOHandler *fd_read, *fd_write; + void *opaque; +} VitastorFdData; + +typedef struct VitastorRPC +{ + BlockDriverState *bs; + Coroutine *co; + QEMUIOVector *iov; + long ret; + int complete; + uint64_t inode, offset, len; + uint32_t bitmap_granularity; + uint8_t *bitmap; +#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8 + QEMUBH *bh; +#endif +} VitastorRPC; + +#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8 +typedef struct VitastorBH +{ + VitastorClient *cli; + QEMUBH *bh; +} VitastorBH; +#endif + +static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task); +static void vitastor_co_generic_cb(void *opaque, long retval); +static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version); +static void vitastor_close(BlockDriverState *bs); + +static char *qemu_vitastor_next_tok(char *src, char delim, char **p) +{ + char *end; + *p = NULL; + for (end = src; *end; ++end) + { + if (*end == delim) + break; + if (*end == '\\' && end[1] != '\0') + end++; + } + if (*end == delim) + { + *p = end + 1; + *end = '\0'; + } + return src; +} + +static void qemu_vitastor_unescape(char *src) +{ + char *p; + for (p = src; *src; ++src, ++p) + { + if (*src == '\\' && src[1] != '\0') + src++; + *p = *src; + } + *p = '\0'; +} + +// vitastor[:key=value]* +// vitastor[:etcd_host=127.0.0.1]:inode=1:pool=1[:rdma_gid_index=3] +// vitastor:config_path=/etc/vitastor/vitastor.conf:image=testimg +static void vitastor_parse_filename(const char *filename, QDict *options, Error **errp) +{ + const char *start; + char *p, *buf; + + if (!strstart(filename, "vitastor:", &start)) + { + error_setg(errp, "File name must start with 'vitastor:'"); + return; + } + + buf = g_strdup(start); + p = buf; + + // The following are all key/value pairs + while (p) + { + int i; + char *name, *value; + name = qemu_vitastor_next_tok(p, '=', &p); + if (!p) + { + error_setg(errp, "conf option %s has no value", name); + break; + } + for (i = 0; i < strlen(name); i++) + if (name[i] == '_') + name[i] = '-'; + qemu_vitastor_unescape(name); + value = qemu_vitastor_next_tok(p, ':', &p); + qemu_vitastor_unescape(value); + if (!strcmp(name, "inode") || + !strcmp(name, "pool") || + !strcmp(name, "size") || + !strcmp(name, "skip-parents") || + !strcmp(name, "use-rdma") || + !strcmp(name, "rdma-port_num") || + !strcmp(name, "rdma-gid-index") || + !strcmp(name, "rdma-mtu")) + { + unsigned long long num_val; +#if QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1 + if (parse_uint_full(value, &num_val, 0)) +#else + if (parse_uint_full(value, 0, &num_val)) +#endif + { + error_setg(errp, "Illegal %s: %s", name, value); + goto out; + } + qdict_put_int(options, name, num_val); + } + else + { + qdict_put_str(options, name, value); + } + } + if (!qdict_get_try_str(options, "image")) + { + if (!qdict_get_try_int(options, "inode", 0)) + { + error_setg(errp, "one of image (name) and inode (number) must be specified"); + goto out; + } + if (!(qdict_get_try_int(options, "inode", 0) >> (64-POOL_ID_BITS)) && + !qdict_get_try_int(options, "pool", 0)) + { + error_setg(errp, "pool number must be specified or included in the inode number"); + goto out; + } + if (!qdict_get_try_int(options, "size", 0)) + { + error_setg(errp, "size must be specified when inode number is used instead of image name"); + goto out; + } + } + +out: + g_free(buf); + return; +} + +#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2 +static void vitastor_uring_handler(void *opaque) +{ + VitastorClient *client = (VitastorClient*)opaque; + qemu_mutex_lock(&client->mutex); + client->bh_uring_scheduled = 0; + vitastor_c_uring_handle_events(client->proxy); + qemu_mutex_unlock(&client->mutex); +} + +#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8 +static void vitastor_bh_uring_handler(void *opaque) +{ + VitastorBH *vbh = opaque; + vitastor_bh_handler(vbh->cli); + qemu_bh_delete(vbh->bh); + free(vbh); +} +#endif + +static void vitastor_schedule_uring_handler(VitastorClient *client) +{ + void *opaque = client; + if (client->uring_eventfd >= 0 && !client->bh_uring_scheduled) + { + client->bh_uring_scheduled = 1; +#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 + replay_bh_schedule_oneshot_event(client->ctx, vitastor_uring_handler, opaque); +#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8 + aio_bh_schedule_oneshot(client->ctx, vitastor_uring_handler, opaque); +#else + VitastorBH *vbh = (VitastorBH*)malloc(sizeof(VitastorBH)); + vbh->cli = client; +#if QEMU_VERSION_MAJOR >= 2 + vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_uring_handler, vbh); +#else + vbh->bh = qemu_bh_new(vitastor_bh_uring_handler, vbh); +#endif + qemu_bh_schedule(vbh->bh); +#endif + } +} +#else +static void vitastor_schedule_uring_handler(VitastorClient *client) +{ +} +#endif + +static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task) +{ + BlockDriverState *bs = task->bs; + VitastorClient *client = bs->opaque; + task->co = qemu_coroutine_self(); + + qemu_mutex_lock(&client->mutex); + vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task); + vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task->complete) + { + qemu_coroutine_yield(); + } +} + +static void vitastor_aio_fd_read(void *fddv) +{ + VitastorFdData *fdd = (VitastorFdData*)fddv; + qemu_mutex_lock(&fdd->cli->mutex); + fdd->fd_read(fdd->opaque); + vitastor_schedule_uring_handler(fdd->cli); + qemu_mutex_unlock(&fdd->cli->mutex); +} + +static void vitastor_aio_fd_write(void *fddv) +{ + VitastorFdData *fdd = (VitastorFdData*)fddv; + qemu_mutex_lock(&fdd->cli->mutex); + fdd->fd_write(fdd->opaque); + vitastor_schedule_uring_handler(fdd->cli); + qemu_mutex_unlock(&fdd->cli->mutex); +} + +static void universal_aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque) +{ + aio_set_fd_handler(ctx, fd, +#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3 && (QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1) + 0 /*is_external*/, +#endif + fd_read, + fd_write, +#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1 + NULL /*io_flush*/, +#endif +#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3 + NULL /*io_poll*/, +#endif +#if QEMU_VERSION_MAJOR >= 7 + NULL /*io_poll_ready*/, +#endif + opaque); +} + +static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque) +{ + VitastorClient *client = (VitastorClient*)vcli; + VitastorFdData *fdd = NULL; + int i; + for (i = 0; i < client->fd_count; i++) + { + if (client->fds[i]->fd == fd) + { + if (fd_read || fd_write) + { + fdd = client->fds[i]; + fdd->opaque = opaque; + fdd->fd_read = fd_read; + fdd->fd_write = fd_write; + } + else + { + for (int j = i+1; j < client->fd_count; j++) + client->fds[j-1] = client->fds[j]; + client->fd_count--; + } + break; + } + } + if ((fd_read || fd_write) && !fdd) + { + fdd = (VitastorFdData*)malloc(sizeof(VitastorFdData)); + fdd->cli = client; + fdd->fd = fd; + fdd->fd_read = fd_read; + fdd->fd_write = fd_write; + fdd->opaque = opaque; + if (client->fd_count >= client->fd_alloc) + { + client->fd_alloc = client->fd_alloc*2; + if (client->fd_alloc < 16) + client->fd_alloc = 16; + client->fds = (VitastorFdData**)realloc(client->fds, sizeof(VitastorFdData*) * client->fd_alloc); + } + client->fds[client->fd_count++] = fdd; + } + universal_aio_set_fd_handler( + client->ctx, fd, fd_read ? vitastor_aio_fd_read : NULL, fd_write ? vitastor_aio_fd_write : NULL, fdd + ); +} + +static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) +{ + VitastorRPC task; + VitastorClient *client = bs->opaque; + void *image = NULL; + int64_t ret = 0; + qemu_mutex_init(&client->mutex); + client->config_path = g_strdup(qdict_get_try_str(options, "config-path")); + // FIXME: Rename to etcd_address + client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host")); + client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix")); + client->skip_parents = qdict_get_try_int(options, "skip-parents", 0); + client->use_rdma = qdict_get_try_int(options, "use-rdma", -1); + client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device")); + client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0); + client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0); + client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0); + client->ctx = bdrv_get_aio_context(bs); +#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2 + client->proxy = vitastor_c_create_qemu_uring( + vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix, + client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0 + ); + if (!client->proxy) + { + fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno)); + client->uring_eventfd = -1; +#endif + client->proxy = vitastor_c_create_qemu( + vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix, + client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0 + ); +#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2 + } + else + { + client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy); + if (client->uring_eventfd < 0) + { + fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno)); + error_setg(errp, "failed to create io_uring eventfd"); + vitastor_close(bs); + return -1; + } + universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client); + } +#endif + image = client->image = g_strdup(qdict_get_try_str(options, "image")); + client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0; + // Get image metadata (size and readonly flag) or just wait until the client is ready + if (!image) + client->image = (char*)"x"; + task.complete = 0; + task.bs = bs; + if (qemu_in_coroutine()) + { + vitastor_co_get_metadata(&task); + } + else + { +#if QEMU_VERSION_MAJOR >= 8 + aio_co_enter(bdrv_get_aio_context(bs), qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task)); +#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3 + bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task)); +#else + qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task)); +#endif + BDRV_POLL_WHILE(bs, !task.complete); + } + client->image = image; + if (client->image) + { + client->watch = (void*)task.ret; + client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch); + client->size = vitastor_c_inode_get_size(client->watch); + if (!vitastor_c_inode_get_num(client->watch)) + { + error_setg(errp, "image does not exist"); + vitastor_close(bs); + return -1; + } + if (!client->size) + { + client->size = qdict_get_try_int(options, "size", 0); + } + } + else + { + client->watch = NULL; + client->inode = qdict_get_try_int(options, "inode", 0); + client->pool = qdict_get_try_int(options, "pool", 0); + if (client->pool) + { + client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS)); + } + client->size = qdict_get_try_int(options, "size", 0); + vitastor_c_close_watch(client->proxy, (void*)task.ret); + } + if (!client->size) + { + error_setg(errp, "image size not specified"); + vitastor_close(bs); + return -1; + } + bs->total_sectors = client->size / BDRV_SECTOR_SIZE; + //client->aio_context = bdrv_get_aio_context(bs); + qdict_del(options, "use-rdma"); + qdict_del(options, "rdma-mtu"); + qdict_del(options, "rdma-gid-index"); + qdict_del(options, "rdma-port-num"); + qdict_del(options, "rdma-device"); + qdict_del(options, "config-path"); + qdict_del(options, "etcd-host"); + qdict_del(options, "etcd-prefix"); + qdict_del(options, "image"); + qdict_del(options, "inode"); + qdict_del(options, "pool"); + qdict_del(options, "size"); + qdict_del(options, "skip-parents"); + return ret; +} + +static void vitastor_close(BlockDriverState *bs) +{ + VitastorClient *client = bs->opaque; + vitastor_c_destroy(client->proxy); + if (client->fds) + { + free(client->fds); + client->fds = NULL; + client->fd_alloc = client->fd_count = 0; + } + qemu_mutex_destroy(&client->mutex); + if (client->config_path) + g_free(client->config_path); + if (client->etcd_host) + g_free(client->etcd_host); + if (client->etcd_prefix) + g_free(client->etcd_prefix); + if (client->image) + g_free(client->image); + free(client->last_bitmap); + client->last_bitmap = NULL; +} + +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2 +static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ + bsz->phys = 4096; + bsz->log = 512; + return 0; +} +#endif + +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12 +static int coroutine_fn vitastor_co_create_opts( +#if QEMU_VERSION_MAJOR >= 4 + BlockDriver *drv, +#endif + const char *url, QemuOpts *opts, Error **errp) +{ + QDict *options; + int ret; + + options = qdict_new(); + vitastor_parse_filename(url, options, errp); + if (*errp) + { + ret = -1; + goto out; + } + + // inodes don't require creation in Vitastor. FIXME: They will when there will be some metadata + + ret = 0; +out: + qobject_unref(options); + return ret; +} +#endif + +#if QEMU_VERSION_MAJOR >= 3 +static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset, +#if QEMU_VERSION_MAJOR >= 4 + bool exact, +#endif + PreallocMode prealloc, +#if QEMU_VERSION_MAJOR >= 5 && QEMU_VERSION_MINOR >= 1 || QEMU_VERSION_MAJOR > 5 || defined RHEL_BDRV_CO_TRUNCATE_FLAGS + BdrvRequestFlags flags, +#endif + Error **errp) +{ + VitastorClient *client = bs->opaque; + + if (prealloc != PREALLOC_MODE_OFF) + { + error_setg(errp, "Unsupported preallocation mode '%s'", PreallocMode_str(prealloc)); + return -ENOTSUP; + } + + // TODO: Resize inode to bytes +#if QEMU_VERSION_MAJOR >= 4 + client->size = exact || client->size < offset ? offset : client->size; +#else + client->size = offset; +#endif + + return 0; +} +#endif + +static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + bdi->cluster_size = 4096; + return 0; +} + +static int64_t vitastor_getlength(BlockDriverState *bs) +{ + VitastorClient *client = bs->opaque; + return client->size; +} + +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 0 +static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp) +#else +static int vitastor_refresh_limits(BlockDriverState *bs) +#endif +{ + bs->bl.request_alignment = 4096; +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 3 + bs->bl.min_mem_alignment = 4096; +#endif + bs->bl.opt_mem_alignment = 4096; +#if QEMU_VERSION_MAJOR < 2 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR == 0 + return 0; +#endif +} + +//static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs) +//{ +// return 0; +//} + +static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task) +{ + *task = (VitastorRPC) { + .co = qemu_coroutine_self(), + .bs = bs, + }; +} + +static void vitastor_co_generic_bh_cb(void *opaque) +{ + VitastorRPC *task = opaque; + task->complete = 1; + if (qemu_coroutine_self() != task->co) + { +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8 + aio_co_wake(task->co); +#else +#if QEMU_VERSION_MAJOR == 2 + qemu_bh_delete(task->bh); +#endif + qemu_coroutine_enter(task->co, NULL); + qemu_aio_release(task); +#endif + } +} + +static void vitastor_co_generic_cb(void *opaque, long retval) +{ + VitastorRPC *task = opaque; + task->ret = retval; +#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 + replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); +#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8 + aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); +#elif QEMU_VERSION_MAJOR >= 2 + task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); + qemu_bh_schedule(task->bh); +#else + task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque); + qemu_bh_schedule(task->bh); +#endif +} + +static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version) +{ + vitastor_co_generic_cb(opaque, retval); +} + +static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs, +#if QEMU_VERSION_MAJOR >= 7 || QEMU_VERSION_MAJOR == 6 && QEMU_VERSION_MINOR >= 2 + int64_t offset, int64_t bytes, QEMUIOVector *iov, BdrvRequestFlags flags +#else + uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags +#endif +) +{ + VitastorClient *client = bs->opaque; + VitastorRPC task; + vitastor_co_init_task(bs, &task); + task.iov = iov; + + uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode; + qemu_mutex_lock(&client->mutex); + vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task); + vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task.complete) + { + qemu_coroutine_yield(); + } + + return task.ret; +} + +static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs, +#if QEMU_VERSION_MAJOR >= 7 || QEMU_VERSION_MAJOR == 6 && QEMU_VERSION_MINOR >= 2 + int64_t offset, int64_t bytes, QEMUIOVector *iov, BdrvRequestFlags flags +#else + uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags +#endif +) +{ + VitastorClient *client = bs->opaque; + VitastorRPC task; + vitastor_co_init_task(bs, &task); + task.iov = iov; + + if (client->last_bitmap) + { + // Invalidate last bitmap on write + free(client->last_bitmap); + client->last_bitmap = NULL; + } + + uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode; + qemu_mutex_lock(&client->mutex); + vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task); + vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task.complete) + { + qemu_coroutine_yield(); + } + + return task.ret; +} + +#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1 +#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 +static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap) +{ + VitastorRPC *task = opaque; + VitastorClient *client = task->bs->opaque; + task->ret = retval; + if (retval >= 0) + { + task->bitmap = bitmap; + if (client->last_bitmap_inode == task->inode && + client->last_bitmap_offset == task->offset && + client->last_bitmap_len == task->len) + { + free(client->last_bitmap); + client->last_bitmap = bitmap; + } + } +#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 + replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); +#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8 + aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); +#elif QEMU_VERSION_MAJOR >= 2 + task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque); + qemu_bh_schedule(task->bh); +#else + task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque); + qemu_bh_schedule(task->bh); +#endif +} + +static int coroutine_fn vitastor_co_block_status( + BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, BlockDriverState **file) +{ + // Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID + // Not allocated => return 0 + // Error => return -errno + // Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs` + VitastorRPC task; + VitastorClient *client = bs->opaque; + uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode; + uint8_t bit = 0; + if (client->last_bitmap && client->last_bitmap_inode == inode && + client->last_bitmap_offset <= offset && + client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes)) + { + // Use the previously read bitmap + task.bitmap_granularity = client->last_bitmap_granularity; + task.offset = client->last_bitmap_offset; + task.len = client->last_bitmap_len; + task.bitmap = client->last_bitmap; + } + else + { + // Read bitmap from this position, rounding to full inode PG blocks + uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode); + if (!block_size) + return -EAGAIN; + // Init coroutine + vitastor_co_init_task(bs, &task); + free(client->last_bitmap); + task.inode = client->last_bitmap_inode = inode; + task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode); + task.offset = client->last_bitmap_offset = offset / block_size * block_size; + task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset; + task.bitmap = client->last_bitmap = NULL; + qemu_mutex_lock(&client->mutex); + vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task); + vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + while (!task.complete) + { + qemu_coroutine_yield(); + } + if (task.ret < 0) + { + // Error + return task.ret; + } + } + if (want_zero) + { + // Get precise mapping with all holes + uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity; + uint64_t bmp_len = task.len / task.bitmap_granularity; + uint64_t bmp_end = bmp_pos+1; + bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1; + while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit) + { + bmp_end++; + } + *pnum = (bmp_end-bmp_pos) * task.bitmap_granularity; + } + else + { + // Get larger allocated extents, possibly with false positives + uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity; + uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos; + while (bmp_pos < bmp_end) + { + if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8) + { + bit = bit || task.bitmap[bmp_pos >> 3]; + bmp_pos += 8; + } + else + { + bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1); + bmp_pos++; + } + } + *pnum = bytes; + } + if (bit) + { + *map = offset; + *file = bs; + } + return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0); +} +#endif +#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12 +// QEMU 1.7-2.11 +static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) +{ + int64_t map = 0; + int64_t pnumbytes = 0; + int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file); + *pnum = pnumbytes/BDRV_SECTOR_SIZE; + return r; +} +#endif +#endif + +#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 ) +static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov) +{ + return vitastor_co_preadv(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0); +} + +static int coroutine_fn vitastor_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov) +{ + return vitastor_co_pwritev(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0); +} +#endif + +static int coroutine_fn vitastor_co_flush(BlockDriverState *bs) +{ + VitastorClient *client = bs->opaque; + VitastorRPC task; + vitastor_co_init_task(bs, &task); + + qemu_mutex_lock(&client->mutex); + vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task); + vitastor_schedule_uring_handler(client); + qemu_mutex_unlock(&client->mutex); + + while (!task.complete) + { + qemu_coroutine_yield(); + } + + return task.ret; +} + +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 0 +static QemuOptsList vitastor_create_opts = { + .name = "vitastor-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; +#else +static QEMUOptionParameter vitastor_create_opts[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; +#endif + +#if QEMU_VERSION_MAJOR >= 4 +static const char *vitastor_strong_runtime_opts[] = { + "inode", + "pool", + "config-path", + "etcd-host", + "etcd-prefix", + + NULL +}; +#endif + +static BlockDriver bdrv_vitastor = { + .format_name = "vitastor", + .protocol_name = "vitastor", + + .instance_size = sizeof(VitastorClient), + .bdrv_parse_filename = vitastor_parse_filename, + + .bdrv_has_zero_init = bdrv_has_zero_init_1, +#if QEMU_VERSION_MAJOR >= 8 + .bdrv_co_get_info = vitastor_get_info, + .bdrv_co_getlength = vitastor_getlength, +#else + .bdrv_get_info = vitastor_get_info, + .bdrv_getlength = vitastor_getlength, +#endif +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2 + .bdrv_probe_blocksizes = vitastor_probe_blocksizes, +#endif + .bdrv_refresh_limits = vitastor_refresh_limits, + + // FIXME: Implement it along with per-inode statistics + //.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size, + + .bdrv_file_open = vitastor_file_open, + .bdrv_close = vitastor_close, + + // Option list for the create operation +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 0 + .create_opts = &vitastor_create_opts, +#else + .create_options = vitastor_create_opts, +#endif + + // For qmp_blockdev_create(), used by the qemu monitor / QAPI + // Requires patching QAPI IDL, thus unimplemented + //.bdrv_co_create = vitastor_co_create, + +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12 + // For bdrv_create(), used by qemu-img + .bdrv_co_create_opts = vitastor_co_create_opts, +#endif + +#if QEMU_VERSION_MAJOR >= 3 + .bdrv_co_truncate = vitastor_co_truncate, +#endif + +#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1 +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12 + // For snapshot export + .bdrv_co_block_status = vitastor_co_block_status, +#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12 + .bdrv_co_get_block_status = vitastor_co_get_block_status, +#endif +#endif + +#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 + .bdrv_co_preadv = vitastor_co_preadv, + .bdrv_co_pwritev = vitastor_co_pwritev, +#else + .bdrv_co_readv = vitastor_co_readv, + .bdrv_co_writev = vitastor_co_writev, +#endif + + .bdrv_co_flush_to_disk = vitastor_co_flush, + +#if QEMU_VERSION_MAJOR >= 4 + .strong_runtime_opts = vitastor_strong_runtime_opts, +#endif +}; + +static void vitastor_block_init(void) +{ + bdrv_register(&bdrv_vitastor); +} + +block_init(vitastor_block_init); diff --git a/hw/display/qxl.c b/hw/display/qxl.c index f1c0eb7dfc..c8e4280d8e 100644 --- a/hw/display/qxl.c +++ b/hw/display/qxl.c @@ -35,6 +35,17 @@ #include "qxl.h" +/* + * SPICE defines memory barriers only for x86 arches + * reuse definitions from qemu/atomic.h on arm + */ +#if defined(__arm__) +#if defined(spice_mb) +#undef spice_mb +#define spice_mb() smp_mb() +#endif +#endif + #undef SPICE_RING_CONS_ITEM #define SPICE_RING_CONS_ITEM(qxl, r, ret) { \ uint32_t cons = (r)->cons & SPICE_RING_INDEX_MASK(r); \ diff --git a/hw/timer/i8254_common.c b/hw/timer/i8254_common.c index e4093e2904..726d42020c 100644 --- a/hw/timer/i8254_common.c +++ b/hw/timer/i8254_common.c @@ -231,7 +231,7 @@ static const VMStateDescription vmstate_pit_common = { .pre_save = pit_dispatch_pre_save, .post_load = pit_dispatch_post_load, .fields = (VMStateField[]) { - VMSTATE_UINT32_V(channels[0].irq_disabled, PITCommonState, 3), + VMSTATE_UINT32(channels[0].irq_disabled, PITCommonState), VMSTATE_STRUCT_ARRAY(channels, PITCommonState, 3, 2, vmstate_pit_channel, PITChannelState), VMSTATE_INT64(channels[0].next_transition_time, diff --git a/meson.build b/meson.build index 98e68ef0b1..19e287c71f 100644 --- a/meson.build +++ b/meson.build @@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_block endif endif +vitastor = not_found +if not get_option('vitastor').auto() or have_block + libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'], + required: get_option('vitastor')) + if libvitastor_client.found() + if cc.links(''' + #include + int main(void) { + vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + return 0; + }''', dependencies: libvitastor_client) + vitastor = declare_dependency(dependencies: libvitastor_client) + elif get_option('vitastor').enabled() + error('could not link libvitastor_client') + else + warning('could not link libvitastor_client, disabling') + endif + endif +endif + glusterfs = not_found glusterfs_ftruncate_has_stat = false glusterfs_iocb_has_stat = false @@ -2118,6 +2138,7 @@ if numa.found() endif config_host_data.set('CONFIG_OPENGL', opengl.found()) config_host_data.set('CONFIG_RBD', rbd.found()) +config_host_data.set('CONFIG_VITASTOR', vitastor.found()) config_host_data.set('CONFIG_RDMA', rdma.found()) config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack')) config_host_data.set('CONFIG_SDL', sdl.found()) @@ -4284,6 +4305,7 @@ summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt} summary_info += {'libcap-ng support': libcap_ng} summary_info += {'bpf support': libbpf} summary_info += {'rbd support': rbd} +summary_info += {'vitastor support': vitastor} summary_info += {'smartcard support': cacard} summary_info += {'U2F support': u2f} summary_info += {'libusb': libusb} diff --git a/meson_options.txt b/meson_options.txt index aaea5ddd77..880f5771f9 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -184,6 +184,8 @@ option('lzo', type : 'feature', value : 'auto', description: 'lzo compression support') option('rbd', type : 'feature', value : 'auto', description: 'Ceph block device driver') +option('vitastor', type : 'feature', value : 'auto', + description: 'Vitastor block device driver') option('opengl', type : 'feature', value : 'auto', description: 'OpenGL support') option('rdma', type : 'feature', value : 'auto', diff --git a/qapi/block-core.json b/qapi/block-core.json index 2b1d493d6e..90673fdbdc 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3146,7 +3146,7 @@ 'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, - 'ssh', 'throttle', 'vdi', 'vhdx', + 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' }, { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' }, { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' }, @@ -4196,6 +4196,28 @@ '*key-secret': 'str', '*server': ['InetSocketAddressBase'] } } +## +# @BlockdevOptionsVitastor: +# +# Driver specific block device options for vitastor +# +# @image: Image name +# @inode: Inode number +# @pool: Pool ID +# @size: Desired image size in bytes +# @config-path: Path to Vitastor configuration +# @etcd-host: etcd connection address(es) +# @etcd-prefix: etcd key/value prefix +## +{ 'struct': 'BlockdevOptionsVitastor', + 'data': { '*inode': 'uint64', + '*pool': 'uint64', + '*size': 'uint64', + '*image': 'str', + '*config-path': 'str', + '*etcd-host': 'str', + '*etcd-prefix': 'str' } } + ## # @ReplicationMode: # @@ -4654,6 +4676,7 @@ 'throttle': 'BlockdevOptionsThrottle', 'vdi': 'BlockdevOptionsGenericFormat', 'vhdx': 'BlockdevOptionsGenericFormat', + 'vitastor': 'BlockdevOptionsVitastor', 'virtio-blk-vfio-pci': { 'type': 'BlockdevOptionsVirtioBlkVfioPci', 'if': 'CONFIG_BLKIO' }, @@ -5089,6 +5112,17 @@ '*cluster-size' : 'size', '*encrypt' : 'RbdEncryptionCreateOptions' } } +## +# @BlockdevCreateOptionsVitastor: +# +# Driver specific image creation options for Vitastor. +# +# @size: Size of the virtual disk in bytes +## +{ 'struct': 'BlockdevCreateOptionsVitastor', + 'data': { 'location': 'BlockdevOptionsVitastor', + 'size': 'size' } } + ## # @BlockdevVmdkSubformat: # @@ -5311,6 +5345,7 @@ 'ssh': 'BlockdevCreateOptionsSsh', 'vdi': 'BlockdevCreateOptionsVdi', 'vhdx': 'BlockdevCreateOptionsVhdx', + 'vitastor': 'BlockdevCreateOptionsVitastor', 'vmdk': 'BlockdevCreateOptionsVmdk', 'vpc': 'BlockdevCreateOptionsVpc' } } diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure index d02b09a4b9..f0b5fbfef3 100755 --- a/scripts/ci/org.centos/stream/8/x86_64/configure +++ b/scripts/ci/org.centos/stream/8/x86_64/configure @@ -30,7 +30,7 @@ --with-suffix="qemu-kvm" \ --firmwarepath=/usr/share/qemu-firmware \ --target-list="x86_64-softmmu" \ ---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \ +--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \ --audio-drv-list="" \ --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \ --with-coroutine=ucontext \ @@ -176,6 +176,7 @@ --enable-opengl \ --enable-pie \ --enable-rbd \ +--enable-vitastor \ --enable-rdma \ --enable-seccomp \ --enable-snappy \ diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 9da3fe299b..3b9018dae1 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -186,6 +186,7 @@ meson_options_help() { printf "%s\n" ' vhost-user-blk-server' printf "%s\n" ' build vhost-user-blk server' printf "%s\n" ' vhost-vdpa vhost-vdpa kernel backend support' + printf "%s\n" ' vitastor Vitastor block device driver' printf "%s\n" ' virglrenderer virgl rendering support' printf "%s\n" ' virtfs virtio-9p support' printf "%s\n" ' virtfs-proxy-helper' @@ -491,6 +492,8 @@ _meson_option_parse() { --disable-vhost-user-blk-server) printf "%s" -Dvhost_user_blk_server=disabled ;; --enable-vhost-vdpa) printf "%s" -Dvhost_vdpa=enabled ;; --disable-vhost-vdpa) printf "%s" -Dvhost_vdpa=disabled ;; + --enable-vitastor) printf "%s" -Dvitastor=enabled ;; + --disable-vitastor) printf "%s" -Dvitastor=disabled ;; --enable-virglrenderer) printf "%s" -Dvirglrenderer=enabled ;; --disable-virglrenderer) printf "%s" -Dvirglrenderer=disabled ;; --enable-virtfs) printf "%s" -Dvirtfs=enabled ;; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index ebfaf3d24c..effe55bf2e 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -2751,6 +2751,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) return ret; } } +#ifdef __x86_64__ if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) { bool r; @@ -2770,6 +2771,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) exit(1); } } +#endif return 0; }