diff options
54 files changed, 1984 insertions, 1017 deletions
@@ -437,6 +437,33 @@ static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) return 0; } +/** + * Set open flags for a given cache mode + * + * Return 0 on success, -1 if the cache mode was invalid. + */ +int bdrv_parse_cache_flags(const char *mode, int *flags) +{ + *flags &= ~BDRV_O_CACHE_MASK; + + if (!strcmp(mode, "off") || !strcmp(mode, "none")) { + *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; + } else if (!strcmp(mode, "directsync")) { + *flags |= BDRV_O_NOCACHE; + } else if (!strcmp(mode, "writeback")) { + *flags |= BDRV_O_CACHE_WB; + } else if (!strcmp(mode, "unsafe")) { + *flags |= BDRV_O_CACHE_WB; + *flags |= BDRV_O_NO_FLUSH; + } else if (!strcmp(mode, "writethrough")) { + /* this is the default */ + } else { + return -1; + } + + return 0; +} + /* * Common part for opening disk images and files */ @@ -1163,8 +1190,8 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return ret; } - /* No flush needed for cache=writethrough, it uses O_DSYNC */ - if ((bs->open_flags & BDRV_O_CACHE_MASK) != 0) { + /* No flush needed for cache modes that use O_DSYNC */ + if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) { bdrv_flush(bs); } @@ -1888,11 +1915,19 @@ static void bdrv_stats_iter(QObject *data, void *opaque) " wr_bytes=%" PRId64 " rd_operations=%" PRId64 " wr_operations=%" PRId64 + " flush_operations=%" PRId64 + " wr_total_time_ns=%" PRId64 + " rd_total_time_ns=%" PRId64 + " flush_total_time_ns=%" PRId64 "\n", qdict_get_int(qdict, "rd_bytes"), qdict_get_int(qdict, "wr_bytes"), qdict_get_int(qdict, "rd_operations"), - qdict_get_int(qdict, "wr_operations")); + qdict_get_int(qdict, "wr_operations"), + qdict_get_int(qdict, "flush_operations"), + qdict_get_int(qdict, "wr_total_time_ns"), + qdict_get_int(qdict, "rd_total_time_ns"), + qdict_get_int(qdict, "flush_total_time_ns")); } void bdrv_stats_print(Monitor *mon, const QObject *data) @@ -1910,12 +1945,22 @@ static QObject* bdrv_info_stats_bs(BlockDriverState *bs) "'wr_bytes': %" PRId64 "," "'rd_operations': %" PRId64 "," "'wr_operations': %" PRId64 "," - "'wr_highest_offset': %" PRId64 + "'wr_highest_offset': %" PRId64 "," + "'flush_operations': %" PRId64 "," + "'wr_total_time_ns': %" PRId64 "," + "'rd_total_time_ns': %" PRId64 "," + "'flush_total_time_ns': %" PRId64 "} }", - bs->rd_bytes, bs->wr_bytes, - bs->rd_ops, bs->wr_ops, + bs->nr_bytes[BDRV_ACCT_READ], + bs->nr_bytes[BDRV_ACCT_WRITE], + bs->nr_ops[BDRV_ACCT_READ], + bs->nr_ops[BDRV_ACCT_WRITE], bs->wr_highest_sector * - (uint64_t)BDRV_SECTOR_SIZE); + (uint64_t)BDRV_SECTOR_SIZE, + bs->nr_ops[BDRV_ACCT_FLUSH], + bs->total_time_ns[BDRV_ACCT_WRITE], + bs->total_time_ns[BDRV_ACCT_READ], + bs->total_time_ns[BDRV_ACCT_FLUSH]); dict = qobject_to_qdict(res); if (*bs->device_name) { @@ -2229,7 +2274,6 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) return buf; } - /**************************************************************/ /* async I/Os */ @@ -2238,7 +2282,6 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, BlockDriverCompletionFunc *cb, void *opaque) { BlockDriver *drv = bs->drv; - BlockDriverAIOCB *ret; trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); @@ -2247,16 +2290,8 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; - ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, - cb, opaque); - - if (ret) { - /* Update stats even though technically transfer has not happened. */ - bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE; - bs->rd_ops++; - } - - return ret; + return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, + cb, opaque); } typedef struct BlockCompleteData { @@ -2323,9 +2358,6 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, cb, opaque); if (ret) { - /* Update stats even though technically transfer has not happened. */ - bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE; - bs->wr_ops ++; if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { bs->wr_highest_sector = sector_num + nb_sectors - 1; } @@ -3133,6 +3165,27 @@ int bdrv_in_use(BlockDriverState *bs) return bs->in_use; } +void +bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, + enum BlockAcctType type) +{ + assert(type < BDRV_MAX_IOTYPE); + + cookie->bytes = bytes; + cookie->start_time_ns = get_clock(); + cookie->type = type; +} + +void +bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie) +{ + assert(cookie->type < BDRV_MAX_IOTYPE); + + bs->nr_bytes[cookie->type] += cookie->bytes; + bs->nr_ops[cookie->type]++; + bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns; +} + int bdrv_img_create(const char *filename, const char *fmt, const char *base_filename, const char *base_fmt, char *options, uint64_t img_size, int flags) @@ -69,6 +69,7 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options); BlockDriverState *bdrv_new(const char *device_name); void bdrv_make_anon(BlockDriverState *bs); void bdrv_delete(BlockDriverState *bs); +int bdrv_parse_cache_flags(const char *mode, int *flags); int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags); int bdrv_open(BlockDriverState *bs, const char *filename, int flags, BlockDriver *drv); @@ -254,6 +255,23 @@ int64_t bdrv_get_dirty_count(BlockDriverState *bs); void bdrv_set_in_use(BlockDriverState *bs, int in_use); int bdrv_in_use(BlockDriverState *bs); +enum BlockAcctType { + BDRV_ACCT_READ, + BDRV_ACCT_WRITE, + BDRV_ACCT_FLUSH, + BDRV_MAX_IOTYPE, +}; + +typedef struct BlockAcctCookie { + int64_t bytes; + int64_t start_time_ns; + enum BlockAcctType type; +} BlockAcctCookie; + +void bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, + int64_t bytes, enum BlockAcctType type); +void bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie); + typedef enum { BLKDBG_L1_UPDATE, @@ -306,3 +324,4 @@ typedef enum { void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event); #endif + diff --git a/block/curl.c b/block/curl.c index 5c157bc609..f3f61cc8a1 100644 --- a/block/curl.c +++ b/block/curl.c @@ -229,6 +229,23 @@ static void curl_multi_do(void *arg) { CURLState *state = NULL; curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char**)&state); + + /* ACBs for successful messages get completed in curl_read_cb */ + if (msg->data.result != CURLE_OK) { + int i; + for (i = 0; i < CURL_NUM_ACB; i++) { + CURLAIOCB *acb = state->acb[i]; + + if (acb == NULL) { + continue; + } + + acb->common.cb(acb->common.opaque, -EIO); + qemu_aio_release(acb); + state->acb[i] = NULL; + } + } + curl_clean_state(state); break; } @@ -277,7 +294,8 @@ static CURLState *curl_init_state(BDRVCURLState *s) curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); - + curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + #ifdef DEBUG_VERBOSE curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); #endif diff --git a/block/qcow.c b/block/qcow.c index e155d3c002..c8bfecc1cb 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -159,6 +159,8 @@ static int qcow_open(BlockDriverState *bs, int flags) goto fail; bs->backing_file[len] = '\0'; } + + qemu_co_mutex_init(&s->lock); return 0; fail: @@ -190,24 +192,6 @@ static int qcow_set_key(BlockDriverState *bs, const char *key) return -1; if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) return -1; -#if 0 - /* test */ - { - uint8_t in[16]; - uint8_t out[16]; - uint8_t tmp[16]; - for(i=0;i<16;i++) - in[i] = i; - AES_encrypt(in, tmp, &s->aes_encrypt_key); - AES_decrypt(tmp, out, &s->aes_decrypt_key); - for(i = 0; i < 16; i++) - printf(" %02x", tmp[i]); - printf("\n"); - for(i = 0; i < 16; i++) - printf(" %02x", out[i]); - printf("\n"); - } -#endif return 0; } @@ -441,296 +425,178 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) return 0; } -#if 0 - -static int qcow_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs->opaque; - int ret, index_in_cluster, n; + int index_in_cluster; + int ret = 0, n; uint64_t cluster_offset; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; + + if (qiov->niov > 1) { + buf = orig_buf = qemu_blockalign(bs, qiov->size); + } else { + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; + } + + qemu_co_mutex_lock(&s->lock); - while (nb_sectors > 0) { - cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + while (nb_sectors != 0) { + /* prepare next request */ + cluster_offset = get_cluster_offset(bs, sector_num << 9, + 0, 0, 0, 0); index_in_cluster = sector_num & (s->cluster_sectors - 1); n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) + if (n > nb_sectors) { n = nb_sectors; + } + if (!cluster_offset) { if (bs->backing_hd) { /* read from the base image */ - ret = bdrv_read(bs->backing_hd, sector_num, buf, n); - if (ret < 0) - return -1; + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, sector_num, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } } else { + /* Note: in this case, no need to wait */ memset(buf, 0, 512 * n); } } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { - if (decompress_cluster(bs, cluster_offset) < 0) - return -1; - memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n); + /* add AIO support for compressed blocks ? */ + if (decompress_cluster(bs, cluster_offset) < 0) { + goto fail; + } + memcpy(buf, + s->cluster_cache + index_in_cluster * 512, 512 * n); } else { - ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512); - if (ret != n * 512) - return -1; + if ((cluster_offset & 511) != 0) { + goto fail; + } + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + break; + } if (s->crypt_method) { - encrypt_sectors(s, sector_num, buf, buf, n, 0, + encrypt_sectors(s, sector_num, buf, buf, + n, 0, &s->aes_decrypt_key); } } + ret = 0; + nb_sectors -= n; sector_num += n; buf += n * 512; } - return 0; -} -#endif -typedef struct QCowAIOCB { - BlockDriverAIOCB common; - int64_t sector_num; - QEMUIOVector *qiov; - uint8_t *buf; - void *orig_buf; - int nb_sectors; - int n; - uint64_t cluster_offset; - uint8_t *cluster_data; - struct iovec hd_iov; - bool is_write; - QEMUBH *bh; - QEMUIOVector hd_qiov; - BlockDriverAIOCB *hd_aiocb; -} QCowAIOCB; - -static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) -{ - QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); - if (acb->hd_aiocb) - bdrv_aio_cancel(acb->hd_aiocb); - qemu_aio_release(acb); -} - -static AIOPool qcow_aio_pool = { - .aiocb_size = sizeof(QCowAIOCB), - .cancel = qcow_aio_cancel, -}; - -static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - int is_write) -{ - QCowAIOCB *acb; - - acb = qemu_aio_get(&qcow_aio_pool, bs, NULL, NULL); - if (!acb) - return NULL; - acb->hd_aiocb = NULL; - acb->sector_num = sector_num; - acb->qiov = qiov; - acb->is_write = is_write; +done: + qemu_co_mutex_unlock(&s->lock); if (qiov->niov > 1) { - acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size); - if (is_write) - qemu_iovec_to_buffer(qiov, acb->buf); - } else { - acb->buf = (uint8_t *)qiov->iov->iov_base; + qemu_iovec_from_buffer(qiov, orig_buf, qiov->size); + qemu_vfree(orig_buf); } - acb->nb_sectors = nb_sectors; - acb->n = 0; - acb->cluster_offset = 0; - return acb; + + return ret; + +fail: + ret = -EIO; + goto done; } -static int qcow_aio_read_cb(void *opaque) +static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { - QCowAIOCB *acb = opaque; - BlockDriverState *bs = acb->common.bs; BDRVQcowState *s = bs->opaque; int index_in_cluster; - int ret; + uint64_t cluster_offset; + const uint8_t *src_buf; + int ret = 0, n; + uint8_t *cluster_data = NULL; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; - acb->hd_aiocb = NULL; + s->cluster_cache_offset = -1; /* disable compressed cache */ - redo: - /* post process the read buffer */ - if (!acb->cluster_offset) { - /* nothing to do */ - } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { - /* nothing to do */ + if (qiov->niov > 1) { + buf = orig_buf = qemu_blockalign(bs, qiov->size); + qemu_iovec_to_buffer(qiov, buf); } else { - if (s->crypt_method) { - encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf, - acb->n, 0, - &s->aes_decrypt_key); - } + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; } - acb->nb_sectors -= acb->n; - acb->sector_num += acb->n; - acb->buf += acb->n * 512; + qemu_co_mutex_lock(&s->lock); - if (acb->nb_sectors == 0) { - /* request completed */ - return 0; - } + while (nb_sectors != 0) { - /* prepare next AIO request */ - acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, - 0, 0, 0, 0); - index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - acb->n = s->cluster_sectors - index_in_cluster; - if (acb->n > acb->nb_sectors) - acb->n = acb->nb_sectors; - - if (!acb->cluster_offset) { - if (bs->backing_hd) { - /* read from the base image */ - acb->hd_iov.iov_base = (void *)acb->buf; - acb->hd_iov.iov_len = acb->n * 512; - qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing_hd, acb->sector_num, - acb->n, &acb->hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - return -EIO; - } - } else { - /* Note: in this case, no need to wait */ - memset(acb->buf, 0, 512 * acb->n); - goto redo; + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; } - } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { - /* add AIO support for compressed blocks ? */ - if (decompress_cluster(bs, acb->cluster_offset) < 0) { - return -EIO; + cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + n); + if (!cluster_offset || (cluster_offset & 511) != 0) { + ret = -EIO; + break; } - memcpy(acb->buf, - s->cluster_cache + index_in_cluster * 512, 512 * acb->n); - goto redo; - } else { - if ((acb->cluster_offset & 511) != 0) { - return -EIO; + if (s->crypt_method) { + if (!cluster_data) { + cluster_data = g_malloc0(s->cluster_size); + } + encrypt_sectors(s, sector_num, cluster_data, buf, + n, 1, &s->aes_encrypt_key); + src_buf = cluster_data; + } else { + src_buf = buf; } - acb->hd_iov.iov_base = (void *)acb->buf; - acb->hd_iov.iov_len = acb->n * 512; - qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + + hd_iov.iov_base = (void *)src_buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file, - (acb->cluster_offset >> 9) + index_in_cluster, - acb->n, &acb->hd_qiov); + ret = bdrv_co_writev(bs->file, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); qemu_co_mutex_lock(&s->lock); if (ret < 0) { - return ret; + break; } - } - - return 1; -} - -static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - QCowAIOCB *acb; - int ret; - - acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0); - - qemu_co_mutex_lock(&s->lock); - do { - ret = qcow_aio_read_cb(acb); - } while (ret > 0); - qemu_co_mutex_unlock(&s->lock); + ret = 0; - if (acb->qiov->niov > 1) { - qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); - qemu_vfree(acb->orig_buf); - } - qemu_aio_release(acb); - - return ret; -} - -static int qcow_aio_write_cb(void *opaque) -{ - QCowAIOCB *acb = opaque; - BlockDriverState *bs = acb->common.bs; - BDRVQcowState *s = bs->opaque; - int index_in_cluster; - uint64_t cluster_offset; - const uint8_t *src_buf; - int ret; - - acb->hd_aiocb = NULL; - - acb->nb_sectors -= acb->n; - acb->sector_num += acb->n; - acb->buf += acb->n * 512; - - if (acb->nb_sectors == 0) { - /* request completed */ - return 0; - } - - index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - acb->n = s->cluster_sectors - index_in_cluster; - if (acb->n > acb->nb_sectors) - acb->n = acb->nb_sectors; - cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0, - index_in_cluster, - index_in_cluster + acb->n); - if (!cluster_offset || (cluster_offset & 511) != 0) { - return -EIO; - } - if (s->crypt_method) { - if (!acb->cluster_data) { - acb->cluster_data = g_malloc0(s->cluster_size); - } - encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, - acb->n, 1, &s->aes_encrypt_key); - src_buf = acb->cluster_data; - } else { - src_buf = acb->buf; - } - - acb->hd_iov.iov_base = (void *)src_buf; - acb->hd_iov.iov_len = acb->n * 512; - qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file, - (cluster_offset >> 9) + index_in_cluster, - acb->n, &acb->hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - return ret; + nb_sectors -= n; + sector_num += n; + buf += n * 512; } - return 1; -} - -static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - QCowAIOCB *acb; - int ret; - - s->cluster_cache_offset = -1; /* disable compressed cache */ - - acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1); - - qemu_co_mutex_lock(&s->lock); - do { - ret = qcow_aio_write_cb(acb); - } while (ret > 0); qemu_co_mutex_unlock(&s->lock); - if (acb->qiov->niov > 1) { - qemu_vfree(acb->orig_buf); + if (qiov->niov > 1) { + qemu_vfree(orig_buf); } - qemu_aio_release(acb); + free(cluster_data); return ret; } diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 9269ddaefd..e06be64876 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -53,7 +53,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size) } #ifdef DEBUG_ALLOC2 - printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size); + fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size); #endif new_l1_size2 = sizeof(uint64_t) * new_l1_size; @@ -381,10 +381,10 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, * For a given offset of the disk image, find the cluster offset in * qcow2 file. The offset is stored in *cluster_offset. * - * on entry, *num is the number of contiguous clusters we'd like to + * on entry, *num is the number of contiguous sectors we'd like to * access following offset. * - * on exit, *num is the number of contiguous clusters we can read. + * on exit, *num is the number of contiguous sectors we can read. * * Return 0, if the offset is found * Return -errno, otherwise. diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 2a915be57a..9605367777 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -422,7 +422,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int ret; #ifdef DEBUG_ALLOC2 - printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", + fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", offset, length, addend); #endif if (length < 0) { @@ -556,7 +556,7 @@ retry: } } #ifdef DEBUG_ALLOC2 - printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", + fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", size, (s->free_cluster_index - nb_clusters) << s->cluster_bits); #endif @@ -680,24 +680,6 @@ void qcow2_free_any_clusters(BlockDriverState *bs, -void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset, - int64_t size) -{ - int refcount; - int64_t start, last, cluster_offset; - uint16_t *p; - - start = offset & ~(s->cluster_size - 1); - last = (offset + size - 1) & ~(s->cluster_size - 1); - for(cluster_offset = start; cluster_offset <= last; - cluster_offset += s->cluster_size) { - p = &s->refcount_block[cluster_offset >> s->cluster_bits]; - refcount = be16_to_cpu(*p); - refcount++; - *p = cpu_to_be16(refcount); - } -} - /* update the refcounts of snapshots and the copied flag */ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend) diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 3bd2a30d35..3e6bf8b6f3 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -303,7 +303,10 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) if (qcow2_write_snapshots(bs) < 0) goto fail; #ifdef DEBUG_ALLOC - qcow2_check_refcounts(bs); + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result); + } #endif return 0; fail: @@ -353,7 +356,10 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; #ifdef DEBUG_ALLOC - qcow2_check_refcounts(bs); + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result); + } #endif return 0; fail: @@ -390,7 +396,10 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) return ret; } #ifdef DEBUG_ALLOC - qcow2_check_refcounts(bs); + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result); + } #endif return 0; } diff --git a/block/qcow2.c b/block/qcow2.c index bfff6cd963..b725d68b1d 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -87,6 +87,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, while (offset < end_offset) { #ifdef DEBUG_EXT + BDRVQcowState *s = bs->opaque; /* Sanity check */ if (offset > s->cluster_size) printf("qcow2_read_extension: suspicious offset %lu\n", offset); @@ -280,7 +281,10 @@ static int qcow2_open(BlockDriverState *bs, int flags) qemu_co_mutex_init(&s->lock); #ifdef DEBUG_ALLOC - qcow2_check_refcounts(bs); + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result); + } #endif return ret; @@ -372,201 +376,127 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, return n1; } -typedef struct QCowAIOCB { - BlockDriverAIOCB common; - int64_t sector_num; - QEMUIOVector *qiov; - int remaining_sectors; - int cur_nr_sectors; /* number of sectors in current iteration */ - uint64_t bytes_done; - uint64_t cluster_offset; - uint8_t *cluster_data; - bool is_write; - QEMUIOVector hd_qiov; - QEMUBH *bh; - QCowL2Meta l2meta; - QLIST_ENTRY(QCowAIOCB) next_depend; -} QCowAIOCB; - -static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb) -{ - QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); - qemu_aio_release(acb); -} - -static AIOPool qcow2_aio_pool = { - .aiocb_size = sizeof(QCowAIOCB), - .cancel = qcow2_aio_cancel, -}; - -/* - * Returns 0 when the request is completed successfully, 1 when there is still - * a part left to do and -errno in error cases. - */ -static int qcow2_aio_read_cb(QCowAIOCB *acb) +static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) { - BlockDriverState *bs = acb->common.bs; BDRVQcowState *s = bs->opaque; int index_in_cluster, n1; int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t cluster_offset = 0; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + uint8_t *cluster_data = NULL; - /* post process the read buffer */ - if (!acb->cluster_offset) { - /* nothing to do */ - } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { - /* nothing to do */ - } else { - if (s->crypt_method) { - qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data, - acb->cluster_data, acb->cur_nr_sectors, 0, &s->aes_decrypt_key); - qemu_iovec_reset(&acb->hd_qiov); - qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done, - acb->cur_nr_sectors * 512); - qemu_iovec_from_buffer(&acb->hd_qiov, acb->cluster_data, - 512 * acb->cur_nr_sectors); - } - } + qemu_iovec_init(&hd_qiov, qiov->niov); - acb->remaining_sectors -= acb->cur_nr_sectors; - acb->sector_num += acb->cur_nr_sectors; - acb->bytes_done += acb->cur_nr_sectors * 512; + qemu_co_mutex_lock(&s->lock); - if (acb->remaining_sectors == 0) { - /* request completed */ - return 0; - } + while (remaining_sectors != 0) { - /* prepare next AIO request */ - acb->cur_nr_sectors = acb->remaining_sectors; - if (s->crypt_method) { - acb->cur_nr_sectors = MIN(acb->cur_nr_sectors, - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); - } + /* prepare next request */ + cur_nr_sectors = remaining_sectors; + if (s->crypt_method) { + cur_nr_sectors = MIN(cur_nr_sectors, + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + } - ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9, - &acb->cur_nr_sectors, &acb->cluster_offset); - if (ret < 0) { - return ret; - } + ret = qcow2_get_cluster_offset(bs, sector_num << 9, + &cur_nr_sectors, &cluster_offset); + if (ret < 0) { + goto fail; + } - index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - - qemu_iovec_reset(&acb->hd_qiov); - qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done, - acb->cur_nr_sectors * 512); - - if (!acb->cluster_offset) { - - if (bs->backing_hd) { - /* read from the base image */ - n1 = qcow2_backing_read1(bs->backing_hd, &acb->hd_qiov, - acb->sector_num, acb->cur_nr_sectors); - if (n1 > 0) { - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing_hd, acb->sector_num, - n1, &acb->hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - return ret; + index_in_cluster = sector_num & (s->cluster_sectors - 1); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_copy(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + + if (!cluster_offset) { + + if (bs->backing_hd) { + /* read from the base image */ + n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, + sector_num, cur_nr_sectors); + if (n1 > 0) { + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing_hd, sector_num, + n1, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } } + } else { + /* Note: in this case, no need to wait */ + qemu_iovec_memset(&hd_qiov, 0, 512 * cur_nr_sectors); + } + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* add AIO support for compressed blocks ? */ + ret = qcow2_decompress_cluster(bs, cluster_offset); + if (ret < 0) { + goto fail; } - return 1; - } else { - /* Note: in this case, no need to wait */ - qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors); - return 1; - } - } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { - /* add AIO support for compressed blocks ? */ - ret = qcow2_decompress_cluster(bs, acb->cluster_offset); - if (ret < 0) { - return ret; - } - qemu_iovec_from_buffer(&acb->hd_qiov, - s->cluster_cache + index_in_cluster * 512, - 512 * acb->cur_nr_sectors); + qemu_iovec_from_buffer(&hd_qiov, + s->cluster_cache + index_in_cluster * 512, + 512 * cur_nr_sectors); + } else { + if ((cluster_offset & 511) != 0) { + ret = -EIO; + goto fail; + } - return 1; - } else { - if ((acb->cluster_offset & 511) != 0) { - return -EIO; - } + if (s->crypt_method) { + /* + * For encrypted images, read everything into a temporary + * contiguous buffer on which the AES functions can work. + */ + if (!cluster_data) { + cluster_data = + g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + } - if (s->crypt_method) { - /* - * For encrypted images, read everything into a temporary - * contiguous buffer on which the AES functions can work. - */ - if (!acb->cluster_data) { - acb->cluster_data = - g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + assert(cur_nr_sectors <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + 512 * cur_nr_sectors); } - assert(acb->cur_nr_sectors <= - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); - qemu_iovec_reset(&acb->hd_qiov); - qemu_iovec_add(&acb->hd_qiov, acb->cluster_data, - 512 * acb->cur_nr_sectors); + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + if (s->crypt_method) { + qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key); + qemu_iovec_reset(&hd_qiov); + qemu_iovec_copy(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + qemu_iovec_from_buffer(&hd_qiov, cluster_data, + 512 * cur_nr_sectors); + } } - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file, - (acb->cluster_offset >> 9) + index_in_cluster, - acb->cur_nr_sectors, &acb->hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - return ret; - } + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; } + ret = 0; - return 1; -} - -static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque, int is_write) -{ - QCowAIOCB *acb; - - acb = qemu_aio_get(&qcow2_aio_pool, bs, cb, opaque); - if (!acb) - return NULL; - acb->sector_num = sector_num; - acb->qiov = qiov; - acb->is_write = is_write; - - qemu_iovec_init(&acb->hd_qiov, qiov->niov); - - acb->bytes_done = 0; - acb->remaining_sectors = nb_sectors; - acb->cur_nr_sectors = 0; - acb->cluster_offset = 0; - acb->l2meta.nb_clusters = 0; - qemu_co_queue_init(&acb->l2meta.dependent_requests); - return acb; -} - -static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - QCowAIOCB *acb; - int ret; - - acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0); - - qemu_co_mutex_lock(&s->lock); - do { - ret = qcow2_aio_read_cb(acb); - } while (ret > 0); +fail: qemu_co_mutex_unlock(&s->lock); - qemu_iovec_destroy(&acb->hd_qiov); - qemu_aio_release(acb); + qemu_iovec_destroy(&hd_qiov); + g_free(cluster_data); return ret; } @@ -586,104 +516,100 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) } } -/* - * Returns 0 when the request is completed successfully, 1 when there is still - * a part left to do and -errno in error cases. - */ -static int qcow2_aio_write_cb(QCowAIOCB *acb) +static int qcow2_co_writev(BlockDriverState *bs, + int64_t sector_num, + int remaining_sectors, + QEMUIOVector *qiov) { - BlockDriverState *bs = acb->common.bs; BDRVQcowState *s = bs->opaque; int index_in_cluster; int n_end; int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + QCowL2Meta l2meta; + uint64_t cluster_offset; + QEMUIOVector hd_qiov; + uint64_t bytes_done = 0; + uint8_t *cluster_data = NULL; - ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta); + l2meta.nb_clusters = 0; + qemu_co_queue_init(&l2meta.dependent_requests); - run_dependent_requests(s, &acb->l2meta); + qemu_iovec_init(&hd_qiov, qiov->niov); - if (ret < 0) { - return ret; - } + s->cluster_cache_offset = -1; /* disable compressed cache */ - acb->remaining_sectors -= acb->cur_nr_sectors; - acb->sector_num += acb->cur_nr_sectors; - acb->bytes_done += acb->cur_nr_sectors * 512; + qemu_co_mutex_lock(&s->lock); - if (acb->remaining_sectors == 0) { - /* request completed */ - return 0; - } + while (remaining_sectors != 0) { - index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + acb->remaining_sectors; - if (s->crypt_method && - n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) - n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n_end = index_in_cluster + remaining_sectors; + if (s->crypt_method && + n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { + n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + } - ret = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9, - index_in_cluster, n_end, &acb->cur_nr_sectors, &acb->l2meta); - if (ret < 0) { - return ret; - } + ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, + index_in_cluster, n_end, &cur_nr_sectors, &l2meta); + if (ret < 0) { + goto fail; + } - acb->cluster_offset = acb->l2meta.cluster_offset; - assert((acb->cluster_offset & 511) == 0); + cluster_offset = l2meta.cluster_offset; + assert((cluster_offset & 511) == 0); - qemu_iovec_reset(&acb->hd_qiov); - qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done, - acb->cur_nr_sectors * 512); + qemu_iovec_reset(&hd_qiov); + qemu_iovec_copy(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); - if (s->crypt_method) { - if (!acb->cluster_data) { - acb->cluster_data = g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * - s->cluster_size); - } + if (s->crypt_method) { + if (!cluster_data) { + cluster_data = g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * + s->cluster_size); + } - assert(acb->hd_qiov.size <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); - qemu_iovec_to_buffer(&acb->hd_qiov, acb->cluster_data); + assert(hd_qiov.size <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + qemu_iovec_to_buffer(&hd_qiov, cluster_data); - qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data, - acb->cluster_data, acb->cur_nr_sectors, 1, &s->aes_encrypt_key); + qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key); - qemu_iovec_reset(&acb->hd_qiov); - qemu_iovec_add(&acb->hd_qiov, acb->cluster_data, - acb->cur_nr_sectors * 512); - } + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + cur_nr_sectors * 512); + } - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file, - (acb->cluster_offset >> 9) + index_in_cluster, - acb->cur_nr_sectors, &acb->hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - return ret; - } + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } - return 1; -} + ret = qcow2_alloc_cluster_link_l2(bs, &l2meta); -static int qcow2_co_writev(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - QCowAIOCB *acb; - int ret; + run_dependent_requests(s, &l2meta); - acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1); - s->cluster_cache_offset = -1; /* disable compressed cache */ + if (ret < 0) { + goto fail; + } - qemu_co_mutex_lock(&s->lock); - do { - ret = qcow2_aio_write_cb(acb); - } while (ret > 0); + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + } + ret = 0; + +fail: qemu_co_mutex_unlock(&s->lock); - qemu_iovec_destroy(&acb->hd_qiov); - qemu_aio_release(acb); + qemu_iovec_destroy(&hd_qiov); + g_free(cluster_data); return ret; } diff --git a/block/qcow2.h b/block/qcow2.h index de23abe1a4..c8ca3bc574 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -189,8 +189,6 @@ void qcow2_free_clusters(BlockDriverState *bs, void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t cluster_offset, int nb_clusters); -void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset, - int64_t size); int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend); diff --git a/block/sheepdog.c b/block/sheepdog.c index 57b6e1aad7..c1f6e07ec1 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -274,7 +274,7 @@ struct SheepdogAIOCB { int ret; enum AIOCBState aiocb_type; - QEMUBH *bh; + Coroutine *coroutine; void (*aio_done_func)(SheepdogAIOCB *); int canceled; @@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState { char *port; int fd; + CoMutex lock; + Coroutine *co_send; + Coroutine *co_recv; + uint32_t aioreq_seq_num; QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head; } BDRVSheepdogState; @@ -346,19 +350,16 @@ static const char * sd_strerror(int err) /* * Sheepdog I/O handling: * - * 1. In the sd_aio_readv/writev, read/write requests are added to the - * QEMU Bottom Halves. - * - * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O - * requests to the server and link the requests to the - * outstanding_list in the BDRVSheepdogState. we exits the - * function without waiting for receiving the response. + * 1. In sd_co_rw_vector, we send the I/O requests to the server and + * link the requests to the outstanding_list in the + * BDRVSheepdogState. The function exits without waiting for + * receiving the response. * - * 3. We receive the response in aio_read_response, the fd handler to + * 2. We receive the response in aio_read_response, the fd handler to * the sheepdog connection. If metadata update is needed, we send * the write request to the vdi object in sd_write_done, the write - * completion function. The AIOCB callback is not called until all - * the requests belonging to the AIOCB are finished. + * completion function. We switch back to sd_co_readv/writev after + * all the requests belonging to the AIOCB are finished. */ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, @@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) static void sd_finish_aiocb(SheepdogAIOCB *acb) { if (!acb->canceled) { - acb->common.cb(acb->common.opaque, acb->ret); + qemu_coroutine_enter(acb->coroutine, NULL); } qemu_aio_release(acb); } @@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb) * Sheepdog cannot cancel the requests which are already sent to * the servers, so we just complete the request with -EIO here. */ - acb->common.cb(acb->common.opaque, -EIO); + acb->ret = -EIO; + qemu_coroutine_enter(acb->coroutine, NULL); acb->canceled = 1; } @@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, acb->aio_done_func = NULL; acb->canceled = 0; - acb->bh = NULL; + acb->coroutine = qemu_coroutine_self(); acb->ret = 0; QLIST_INIT(&acb->aioreq_head); return acb; } -static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb) -{ - if (acb->bh) { - error_report("bug: %d %d", acb->aiocb_type, acb->aiocb_type); - return -EIO; - } - - acb->bh = qemu_bh_new(cb, acb); - qemu_bh_schedule(acb->bh); - return 0; -} - #ifdef _WIN32 struct msghdr { @@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec *iov, int len, again: ret = do_send_recv(sockfd, iov, len, iov_offset, write); if (ret < 0) { - if (errno == EINTR || errno == EAGAIN) { + if (errno == EINTR) { + goto again; + } + if (errno == EAGAIN) { + if (qemu_in_coroutine()) { + qemu_coroutine_yield(); + } goto again; } error_report("failed to recv a rsp, %s", strerror(errno)); @@ -793,14 +789,14 @@ static void aio_read_response(void *opaque) unsigned long idx; if (QLIST_EMPTY(&s->outstanding_aio_head)) { - return; + goto out; } /* read a header */ ret = do_read(fd, &rsp, sizeof(rsp)); if (ret) { error_report("failed to get the header, %s", strerror(errno)); - return; + goto out; } /* find the right aio_req from the outstanding_aio list */ @@ -811,7 +807,7 @@ static void aio_read_response(void *opaque) } if (!aio_req) { error_report("cannot find aio_req %x", rsp.id); - return; + goto out; } acb = aio_req->aiocb; @@ -847,7 +843,7 @@ static void aio_read_response(void *opaque) aio_req->iov_offset); if (ret) { error_report("failed to get the data, %s", strerror(errno)); - return; + goto out; } break; } @@ -861,10 +857,30 @@ static void aio_read_response(void *opaque) if (!rest) { /* * We've finished all requests which belong to the AIOCB, so - * we can call the callback now. + * we can switch back to sd_co_readv/writev now. */ acb->aio_done_func(acb); } +out: + s->co_recv = NULL; +} + +static void co_read_response(void *opaque) +{ + BDRVSheepdogState *s = opaque; + + if (!s->co_recv) { + s->co_recv = qemu_coroutine_create(aio_read_response); + } + + qemu_coroutine_enter(s->co_recv, opaque); +} + +static void co_write_request(void *opaque) +{ + BDRVSheepdogState *s = opaque; + + qemu_coroutine_enter(s->co_send, NULL); } static int aio_flush_request(void *opaque) @@ -924,7 +940,7 @@ static int get_sheep_fd(BDRVSheepdogState *s) return -1; } - qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request, + qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, NULL, s); return fd; } @@ -1091,6 +1107,10 @@ static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, hdr.id = aio_req->id; + qemu_co_mutex_lock(&s->lock); + s->co_send = qemu_coroutine_self(); + qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, + aio_flush_request, NULL, s); set_cork(s->fd, 1); /* send a header */ @@ -1109,6 +1129,9 @@ static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, } set_cork(s->fd, 0); + qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, + aio_flush_request, NULL, s); + qemu_co_mutex_unlock(&s->lock); return 0; } @@ -1225,6 +1248,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE; strncpy(s->name, vdi, sizeof(s->name)); + qemu_co_mutex_init(&s->lock); g_free(buf); return 0; out: @@ -1491,7 +1515,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) /* * This function is called after writing data objects. If we need to * update metadata, this sends a write request to the vdi object. - * Otherwise, this calls the AIOCB callback. + * Otherwise, this switches back to sd_co_readv/writev. */ static void sd_write_done(SheepdogAIOCB *acb) { @@ -1587,8 +1611,11 @@ out: * waiting the response. The responses are received in the * `aio_read_response' function which is called from the main loop as * a fd handler. + * + * Returns 1 when we need to wait a response, 0 when there is no sent + * request and -errno in error cases. */ -static void sd_readv_writev_bh_cb(void *p) +static int sd_co_rw_vector(void *p) { SheepdogAIOCB *acb = p; int ret = 0; @@ -1600,9 +1627,6 @@ static void sd_readv_writev_bh_cb(void *p) SheepdogInode *inode = &s->inode; AIOReq *aio_req; - qemu_bh_delete(acb->bh); - acb->bh = NULL; - if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) { /* * In the case we open the snapshot VDI, Sheepdog creates the @@ -1684,42 +1708,47 @@ static void sd_readv_writev_bh_cb(void *p) } out: if (QLIST_EMPTY(&acb->aioreq_head)) { - sd_finish_aiocb(acb); + return acb->ret; } + return 1; } -static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) +static int sd_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { SheepdogAIOCB *acb; + int ret; if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { /* TODO: shouldn't block here */ if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) { - return NULL; + return -EIO; } bs->total_sectors = sector_num + nb_sectors; } - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque); + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); acb->aio_done_func = sd_write_done; acb->aiocb_type = AIOCB_WRITE_UDATA; - sd_schedule_bh(sd_readv_writev_bh_cb, acb); - return &acb->common; + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + qemu_aio_release(acb); + return ret; + } + + qemu_coroutine_yield(); + + return acb->ret; } -static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) +static int sd_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { SheepdogAIOCB *acb; - int i; + int i, ret; - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque); + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); acb->aiocb_type = AIOCB_READ_UDATA; acb->aio_done_func = sd_finish_aiocb; @@ -1731,8 +1760,15 @@ static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num, memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len); } - sd_schedule_bh(sd_readv_writev_bh_cb, acb); - return &acb->common; + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + qemu_aio_release(acb); + return ret; + } + + qemu_coroutine_yield(); + + return acb->ret; } static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) @@ -2062,8 +2098,8 @@ BlockDriver bdrv_sheepdog = { .bdrv_getlength = sd_getlength, .bdrv_truncate = sd_truncate, - .bdrv_aio_readv = sd_aio_readv, - .bdrv_aio_writev = sd_aio_writev, + .bdrv_co_readv = sd_co_readv, + .bdrv_co_writev = sd_co_writev, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, diff --git a/block_int.h b/block_int.h index f6d02b38a7..8a72b804b2 100644 --- a/block_int.h +++ b/block_int.h @@ -28,6 +28,7 @@ #include "qemu-option.h" #include "qemu-queue.h" #include "qemu-coroutine.h" +#include "qemu-timer.h" #define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_COMPAT6 4 @@ -184,10 +185,9 @@ struct BlockDriverState { void *sync_aiocb; /* I/O stats (display with "info blockstats"). */ - uint64_t rd_bytes; - uint64_t wr_bytes; - uint64_t rd_ops; - uint64_t wr_ops; + uint64_t nr_bytes[BDRV_MAX_IOTYPE]; + uint64_t nr_ops[BDRV_MAX_IOTYPE]; + uint64_t total_time_ns[BDRV_MAX_IOTYPE]; uint64_t wr_highest_sector; /* Whether the disk can expand beyond total_sectors */ diff --git a/blockdev.c b/blockdev.c index d272659ab2..2602591bf6 100644 --- a/blockdev.c +++ b/blockdev.c @@ -321,18 +321,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) } if ((buf = qemu_opt_get(opts, "cache")) != NULL) { - if (!strcmp(buf, "off") || !strcmp(buf, "none")) { - bdrv_flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; - } else if (!strcmp(buf, "writeback")) { - bdrv_flags |= BDRV_O_CACHE_WB; - } else if (!strcmp(buf, "unsafe")) { - bdrv_flags |= BDRV_O_CACHE_WB; - bdrv_flags |= BDRV_O_NO_FLUSH; - } else if (!strcmp(buf, "writethrough")) { - /* this is the default */ - } else { - error_report("invalid cache option"); - return NULL; + if (bdrv_parse_cache_flags(buf, &bdrv_flags) != 0) { + error_report("invalid cache option"); + return NULL; } } diff --git a/hw/flash.h b/hw/flash.h index 140ae39801..270be5e127 100644 --- a/hw/flash.h +++ b/hw/flash.h @@ -36,12 +36,7 @@ uint32_t nand_getbuswidth(DeviceState *dev); #define NAND_MFR_MICRON 0x2c /* onenand.c */ -void onenand_base_update(void *opaque, target_phys_addr_t new); -void onenand_base_unmap(void *opaque); -void *onenand_init(BlockDriverState *bdrv, - uint16_t man_id, uint16_t dev_id, uint16_t ver_id, - int regshift, qemu_irq irq); -void *onenand_raw_otp(void *opaque); +void *onenand_raw_otp(DeviceState *onenand_device); /* ecc.c */ typedef struct { diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 29521babf7..f4fa1545bd 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -710,6 +710,7 @@ static void ncq_cb(void *opaque, int ret) DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n", ncq_tfs->tag); + bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct); qemu_sglist_destroy(&ncq_tfs->sglist); ncq_tfs->used = 0; } @@ -756,6 +757,10 @@ static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis, ncq_tfs->is_read = 1; DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba); + + bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct, + (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE, + BDRV_ACCT_READ); ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->sglist, ncq_tfs->lba, ncq_cb, ncq_tfs); @@ -766,6 +771,10 @@ static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis, ncq_tfs->is_read = 0; DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba); + + bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct, + (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE, + BDRV_ACCT_WRITE); ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->sglist, ncq_tfs->lba, ncq_cb, ncq_tfs); diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h index e456193b2b..832539c23c 100644 --- a/hw/ide/ahci.h +++ b/hw/ide/ahci.h @@ -258,6 +258,7 @@ typedef struct NCQTransferState { AHCIDevice *drive; BlockDriverAIOCB *aiocb; QEMUSGList sglist; + BlockAcctCookie acct; int is_read; uint16_t sector_count; uint64_t lba; diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c index fe2fb0b806..c552320122 100644 --- a/hw/ide/atapi.c +++ b/hw/ide/atapi.c @@ -104,17 +104,20 @@ static void cd_data_to_raw(uint8_t *buf, int lba) memset(buf, 0, 288); } -static int cd_read_sector(BlockDriverState *bs, int lba, uint8_t *buf, - int sector_size) +static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size) { int ret; switch(sector_size) { case 2048: - ret = bdrv_read(bs, (int64_t)lba << 2, buf, 4); + bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); + ret = bdrv_read(s->bs, (int64_t)lba << 2, buf, 4); + bdrv_acct_done(s->bs, &s->acct); break; case 2352: - ret = bdrv_read(bs, (int64_t)lba << 2, buf + 16, 4); + bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); + ret = bdrv_read(s->bs, (int64_t)lba << 2, buf + 16, 4); + bdrv_acct_done(s->bs, &s->acct); if (ret < 0) return ret; cd_data_to_raw(buf, lba); @@ -181,7 +184,7 @@ void ide_atapi_cmd_reply_end(IDEState *s) } else { /* see if a new sector must be read */ if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) { - ret = cd_read_sector(s->bs, s->lba, s->io_buffer, s->cd_sector_size); + ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size); if (ret < 0) { ide_transfer_stop(s); ide_atapi_io_error(s, ret); @@ -250,6 +253,7 @@ static void ide_atapi_cmd_reply(IDEState *s, int size, int max_size) s->io_buffer_index = 0; if (s->atapi_dma) { + bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ); s->status = READY_STAT | SEEK_STAT | DRQ_STAT; s->bus->dma->ops->start_dma(s->bus->dma, s, ide_atapi_cmd_read_dma_cb); @@ -322,10 +326,7 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret) s->status = READY_STAT | SEEK_STAT; s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD; ide_set_irq(s->bus); - eot: - s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT); - ide_set_inactive(s); - return; + goto eot; } s->io_buffer_index = 0; @@ -343,9 +344,11 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret) #ifdef DEBUG_AIO printf("aio_read_cd: lba=%u n=%d\n", s->lba, n); #endif + s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset); s->bus->dma->iov.iov_len = n * 4 * 512; qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1); + s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2, &s->bus->dma->qiov, n * 4, ide_atapi_cmd_read_dma_cb, s); @@ -355,6 +358,12 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret) ASC_MEDIUM_NOT_PRESENT); goto eot; } + + return; +eot: + bdrv_acct_done(s->bs, &s->acct); + s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT); + ide_set_inactive(s); } /* start a CD-CDROM read command with DMA */ @@ -368,6 +377,8 @@ static void ide_atapi_cmd_read_dma(IDEState *s, int lba, int nb_sectors, s->io_buffer_size = 0; s->cd_sector_size = sector_size; + bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ); + /* XXX: check if BUSY_STAT should be set */ s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT; s->bus->dma->ops->start_dma(s->bus->dma, s, diff --git a/hw/ide/core.c b/hw/ide/core.c index d145b19b0c..40abc1edd2 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -473,7 +473,10 @@ void ide_sector_read(IDEState *s) #endif if (n > s->req_nb_sectors) n = s->req_nb_sectors; + + bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); ret = bdrv_read(s->bs, sector_num, s->io_buffer, n); + bdrv_acct_done(s->bs, &s->acct); if (ret != 0) { if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ)) @@ -610,7 +613,10 @@ handle_rw_error: return; eot: - ide_set_inactive(s); + if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) { + bdrv_acct_done(s->bs, &s->acct); + } + ide_set_inactive(s); } static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd) @@ -619,6 +625,20 @@ static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd) s->io_buffer_index = 0; s->io_buffer_size = 0; s->dma_cmd = dma_cmd; + + switch (dma_cmd) { + case IDE_DMA_READ: + bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE, + BDRV_ACCT_READ); + break; + case IDE_DMA_WRITE: + bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE, + BDRV_ACCT_WRITE); + break; + default: + break; + } + s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb); } @@ -641,7 +661,10 @@ void ide_sector_write(IDEState *s) n = s->nsector; if (n > s->req_nb_sectors) n = s->req_nb_sectors; + + bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); ret = bdrv_write(s->bs, sector_num, s->io_buffer, n); + bdrv_acct_done(s->bs, &s->acct); if (ret != 0) { if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY)) @@ -685,6 +708,7 @@ static void ide_flush_cb(void *opaque, int ret) } } + bdrv_acct_done(s->bs, &s->acct); s->status = READY_STAT | SEEK_STAT; ide_set_irq(s->bus); } @@ -698,6 +722,7 @@ void ide_flush_cache(IDEState *s) return; } + bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH); acb = bdrv_aio_flush(s->bs, ide_flush_cb, s); if (acb == NULL) { ide_flush_cb(s, -EIO); diff --git a/hw/ide/internal.h b/hw/ide/internal.h index 02e805f070..7f5ef8de1d 100644 --- a/hw/ide/internal.h +++ b/hw/ide/internal.h @@ -440,6 +440,7 @@ struct IDEState { int lba; int cd_sector_size; int atapi_dma; /* true if dma is requested for the packet cmd */ + BlockAcctCookie acct; /* ATA DMA state */ int io_buffer_size; QEMUSGList sg; diff --git a/hw/ide/macio.c b/hw/ide/macio.c index 44fb3fef60..fdf5d75082 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -52,8 +52,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) m->aiocb = NULL; qemu_sglist_destroy(&s->sg); ide_atapi_io_error(s, ret); - io->dma_end(opaque); - return; + goto done; } if (s->io_buffer_size > 0) { @@ -71,8 +70,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) ide_atapi_cmd_ok(s); if (io->len == 0) { - io->dma_end(opaque); - return; + goto done; } /* launch next transfer */ @@ -92,9 +90,14 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) /* Note: media not present is the most likely case */ ide_atapi_cmd_error(s, SENSE_NOT_READY, ASC_MEDIUM_NOT_PRESENT); - io->dma_end(opaque); - return; + goto done; } + return; + +done: + bdrv_acct_done(s->bs, &s->acct); + io->dma_end(opaque); + return; } static void pmac_ide_transfer_cb(void *opaque, int ret) @@ -109,8 +112,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) m->aiocb = NULL; qemu_sglist_destroy(&s->sg); ide_dma_error(s); - io->dma_end(io); - return; + goto done; } sector_num = ide_get_sector(s); @@ -130,10 +132,8 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) } /* end of DMA ? */ - if (io->len == 0) { - io->dma_end(io); - return; + goto done; } /* launch next transfer */ @@ -163,6 +163,12 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) if (!m->aiocb) pmac_ide_transfer_cb(io, -1); + return; +done: + if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) { + bdrv_acct_done(s->bs, &s->acct); + } + io->dma_end(io); } static void pmac_ide_transfer(DBDMA_io *io) @@ -172,10 +178,22 @@ static void pmac_ide_transfer(DBDMA_io *io) s->io_buffer_size = 0; if (s->drive_kind == IDE_CD) { + bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ); pmac_ide_atapi_transfer_cb(io, 0); return; } + switch (s->dma_cmd) { + case IDE_DMA_READ: + bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ); + break; + case IDE_DMA_WRITE: + bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE); + break; + default: + break; + } + pmac_ide_transfer_cb(io, 0); } diff --git a/hw/nseries.c b/hw/nseries.c index f7aae7a59e..af287dd6dc 100644 --- a/hw/nseries.c +++ b/hw/nseries.c @@ -32,7 +32,7 @@ #include "bt.h" #include "loader.h" #include "blockdev.h" -#include "tusb6010.h" +#include "sysbus.h" /* Nokia N8x0 support */ struct n800_s { @@ -49,10 +49,10 @@ struct n800_s { int keymap[0x80]; DeviceState *kbd; - TUSBState *usb; + DeviceState *usb; void *retu; void *tahvo; - void *nand; + DeviceState *nand; }; /* GPIO pins */ @@ -167,13 +167,21 @@ static void n8x0_nand_setup(struct n800_s *s) char *otp_region; DriveInfo *dinfo; - dinfo = drive_get(IF_MTD, 0, 0); + s->nand = qdev_create(NULL, "onenand"); + qdev_prop_set_uint16(s->nand, "manufacturer_id", NAND_MFR_SAMSUNG); /* Either 0x40 or 0x48 are OK for the device ID */ - s->nand = onenand_init(dinfo ? dinfo->bdrv : 0, - NAND_MFR_SAMSUNG, 0x48, 0, 1, - qdev_get_gpio_in(s->cpu->gpio, N8X0_ONENAND_GPIO)); - omap_gpmc_attach(s->cpu->gpmc, N8X0_ONENAND_CS, 0, onenand_base_update, - onenand_base_unmap, s->nand); + qdev_prop_set_uint16(s->nand, "device_id", 0x48); + qdev_prop_set_uint16(s->nand, "version_id", 0); + qdev_prop_set_int32(s->nand, "shift", 1); + dinfo = drive_get(IF_MTD, 0, 0); + if (dinfo && dinfo->bdrv) { + qdev_prop_set_drive_nofail(s->nand, "drive", dinfo->bdrv); + } + qdev_init_nofail(s->nand); + sysbus_connect_irq(sysbus_from_qdev(s->nand), 0, + qdev_get_gpio_in(s->cpu->gpio, N8X0_ONENAND_GPIO)); + omap_gpmc_attach(s->cpu->gpmc, N8X0_ONENAND_CS, + sysbus_mmio_get_region(sysbus_from_qdev(s->nand), 0)); otp_region = onenand_raw_otp(s->nand); memcpy(otp_region + 0x000, n8x0_cal_wlan_mac, sizeof(n8x0_cal_wlan_mac)); @@ -756,27 +764,21 @@ static void n8x0_uart_setup(struct n800_s *s) omap_uart_attach(s->cpu->uart[BT_UART], radio); } -static void n8x0_usb_power_cb(void *opaque, int line, int level) -{ - struct n800_s *s = opaque; - - tusb6010_power(s->usb, level); -} - static void n8x0_usb_setup(struct n800_s *s) { - qemu_irq tusb_irq = qdev_get_gpio_in(s->cpu->gpio, N8X0_TUSB_INT_GPIO); - qemu_irq tusb_pwr = qemu_allocate_irqs(n8x0_usb_power_cb, s, 1)[0]; - TUSBState *tusb = tusb6010_init(tusb_irq); - + SysBusDevice *dev; + s->usb = qdev_create(NULL, "tusb6010"); + dev = sysbus_from_qdev(s->usb); + qdev_init_nofail(s->usb); + sysbus_connect_irq(dev, 0, + qdev_get_gpio_in(s->cpu->gpio, N8X0_TUSB_INT_GPIO)); /* Using the NOR interface */ omap_gpmc_attach(s->cpu->gpmc, N8X0_USB_ASYNC_CS, - tusb6010_async_io(tusb), NULL, NULL, tusb); + sysbus_mmio_get_region(dev, 0)); omap_gpmc_attach(s->cpu->gpmc, N8X0_USB_SYNC_CS, - tusb6010_sync_io(tusb), NULL, NULL, tusb); - - s->usb = tusb; - qdev_connect_gpio_out(s->cpu->gpio, N8X0_TUSB_ENABLE_GPIO, tusb_pwr); + sysbus_mmio_get_region(dev, 1)); + qdev_connect_gpio_out(s->cpu->gpio, N8X0_TUSB_ENABLE_GPIO, + qdev_get_gpio_in(s->usb, 0)); /* tusb_pwr */ } /* Setup done before the main bootloader starts by some early setup code @@ -118,11 +118,12 @@ void omap_sdrc_reset(struct omap_sdrc_s *s); /* OMAP2 general purpose memory controller */ struct omap_gpmc_s; -struct omap_gpmc_s *omap_gpmc_init(target_phys_addr_t base, qemu_irq irq); +struct omap_gpmc_s *omap_gpmc_init(struct omap_mpu_state_s *mpu, + target_phys_addr_t base, + qemu_irq irq, qemu_irq drq); void omap_gpmc_reset(struct omap_gpmc_s *s); -void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem, - void (*base_upd)(void *opaque, target_phys_addr_t new), - void (*unmap)(void *opaque), void *opaque); +void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem); +void omap_gpmc_attach_nand(struct omap_gpmc_s *s, int cs, DeviceState *nand); /* * Common IRQ numbers for level 1 interrupt handler @@ -788,6 +789,7 @@ i2c_bus *omap_i2c_bus(struct omap_i2c_s *s); # define cpu_is_omap2420(cpu) (cpu->mpu_model == omap2420) # define cpu_is_omap2430(cpu) (cpu->mpu_model == omap2430) # define cpu_is_omap3430(cpu) (cpu->mpu_model == omap3430) +# define cpu_is_omap3630(cpu) (cpu->mpu_model == omap3630) # define cpu_is_omap15xx(cpu) \ (cpu_is_omap310(cpu) || cpu_is_omap1510(cpu)) @@ -799,7 +801,8 @@ i2c_bus *omap_i2c_bus(struct omap_i2c_s *s); # define cpu_class_omap1(cpu) \ (cpu_is_omap15xx(cpu) || cpu_is_omap16xx(cpu)) # define cpu_class_omap2(cpu) cpu_is_omap24xx(cpu) -# define cpu_class_omap3(cpu) cpu_is_omap3430(cpu) +# define cpu_class_omap3(cpu) \ + (cpu_is_omap3430(cpu) || cpu_is_omap3630(cpu)) struct omap_mpu_state_s { enum omap_mpu_model { @@ -813,6 +816,7 @@ struct omap_mpu_state_s { omap2423, omap2430, omap3430, + omap3630, } mpu_model; CPUState *env; diff --git a/hw/omap2.c b/hw/omap2.c index 7e5820a97b..ca088d9f53 100644 --- a/hw/omap2.c +++ b/hw/omap2.c @@ -2402,7 +2402,8 @@ struct omap_mpu_state_s *omap2420_mpu_init(unsigned long sdram_size, sysbus_mmio_map(busdev, 4, omap_l4_region_base(ta, 5)); s->sdrc = omap_sdrc_init(0x68009000); - s->gpmc = omap_gpmc_init(0x6800a000, s->irq[0][OMAP_INT_24XX_GPMC_IRQ]); + s->gpmc = omap_gpmc_init(s, 0x6800a000, s->irq[0][OMAP_INT_24XX_GPMC_IRQ], + s->drq[OMAP24XX_DMA_GPMC]); dinfo = drive_get(IF_SD, 0, 0); if (!dinfo) { diff --git a/hw/omap_gpmc.c b/hw/omap_gpmc.c index 673dddd237..02f0c52107 100644 --- a/hw/omap_gpmc.c +++ b/hw/omap_gpmc.c @@ -27,82 +27,410 @@ /* General-Purpose Memory Controller */ struct omap_gpmc_s { qemu_irq irq; + qemu_irq drq; MemoryRegion iomem; + int accept_256; + uint8_t revision; uint8_t sysconfig; uint16_t irqst; uint16_t irqen; + uint16_t lastirq; uint16_t timeout; uint16_t config; - uint32_t prefconfig[2]; - int prefcontrol; - int preffifo; - int prefcount; struct omap_gpmc_cs_file_s { uint32_t config[7]; - target_phys_addr_t base; - size_t size; MemoryRegion *iomem; MemoryRegion container; - void (*base_update)(void *opaque, target_phys_addr_t new); - void (*unmap)(void *opaque); - void *opaque; + MemoryRegion nandiomem; + DeviceState *dev; } cs_file[8]; int ecc_cs; int ecc_ptr; uint32_t ecc_cfg; ECCState ecc[9]; + struct prefetch { + uint32_t config1; /* GPMC_PREFETCH_CONFIG1 */ + uint32_t transfercount; /* GPMC_PREFETCH_CONFIG2:TRANSFERCOUNT */ + int startengine; /* GPMC_PREFETCH_CONTROL:STARTENGINE */ + int fifopointer; /* GPMC_PREFETCH_STATUS:FIFOPOINTER */ + int count; /* GPMC_PREFETCH_STATUS:COUNTVALUE */ + MemoryRegion iomem; + uint8_t fifo[64]; + } prefetch; }; +#define OMAP_GPMC_8BIT 0 +#define OMAP_GPMC_16BIT 1 +#define OMAP_GPMC_NOR 0 +#define OMAP_GPMC_NAND 2 + +static int omap_gpmc_devtype(struct omap_gpmc_cs_file_s *f) +{ + return (f->config[0] >> 10) & 3; +} + +static int omap_gpmc_devsize(struct omap_gpmc_cs_file_s *f) +{ + /* devsize field is really 2 bits but we ignore the high + * bit to ensure consistent behaviour if the guest sets + * it (values 2 and 3 are reserved in the TRM) + */ + return (f->config[0] >> 12) & 1; +} + +/* Extract the chip-select value from the prefetch config1 register */ +static int prefetch_cs(uint32_t config1) +{ + return (config1 >> 24) & 7; +} + +static int prefetch_threshold(uint32_t config1) +{ + return (config1 >> 8) & 0x7f; +} + static void omap_gpmc_int_update(struct omap_gpmc_s *s) { - qemu_set_irq(s->irq, s->irqen & s->irqst); + /* The TRM is a bit unclear, but it seems to say that + * the TERMINALCOUNTSTATUS bit is set only on the + * transition when the prefetch engine goes from + * active to inactive, whereas the FIFOEVENTSTATUS + * bit is held high as long as the fifo has at + * least THRESHOLD bytes available. + * So we do the latter here, but TERMINALCOUNTSTATUS + * is set elsewhere. + */ + if (s->prefetch.fifopointer >= prefetch_threshold(s->prefetch.config1)) { + s->irqst |= 1; + } + if ((s->irqen & s->irqst) != s->lastirq) { + s->lastirq = s->irqen & s->irqst; + qemu_set_irq(s->irq, s->lastirq); + } } -static void omap_gpmc_cs_map(struct omap_gpmc_cs_file_s *f, int base, int mask) +static void omap_gpmc_dma_update(struct omap_gpmc_s *s, int value) { - /* TODO: check for overlapping regions and report access errors */ - if ((mask != 0x8 && mask != 0xc && mask != 0xe && mask != 0xf) || - (base < 0 || base >= 0x40) || - (base & 0x0f & ~mask)) { - fprintf(stderr, "%s: wrong cs address mapping/decoding!\n", - __FUNCTION__); + if (s->prefetch.config1 & 4) { + qemu_set_irq(s->drq, value); + } +} + +/* Access functions for when a NAND-like device is mapped into memory: + * all addresses in the region behave like accesses to the relevant + * GPMC_NAND_DATA_i register (which is actually implemented to call these) + */ +static uint64_t omap_nand_read(void *opaque, target_phys_addr_t addr, + unsigned size) +{ + struct omap_gpmc_cs_file_s *f = (struct omap_gpmc_cs_file_s *)opaque; + uint64_t v; + nand_setpins(f->dev, 0, 0, 0, 1, 0); + switch (omap_gpmc_devsize(f)) { + case OMAP_GPMC_8BIT: + v = nand_getio(f->dev); + if (size == 1) { + return v; + } + v |= (nand_getio(f->dev) << 8); + if (size == 2) { + return v; + } + v |= (nand_getio(f->dev) << 16); + v |= (nand_getio(f->dev) << 24); + return v; + case OMAP_GPMC_16BIT: + v = nand_getio(f->dev); + if (size == 1) { + /* 8 bit read from 16 bit device : probably a guest bug */ + return v & 0xff; + } + if (size == 2) { + return v; + } + v |= (nand_getio(f->dev) << 16); + return v; + default: + abort(); + } +} + +static void omap_nand_setio(DeviceState *dev, uint64_t value, + int nandsize, int size) +{ + /* Write the specified value to the NAND device, respecting + * both size of the NAND device and size of the write access. + */ + switch (nandsize) { + case OMAP_GPMC_8BIT: + switch (size) { + case 1: + nand_setio(dev, value & 0xff); + break; + case 2: + nand_setio(dev, value & 0xff); + nand_setio(dev, (value >> 8) & 0xff); + break; + case 4: + default: + nand_setio(dev, value & 0xff); + nand_setio(dev, (value >> 8) & 0xff); + nand_setio(dev, (value >> 16) & 0xff); + nand_setio(dev, (value >> 24) & 0xff); + break; + } + case OMAP_GPMC_16BIT: + switch (size) { + case 1: + /* writing to a 16bit device with 8bit access is probably a guest + * bug; pass the value through anyway. + */ + case 2: + nand_setio(dev, value & 0xffff); + break; + case 4: + default: + nand_setio(dev, value & 0xffff); + nand_setio(dev, (value >> 16) & 0xffff); + break; + } + } +} + +static void omap_nand_write(void *opaque, target_phys_addr_t addr, + uint64_t value, unsigned size) +{ + struct omap_gpmc_cs_file_s *f = (struct omap_gpmc_cs_file_s *)opaque; + nand_setpins(f->dev, 0, 0, 0, 1, 0); + omap_nand_setio(f->dev, value, omap_gpmc_devsize(f), size); +} + +static const MemoryRegionOps omap_nand_ops = { + .read = omap_nand_read, + .write = omap_nand_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void fill_prefetch_fifo(struct omap_gpmc_s *s) +{ + /* Fill the prefetch FIFO by reading data from NAND. + * We do this synchronously, unlike the hardware which + * will do this asynchronously. We refill when the + * FIFO has THRESHOLD bytes free, and we always refill + * as much data as possible starting at the top end + * of the FIFO. + * (We have to refill at THRESHOLD rather than waiting + * for the FIFO to empty to allow for the case where + * the FIFO size isn't an exact multiple of THRESHOLD + * and we're doing DMA transfers.) + * This means we never need to handle wrap-around in + * the fifo-reading code, and the next byte of data + * to read is always fifo[63 - fifopointer]. + */ + int fptr; + int cs = prefetch_cs(s->prefetch.config1); + int is16bit = (((s->cs_file[cs].config[0] >> 12) & 3) != 0); + int bytes; + /* Don't believe the bit of the OMAP TRM that says that COUNTVALUE + * and TRANSFERCOUNT are in units of 16 bit words for 16 bit NAND. + * Instead believe the bit that says it is always a byte count. + */ + bytes = 64 - s->prefetch.fifopointer; + if (bytes > s->prefetch.count) { + bytes = s->prefetch.count; + } + s->prefetch.count -= bytes; + s->prefetch.fifopointer += bytes; + fptr = 64 - s->prefetch.fifopointer; + /* Move the existing data in the FIFO so it sits just + * before what we're about to read in + */ + while (fptr < (64 - bytes)) { + s->prefetch.fifo[fptr] = s->prefetch.fifo[fptr + bytes]; + fptr++; + } + while (fptr < 64) { + if (is16bit) { + uint32_t v = omap_nand_read(&s->cs_file[cs], 0, 2); + s->prefetch.fifo[fptr++] = v & 0xff; + s->prefetch.fifo[fptr++] = (v >> 8) & 0xff; + } else { + s->prefetch.fifo[fptr++] = omap_nand_read(&s->cs_file[cs], 0, 1); + } + } + if (s->prefetch.startengine && (s->prefetch.count == 0)) { + /* This was the final transfer: raise TERMINALCOUNTSTATUS */ + s->irqst |= 2; + s->prefetch.startengine = 0; + } + /* If there are any bytes in the FIFO at this point then + * we must raise a DMA request (either this is a final part + * transfer, or we filled the FIFO in which case we certainly + * have THRESHOLD bytes available) + */ + if (s->prefetch.fifopointer != 0) { + omap_gpmc_dma_update(s, 1); + } + omap_gpmc_int_update(s); +} + +/* Access functions for a NAND-like device when the prefetch/postwrite + * engine is enabled -- all addresses in the region behave alike: + * data is read or written to the FIFO. + */ +static uint64_t omap_gpmc_prefetch_read(void *opaque, target_phys_addr_t addr, + unsigned size) +{ + struct omap_gpmc_s *s = (struct omap_gpmc_s *) opaque; + uint32_t data; + if (s->prefetch.config1 & 1) { + /* The TRM doesn't define the behaviour if you read from the + * FIFO when the prefetch engine is in write mode. We choose + * to always return zero. + */ + return 0; + } + /* Note that trying to read an empty fifo repeats the last byte */ + if (s->prefetch.fifopointer) { + s->prefetch.fifopointer--; + } + data = s->prefetch.fifo[63 - s->prefetch.fifopointer]; + if (s->prefetch.fifopointer == + (64 - prefetch_threshold(s->prefetch.config1))) { + /* We've drained THRESHOLD bytes now. So deassert the + * DMA request, then refill the FIFO (which will probably + * assert it again.) + */ + omap_gpmc_dma_update(s, 0); + fill_prefetch_fifo(s); + } + omap_gpmc_int_update(s); + return data; +} + +static void omap_gpmc_prefetch_write(void *opaque, target_phys_addr_t addr, + uint64_t value, unsigned size) +{ + struct omap_gpmc_s *s = (struct omap_gpmc_s *) opaque; + int cs = prefetch_cs(s->prefetch.config1); + if ((s->prefetch.config1 & 1) == 0) { + /* The TRM doesn't define the behaviour of writing to the + * FIFO when the prefetch engine is in read mode. We + * choose to ignore the write. + */ + return; + } + if (s->prefetch.count == 0) { + /* The TRM doesn't define the behaviour of writing to the + * FIFO if the transfer is complete. We choose to ignore. + */ return; } + /* The only reason we do any data buffering in postwrite + * mode is if we are talking to a 16 bit NAND device, in + * which case we need to buffer the first byte of the + * 16 bit word until the other byte arrives. + */ + int is16bit = (((s->cs_file[cs].config[0] >> 12) & 3) != 0); + if (is16bit) { + /* fifopointer alternates between 64 (waiting for first + * byte of word) and 63 (waiting for second byte) + */ + if (s->prefetch.fifopointer == 64) { + s->prefetch.fifo[0] = value; + s->prefetch.fifopointer--; + } else { + value = (value << 8) | s->prefetch.fifo[0]; + omap_nand_write(&s->cs_file[cs], 0, value, 2); + s->prefetch.count--; + s->prefetch.fifopointer = 64; + } + } else { + /* Just write the byte : fifopointer remains 64 at all times */ + omap_nand_write(&s->cs_file[cs], 0, value, 1); + s->prefetch.count--; + } + if (s->prefetch.count == 0) { + /* Final transfer: raise TERMINALCOUNTSTATUS */ + s->irqst |= 2; + s->prefetch.startengine = 0; + } + omap_gpmc_int_update(s); +} + +static const MemoryRegionOps omap_prefetch_ops = { + .read = omap_gpmc_prefetch_read, + .write = omap_gpmc_prefetch_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl.min_access_size = 1, + .impl.max_access_size = 1, +}; - if (!f->opaque) +static MemoryRegion *omap_gpmc_cs_memregion(struct omap_gpmc_s *s, int cs) +{ + /* Return the MemoryRegion* to map/unmap for this chipselect */ + struct omap_gpmc_cs_file_s *f = &s->cs_file[cs]; + if (omap_gpmc_devtype(f) == OMAP_GPMC_NOR) { + return f->iomem; + } + if ((s->prefetch.config1 & 0x80) && + (prefetch_cs(s->prefetch.config1) == cs)) { + /* The prefetch engine is enabled for this CS: map the FIFO */ + return &s->prefetch.iomem; + } + return &f->nandiomem; +} + +static void omap_gpmc_cs_map(struct omap_gpmc_s *s, int cs) +{ + struct omap_gpmc_cs_file_s *f = &s->cs_file[cs]; + uint32_t mask = (f->config[6] >> 8) & 0xf; + uint32_t base = f->config[6] & 0x3f; + uint32_t size; + + if (!f->iomem && !f->dev) { + return; + } + + if (!(f->config[6] & (1 << 6))) { + /* Do nothing unless CSVALID */ return; + } - f->base = base << 24; - f->size = (0x0fffffff & ~(mask << 24)) + 1; + /* TODO: check for overlapping regions and report access errors */ + if (mask != 0x8 && mask != 0xc && mask != 0xe && mask != 0xf + && !(s->accept_256 && !mask)) { + fprintf(stderr, "%s: invalid chip-select mask address (0x%x)\n", + __func__, mask); + } + + base <<= 24; + size = (0x0fffffff & ~(mask << 24)) + 1; /* TODO: rather than setting the size of the mapping (which should be * constant), the mask should cause wrapping of the address space, so * that the same memory becomes accessible at every <i>size</i> bytes * starting from <i>base</i>. */ - if (f->iomem) { - memory_region_init(&f->container, "omap-gpmc-file", f->size); - memory_region_add_subregion(&f->container, 0, f->iomem); - memory_region_add_subregion(get_system_memory(), f->base, - &f->container); - } - - if (f->base_update) - f->base_update(f->opaque, f->base); + memory_region_init(&f->container, "omap-gpmc-file", size); + memory_region_add_subregion(&f->container, 0, + omap_gpmc_cs_memregion(s, cs)); + memory_region_add_subregion(get_system_memory(), base, + &f->container); } -static void omap_gpmc_cs_unmap(struct omap_gpmc_cs_file_s *f) +static void omap_gpmc_cs_unmap(struct omap_gpmc_s *s, int cs) { - if (f->size) { - if (f->unmap) - f->unmap(f->opaque); - if (f->iomem) { - memory_region_del_subregion(get_system_memory(), &f->container); - memory_region_del_subregion(&f->container, f->iomem); - memory_region_destroy(&f->container); - } - f->base = 0; - f->size = 0; + struct omap_gpmc_cs_file_s *f = &s->cs_file[cs]; + if (!(f->config[6] & (1 << 6))) { + /* Do nothing unless CSVALID */ + return; + } + if (!f->iomem && !f->dev) { + return; } + memory_region_del_subregion(get_system_memory(), &f->container); + memory_region_del_subregion(&f->container, omap_gpmc_cs_memregion(s, cs)); + memory_region_destroy(&f->container); } void omap_gpmc_reset(struct omap_gpmc_s *s) @@ -115,25 +443,32 @@ void omap_gpmc_reset(struct omap_gpmc_s *s) omap_gpmc_int_update(s); s->timeout = 0; s->config = 0xa00; - s->prefconfig[0] = 0x00004000; - s->prefconfig[1] = 0x00000000; - s->prefcontrol = 0; - s->preffifo = 0; - s->prefcount = 0; + s->prefetch.config1 = 0x00004000; + s->prefetch.transfercount = 0x00000000; + s->prefetch.startengine = 0; + s->prefetch.fifopointer = 0; + s->prefetch.count = 0; for (i = 0; i < 8; i ++) { - if (s->cs_file[i].config[6] & (1 << 6)) /* CSVALID */ - omap_gpmc_cs_unmap(s->cs_file + i); - s->cs_file[i].config[0] = i ? 1 << 12 : 0; + omap_gpmc_cs_unmap(s, i); s->cs_file[i].config[1] = 0x101001; s->cs_file[i].config[2] = 0x020201; s->cs_file[i].config[3] = 0x10031003; s->cs_file[i].config[4] = 0x10f1111; s->cs_file[i].config[5] = 0; s->cs_file[i].config[6] = 0xf00 | (i ? 0 : 1 << 6); - if (s->cs_file[i].config[6] & (1 << 6)) /* CSVALID */ - omap_gpmc_cs_map(&s->cs_file[i], - s->cs_file[i].config[6] & 0x1f, /* MASKADDR */ - (s->cs_file[i].config[6] >> 8 & 0xf)); /* BASEADDR */ + + s->cs_file[i].config[6] = 0xf00; + /* In theory we could probe attached devices for some CFG1 + * bits here, but we just retain them across resets as they + * were set initially by omap_gpmc_attach(). + */ + if (i == 0) { + s->cs_file[i].config[0] &= 0x00433e00; + s->cs_file[i].config[6] |= 1 << 6; /* CSVALID */ + omap_gpmc_cs_map(s, i); + } else { + s->cs_file[i].config[0] &= 0x00403c00; + } } s->ecc_cs = 0; s->ecc_ptr = 0; @@ -142,6 +477,24 @@ void omap_gpmc_reset(struct omap_gpmc_s *s) ecc_reset(&s->ecc[i]); } +static int gpmc_wordaccess_only(target_phys_addr_t addr) +{ + /* Return true if the register offset is to a register that + * only permits word width accesses. + * Non-word accesses are only OK for GPMC_NAND_DATA/ADDRESS/COMMAND + * for any chipselect. + */ + if (addr >= 0x60 && addr <= 0x1d4) { + int cs = (addr - 0x60) / 0x30; + addr -= cs * 0x30; + if (addr >= 0x7c && addr < 0x88) { + /* GPMC_NAND_COMMAND, GPMC_NAND_ADDRESS, GPMC_NAND_DATA */ + return 0; + } + } + return 1; +} + static uint64_t omap_gpmc_read(void *opaque, target_phys_addr_t addr, unsigned size) { @@ -149,13 +502,13 @@ static uint64_t omap_gpmc_read(void *opaque, target_phys_addr_t addr, int cs; struct omap_gpmc_cs_file_s *f; - if (size != 4) { + if (size != 4 && gpmc_wordaccess_only(addr)) { return omap_badwidth_read32(opaque, addr); } switch (addr) { case 0x000: /* GPMC_REVISION */ - return 0x20; + return s->revision; case 0x010: /* GPMC_SYSCONFIG */ return s->sysconfig; @@ -187,36 +540,39 @@ static uint64_t omap_gpmc_read(void *opaque, target_phys_addr_t addr, addr -= cs * 0x30; f = s->cs_file + cs; switch (addr) { - case 0x60: /* GPMC_CONFIG1 */ - return f->config[0]; - case 0x64: /* GPMC_CONFIG2 */ - return f->config[1]; - case 0x68: /* GPMC_CONFIG3 */ - return f->config[2]; - case 0x6c: /* GPMC_CONFIG4 */ - return f->config[3]; - case 0x70: /* GPMC_CONFIG5 */ - return f->config[4]; - case 0x74: /* GPMC_CONFIG6 */ - return f->config[5]; - case 0x78: /* GPMC_CONFIG7 */ - return f->config[6]; - case 0x84: /* GPMC_NAND_DATA */ - return 0; + case 0x60: /* GPMC_CONFIG1 */ + return f->config[0]; + case 0x64: /* GPMC_CONFIG2 */ + return f->config[1]; + case 0x68: /* GPMC_CONFIG3 */ + return f->config[2]; + case 0x6c: /* GPMC_CONFIG4 */ + return f->config[3]; + case 0x70: /* GPMC_CONFIG5 */ + return f->config[4]; + case 0x74: /* GPMC_CONFIG6 */ + return f->config[5]; + case 0x78: /* GPMC_CONFIG7 */ + return f->config[6]; + case 0x84 ... 0x87: /* GPMC_NAND_DATA */ + if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) { + return omap_nand_read(f, 0, size); + } + return 0; } break; case 0x1e0: /* GPMC_PREFETCH_CONFIG1 */ - return s->prefconfig[0]; + return s->prefetch.config1; case 0x1e4: /* GPMC_PREFETCH_CONFIG2 */ - return s->prefconfig[1]; + return s->prefetch.transfercount; case 0x1ec: /* GPMC_PREFETCH_CONTROL */ - return s->prefcontrol; + return s->prefetch.startengine; case 0x1f0: /* GPMC_PREFETCH_STATUS */ - return (s->preffifo << 24) | - ((s->preffifo > - ((s->prefconfig[0] >> 8) & 0x7f) ? 1 : 0) << 16) | - s->prefcount; + return (s->prefetch.fifopointer << 24) | + ((s->prefetch.fifopointer >= + ((s->prefetch.config1 >> 8) & 0x7f) ? 1 : 0) << 16) | + s->prefetch.count; case 0x1f4: /* GPMC_ECC_CONFIG */ return s->ecc_cs; @@ -251,7 +607,7 @@ static void omap_gpmc_write(void *opaque, target_phys_addr_t addr, int cs; struct omap_gpmc_cs_file_s *f; - if (size != 4) { + if (size != 4 && gpmc_wordaccess_only(addr)) { return omap_badwidth_write32(opaque, addr, value); } @@ -276,7 +632,7 @@ static void omap_gpmc_write(void *opaque, target_phys_addr_t addr, break; case 0x018: /* GPMC_IRQSTATUS */ - s->irqen = ~value; + s->irqen &= ~value; omap_gpmc_int_update(s); break; @@ -302,62 +658,109 @@ static void omap_gpmc_write(void *opaque, target_phys_addr_t addr, addr -= cs * 0x30; f = s->cs_file + cs; switch (addr) { - case 0x60: /* GPMC_CONFIG1 */ - f->config[0] = value & 0xffef3e13; - break; - case 0x64: /* GPMC_CONFIG2 */ - f->config[1] = value & 0x001f1f8f; - break; - case 0x68: /* GPMC_CONFIG3 */ - f->config[2] = value & 0x001f1f8f; - break; - case 0x6c: /* GPMC_CONFIG4 */ - f->config[3] = value & 0x1f8f1f8f; - break; - case 0x70: /* GPMC_CONFIG5 */ - f->config[4] = value & 0x0f1f1f1f; - break; - case 0x74: /* GPMC_CONFIG6 */ - f->config[5] = value & 0x00000fcf; - break; - case 0x78: /* GPMC_CONFIG7 */ - if ((f->config[6] ^ value) & 0xf7f) { - if (f->config[6] & (1 << 6)) /* CSVALID */ - omap_gpmc_cs_unmap(f); - if (value & (1 << 6)) /* CSVALID */ - omap_gpmc_cs_map(f, value & 0x1f, /* MASKADDR */ - (value >> 8 & 0xf)); /* BASEADDR */ - } + case 0x60: /* GPMC_CONFIG1 */ + f->config[0] = value & 0xffef3e13; + break; + case 0x64: /* GPMC_CONFIG2 */ + f->config[1] = value & 0x001f1f8f; + break; + case 0x68: /* GPMC_CONFIG3 */ + f->config[2] = value & 0x001f1f8f; + break; + case 0x6c: /* GPMC_CONFIG4 */ + f->config[3] = value & 0x1f8f1f8f; + break; + case 0x70: /* GPMC_CONFIG5 */ + f->config[4] = value & 0x0f1f1f1f; + break; + case 0x74: /* GPMC_CONFIG6 */ + f->config[5] = value & 0x00000fcf; + break; + case 0x78: /* GPMC_CONFIG7 */ + if ((f->config[6] ^ value) & 0xf7f) { + omap_gpmc_cs_unmap(s, cs); f->config[6] = value & 0x00000f7f; - break; - case 0x7c: /* GPMC_NAND_COMMAND */ - case 0x80: /* GPMC_NAND_ADDRESS */ - case 0x84: /* GPMC_NAND_DATA */ - break; - - default: - goto bad_reg; + omap_gpmc_cs_map(s, cs); + } + break; + case 0x7c ... 0x7f: /* GPMC_NAND_COMMAND */ + if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) { + nand_setpins(f->dev, 1, 0, 0, 1, 0); /* CLE */ + omap_nand_setio(f->dev, value, omap_gpmc_devsize(f), size); + } + break; + case 0x80 ... 0x83: /* GPMC_NAND_ADDRESS */ + if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) { + nand_setpins(f->dev, 0, 1, 0, 1, 0); /* ALE */ + omap_nand_setio(f->dev, value, omap_gpmc_devsize(f), size); + } + break; + case 0x84 ... 0x87: /* GPMC_NAND_DATA */ + if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) { + omap_nand_write(f, 0, value, size); + } + break; + default: + goto bad_reg; } break; case 0x1e0: /* GPMC_PREFETCH_CONFIG1 */ - s->prefconfig[0] = value & 0x7f8f7fbf; - /* TODO: update interrupts, fifos, dmas */ + if (!s->prefetch.startengine) { + uint32_t oldconfig1 = s->prefetch.config1; + uint32_t changed; + s->prefetch.config1 = value & 0x7f8f7fbf; + changed = oldconfig1 ^ s->prefetch.config1; + if (changed & (0x80 | 0x7000000)) { + /* Turning the engine on or off, or mapping it somewhere else. + * cs_map() and cs_unmap() check the prefetch config and + * overall CSVALID bits, so it is sufficient to unmap-and-map + * both the old cs and the new one. + */ + int oldcs = prefetch_cs(oldconfig1); + int newcs = prefetch_cs(s->prefetch.config1); + omap_gpmc_cs_unmap(s, oldcs); + omap_gpmc_cs_map(s, oldcs); + if (newcs != oldcs) { + omap_gpmc_cs_unmap(s, newcs); + omap_gpmc_cs_map(s, newcs); + } + } + } break; case 0x1e4: /* GPMC_PREFETCH_CONFIG2 */ - s->prefconfig[1] = value & 0x3fff; + if (!s->prefetch.startengine) { + s->prefetch.transfercount = value & 0x3fff; + } break; case 0x1ec: /* GPMC_PREFETCH_CONTROL */ - s->prefcontrol = value & 1; - if (s->prefcontrol) { - if (s->prefconfig[0] & 1) - s->preffifo = 0x40; - else - s->preffifo = 0x00; + if (s->prefetch.startengine != (value & 1)) { + s->prefetch.startengine = value & 1; + if (s->prefetch.startengine) { + /* Prefetch engine start */ + s->prefetch.count = s->prefetch.transfercount; + if (s->prefetch.config1 & 1) { + /* Write */ + s->prefetch.fifopointer = 64; + } else { + /* Read */ + s->prefetch.fifopointer = 0; + fill_prefetch_fifo(s); + } + } else { + /* Prefetch engine forcibly stopped. The TRM + * doesn't define the behaviour if you do this. + * We clear the prefetch count, which means that + * we permit no more writes, and don't read any + * more data from NAND. The CPU can still drain + * the FIFO of unread data. + */ + s->prefetch.count = 0; + } + omap_gpmc_int_update(s); } - /* TODO: start */ break; case 0x1f4: /* GPMC_ECC_CONFIG */ @@ -394,24 +797,47 @@ static const MemoryRegionOps omap_gpmc_ops = { .endianness = DEVICE_NATIVE_ENDIAN, }; -struct omap_gpmc_s *omap_gpmc_init(target_phys_addr_t base, qemu_irq irq) +struct omap_gpmc_s *omap_gpmc_init(struct omap_mpu_state_s *mpu, + target_phys_addr_t base, + qemu_irq irq, qemu_irq drq) { + int cs; struct omap_gpmc_s *s = (struct omap_gpmc_s *) g_malloc0(sizeof(struct omap_gpmc_s)); - omap_gpmc_reset(s); - memory_region_init_io(&s->iomem, &omap_gpmc_ops, s, "omap-gpmc", 0x1000); memory_region_add_subregion(get_system_memory(), base, &s->iomem); + s->irq = irq; + s->drq = drq; + s->accept_256 = cpu_is_omap3630(mpu); + s->revision = cpu_class_omap3(mpu) ? 0x50 : 0x20; + s->lastirq = 0; + omap_gpmc_reset(s); + + /* We have to register a different IO memory handler for each + * chip select region in case a NAND device is mapped there. We + * make the region the worst-case size of 256MB and rely on the + * container memory region in cs_map to chop it down to the actual + * guest-requested size. + */ + for (cs = 0; cs < 8; cs++) { + memory_region_init_io(&s->cs_file[cs].nandiomem, + &omap_nand_ops, + &s->cs_file[cs], + "omap-nand", + 256 * 1024 * 1024); + } + + memory_region_init_io(&s->prefetch.iomem, &omap_prefetch_ops, s, + "omap-gpmc-prefetch", 256 * 1024 * 1024); return s; } -void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem, - void (*base_upd)(void *opaque, target_phys_addr_t new), - void (*unmap)(void *opaque), void *opaque) +void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem) { struct omap_gpmc_cs_file_s *f; + assert(iomem); if (cs < 0 || cs >= 8) { fprintf(stderr, "%s: bad chip-select %i\n", __FUNCTION__, cs); @@ -419,12 +845,29 @@ void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem, } f = &s->cs_file[cs]; + omap_gpmc_cs_unmap(s, cs); + f->config[0] &= ~(0xf << 10); f->iomem = iomem; - f->base_update = base_upd; - f->unmap = unmap; - f->opaque = opaque; + omap_gpmc_cs_map(s, cs); +} - if (f->config[6] & (1 << 6)) /* CSVALID */ - omap_gpmc_cs_map(f, f->config[6] & 0x1f, /* MASKADDR */ - (f->config[6] >> 8 & 0xf)); /* BASEADDR */ +void omap_gpmc_attach_nand(struct omap_gpmc_s *s, int cs, DeviceState *nand) +{ + struct omap_gpmc_cs_file_s *f; + assert(nand); + + if (cs < 0 || cs >= 8) { + fprintf(stderr, "%s: bad chip-select %i\n", __func__, cs); + exit(-1); + } + f = &s->cs_file[cs]; + + omap_gpmc_cs_unmap(s, cs); + f->config[0] &= ~(0xf << 10); + f->config[0] |= (OMAP_GPMC_NAND << 10); + f->dev = nand; + if (nand_getbuswidth(f->dev) == 16) { + f->config[0] |= OMAP_GPMC_16BIT << 12; + } + omap_gpmc_cs_map(s, cs); } diff --git a/hw/onenand.c b/hw/onenand.c index 00276a03cb..6f68f70698 100644 --- a/hw/onenand.c +++ b/hw/onenand.c @@ -25,6 +25,7 @@ #include "blockdev.h" #include "memory.h" #include "exec-memory.h" +#include "sysbus.h" /* 11 for 2kB-page OneNAND ("2nd generation") and 10 for 1kB-page chips */ #define PAGE_SHIFT 11 @@ -33,6 +34,7 @@ #define BLOCK_SHIFT (PAGE_SHIFT + 6) typedef struct { + SysBusDevice busdev; struct { uint16_t man; uint16_t dev; @@ -49,6 +51,7 @@ typedef struct { uint8_t *current; MemoryRegion ram; MemoryRegion mapped_ram; + uint8_t current_direction; uint8_t *boot[2]; uint8_t *data[2][2]; MemoryRegion iomem; @@ -120,27 +123,72 @@ static void onenand_mem_setup(OneNANDState *s) 1); } -void onenand_base_update(void *opaque, target_phys_addr_t new) +static void onenand_intr_update(OneNANDState *s) { - OneNANDState *s = (OneNANDState *) opaque; - - s->base = new; - - memory_region_add_subregion(get_system_memory(), s->base, &s->container); + qemu_set_irq(s->intr, ((s->intstatus >> 15) ^ (~s->config[0] >> 6)) & 1); } -void onenand_base_unmap(void *opaque) +static void onenand_pre_save(void *opaque) { - OneNANDState *s = (OneNANDState *) opaque; - - memory_region_del_subregion(get_system_memory(), &s->container); + OneNANDState *s = opaque; + if (s->current == s->otp) { + s->current_direction = 1; + } else if (s->current == s->image) { + s->current_direction = 2; + } else { + s->current_direction = 0; + } } -static void onenand_intr_update(OneNANDState *s) +static int onenand_post_load(void *opaque, int version_id) { - qemu_set_irq(s->intr, ((s->intstatus >> 15) ^ (~s->config[0] >> 6)) & 1); + OneNANDState *s = opaque; + switch (s->current_direction) { + case 0: + break; + case 1: + s->current = s->otp; + break; + case 2: + s->current = s->image; + break; + default: + return -1; + } + onenand_intr_update(s); + return 0; } +static const VMStateDescription vmstate_onenand = { + .name = "onenand", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .pre_save = onenand_pre_save, + .post_load = onenand_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT8(current_direction, OneNANDState), + VMSTATE_INT32(cycle, OneNANDState), + VMSTATE_INT32(otpmode, OneNANDState), + VMSTATE_UINT16_ARRAY(addr, OneNANDState, 8), + VMSTATE_UINT16_ARRAY(unladdr, OneNANDState, 8), + VMSTATE_INT32(bufaddr, OneNANDState), + VMSTATE_INT32(count, OneNANDState), + VMSTATE_UINT16(command, OneNANDState), + VMSTATE_UINT16_ARRAY(config, OneNANDState, 2), + VMSTATE_UINT16(status, OneNANDState), + VMSTATE_UINT16(intstatus, OneNANDState), + VMSTATE_UINT16(wpstatus, OneNANDState), + VMSTATE_INT32(secs_cur, OneNANDState), + VMSTATE_PARTIAL_VBUFFER(blockwp, OneNANDState, blocks), + VMSTATE_UINT8(ecc.cp, OneNANDState), + VMSTATE_UINT16_ARRAY(ecc.lp, OneNANDState, 2), + VMSTATE_UINT16(ecc.count, OneNANDState), + VMSTATE_BUFFER_UNSAFE(otp, OneNANDState, 0, ((64 + 2) << PAGE_SHIFT)), + VMSTATE_END_OF_LIST() + } +}; + /* Hot reset (Reset OneNAND command) or warm reset (RP pin low) */ static void onenand_reset(OneNANDState *s, int cold) { @@ -167,11 +215,17 @@ static void onenand_reset(OneNANDState *s, int cold) /* Lock the whole flash */ memset(s->blockwp, ONEN_LOCK_LOCKED, s->blocks); - if (s->bdrv && bdrv_read(s->bdrv, 0, s->boot[0], 8) < 0) - hw_error("%s: Loading the BootRAM failed.\n", __FUNCTION__); + if (s->bdrv_cur && bdrv_read(s->bdrv_cur, 0, s->boot[0], 8) < 0) { + hw_error("%s: Loading the BootRAM failed.\n", __func__); + } } } +static void onenand_system_reset(DeviceState *dev) +{ + onenand_reset(FROM_SYSBUS(OneNANDState, sysbus_from_qdev(dev)), 1); +} + static inline int onenand_load_main(OneNANDState *s, int sec, int secn, void *dest) { @@ -191,8 +245,8 @@ static inline int onenand_prog_main(OneNANDState *s, int sec, int secn, int result = 0; if (secn > 0) { - uint32_t size = (uint32_t) secn * 512; - const uint8_t *sp = (const uint8_t *) src; + uint32_t size = (uint32_t)secn * 512; + const uint8_t *sp = (const uint8_t *)src; uint8_t *dp = 0; if (s->bdrv_cur) { dp = g_malloc(size); @@ -203,7 +257,7 @@ static inline int onenand_prog_main(OneNANDState *s, int sec, int secn, if (sec + secn > s->secs_cur) { result = 1; } else { - dp = (uint8_t *) s->current + (sec << 9); + dp = (uint8_t *)s->current + (sec << 9); } } if (!result) { @@ -245,13 +299,13 @@ static inline int onenand_prog_spare(OneNANDState *s, int sec, int secn, { int result = 0; if (secn > 0) { - const uint8_t *sp = (const uint8_t *) src; + const uint8_t *sp = (const uint8_t *)src; uint8_t *dp = 0, *dpp = 0; if (s->bdrv_cur) { dp = g_malloc(512); if (!dp || bdrv_read(s->bdrv_cur, - s->secs_cur + (sec >> 5), - dp, 1) < 0) { + s->secs_cur + (sec >> 5), + dp, 1) < 0) { result = 1; } else { dpp = dp + ((sec & 31) << 4); @@ -270,7 +324,7 @@ static inline int onenand_prog_spare(OneNANDState *s, int sec, int secn, } if (s->bdrv_cur) { result = bdrv_write(s->bdrv_cur, s->secs_cur + (sec >> 5), - dp, 1) < 0; + dp, 1) < 0; } } if (dp) { @@ -326,7 +380,7 @@ fail: return 1; } -static void onenand_command(OneNANDState *s, int cmd) +static void onenand_command(OneNANDState *s) { int b; int sec; @@ -346,7 +400,7 @@ static void onenand_command(OneNANDState *s, int cmd) s->data[(s->bufaddr >> 2) & 1][1] : s->boot[1]; \ buf += (s->bufaddr & 3) << 4; - switch (cmd) { + switch (s->command) { case 0x00: /* Load single/multiple sector data unit into buffer */ SETADDR(ONEN_BUF_BLOCK, ONEN_BUF_PAGE) @@ -527,7 +581,7 @@ static void onenand_command(OneNANDState *s, int cmd) s->status |= ONEN_ERR_CMD; s->intstatus |= ONEN_INT; fprintf(stderr, "%s: unknown OneNAND command %x\n", - __FUNCTION__, cmd); + __func__, s->command); } onenand_intr_update(s); @@ -659,7 +713,7 @@ static void onenand_write(void *opaque, target_phys_addr_t addr, if (s->intstatus & (1 << 15)) break; s->command = value; - onenand_command(s, s->command); + onenand_command(s); break; case 0xf221: /* System Configuration 1 */ s->config[0] = value; @@ -700,30 +754,25 @@ static const MemoryRegionOps onenand_ops = { .endianness = DEVICE_NATIVE_ENDIAN, }; -void *onenand_init(BlockDriverState *bdrv, - uint16_t man_id, uint16_t dev_id, uint16_t ver_id, - int regshift, qemu_irq irq) +static int onenand_initfn(SysBusDevice *dev) { - OneNANDState *s = (OneNANDState *) g_malloc0(sizeof(*s)); - uint32_t size = 1 << (24 + ((dev_id >> 4) & 7)); + OneNANDState *s = (OneNANDState *)dev; + uint32_t size = 1 << (24 + ((s->id.dev >> 4) & 7)); void *ram; - - s->shift = regshift; - s->intr = irq; + s->base = (target_phys_addr_t)-1; s->rdy = NULL; - s->id.man = man_id; - s->id.dev = dev_id; - s->id.ver = ver_id; s->blocks = size >> BLOCK_SHIFT; s->secs = size >> 9; s->blockwp = g_malloc(s->blocks); - s->density_mask = (dev_id & 0x08) ? (1 << (6 + ((dev_id >> 4) & 7))) : 0; + s->density_mask = (s->id.dev & 0x08) + ? (1 << (6 + ((s->id.dev >> 4) & 7))) : 0; memory_region_init_io(&s->iomem, &onenand_ops, s, "onenand", 0x10000 << s->shift); - s->bdrv = bdrv; if (!s->bdrv) { s->image = memset(g_malloc(size + (size >> 5)), - 0xff, size + (size >> 5)); + 0xff, size + (size >> 5)); + } else { + s->bdrv_cur = s->bdrv; } s->otp = memset(g_malloc((64 + 2) << PAGE_SHIFT), 0xff, (64 + 2) << PAGE_SHIFT); @@ -736,15 +785,40 @@ void *onenand_init(BlockDriverState *bdrv, s->data[1][0] = ram + ((0x0200 + (1 << (PAGE_SHIFT - 1))) << s->shift); s->data[1][1] = ram + ((0x8010 + (1 << (PAGE_SHIFT - 6))) << s->shift); onenand_mem_setup(s); + sysbus_init_irq(dev, &s->intr); + sysbus_init_mmio_region(dev, &s->container); + vmstate_register(&dev->qdev, + ((s->shift & 0x7f) << 24) + | ((s->id.man & 0xff) << 16) + | ((s->id.dev & 0xff) << 8) + | (s->id.ver & 0xff), + &vmstate_onenand, s); + return 0; +} - onenand_reset(s, 1); +static SysBusDeviceInfo onenand_info = { + .init = onenand_initfn, + .qdev.name = "onenand", + .qdev.size = sizeof(OneNANDState), + .qdev.reset = onenand_system_reset, + .qdev.props = (Property[]) { + DEFINE_PROP_UINT16("manufacturer_id", OneNANDState, id.man, 0), + DEFINE_PROP_UINT16("device_id", OneNANDState, id.dev, 0), + DEFINE_PROP_UINT16("version_id", OneNANDState, id.ver, 0), + DEFINE_PROP_INT32("shift", OneNANDState, shift, 0), + DEFINE_PROP_DRIVE("drive", OneNANDState, bdrv), + DEFINE_PROP_END_OF_LIST() + } +}; - return s; +static void onenand_register_device(void) +{ + sysbus_register_withprop(&onenand_info); } -void *onenand_raw_otp(void *opaque) +void *onenand_raw_otp(DeviceState *onenand_device) { - OneNANDState *s = (OneNANDState *) opaque; - - return s->otp; + return FROM_SYSBUS(OneNANDState, sysbus_from_qdev(onenand_device))->otp; } + +device_init(onenand_register_device) @@ -1811,6 +1811,25 @@ static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id, return next; } +static uint8_t pci_find_capability_at_offset(PCIDevice *pdev, uint8_t offset) +{ + uint8_t next, prev, found = 0; + + if (!(pdev->used[offset])) { + return 0; + } + + assert(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST); + + for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]); + prev = next + PCI_CAP_LIST_NEXT) { + if (next <= offset && next > found) { + found = next; + } + } + return found; +} + /* Patch the PCI vendor and device ids in a PCI rom image if necessary. This is needed for an option rom which is used for more than one device. */ static void pci_patch_ids(PCIDevice *pdev, uint8_t *ptr, int size) @@ -1952,11 +1971,30 @@ int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t offset, uint8_t size) { uint8_t *config; + int i, overlapping_cap; + if (!offset) { offset = pci_find_space(pdev, size); if (!offset) { return -ENOSPC; } + } else { + /* Verify that capabilities don't overlap. Note: device assignment + * depends on this check to verify that the device is not broken. + * Should never trigger for emulated devices, but it's helpful + * for debugging these. */ + for (i = offset; i < offset + size; i++) { + overlapping_cap = pci_find_capability_at_offset(pdev, i); + if (overlapping_cap) { + fprintf(stderr, "ERROR: %04x:%02x:%02x.%x " + "Attempt to add PCI capability %x at offset " + "%x overlaps existing capability %x at offset %x\n", + pci_find_domain(pdev->bus), pci_bus_num(pdev->bus), + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), + cap_id, offset, overlapping_cap, i); + return -EINVAL; + } + } } config = pdev->config + offset; @@ -175,6 +175,14 @@ static void hotplug_event_notify(PCIDevice *dev) } } +static void hotplug_event_clear(PCIDevice *dev) +{ + hotplug_event_update_event_status(dev); + if (!msix_enabled(dev) && !msi_enabled(dev) && !dev->exp.hpev_notified) { + qemu_set_irq(dev->irq[dev->exp.hpev_intx], 0); + } +} + /* * A PCI Express Hot-Plug Event has occurred, so update slot status register * and notify OS of the event if necessary. @@ -320,6 +328,10 @@ void pcie_cap_slot_write_config(PCIDevice *dev, uint8_t *exp_cap = dev->config + pos; uint16_t sltsta = pci_get_word(exp_cap + PCI_EXP_SLTSTA); + if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) { + hotplug_event_clear(dev); + } + if (!ranges_overlap(addr, len, pos + PCI_EXP_SLTCTL, 2)) { return; } diff --git a/hw/pcie_aer.c b/hw/pcie_aer.c index 2ae65ec807..62c06eafd6 100644 --- a/hw/pcie_aer.c +++ b/hw/pcie_aer.c @@ -415,7 +415,7 @@ static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err) int i; assert(err->status); - assert(err->status & (err->status - 1)); + assert(!(err->status & (err->status - 1))); errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP); errcap |= PCI_ERR_CAP_FEP(first_bit); @@ -495,7 +495,7 @@ static int pcie_aer_record_error(PCIDevice *dev, int fep = PCI_ERR_CAP_FEP(errcap); assert(err->status); - assert(err->status & (err->status - 1)); + assert(!(err->status & (err->status - 1))); if (errcap & PCI_ERR_CAP_MHRE && (pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & (1U << fep))) { @@ -979,20 +979,21 @@ int do_pcie_aer_inejct_error(Monitor *mon, if (pcie_aer_parse_error_string(error_name, &error_status, &correctable)) { char *e = NULL; error_status = strtoul(error_name, &e, 0); - correctable = !!qdict_get_int(qdict, "correctable"); + correctable = qdict_get_try_bool(qdict, "correctable", 0); if (!e || *e != '\0') { monitor_printf(mon, "invalid error status value. \"%s\"", error_name); return -EINVAL; } } + err.status = error_status; err.source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn; err.flags = 0; if (correctable) { err.flags |= PCIE_AER_ERR_IS_CORRECTABLE; } - if (qdict_get_int(qdict, "advisory_non_fatal")) { + if (qdict_get_try_bool(qdict, "advisory_non_fatal", 0)) { err.flags |= PCIE_AER_ERR_MAYBE_ADVISORY; } if (qdict_haskey(qdict, "header0")) { diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index d94b1eb53c..3cc830ff95 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -57,6 +57,7 @@ typedef struct SCSIDiskReq { struct iovec iov; QEMUIOVector qiov; uint32_t status; + BlockAcctCookie acct; } SCSIDiskReq; struct SCSIDiskState @@ -107,10 +108,13 @@ static void scsi_cancel_io(SCSIRequest *req) static void scsi_read_complete(void * opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); int n; r->req.aiocb = NULL; + bdrv_acct_done(s->bs, &r->acct); + if (ret) { if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_READ)) { return; @@ -161,6 +165,8 @@ static void scsi_read_data(SCSIRequest *req) r->iov.iov_len = n * 512; qemu_iovec_init_external(&r->qiov, &r->iov, 1); + + bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n, scsi_read_complete, r); if (r->req.aiocb == NULL) { @@ -207,11 +213,14 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type) static void scsi_write_complete(void * opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); uint32_t len; uint32_t n; r->req.aiocb = NULL; + bdrv_acct_done(s->bs, &r->acct); + if (ret) { if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_WRITE)) { return; @@ -252,6 +261,8 @@ static void scsi_write_data(SCSIRequest *req) n = r->iov.iov_len / 512; if (n) { qemu_iovec_init_external(&r->qiov, &r->iov, 1); + + bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE); r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n, scsi_write_complete, r); if (r->req.aiocb == NULL) { @@ -854,13 +865,19 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) buflen = 8; break; case SYNCHRONIZE_CACHE: + { + BlockAcctCookie acct; + + bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH); ret = bdrv_flush(s->bs); + bdrv_acct_done(s->bs, &acct); if (ret < 0) { if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) { return -1; } } break; + } case GET_CONFIGURATION: memset(outbuf, 0, 8); /* ??? This should probably return much more information. For now diff --git a/hw/sh_pci.c b/hw/sh_pci.c index 76061bb756..36f39300d5 100644 --- a/hw/sh_pci.c +++ b/hw/sh_pci.c @@ -150,7 +150,7 @@ static int sh_pci_init_device(SysBusDevice *dev) PCI_DEVFN(0, 0), 4); memory_region_init_io(&s->memconfig_p4, &sh_pci_reg_ops, s, "sh_pci", 0x224); - memory_region_init_alias(&s->memconfig_a7, "sh_pci.2", &s->memconfig_a7, + memory_region_init_alias(&s->memconfig_a7, "sh_pci.2", &s->memconfig_p4, 0, 0x224); isa_mmio_setup(&s->isa, 0x40000); sysbus_init_mmio_cb2(dev, sh_pci_map, sh_pci_unmap); diff --git a/hw/sysbus.c b/hw/sysbus.c index f39768b6a2..c365d39d24 100644 --- a/hw/sysbus.c +++ b/hw/sysbus.c @@ -131,6 +131,11 @@ void sysbus_init_mmio_region(SysBusDevice *dev, MemoryRegion *memory) dev->mmio[n].memory = memory; } +MemoryRegion *sysbus_mmio_get_region(SysBusDevice *dev, int n) +{ + return dev->mmio[n].memory; +} + void sysbus_init_ioports(SysBusDevice *dev, pio_addr_t ioport, pio_addr_t size) { pio_addr_t i; diff --git a/hw/sysbus.h b/hw/sysbus.h index b87c6c5aab..aa3d383277 100644 --- a/hw/sysbus.h +++ b/hw/sysbus.h @@ -50,6 +50,7 @@ void sysbus_init_mmio(SysBusDevice *dev, target_phys_addr_t size, void sysbus_init_mmio_cb2(SysBusDevice *dev, mmio_mapfunc cb, mmio_mapfunc unmap); void sysbus_init_mmio_region(SysBusDevice *dev, MemoryRegion *memory); +MemoryRegion *sysbus_mmio_get_region(SysBusDevice *dev, int n); void sysbus_init_irq(SysBusDevice *dev, qemu_irq *p); void sysbus_pass_irq(SysBusDevice *dev, SysBusDevice *target); void sysbus_init_ioports(SysBusDevice *dev, pio_addr_t ioport, pio_addr_t size); diff --git a/hw/tusb6010.c b/hw/tusb6010.c index b2bf35934d..de6ffc6133 100644 --- a/hw/tusb6010.c +++ b/hw/tusb6010.c @@ -23,9 +23,11 @@ #include "usb.h" #include "omap.h" #include "irq.h" -#include "tusb6010.h" +#include "devices.h" +#include "sysbus.h" -struct TUSBState { +typedef struct TUSBState { + SysBusDevice busdev; MemoryRegion iomem[2]; qemu_irq irq; MUSBState *musb; @@ -59,7 +61,7 @@ struct TUSBState { uint32_t pullup[2]; uint32_t control_config; uint32_t otg_timer_val; -}; +} TUSBState; #define TUSB_DEVCLOCK 60000000 /* 60 MHz */ @@ -234,16 +236,6 @@ struct TUSBState { #define TUSB_EP_CONFIG_XFR_SIZE(v) ((v) & 0x7fffffff) #define TUSB_PROD_TEST_RESET_VAL 0xa596 -MemoryRegion *tusb6010_sync_io(TUSBState *s) -{ - return &s->iomem[0]; -} - -MemoryRegion *tusb6010_async_io(TUSBState *s) -{ - return &s->iomem[1]; -} - static void tusb_intr_update(TUSBState *s) { if (s->control_config & TUSB_INT_CTRL_CONF_INT_POLARITY) @@ -723,9 +715,33 @@ static void tusb_musb_core_intr(void *opaque, int source, int level) } } -TUSBState *tusb6010_init(qemu_irq intr) +static void tusb6010_power(TUSBState *s, int on) { - TUSBState *s = g_malloc0(sizeof(*s)); + if (!on) { + s->power = 0; + } else if (!s->power && on) { + s->power = 1; + /* Pull the interrupt down after TUSB6010 comes up. */ + s->intr_ok = 0; + tusb_intr_update(s); + qemu_mod_timer(s->pwr_timer, + qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 2); + } +} + +static void tusb6010_irq(void *opaque, int source, int level) +{ + if (source) { + tusb_musb_core_intr(opaque, source - 1, level); + } else { + tusb6010_power(opaque, level); + } +} + +static void tusb6010_reset(DeviceState *dev) +{ + TUSBState *s = FROM_SYSBUS(TUSBState, sysbus_from_qdev(dev)); + int i; s->test_reset = TUSB_PROD_TEST_RESET_VAL; s->host_mode = 0; @@ -735,28 +751,59 @@ TUSBState *tusb6010_init(qemu_irq intr) s->mask = 0xffffffff; s->intr = 0x00000000; s->otg_timer_val = 0; - memory_region_init_io(&s->iomem[1], &tusb_async_ops, s, "tusb-async", - UINT32_MAX); - s->irq = intr; + s->scratch = 0; + s->prcm_config = 0; + s->prcm_mngmt = 0; + s->intr_ok = 0; + s->usbip_intr = 0; + s->usbip_mask = 0; + s->gpio_intr = 0; + s->gpio_mask = 0; + s->gpio_config = 0; + s->dma_intr = 0; + s->dma_mask = 0; + s->dma_map = 0; + s->dma_config = 0; + s->ep0_config = 0; + s->wkup_mask = 0; + s->pullup[0] = s->pullup[1] = 0; + s->control_config = 0; + for (i = 0; i < 15; i++) { + s->rx_config[i] = s->tx_config[i] = 0; + } +} + +static int tusb6010_init(SysBusDevice *dev) +{ + TUSBState *s = FROM_SYSBUS(TUSBState, dev); + qemu_irq *musb_irqs; + int i; s->otg_timer = qemu_new_timer_ns(vm_clock, tusb_otg_tick, s); s->pwr_timer = qemu_new_timer_ns(vm_clock, tusb_power_tick, s); - s->musb = musb_init(qemu_allocate_irqs(tusb_musb_core_intr, s, - __musb_irq_max)); - - return s; + memory_region_init_io(&s->iomem[1], &tusb_async_ops, s, "tusb-async", + UINT32_MAX); + sysbus_init_mmio_region(dev, &s->iomem[0]); + sysbus_init_mmio_region(dev, &s->iomem[1]); + sysbus_init_irq(dev, &s->irq); + qdev_init_gpio_in(&dev->qdev, tusb6010_irq, __musb_irq_max + 1); + musb_irqs = g_new0(qemu_irq, __musb_irq_max); + for (i = 0; i < __musb_irq_max; i++) { + musb_irqs[i] = qdev_get_gpio_in(&dev->qdev, i + 1); + } + s->musb = musb_init(musb_irqs); + return 0; } -void tusb6010_power(TUSBState *s, int on) -{ - if (!on) - s->power = 0; - else if (!s->power && on) { - s->power = 1; +static SysBusDeviceInfo tusb6010_info = { + .init = tusb6010_init, + .qdev.name = "tusb6010", + .qdev.size = sizeof(TUSBState), + .qdev.reset = tusb6010_reset, +}; - /* Pull the interrupt down after TUSB6010 comes up. */ - s->intr_ok = 0; - tusb_intr_update(s); - qemu_mod_timer(s->pwr_timer, - qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 2); - } +static void tusb6010_register_device(void) +{ + sysbus_register_withprop(&tusb6010_info); } + +device_init(tusb6010_register_device) diff --git a/hw/tusb6010.h b/hw/tusb6010.h deleted file mode 100644 index b85ee86215..0000000000 --- a/hw/tusb6010.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * tusb6010 interfaces - * - * Copyright 2011 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Avi Kivity <avi@redhat.com> - * - * Derived from hw/devices.h. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#ifndef TUSB6010_H -#define TUSB6010_H - -#include "targphys.h" -#include "memory.h" - -typedef struct TUSBState TUSBState; -TUSBState *tusb6010_init(qemu_irq intr); -MemoryRegion *tusb6010_sync_io(TUSBState *s); -MemoryRegion *tusb6010_async_io(TUSBState *s); -void tusb6010_power(TUSBState *s, int on); - -#endif diff --git a/hw/vhost.c b/hw/vhost.c index 18860678ba..0870cb7d85 100644 --- a/hw/vhost.c +++ b/hw/vhost.c @@ -515,11 +515,6 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, }; struct VirtQueue *vvq = virtio_get_queue(vdev, idx); - if (!vdev->binding->set_host_notifier) { - fprintf(stderr, "binding does not support host notifiers\n"); - return -ENOSYS; - } - vq->num = state.num = virtio_queue_get_num(vdev, idx); r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); if (r) { @@ -567,12 +562,6 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, r = -errno; goto fail_alloc; } - r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, true); - if (r < 0) { - fprintf(stderr, "Error binding host notifier: %d\n", -r); - goto fail_host_notifier; - } - file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); if (r) { @@ -591,8 +580,6 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, fail_call: fail_kick: - vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false); -fail_host_notifier: fail_alloc: cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), 0, 0); @@ -618,12 +605,6 @@ static void vhost_virtqueue_cleanup(struct vhost_dev *dev, .index = idx, }; int r; - r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false); - if (r < 0) { - fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r); - fflush(stderr); - } - assert (r >= 0); r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state); if (r < 0) { fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); @@ -697,6 +678,60 @@ bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev) hdev->force; } +/* Stop processing guest IO notifications in qemu. + * Start processing them in vhost in kernel. + */ +int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i, r; + if (!vdev->binding->set_host_notifier) { + fprintf(stderr, "binding does not support host notifiers\n"); + r = -ENOSYS; + goto fail; + } + + for (i = 0; i < hdev->nvqs; ++i) { + r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, true); + if (r < 0) { + fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r); + goto fail_vq; + } + } + + return 0; +fail_vq: + while (--i >= 0) { + r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, false); + if (r < 0) { + fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r); + fflush(stderr); + } + assert (r >= 0); + } +fail: + return r; +} + +/* Stop processing guest IO notifications in vhost. + * Start processing them in qemu. + * This might actually run the qemu handlers right away, + * so virtio in qemu must be completely setup when this is called. + */ +void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int i, r; + + for (i = 0; i < hdev->nvqs; ++i) { + r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, false); + if (r < 0) { + fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r); + fflush(stderr); + } + assert (r >= 0); + } +} + +/* Host notifiers must be enabled at this point. */ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) { int i, r; @@ -762,6 +797,7 @@ fail: return r; } +/* Host notifiers must be enabled at this point. */ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) { int i, r; diff --git a/hw/vhost.h b/hw/vhost.h index c8c595a147..c9452f0732 100644 --- a/hw/vhost.h +++ b/hw/vhost.h @@ -46,5 +46,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev); bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev); int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev); +int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev); +void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev); #endif diff --git a/hw/vhost_net.c b/hw/vhost_net.c index a55981200d..950a6b8d99 100644 --- a/hw/vhost_net.c +++ b/hw/vhost_net.c @@ -139,16 +139,22 @@ int vhost_net_start(struct vhost_net *net, { struct vhost_vring_file file = { }; int r; + + net->dev.nvqs = 2; + net->dev.vqs = net->vqs; + + r = vhost_dev_enable_notifiers(&net->dev, dev); + if (r < 0) { + goto fail_notifiers; + } if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr_mrg_rxbuf)); } - net->dev.nvqs = 2; - net->dev.vqs = net->vqs; r = vhost_dev_start(&net->dev, dev); if (r < 0) { - return r; + goto fail_start; } net->vc->info->poll(net->vc, false); @@ -173,6 +179,9 @@ fail: if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr)); } +fail_start: + vhost_dev_disable_notifiers(&net->dev, dev); +fail_notifiers: return r; } @@ -190,6 +199,7 @@ void vhost_net_stop(struct vhost_net *net, if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr)); } + vhost_dev_disable_notifiers(&net->dev, dev); } void vhost_net_cleanup(struct vhost_net *net) diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c index dad8c0a6a2..2a8ccd0aa9 100644 --- a/hw/virtio-blk.c +++ b/hw/virtio-blk.c @@ -47,6 +47,7 @@ typedef struct VirtIOBlockReq struct virtio_scsi_inhdr *scsi; QEMUIOVector qiov; struct VirtIOBlockReq *next; + BlockAcctCookie acct; } VirtIOBlockReq; static void virtio_blk_req_complete(VirtIOBlockReq *req, int status) @@ -58,8 +59,6 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int status) stb_p(&req->in->status, status); virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in)); virtio_notify(&s->vdev, s->vq); - - g_free(req); } static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, @@ -81,6 +80,8 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, vm_stop(VMSTOP_DISKFULL); } else { virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + bdrv_acct_done(s->bs, &req->acct); + g_free(req); bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read); } @@ -100,6 +101,8 @@ static void virtio_blk_rw_complete(void *opaque, int ret) } virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + bdrv_acct_done(req->dev->bs, &req->acct); + g_free(req); } static void virtio_blk_flush_complete(void *opaque, int ret) @@ -113,6 +116,8 @@ static void virtio_blk_flush_complete(void *opaque, int ret) } virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + bdrv_acct_done(req->dev->bs, &req->acct); + g_free(req); } static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s) @@ -155,6 +160,7 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) */ if (req->elem.out_num < 2 || req->elem.in_num < 3) { virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + g_free(req); return; } @@ -163,6 +169,7 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) */ if (req->elem.out_num > 2 && req->elem.in_num > 3) { virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP); + g_free(req); return; } @@ -229,11 +236,13 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) stl_p(&req->scsi->data_len, hdr.dxfer_len); virtio_blk_req_complete(req, status); + g_free(req); } #else static void virtio_blk_handle_scsi(VirtIOBlockReq *req) { virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP); + g_free(req); } #endif /* __linux__ */ @@ -266,6 +275,8 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb) { BlockDriverAIOCB *acb; + bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH); + /* * Make sure all outstanding writes are posted to the backing device. */ @@ -284,6 +295,8 @@ static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb) sector = ldq_p(&req->out->sector); + bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE); + trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512); if (sector & req->dev->sector_mask) { @@ -317,6 +330,8 @@ static void virtio_blk_handle_read(VirtIOBlockReq *req) sector = ldq_p(&req->out->sector); + bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ); + if (sector & req->dev->sector_mask) { virtio_blk_rw_complete(req, -EIO); return; @@ -370,6 +385,7 @@ static void virtio_blk_handle_request(VirtIOBlockReq *req, s->serial ? s->serial : "", MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES)); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + g_free(req); } else if (type & VIRTIO_BLK_T_OUT) { qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1], req->elem.out_num - 1); diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 31f91514f2..bd5c66916b 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -79,6 +79,7 @@ struct ioreq { struct XenBlkDev *blkdev; QLIST_ENTRY(ioreq) list; + BlockAcctCookie acct; }; struct XenBlkDev { @@ -401,6 +402,7 @@ static void qemu_aio_complete(void *opaque, int ret) ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY; ioreq_unmap(ioreq); ioreq_finish(ioreq); + bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct); qemu_bh_schedule(ioreq->blkdev->bh); } @@ -419,6 +421,7 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) switch (ioreq->req.operation) { case BLKIF_OP_READ: + bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ); ioreq->aio_inflight++; bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, @@ -429,6 +432,8 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) if (!ioreq->req.nr_segments) { break; } + + bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE); ioreq->aio_inflight++; bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, diff --git a/iohandler.c b/iohandler.c index 4deae1e6ab..5ef66fb6e8 100644 --- a/iohandler.c +++ b/iohandler.c @@ -80,12 +80,67 @@ int qemu_set_fd_handler2(int fd, return 0; } +typedef struct IOTrampoline +{ + GIOChannel *chan; + IOHandler *fd_read; + IOHandler *fd_write; + void *opaque; + guint tag; +} IOTrampoline; + +static gboolean fd_trampoline(GIOChannel *chan, GIOCondition cond, gpointer opaque) +{ + IOTrampoline *tramp = opaque; + + if (tramp->opaque == NULL) { + return FALSE; + } + + if ((cond & G_IO_IN) && tramp->fd_read) { + tramp->fd_read(tramp->opaque); + } + + if ((cond & G_IO_OUT) && tramp->fd_write) { + tramp->fd_write(tramp->opaque); + } + + return TRUE; +} + int qemu_set_fd_handler(int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque) { - return qemu_set_fd_handler2(fd, NULL, fd_read, fd_write, opaque); + static IOTrampoline fd_trampolines[FD_SETSIZE]; + IOTrampoline *tramp = &fd_trampolines[fd]; + + if (tramp->tag != 0) { + g_io_channel_unref(tramp->chan); + g_source_remove(tramp->tag); + } + + if (opaque) { + GIOCondition cond = 0; + + tramp->fd_read = fd_read; + tramp->fd_write = fd_write; + tramp->opaque = opaque; + + if (fd_read) { + cond |= G_IO_IN | G_IO_ERR; + } + + if (fd_write) { + cond |= G_IO_OUT | G_IO_ERR; + } + + tramp->chan = g_io_channel_unix_new(fd); + tramp->tag = g_io_add_watch(tramp->chan, cond, fd_trampoline, tramp); + } + + return 0; } void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds) @@ -304,7 +304,7 @@ static void as_memory_range_add(AddressSpace *as, FlatRange *fr) } if (!fr->readable) { - phys_offset &= TARGET_PAGE_MASK; + phys_offset &= ~TARGET_PAGE_MASK & ~IO_MEM_ROMD; } cpu_register_physical_memory_log(fr->addr.start, @@ -962,11 +962,14 @@ void memory_region_init_alias(MemoryRegion *mr, void memory_region_init_rom_device(MemoryRegion *mr, const MemoryRegionOps *ops, + void *opaque, DeviceState *dev, const char *name, uint64_t size) { memory_region_init(mr, name, size); + mr->ops = ops; + mr->opaque = opaque; mr->terminates = true; mr->destructor = memory_region_destructor_rom_device; mr->ram_addr = qemu_ram_alloc(dev, name, size); @@ -1060,7 +1063,7 @@ void *memory_region_get_ram_ptr(MemoryRegion *mr) assert(mr->terminates); - return qemu_get_ram_ptr(mr->ram_addr); + return qemu_get_ram_ptr(mr->ram_addr & TARGET_PAGE_MASK); } static void memory_region_update_coalesced_range(MemoryRegion *mr) @@ -235,6 +235,7 @@ void memory_region_init_alias(MemoryRegion *mr, */ void memory_region_init_rom_device(MemoryRegion *mr, const MemoryRegionOps *ops, + void *opaque, DeviceState *dev, /* FIXME: layering violation */ const char *name, uint64_t size); @@ -1189,7 +1189,6 @@ static int add_graphics_client(Monitor *mon, const QDict *qdict, QObject **ret_d { const char *protocol = qdict_get_str(qdict, "protocol"); const char *fdname = qdict_get_str(qdict, "fdname"); - int skipauth = qdict_get_try_bool(qdict, "skipauth", 0); CharDriverState *s; if (strcmp(protocol, "spice") == 0) { @@ -1203,6 +1202,7 @@ static int add_graphics_client(Monitor *mon, const QDict *qdict, QObject **ret_d #ifdef CONFIG_VNC } else if (strcmp(protocol, "vnc") == 0) { int fd = monitor_get_fd(mon, fdname); + int skipauth = qdict_get_try_bool(qdict, "skipauth", 0); vnc_display_add_client(NULL, fd, skipauth); return 0; #endif diff --git a/posix-aio-compat.c b/posix-aio-compat.c index babb0940dd..3193dbf83c 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -30,6 +30,7 @@ #include "block/raw-posix-aio.h" +static void do_spawn_thread(void); struct qemu_paiocb { BlockDriverAIOCB common; @@ -64,6 +65,9 @@ static pthread_attr_t attr; static int max_threads = 64; static int cur_threads = 0; static int idle_threads = 0; +static int new_threads = 0; /* backlog of threads we need to create */ +static int pending_threads = 0; /* threads created but not running yet */ +static QEMUBH *new_thread_bh; static QTAILQ_HEAD(, qemu_paiocb) request_list; #ifdef CONFIG_PREADV @@ -311,6 +315,11 @@ static void *aio_thread(void *unused) pid = getpid(); + mutex_lock(&lock); + pending_threads--; + mutex_unlock(&lock); + do_spawn_thread(); + while (1) { struct qemu_paiocb *aiocb; ssize_t ret = 0; @@ -381,11 +390,20 @@ static void *aio_thread(void *unused) return NULL; } -static void spawn_thread(void) +static void do_spawn_thread(void) { sigset_t set, oldset; - cur_threads++; + mutex_lock(&lock); + if (!new_threads) { + mutex_unlock(&lock); + return; + } + + new_threads--; + pending_threads++; + + mutex_unlock(&lock); /* block all signals */ if (sigfillset(&set)) die("sigfillset"); @@ -396,6 +414,27 @@ static void spawn_thread(void) if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore"); } +static void spawn_thread_bh_fn(void *opaque) +{ + do_spawn_thread(); +} + +static void spawn_thread(void) +{ + cur_threads++; + new_threads++; + /* If there are threads being created, they will spawn new workers, so + * we don't spend time creating many threads in a loop holding a mutex or + * starving the current vcpu. + * + * If there are no idle threads, ask the main thread to create one, so we + * inherit the correct affinity instead of the vcpu affinity. + */ + if (!pending_threads) { + qemu_bh_schedule(new_thread_bh); + } +} + static void qemu_paio_submit(struct qemu_paiocb *aiocb) { aiocb->ret = -EINPROGRESS; @@ -665,6 +704,7 @@ int paio_init(void) die2(ret, "pthread_attr_setdetachstate"); QTAILQ_INIT(&request_list); + new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL); posix_aio_state = s; return 0; diff --git a/qemu-config.c b/qemu-config.c index 1eb6b9a709..139e07768b 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -55,7 +55,8 @@ static QemuOptsList qemu_drive_opts = { },{ .name = "cache", .type = QEMU_OPT_STRING, - .help = "host cache usage (none, writeback, writethrough, unsafe)", + .help = "host cache usage (none, writeback, writethrough, " + "directsync, unsafe)", },{ .name = "aio", .type = QEMU_OPT_STRING, diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c index a80f437c59..2a385a3bb8 100644 --- a/qemu-coroutine-lock.c +++ b/qemu-coroutine-lock.c @@ -115,3 +115,47 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) trace_qemu_co_mutex_unlock_return(mutex, self); } + +void qemu_co_rwlock_init(CoRwlock *lock) +{ + memset(lock, 0, sizeof(*lock)); + qemu_co_queue_init(&lock->queue); +} + +void qemu_co_rwlock_rdlock(CoRwlock *lock) +{ + while (lock->writer) { + qemu_co_queue_wait(&lock->queue); + } + lock->reader++; +} + +void qemu_co_rwlock_unlock(CoRwlock *lock) +{ + assert(qemu_in_coroutine()); + if (lock->writer) { + lock->writer = false; + while (!qemu_co_queue_empty(&lock->queue)) { + /* + * Wakeup every body. This will include some + * writers too. + */ + qemu_co_queue_next(&lock->queue); + } + } else { + lock->reader--; + assert(lock->reader >= 0); + /* Wakeup only one waiting writer */ + if (!lock->reader) { + qemu_co_queue_next(&lock->queue); + } + } +} + +void qemu_co_rwlock_wrlock(CoRwlock *lock) +{ + while (lock->writer || lock->reader) { + qemu_co_queue_wait(&lock->queue); + } + lock->writer = true; +} diff --git a/qemu-coroutine.h b/qemu-coroutine.h index 2f2fd95552..b8fc4f4332 100644 --- a/qemu-coroutine.h +++ b/qemu-coroutine.h @@ -156,4 +156,36 @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex); */ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex); +typedef struct CoRwlock { + bool writer; + int reader; + CoQueue queue; +} CoRwlock; + +/** + * Initialises a CoRwlock. This must be called before any other operation + * is used on the CoRwlock + */ +void qemu_co_rwlock_init(CoRwlock *lock); + +/** + * Read locks the CoRwlock. If the lock cannot be taken immediately because + * of a parallel writer, control is transferred to the caller of the current + * coroutine. + */ +void qemu_co_rwlock_rdlock(CoRwlock *lock); + +/** + * Write Locks the mutex. If the lock cannot be taken immediately because + * of a parallel reader, control is transferred to the caller of the current + * coroutine. + */ +void qemu_co_rwlock_wrlock(CoRwlock *lock); + +/** + * Unlocks the read/write lock and schedules the next coroutine that was + * waiting for this lock to be run. + */ +void qemu_co_rwlock_unlock(CoRwlock *lock); + #endif /* QEMU_COROUTINE_H */ diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 1299e83ef2..4be00a5edd 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -28,9 +28,9 @@ STEXI ETEXI DEF("convert", img_convert, - "convert [-c] [-p] [-f fmt] [-t cache] [-O output_fmt] [-o options] [-s snapshot_name] filename [filename2 [...]] output_filename") + "convert [-c] [-p] [-f fmt] [-t cache] [-O output_fmt] [-o options] [-s snapshot_name] [-S sparse_size] filename [filename2 [...]] output_filename") STEXI -@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] @var{filename} [@var{filename2} [...]] @var{output_filename} +@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename} ETEXI DEF("info", img_info, diff --git a/qemu-img.c b/qemu-img.c index 95f3219571..6a3973163f 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -66,7 +66,8 @@ static void help(void) " 'filename' is a disk image filename\n" " 'fmt' is the disk image format. It is guessed automatically in most cases\n" " 'cache' is the cache mode used to write the output disk image, the valid\n" - " options are: 'none', 'writeback' (default), 'writethrough' and 'unsafe'\n" + " options are: 'none', 'writeback' (default), 'writethrough', 'directsync'\n" + " and 'unsafe'\n" " 'size' is the disk image size in bytes. Optional suffixes\n" " 'k' or 'K' (kilobyte, 1024), 'M' (megabyte, 1024k), 'G' (gigabyte, 1024M)\n" " and T (terabyte, 1024G) are supported. 'b' is ignored.\n" @@ -81,6 +82,8 @@ static void help(void) " rebasing in this case (useful for renaming the backing file)\n" " '-h' with or without a command shows this help and lists the supported formats\n" " '-p' show progress of command (only certain commands)\n" + " '-S' indicates the consecutive number of bytes that must contain only zeros\n" + " for qemu-img to create a sparse image during conversion\n" "\n" "Parameters to snapshot subcommand:\n" " 'snapshot' is the name of the snapshot to create, apply or delete\n" @@ -183,27 +186,6 @@ static int read_password(char *buf, int buf_size) } #endif -static int set_cache_flag(const char *mode, int *flags) -{ - *flags &= ~BDRV_O_CACHE_MASK; - - if (!strcmp(mode, "none") || !strcmp(mode, "off")) { - *flags |= BDRV_O_CACHE_WB; - *flags |= BDRV_O_NOCACHE; - } else if (!strcmp(mode, "writeback")) { - *flags |= BDRV_O_CACHE_WB; - } else if (!strcmp(mode, "unsafe")) { - *flags |= BDRV_O_CACHE_WB; - *flags |= BDRV_O_NO_FLUSH; - } else if (!strcmp(mode, "writethrough")) { - /* this is the default */ - } else { - return -1; - } - - return 0; -} - static int print_block_option_help(const char *filename, const char *fmt) { BlockDriver *drv, *proto_drv; @@ -495,7 +477,7 @@ static int img_commit(int argc, char **argv) filename = argv[optind++]; flags = BDRV_O_RDWR; - ret = set_cache_flag(cache, &flags); + ret = bdrv_parse_cache_flags(cache, &flags); if (ret < 0) { error_report("Invalid cache option: %s", cache); return -1; @@ -591,6 +573,48 @@ static int is_allocated_sectors(const uint8_t *buf, int n, int *pnum) } /* + * Like is_allocated_sectors, but if the buffer starts with a used sector, + * up to 'min' consecutive sectors containing zeros are ignored. This avoids + * breaking up write requests for only small sparse areas. + */ +static int is_allocated_sectors_min(const uint8_t *buf, int n, int *pnum, + int min) +{ + int ret; + int num_checked, num_used; + + if (n < min) { + min = n; + } + + ret = is_allocated_sectors(buf, n, pnum); + if (!ret) { + return ret; + } + + num_used = *pnum; + buf += BDRV_SECTOR_SIZE * *pnum; + n -= *pnum; + num_checked = num_used; + + while (n > 0) { + ret = is_allocated_sectors(buf, n, pnum); + + buf += BDRV_SECTOR_SIZE * *pnum; + n -= *pnum; + num_checked += *pnum; + if (ret) { + num_used = num_checked; + } else if (*pnum >= min) { + break; + } + } + + *pnum = num_used; + return 1; +} + +/* * Compares two buffers sector by sector. Returns 0 if the first sector of both * buffers matches, non-zero otherwise. * @@ -640,6 +664,7 @@ static int img_convert(int argc, char **argv) char *options = NULL; const char *snapshot_name = NULL; float local_progress; + int min_sparse = 8; /* Need at least 4k of zeros for sparse detection */ fmt = NULL; out_fmt = "raw"; @@ -647,7 +672,7 @@ static int img_convert(int argc, char **argv) out_baseimg = NULL; compress = 0; for(;;) { - c = getopt(argc, argv, "f:O:B:s:hce6o:pt:"); + c = getopt(argc, argv, "f:O:B:s:hce6o:pS:t:"); if (c == -1) { break; } @@ -682,6 +707,18 @@ static int img_convert(int argc, char **argv) case 's': snapshot_name = optarg; break; + case 'S': + { + int64_t sval; + sval = strtosz_suffix(optarg, NULL, STRTOSZ_DEFSUFFIX_B); + if (sval < 0) { + error_report("Invalid minimum zero buffer size for sparse output specified"); + return 1; + } + + min_sparse = sval / BDRV_SECTOR_SIZE; + break; + } case 'p': progress = 1; break; @@ -819,7 +856,7 @@ static int img_convert(int argc, char **argv) } flags = BDRV_O_RDWR; - ret = set_cache_flag(cache, &flags); + ret = bdrv_parse_cache_flags(cache, &flags); if (ret < 0) { error_report("Invalid cache option: %s", cache); return -1; @@ -834,7 +871,7 @@ static int img_convert(int argc, char **argv) bs_i = 0; bs_offset = 0; bdrv_get_geometry(bs[0], &bs_sectors); - buf = g_malloc(IO_BUF_SIZE); + buf = qemu_blockalign(out_bs, IO_BUF_SIZE); if (compress) { ret = bdrv_get_info(out_bs, &bdi); @@ -890,7 +927,8 @@ static int img_convert(int argc, char **argv) ret = bdrv_read(bs[bs_i], bs_num, buf2, nlow); if (ret < 0) { - error_report("error while reading"); + error_report("error while reading sector %" PRId64 ": %s", + bs_num, strerror(-ret)); goto out; } @@ -908,8 +946,8 @@ static int img_convert(int argc, char **argv) ret = bdrv_write_compressed(out_bs, sector_num, buf, cluster_sectors); if (ret != 0) { - error_report("error while compressing sector %" PRId64, - sector_num); + error_report("error while compressing sector %" PRId64 + ": %s", sector_num, strerror(-ret)); goto out; } } @@ -972,7 +1010,8 @@ static int img_convert(int argc, char **argv) ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n); if (ret < 0) { - error_report("error while reading"); + error_report("error while reading sector %" PRId64 ": %s", + sector_num - bs_offset, strerror(-ret)); goto out; } /* NOTE: at the same time we convert, we do not write zero @@ -988,10 +1027,11 @@ static int img_convert(int argc, char **argv) sectors that are entirely 0, since whatever data was already there is garbage, not 0s. */ if (!has_zero_init || out_baseimg || - is_allocated_sectors(buf1, n, &n1)) { + is_allocated_sectors_min(buf1, n, &n1, min_sparse)) { ret = bdrv_write(out_bs, sector_num, buf1, n1); if (ret < 0) { - error_report("error while writing"); + error_report("error while writing sector %" PRId64 + ": %s", sector_num, strerror(-ret)); goto out; } } @@ -1006,7 +1046,7 @@ out: qemu_progress_end(); free_option_parameters(create_options); free_option_parameters(param); - g_free(buf); + qemu_vfree(buf); if (out_bs) { bdrv_delete(out_bs); } @@ -1291,7 +1331,7 @@ static int img_rebase(int argc, char **argv) qemu_progress_print(0, 100); flags = BDRV_O_RDWR | (unsafe ? BDRV_O_NO_BACKING : 0); - ret = set_cache_flag(cache, &flags); + ret = bdrv_parse_cache_flags(cache, &flags); if (ret < 0) { error_report("Invalid cache option: %s", cache); return -1; @@ -1373,8 +1413,8 @@ static int img_rebase(int argc, char **argv) uint8_t * buf_new; float local_progress; - buf_old = g_malloc(IO_BUF_SIZE); - buf_new = g_malloc(IO_BUF_SIZE); + buf_old = qemu_blockalign(bs, IO_BUF_SIZE); + buf_new = qemu_blockalign(bs, IO_BUF_SIZE); bdrv_get_geometry(bs, &num_sectors); @@ -1430,8 +1470,8 @@ static int img_rebase(int argc, char **argv) qemu_progress_print(local_progress, 100); } - g_free(buf_old); - g_free(buf_new); + qemu_vfree(buf_old); + qemu_vfree(buf_new); } /* diff --git a/qemu-img.texi b/qemu-img.texi index 495a1b6695..70fa321dff 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -40,6 +40,11 @@ indicates that target image must be compressed (qcow format only) with or without a command shows help and lists the supported formats @item -p display progress bar (convert and rebase commands only) +@item -S @var{size} +indicates the consecutive number of bytes that must contain only zeros +for qemu-img to create a sparse image during conversion. This value is rounded +down to the nearest 512 bytes. You may use the common size suffixes like +@code{k} for kilobytes. @end table Parameters to snapshot subcommand: @@ -86,7 +91,7 @@ it doesn't need to be specified separately in this case. Commit the changes recorded in @var{filename} in its base image. -@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] @var{filename} [@var{filename2} [...]] @var{output_filename} +@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename} Convert the disk image @var{filename} or a snapshot @var{snapshot_name} to disk image @var{output_filename} using format @var{output_fmt}. It can be optionally compressed (@code{-c} diff --git a/qemu-options.hx b/qemu-options.hx index d86815dc04..35d95d1d89 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -133,7 +133,7 @@ ETEXI DEF("drive", HAS_ARG, QEMU_OPTION_drive, "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n" " [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n" - " [,cache=writethrough|writeback|none|unsafe][,format=f]\n" + " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" " [,readonly=on|off]\n" " use 'file' as a drive image\n", QEMU_ARCH_ALL) @@ -164,7 +164,7 @@ These options have the same definition as they have in @option{-hdachs}. @item snapshot=@var{snapshot} @var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}). @item cache=@var{cache} -@var{cache} is "none", "writeback", "unsafe", or "writethrough" and controls how the host cache is used to access block data. +@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough" and controls how the host cache is used to access block data. @item aio=@var{aio} @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO. @item format=@var{format} @@ -199,6 +199,10 @@ The host page cache can be avoided entirely with @option{cache=none}. This will attempt to do disk IO directly to the guests memory. QEMU may still perform an internal copy of the data. +The host page cache can be avoided while only sending write notifications to +the guest when the data has been reported as written by the storage subsystem +using @option{cache=directsync}. + Some block drivers perform badly with @option{cache=writethrough}, most notably, qcow2. If performance is more important than correctness, @option{cache=writeback} should be used with qcow2. diff --git a/qmp-commands.hx b/qmp-commands.hx index 03f67da198..27cc66ebc9 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -1201,6 +1201,10 @@ Each json-object contain the following: - "wr_bytes": bytes written (json-int) - "rd_operations": read operations (json-int) - "wr_operations": write operations (json-int) + - "flush_operations": cache flush operations (json-int) + - "wr_total_time_ns": total time spend on writes in nano-seconds (json-int) + - "rd_total_time_ns": total time spend on reads in nano-seconds (json-int) + - "flush_total_time_ns": total time spend on cache flushes in nano-seconds (json-int) - "wr_highest_offset": Highest offset of a sector written since the BlockDriverState has been opened (json-int) - "parent": Contains recursively the statistics of the underlying @@ -1222,6 +1226,10 @@ Example: "wr_operations":751, "rd_bytes":122567168, "rd_operations":36772 + "wr_total_times_ns":313253456 + "rd_total_times_ns":3465673657 + "flush_total_times_ns":49653 + "flush_operations":61, } }, "stats":{ @@ -1230,6 +1238,10 @@ Example: "wr_operations":692, "rd_bytes":122739200, "rd_operations":36604 + "flush_operations":51, + "wr_total_times_ns":313253456 + "rd_total_times_ns":3465673657 + "flush_total_times_ns":49653 } }, { @@ -1240,6 +1252,10 @@ Example: "wr_operations":0, "rd_bytes":0, "rd_operations":0 + "flush_operations":0, + "wr_total_times_ns":0 + "rd_total_times_ns":0 + "flush_total_times_ns":0 } }, { @@ -1250,6 +1266,10 @@ Example: "wr_operations":0, "rd_bytes":0, "rd_operations":0 + "flush_operations":0, + "wr_total_times_ns":0 + "rd_total_times_ns":0 + "flush_total_times_ns":0 } }, { @@ -1260,6 +1280,10 @@ Example: "wr_operations":0, "rd_bytes":0, "rd_operations":0 + "flush_operations":0, + "wr_total_times_ns":0 + "rd_total_times_ns":0 + "flush_total_times_ns":0 } } ] @@ -31,7 +31,7 @@ endif %.o: %.m $(call quiet-command,$(CC) $(QEMU_INCLUDES) $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) -c -o $@ $<," OBJC $(TARGET_DIR)$@") -LINK = $(call quiet-command,$(CC) $(QEMU_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ $(1) $(LIBS)," LINK $(TARGET_DIR)$@") +LINK = $(call quiet-command,$(CC) $(QEMU_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ $(sort $(1)) $(LIBS)," LINK $(TARGET_DIR)$@") %$(EXESUF): %.o $(call LINK,$^) diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c index 4462647ef1..87cc1177f6 100644 --- a/tcg/ppc/tcg-target.c +++ b/tcg/ppc/tcg-target.c @@ -525,14 +525,14 @@ static void *qemu_st_helpers[4] = { static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) { - int addr_reg, data_reg, data_reg2, r0, r1, rbase, mem_index, s_bits, bswap; + int addr_reg, data_reg, data_reg2, r0, r1, rbase, bswap; #ifdef CONFIG_SOFTMMU - int r2; + int mem_index, s_bits, r2; void *label1_ptr, *label2_ptr; -#endif #if TARGET_LONG_BITS == 64 int addr_reg2; #endif +#endif data_reg = *args++; if (opc == 3) @@ -540,13 +540,13 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) else data_reg2 = 0; addr_reg = *args++; + +#ifdef CONFIG_SOFTMMU #if TARGET_LONG_BITS == 64 addr_reg2 = *args++; #endif mem_index = *args; s_bits = opc & 3; - -#ifdef CONFIG_SOFTMMU r0 = 3; r1 = 4; r2 = 0; @@ -722,14 +722,14 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc) { - int addr_reg, r0, r1, data_reg, data_reg2, mem_index, bswap, rbase; + int addr_reg, r0, r1, data_reg, data_reg2, bswap, rbase; #ifdef CONFIG_SOFTMMU - int r2, ir; + int mem_index, r2, ir; void *label1_ptr, *label2_ptr; -#endif #if TARGET_LONG_BITS == 64 int addr_reg2; #endif +#endif data_reg = *args++; if (opc == 3) @@ -737,12 +737,12 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc) else data_reg2 = 0; addr_reg = *args++; + +#ifdef CONFIG_SOFTMMU #if TARGET_LONG_BITS == 64 addr_reg2 = *args++; #endif mem_index = *args; - -#ifdef CONFIG_SOFTMMU r0 = 3; r1 = 4; r2 = 0; @@ -111,6 +111,8 @@ int main(int argc, char **argv) #define main qemu_main #endif /* CONFIG_COCOA */ +#include <glib.h> + #include "hw/hw.h" #include "hw/boards.h" #include "hw/usb.h" @@ -1321,6 +1323,75 @@ void qemu_system_vmstop_request(int reason) qemu_notify_event(); } +static GPollFD poll_fds[1024 * 2]; /* this is probably overkill */ +static int n_poll_fds; +static int max_priority; + +static void glib_select_fill(int *max_fd, fd_set *rfds, fd_set *wfds, + fd_set *xfds, struct timeval *tv) +{ + GMainContext *context = g_main_context_default(); + int i; + int timeout = 0, cur_timeout; + + g_main_context_prepare(context, &max_priority); + + n_poll_fds = g_main_context_query(context, max_priority, &timeout, + poll_fds, ARRAY_SIZE(poll_fds)); + g_assert(n_poll_fds <= ARRAY_SIZE(poll_fds)); + + for (i = 0; i < n_poll_fds; i++) { + GPollFD *p = &poll_fds[i]; + + if ((p->events & G_IO_IN)) { + FD_SET(p->fd, rfds); + *max_fd = MAX(*max_fd, p->fd); + } + if ((p->events & G_IO_OUT)) { + FD_SET(p->fd, wfds); + *max_fd = MAX(*max_fd, p->fd); + } + if ((p->events & G_IO_ERR)) { + FD_SET(p->fd, xfds); + *max_fd = MAX(*max_fd, p->fd); + } + } + + cur_timeout = (tv->tv_sec * 1000) + ((tv->tv_usec + 500) / 1000); + if (timeout >= 0 && timeout < cur_timeout) { + tv->tv_sec = timeout / 1000; + tv->tv_usec = (timeout % 1000) * 1000; + } +} + +static void glib_select_poll(fd_set *rfds, fd_set *wfds, fd_set *xfds, + bool err) +{ + GMainContext *context = g_main_context_default(); + + if (!err) { + int i; + + for (i = 0; i < n_poll_fds; i++) { + GPollFD *p = &poll_fds[i]; + + if ((p->events & G_IO_IN) && FD_ISSET(p->fd, rfds)) { + p->revents |= G_IO_IN; + } + if ((p->events & G_IO_OUT) && FD_ISSET(p->fd, wfds)) { + p->revents |= G_IO_OUT; + } + if ((p->events & G_IO_ERR) && FD_ISSET(p->fd, xfds)) { + p->revents |= G_IO_ERR; + } + } + } + + if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) { + g_main_context_dispatch(context); + } +} + int main_loop_wait(int nonblocking) { fd_set rfds, wfds, xfds; @@ -1346,8 +1417,10 @@ int main_loop_wait(int nonblocking) FD_ZERO(&rfds); FD_ZERO(&wfds); FD_ZERO(&xfds); + qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds); slirp_select_fill(&nfds, &rfds, &wfds, &xfds); + glib_select_fill(&nfds, &rfds, &wfds, &xfds, &tv); if (timeout > 0) { qemu_mutex_unlock_iothread(); @@ -1361,6 +1434,7 @@ int main_loop_wait(int nonblocking) qemu_iohandler_poll(&rfds, &wfds, &xfds, ret); slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0)); + glib_select_poll(&rfds, &wfds, &xfds, (ret < 0)); qemu_run_all_timers(); |