54 files changed, 1984 insertions, 1017 deletions
diff --git a/block.c b/block.c
index a8a013a6d2..03a21d88de 100644
--- a/block.c
+++ b/block.c
@@ -437,6 +437,33 @@ static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
     return 0;
 }
 
+/**
+ * Set open flags for a given cache mode
+ *
+ * Return 0 on success, -1 if the cache mode was invalid.
+ */
+int bdrv_parse_cache_flags(const char *mode, int *flags)
+{
+    *flags &= ~BDRV_O_CACHE_MASK;
+
+    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
+        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
+    } else if (!strcmp(mode, "directsync")) {
+        *flags |= BDRV_O_NOCACHE;
+    } else if (!strcmp(mode, "writeback")) {
+        *flags |= BDRV_O_CACHE_WB;
+    } else if (!strcmp(mode, "unsafe")) {
+        *flags |= BDRV_O_CACHE_WB;
+        *flags |= BDRV_O_NO_FLUSH;
+    } else if (!strcmp(mode, "writethrough")) {
+        /* this is the default */
+    } else {
+        return -1;
+    }
+
+    return 0;
+}
+
 /*
  * Common part for opening disk images and files
  */
@@ -1163,8 +1190,8 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
         return ret;
     }
 
-    /* No flush needed for cache=writethrough, it uses O_DSYNC */
-    if ((bs->open_flags & BDRV_O_CACHE_MASK) != 0) {
+    /* No flush needed for cache modes that use O_DSYNC */
+    if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
         bdrv_flush(bs);
     }
 
@@ -1888,11 +1915,19 @@ static void bdrv_stats_iter(QObject *data, void *opaque)
                         " wr_bytes=%" PRId64
                         " rd_operations=%" PRId64
                         " wr_operations=%" PRId64
+                        " flush_operations=%" PRId64
+                        " wr_total_time_ns=%" PRId64
+                        " rd_total_time_ns=%" PRId64
+                        " flush_total_time_ns=%" PRId64
                         "\n",
                         qdict_get_int(qdict, "rd_bytes"),
                         qdict_get_int(qdict, "wr_bytes"),
                         qdict_get_int(qdict, "rd_operations"),
-                        qdict_get_int(qdict, "wr_operations"));
+                        qdict_get_int(qdict, "wr_operations"),
+                        qdict_get_int(qdict, "flush_operations"),
+                        qdict_get_int(qdict, "wr_total_time_ns"),
+                        qdict_get_int(qdict, "rd_total_time_ns"),
+                        qdict_get_int(qdict, "flush_total_time_ns"));
 }
 
 void bdrv_stats_print(Monitor *mon, const QObject *data)
@@ -1910,12 +1945,22 @@ static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
                              "'wr_bytes': %" PRId64 ","
                              "'rd_operations': %" PRId64 ","
                              "'wr_operations': %" PRId64 ","
-                             "'wr_highest_offset': %" PRId64
+                             "'wr_highest_offset': %" PRId64 ","
+                             "'flush_operations': %" PRId64 ","
+                             "'wr_total_time_ns': %" PRId64 ","
+                             "'rd_total_time_ns': %" PRId64 ","
+                             "'flush_total_time_ns': %" PRId64
                              "} }",
-                             bs->rd_bytes, bs->wr_bytes,
-                             bs->rd_ops, bs->wr_ops,
+                             bs->nr_bytes[BDRV_ACCT_READ],
+                             bs->nr_bytes[BDRV_ACCT_WRITE],
+                             bs->nr_ops[BDRV_ACCT_READ],
+                             bs->nr_ops[BDRV_ACCT_WRITE],
                              bs->wr_highest_sector *
-                             (uint64_t)BDRV_SECTOR_SIZE);
+                             (uint64_t)BDRV_SECTOR_SIZE,
+                             bs->nr_ops[BDRV_ACCT_FLUSH],
+                             bs->total_time_ns[BDRV_ACCT_WRITE],
+                             bs->total_time_ns[BDRV_ACCT_READ],
+                             bs->total_time_ns[BDRV_ACCT_FLUSH]);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -2229,7 +2274,6 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
     return buf;
 }
 
-
 /**************************************************************/
 /* async I/Os */
 
@@ -2238,7 +2282,6 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
-    BlockDriverAIOCB *ret;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
@@ -2247,16 +2290,8 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
     if (bdrv_check_request(bs, sector_num, nb_sectors))
         return NULL;
 
-    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
-                              cb, opaque);
-
-    if (ret) {
-        /* Update stats even though technically transfer has not happened. */
-        bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-        bs->rd_ops++;
-    }
-
-    return ret;
+    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+                               cb, opaque);
 }
 
 typedef struct BlockCompleteData {
@@ -2323,9 +2358,6 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
                                cb, opaque);
 
     if (ret) {
-        /* Update stats even though technically transfer has not happened. */
-        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-        bs->wr_ops ++;
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
@@ -3133,6 +3165,27 @@ int bdrv_in_use(BlockDriverState *bs)
     return bs->in_use;
 }
 
+void
+bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
+        enum BlockAcctType type)
+{
+    assert(type < BDRV_MAX_IOTYPE);
+
+    cookie->bytes = bytes;
+    cookie->start_time_ns = get_clock();
+    cookie->type = type;
+}
+
+void
+bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
+{
+    assert(cookie->type < BDRV_MAX_IOTYPE);
+
+    bs->nr_bytes[cookie->type] += cookie->bytes;
+    bs->nr_ops[cookie->type]++;
+    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
+}
+
 int bdrv_img_create(const char *filename, const char *fmt,
                     const char *base_filename, const char *base_fmt,
                     char *options, uint64_t img_size, int flags)
diff --git a/block.h b/block.h
index a3bfaafef0..3ac0b944eb 100644
--- a/block.h
+++ b/block.h
@@ -69,6 +69,7 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options);
 BlockDriverState *bdrv_new(const char *device_name);
 void bdrv_make_anon(BlockDriverState *bs);
 void bdrv_delete(BlockDriverState *bs);
+int bdrv_parse_cache_flags(const char *mode, int *flags);
 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags);
 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
               BlockDriver *drv);
@@ -254,6 +255,23 @@ int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 void bdrv_set_in_use(BlockDriverState *bs, int in_use);
 int bdrv_in_use(BlockDriverState *bs);
 
+enum BlockAcctType {
+    BDRV_ACCT_READ,
+    BDRV_ACCT_WRITE,
+    BDRV_ACCT_FLUSH,
+    BDRV_MAX_IOTYPE,
+};
+
+typedef struct BlockAcctCookie {
+    int64_t bytes;
+    int64_t start_time_ns;
+    enum BlockAcctType type;
+} BlockAcctCookie;
+
+void bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
+        int64_t bytes, enum BlockAcctType type);
+void bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie);
+
 typedef enum {
     BLKDBG_L1_UPDATE,
 
@@ -306,3 +324,4 @@ typedef enum {
 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event);
 
 #endif
+
diff --git a/block/curl.c b/block/curl.c
index 5c157bc609..f3f61cc8a1 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -229,6 +229,23 @@ static void curl_multi_do(void *arg)
             {
                 CURLState *state = NULL;
                 curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char**)&state);
+
+                /* ACBs for successful messages get completed in curl_read_cb */
+                if (msg->data.result != CURLE_OK) {
+                    int i;
+                    for (i = 0; i < CURL_NUM_ACB; i++) {
+                        CURLAIOCB *acb = state->acb[i];
+
+                        if (acb == NULL) {
+                            continue;
+                        }
+
+                        acb->common.cb(acb->common.opaque, -EIO);
+                        qemu_aio_release(acb);
+                        state->acb[i] = NULL;
+                    }
+                }
+
                 curl_clean_state(state);
                 break;
             }
@@ -277,7 +294,8 @@ static CURLState *curl_init_state(BDRVCURLState *s)
     curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1);
     curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1);
     curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg);
-    
+    curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1);
+
 #ifdef DEBUG_VERBOSE
     curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1);
 #endif
diff --git a/block/qcow.c b/block/qcow.c
index e155d3c002..c8bfecc1cb 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -159,6 +159,8 @@ static int qcow_open(BlockDriverState *bs, int flags)
             goto fail;
         bs->backing_file[len] = '\0';
     }
+
+    qemu_co_mutex_init(&s->lock);
     return 0;
 
  fail:
@@ -190,24 +192,6 @@ static int qcow_set_key(BlockDriverState *bs, const char *key)
         return -1;
     if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
         return -1;
-#if 0
-    /* test */
-    {
-        uint8_t in[16];
-        uint8_t out[16];
-        uint8_t tmp[16];
-        for(i=0;i<16;i++)
-            in[i] = i;
-        AES_encrypt(in, tmp, &s->aes_encrypt_key);
-        AES_decrypt(tmp, out, &s->aes_decrypt_key);
-        for(i = 0; i < 16; i++)
-            printf(" %02x", tmp[i]);
-        printf("\n");
-        for(i = 0; i < 16; i++)
-            printf(" %02x", out[i]);
-        printf("\n");
-    }
-#endif
     return 0;
 }
 
@@ -441,296 +425,178 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
     return 0;
 }
 
-#if 0
-
-static int qcow_read(BlockDriverState *bs, int64_t sector_num,
-                     uint8_t *buf, int nb_sectors)
+static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret, index_in_cluster, n;
+    int index_in_cluster;
+    int ret = 0, n;
     uint64_t cluster_offset;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    uint8_t *buf;
+    void *orig_buf;
+
+    if (qiov->niov > 1) {
+        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+    } else {
+        orig_buf = NULL;
+        buf = (uint8_t *)qiov->iov->iov_base;
+    }
+
+    qemu_co_mutex_lock(&s->lock);
 
-    while (nb_sectors > 0) {
-        cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    while (nb_sectors != 0) {
+        /* prepare next request */
+        cluster_offset = get_cluster_offset(bs, sector_num << 9,
+                                                 0, 0, 0, 0);
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         n = s->cluster_sectors - index_in_cluster;
-        if (n > nb_sectors)
+        if (n > nb_sectors) {
             n = nb_sectors;
+        }
+
         if (!cluster_offset) {
             if (bs->backing_hd) {
                 /* read from the base image */
-                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
-                if (ret < 0)
-                    return -1;
+                hd_iov.iov_base = (void *)buf;
+                hd_iov.iov_len = n * 512;
+                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
+                }
             } else {
+                /* Note: in this case, no need to wait */
                 memset(buf, 0, 512 * n);
             }
         } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-            if (decompress_cluster(bs, cluster_offset) < 0)
-                return -1;
-            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
+            /* add AIO support for compressed blocks ? */
+            if (decompress_cluster(bs, cluster_offset) < 0) {
+                goto fail;
+            }
+            memcpy(buf,
+                   s->cluster_cache + index_in_cluster * 512, 512 * n);
         } else {
-            ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512);
-            if (ret != n * 512)
-                return -1;
+            if ((cluster_offset & 511) != 0) {
+                goto fail;
+            }
+            hd_iov.iov_base = (void *)buf;
+            hd_iov.iov_len = n * 512;
+            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->file,
+                                (cluster_offset >> 9) + index_in_cluster,
+                                n, &hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                break;
+            }
             if (s->crypt_method) {
-                encrypt_sectors(s, sector_num, buf, buf, n, 0,
+                encrypt_sectors(s, sector_num, buf, buf,
+                                n, 0,
                                 &s->aes_decrypt_key);
             }
         }
+        ret = 0;
+
         nb_sectors -= n;
         sector_num += n;
         buf += n * 512;
     }
-    return 0;
-}
-#endif
 
-typedef struct QCowAIOCB {
-    BlockDriverAIOCB common;
-    int64_t sector_num;
-    QEMUIOVector *qiov;
-    uint8_t *buf;
-    void *orig_buf;
-    int nb_sectors;
-    int n;
-    uint64_t cluster_offset;
-    uint8_t *cluster_data;
-    struct iovec hd_iov;
-    bool is_write;
-    QEMUBH *bh;
-    QEMUIOVector hd_qiov;
-    BlockDriverAIOCB *hd_aiocb;
-} QCowAIOCB;
-
-static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-    if (acb->hd_aiocb)
-        bdrv_aio_cancel(acb->hd_aiocb);
-    qemu_aio_release(acb);
-}
-
-static AIOPool qcow_aio_pool = {
-    .aiocb_size         = sizeof(QCowAIOCB),
-    .cancel             = qcow_aio_cancel,
-};
-
-static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        int is_write)
-{
-    QCowAIOCB *acb;
-
-    acb = qemu_aio_get(&qcow_aio_pool, bs, NULL, NULL);
-    if (!acb)
-        return NULL;
-    acb->hd_aiocb = NULL;
-    acb->sector_num = sector_num;
-    acb->qiov = qiov;
-    acb->is_write = is_write;
+done:
+    qemu_co_mutex_unlock(&s->lock);
 
     if (qiov->niov > 1) {
-        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
-        if (is_write)
-            qemu_iovec_to_buffer(qiov, acb->buf);
-    } else {
-        acb->buf = (uint8_t *)qiov->iov->iov_base;
+        qemu_iovec_from_buffer(qiov, orig_buf, qiov->size);
+        qemu_vfree(orig_buf);
     }
-    acb->nb_sectors = nb_sectors;
-    acb->n = 0;
-    acb->cluster_offset = 0;
-    return acb;
+
+    return ret;
+
+fail:
+    ret = -EIO;
+    goto done;
 }
 
-static int qcow_aio_read_cb(void *opaque)
+static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov)
 {
-    QCowAIOCB *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
-    int ret;
+    uint64_t cluster_offset;
+    const uint8_t *src_buf;
+    int ret = 0, n;
+    uint8_t *cluster_data = NULL;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    uint8_t *buf;
+    void *orig_buf;
 
-    acb->hd_aiocb = NULL;
+    s->cluster_cache_offset = -1; /* disable compressed cache */
 
- redo:
-    /* post process the read buffer */
-    if (!acb->cluster_offset) {
-        /* nothing to do */
-    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
-        /* nothing to do */
+    if (qiov->niov > 1) {
+        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        qemu_iovec_to_buffer(qiov, buf);
     } else {
-        if (s->crypt_method) {
-            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
-                            acb->n, 0,
-                            &s->aes_decrypt_key);
-        }
+        orig_buf = NULL;
+        buf = (uint8_t *)qiov->iov->iov_base;
     }
 
-    acb->nb_sectors -= acb->n;
-    acb->sector_num += acb->n;
-    acb->buf += acb->n * 512;
+    qemu_co_mutex_lock(&s->lock);
 
-    if (acb->nb_sectors == 0) {
-        /* request completed */
-        return 0;
-    }
+    while (nb_sectors != 0) {
 
-    /* prepare next AIO request */
-    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
-                                             0, 0, 0, 0);
-    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
-    acb->n = s->cluster_sectors - index_in_cluster;
-    if (acb->n > acb->nb_sectors)
-        acb->n = acb->nb_sectors;
-
-    if (!acb->cluster_offset) {
-        if (bs->backing_hd) {
-            /* read from the base image */
-            acb->hd_iov.iov_base = (void *)acb->buf;
-            acb->hd_iov.iov_len = acb->n * 512;
-            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            qemu_co_mutex_unlock(&s->lock);
-            ret = bdrv_co_readv(bs->backing_hd, acb->sector_num,
-                                acb->n, &acb->hd_qiov);
-            qemu_co_mutex_lock(&s->lock);
-            if (ret < 0) {
-                return -EIO;
-            }
-        } else {
-            /* Note: in this case, no need to wait */
-            memset(acb->buf, 0, 512 * acb->n);
-            goto redo;
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
         }
-    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
-        /* add AIO support for compressed blocks ? */
-        if (decompress_cluster(bs, acb->cluster_offset) < 0) {
-            return -EIO;
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
+                                            index_in_cluster,
+                                            index_in_cluster + n);
+        if (!cluster_offset || (cluster_offset & 511) != 0) {
+            ret = -EIO;
+            break;
         }
-        memcpy(acb->buf,
-               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
-        goto redo;
-    } else {
-        if ((acb->cluster_offset & 511) != 0) {
-            return -EIO;
+        if (s->crypt_method) {
+            if (!cluster_data) {
+                cluster_data = g_malloc0(s->cluster_size);
+            }
+            encrypt_sectors(s, sector_num, cluster_data, buf,
+                            n, 1, &s->aes_encrypt_key);
+            src_buf = cluster_data;
+        } else {
+            src_buf = buf;
         }
-        acb->hd_iov.iov_base = (void *)acb->buf;
-        acb->hd_iov.iov_len = acb->n * 512;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+
+        hd_iov.iov_base = (void *)src_buf;
+        hd_iov.iov_len = n * 512;
+        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
         qemu_co_mutex_unlock(&s->lock);
-        ret = bdrv_co_readv(bs->file,
-                            (acb->cluster_offset >> 9) + index_in_cluster,
-                            acb->n, &acb->hd_qiov);
+        ret = bdrv_co_writev(bs->file,
+                             (cluster_offset >> 9) + index_in_cluster,
+                             n, &hd_qiov);
         qemu_co_mutex_lock(&s->lock);
         if (ret < 0) {
-            return ret;
+            break;
         }
-    }
-
-    return 1;
-}
-
-static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov)
-{
-    BDRVQcowState *s = bs->opaque;
-    QCowAIOCB *acb;
-    int ret;
-
-    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
-
-    qemu_co_mutex_lock(&s->lock);
-    do {
-        ret = qcow_aio_read_cb(acb);
-    } while (ret > 0);
-    qemu_co_mutex_unlock(&s->lock);
+        ret = 0;
 
-    if (acb->qiov->niov > 1) {
-        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
-        qemu_vfree(acb->orig_buf);
-    }
-    qemu_aio_release(acb);
-
-    return ret;
-}
-
-static int qcow_aio_write_cb(void *opaque)
-{
-    QCowAIOCB *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-    BDRVQcowState *s = bs->opaque;
-    int index_in_cluster;
-    uint64_t cluster_offset;
-    const uint8_t *src_buf;
-    int ret;
-
-    acb->hd_aiocb = NULL;
-
-    acb->nb_sectors -= acb->n;
-    acb->sector_num += acb->n;
-    acb->buf += acb->n * 512;
-
-    if (acb->nb_sectors == 0) {
-        /* request completed */
-        return 0;
-    }
-
-    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
-    acb->n = s->cluster_sectors - index_in_cluster;
-    if (acb->n > acb->nb_sectors)
-        acb->n = acb->nb_sectors;
-    cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
-                                        index_in_cluster,
-                                        index_in_cluster + acb->n);
-    if (!cluster_offset || (cluster_offset & 511) != 0) {
-        return -EIO;
-    }
-    if (s->crypt_method) {
-        if (!acb->cluster_data) {
-            acb->cluster_data = g_malloc0(s->cluster_size);
-        }
-        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
-                        acb->n, 1, &s->aes_encrypt_key);
-        src_buf = acb->cluster_data;
-    } else {
-        src_buf = acb->buf;
-    }
-
-    acb->hd_iov.iov_base = (void *)src_buf;
-    acb->hd_iov.iov_len = acb->n * 512;
-    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-    qemu_co_mutex_unlock(&s->lock);
-    ret = bdrv_co_writev(bs->file,
-                         (cluster_offset >> 9) + index_in_cluster,
-                         acb->n, &acb->hd_qiov);
-    qemu_co_mutex_lock(&s->lock);
-    if (ret < 0) {
-        return ret;
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
     }
-    return 1;
-}
-
-static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov)
-{
-    BDRVQcowState *s = bs->opaque;
-    QCowAIOCB *acb;
-    int ret;
-
-    s->cluster_cache_offset = -1; /* disable compressed cache */
-
-    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
-
-    qemu_co_mutex_lock(&s->lock);
-    do {
-        ret = qcow_aio_write_cb(acb);
-    } while (ret > 0);
     qemu_co_mutex_unlock(&s->lock);
 
-    if (acb->qiov->niov > 1) {
-        qemu_vfree(acb->orig_buf);
+    if (qiov->niov > 1) {
+        qemu_vfree(orig_buf);
     }
-    qemu_aio_release(acb);
+    free(cluster_data);
 
     return ret;
 }
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 9269ddaefd..e06be64876 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -53,7 +53,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size)
     }
 
 #ifdef DEBUG_ALLOC2
-    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+    fprintf(stderr, "grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
 #endif
 
     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
@@ -381,10 +381,10 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
  * For a given offset of the disk image, find the cluster offset in
  * qcow2 file. The offset is stored in *cluster_offset.
  *
- * on entry, *num is the number of contiguous clusters we'd like to
+ * on entry, *num is the number of contiguous sectors we'd like to
  * access following offset.
  *
- * on exit, *num is the number of contiguous clusters we can read.
+ * on exit, *num is the number of contiguous sectors we can read.
  *
  * Return 0, if the offset is found
  * Return -errno, otherwise.
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 2a915be57a..9605367777 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -422,7 +422,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     int ret;
 
 #ifdef DEBUG_ALLOC2
-    printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
+    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
            offset, length, addend);
 #endif
     if (length < 0) {
@@ -556,7 +556,7 @@ retry:
         }
     }
 #ifdef DEBUG_ALLOC2
-    printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
+    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
             size,
             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 #endif
@@ -680,24 +680,6 @@ void qcow2_free_any_clusters(BlockDriverState *bs,
 
 
 
-void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
-    int64_t size)
-{
-    int refcount;
-    int64_t start, last, cluster_offset;
-    uint16_t *p;
-
-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + size - 1)  & ~(s->cluster_size - 1);
-    for(cluster_offset = start; cluster_offset <= last;
-        cluster_offset += s->cluster_size) {
-        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
-        refcount = be16_to_cpu(*p);
-        refcount++;
-        *p = cpu_to_be16(refcount);
-    }
-}
-
 /* update the refcounts of snapshots and the copied flag */
 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend)
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 3bd2a30d35..3e6bf8b6f3 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -303,7 +303,10 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     if (qcow2_write_snapshots(bs) < 0)
         goto fail;
 #ifdef DEBUG_ALLOC
-    qcow2_check_refcounts(bs);
+    {
+      BdrvCheckResult result = {0};
+      qcow2_check_refcounts(bs, &result);
+    }
 #endif
     return 0;
  fail:
@@ -353,7 +356,10 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
         goto fail;
 
 #ifdef DEBUG_ALLOC
-    qcow2_check_refcounts(bs);
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result);
+    }
 #endif
     return 0;
  fail:
@@ -390,7 +396,10 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
         return ret;
     }
 #ifdef DEBUG_ALLOC
-    qcow2_check_refcounts(bs);
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result);
+    }
 #endif
     return 0;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index bfff6cd963..b725d68b1d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -87,6 +87,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
     while (offset < end_offset) {
 
 #ifdef DEBUG_EXT
+        BDRVQcowState *s = bs->opaque;
         /* Sanity check */
         if (offset > s->cluster_size)
             printf("qcow2_read_extension: suspicious offset %lu\n", offset);
@@ -280,7 +281,10 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     qemu_co_mutex_init(&s->lock);
 
 #ifdef DEBUG_ALLOC
-    qcow2_check_refcounts(bs);
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result);
+    }
 #endif
     return ret;
 
@@ -372,201 +376,127 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
     return n1;
 }
 
-typedef struct QCowAIOCB {
-    BlockDriverAIOCB common;
-    int64_t sector_num;
-    QEMUIOVector *qiov;
-    int remaining_sectors;
-    int cur_nr_sectors;	/* number of sectors in current iteration */
-    uint64_t bytes_done;
-    uint64_t cluster_offset;
-    uint8_t *cluster_data;
-    bool is_write;
-    QEMUIOVector hd_qiov;
-    QEMUBH *bh;
-    QCowL2Meta l2meta;
-    QLIST_ENTRY(QCowAIOCB) next_depend;
-} QCowAIOCB;
-
-static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-    qemu_aio_release(acb);
-}
-
-static AIOPool qcow2_aio_pool = {
-    .aiocb_size         = sizeof(QCowAIOCB),
-    .cancel             = qcow2_aio_cancel,
-};
-
-/*
- * Returns 0 when the request is completed successfully, 1 when there is still
- * a part left to do and -errno in error cases.
- */
-static int qcow2_aio_read_cb(QCowAIOCB *acb)
+static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
 {
-    BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster, n1;
     int ret;
+    int cur_nr_sectors; /* number of sectors in current iteration */
+    uint64_t cluster_offset = 0;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    uint8_t *cluster_data = NULL;
 
-    /* post process the read buffer */
-    if (!acb->cluster_offset) {
-        /* nothing to do */
-    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
-        /* nothing to do */
-    } else {
-        if (s->crypt_method) {
-            qcow2_encrypt_sectors(s, acb->sector_num,  acb->cluster_data,
-                acb->cluster_data, acb->cur_nr_sectors, 0, &s->aes_decrypt_key);
-            qemu_iovec_reset(&acb->hd_qiov);
-            qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
-                acb->cur_nr_sectors * 512);
-            qemu_iovec_from_buffer(&acb->hd_qiov, acb->cluster_data,
-                512 * acb->cur_nr_sectors);
-        }
-    }
+    qemu_iovec_init(&hd_qiov, qiov->niov);
 
-    acb->remaining_sectors -= acb->cur_nr_sectors;
-    acb->sector_num += acb->cur_nr_sectors;
-    acb->bytes_done += acb->cur_nr_sectors * 512;
+    qemu_co_mutex_lock(&s->lock);
 
-    if (acb->remaining_sectors == 0) {
-        /* request completed */
-        return 0;
-    }
+    while (remaining_sectors != 0) {
 
-    /* prepare next AIO request */
-    acb->cur_nr_sectors = acb->remaining_sectors;
-    if (s->crypt_method) {
-        acb->cur_nr_sectors = MIN(acb->cur_nr_sectors,
-            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
-    }
+        /* prepare next request */
+        cur_nr_sectors = remaining_sectors;
+        if (s->crypt_method) {
+            cur_nr_sectors = MIN(cur_nr_sectors,
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+        }
 
-    ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9,
-        &acb->cur_nr_sectors, &acb->cluster_offset);
-    if (ret < 0) {
-        return ret;
-    }
+        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
+            &cur_nr_sectors, &cluster_offset);
+        if (ret < 0) {
+            goto fail;
+        }
 
-    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
-
-    qemu_iovec_reset(&acb->hd_qiov);
-    qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
-        acb->cur_nr_sectors * 512);
-
-    if (!acb->cluster_offset) {
-
-        if (bs->backing_hd) {
-            /* read from the base image */
-            n1 = qcow2_backing_read1(bs->backing_hd, &acb->hd_qiov,
-                acb->sector_num, acb->cur_nr_sectors);
-            if (n1 > 0) {
-                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                qemu_co_mutex_unlock(&s->lock);
-                ret = bdrv_co_readv(bs->backing_hd, acb->sector_num,
-                                    n1, &acb->hd_qiov);
-                qemu_co_mutex_lock(&s->lock);
-                if (ret < 0) {
-                    return ret;
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);
+
+        if (!cluster_offset) {
+
+            if (bs->backing_hd) {
+                /* read from the base image */
+                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
+                    sector_num, cur_nr_sectors);
+                if (n1 > 0) {
+                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                    qemu_co_mutex_unlock(&s->lock);
+                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                        n1, &hd_qiov);
+                    qemu_co_mutex_lock(&s->lock);
+                    if (ret < 0) {
+                        goto fail;
+                    }
                 }
+            } else {
+                /* Note: in this case, no need to wait */
+                qemu_iovec_memset(&hd_qiov, 0, 512 * cur_nr_sectors);
+            }
+        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            /* add AIO support for compressed blocks ? */
+            ret = qcow2_decompress_cluster(bs, cluster_offset);
+            if (ret < 0) {
+                goto fail;
             }
-            return 1;
-        } else {
-            /* Note: in this case, no need to wait */
-            qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors);
-            return 1;
-        }
-    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
-        /* add AIO support for compressed blocks ? */
-        ret = qcow2_decompress_cluster(bs, acb->cluster_offset);
-        if (ret < 0) {
-            return ret;
-        }
 
-        qemu_iovec_from_buffer(&acb->hd_qiov,
-            s->cluster_cache + index_in_cluster * 512,
-            512 * acb->cur_nr_sectors);
+            qemu_iovec_from_buffer(&hd_qiov,
+                s->cluster_cache + index_in_cluster * 512,
+                512 * cur_nr_sectors);
+        } else {
+            if ((cluster_offset & 511) != 0) {
+                ret = -EIO;
+                goto fail;
+            }
 
-        return 1;
-    } else {
-        if ((acb->cluster_offset & 511) != 0) {
-            return -EIO;
-        }
+            if (s->crypt_method) {
+                /*
+                 * For encrypted images, read everything into a temporary
+                 * contiguous buffer on which the AES functions can work.
+                 */
+                if (!cluster_data) {
+                    cluster_data =
+                        g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                }
 
-        if (s->crypt_method) {
-            /*
-             * For encrypted images, read everything into a temporary
-             * contiguous buffer on which the AES functions can work.
-             */
-            if (!acb->cluster_data) {
-                acb->cluster_data =
-                    g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                assert(cur_nr_sectors <=
+                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_add(&hd_qiov, cluster_data,
+                    512 * cur_nr_sectors);
             }
 
-            assert(acb->cur_nr_sectors <=
-                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
-            qemu_iovec_reset(&acb->hd_qiov);
-            qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
-                512 * acb->cur_nr_sectors);
+            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->file,
+                                (cluster_offset >> 9) + index_in_cluster,
+                                cur_nr_sectors, &hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                goto fail;
+            }
+            if (s->crypt_method) {
+                qcow2_encrypt_sectors(s, sector_num,  cluster_data,
+                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                    cur_nr_sectors * 512);
+                qemu_iovec_from_buffer(&hd_qiov, cluster_data,
+                    512 * cur_nr_sectors);
+            }
         }
 
-        BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-        qemu_co_mutex_unlock(&s->lock);
-        ret = bdrv_co_readv(bs->file,
-                            (acb->cluster_offset >> 9) + index_in_cluster,
-                            acb->cur_nr_sectors, &acb->hd_qiov);
-        qemu_co_mutex_lock(&s->lock);
-        if (ret < 0) {
-            return ret;
-        }
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
     }
+    ret = 0;
 
-    return 1;
-}
-
-static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
-                                  QEMUIOVector *qiov, int nb_sectors,
-                                  BlockDriverCompletionFunc *cb,
-                                  void *opaque, int is_write)
-{
-    QCowAIOCB *acb;
-
-    acb = qemu_aio_get(&qcow2_aio_pool, bs, cb, opaque);
-    if (!acb)
-        return NULL;
-    acb->sector_num = sector_num;
-    acb->qiov = qiov;
-    acb->is_write = is_write;
-
-    qemu_iovec_init(&acb->hd_qiov, qiov->niov);
-
-    acb->bytes_done = 0;
-    acb->remaining_sectors = nb_sectors;
-    acb->cur_nr_sectors = 0;
-    acb->cluster_offset = 0;
-    acb->l2meta.nb_clusters = 0;
-    qemu_co_queue_init(&acb->l2meta.dependent_requests);
-    return acb;
-}
-
-static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov)
-{
-    BDRVQcowState *s = bs->opaque;
-    QCowAIOCB *acb;
-    int ret;
-
-    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0);
-
-    qemu_co_mutex_lock(&s->lock);
-    do {
-        ret = qcow2_aio_read_cb(acb);
-    } while (ret > 0);
+fail:
     qemu_co_mutex_unlock(&s->lock);
 
-    qemu_iovec_destroy(&acb->hd_qiov);
-    qemu_aio_release(acb);
+    qemu_iovec_destroy(&hd_qiov);
+    g_free(cluster_data);
 
     return ret;
 }
@@ -586,104 +516,100 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
     }
 }
 
-/*
- * Returns 0 when the request is completed successfully, 1 when there is still
- * a part left to do and -errno in error cases.
- */
-static int qcow2_aio_write_cb(QCowAIOCB *acb)
+static int qcow2_co_writev(BlockDriverState *bs,
+                           int64_t sector_num,
+                           int remaining_sectors,
+                           QEMUIOVector *qiov)
 {
-    BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
     int n_end;
     int ret;
+    int cur_nr_sectors; /* number of sectors in current iteration */
+    QCowL2Meta l2meta;
+    uint64_t cluster_offset;
+    QEMUIOVector hd_qiov;
+    uint64_t bytes_done = 0;
+    uint8_t *cluster_data = NULL;
 
-    ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta);
+    l2meta.nb_clusters = 0;
+    qemu_co_queue_init(&l2meta.dependent_requests);
 
-    run_dependent_requests(s, &acb->l2meta);
+    qemu_iovec_init(&hd_qiov, qiov->niov);
 
-    if (ret < 0) {
-        return ret;
-    }
+    s->cluster_cache_offset = -1; /* disable compressed cache */
 
-    acb->remaining_sectors -= acb->cur_nr_sectors;
-    acb->sector_num += acb->cur_nr_sectors;
-    acb->bytes_done += acb->cur_nr_sectors * 512;
+    qemu_co_mutex_lock(&s->lock);
 
-    if (acb->remaining_sectors == 0) {
-        /* request completed */
-        return 0;
-    }
+    while (remaining_sectors != 0) {
 
-    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
-    n_end = index_in_cluster + acb->remaining_sectors;
-    if (s->crypt_method &&
-        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
-        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n_end = index_in_cluster + remaining_sectors;
+        if (s->crypt_method &&
+            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
+            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+        }
 
-    ret = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9,
-        index_in_cluster, n_end, &acb->cur_nr_sectors, &acb->l2meta);
-    if (ret < 0) {
-        return ret;
-    }
+        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
+            index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
+        if (ret < 0) {
+            goto fail;
+        }
 
-    acb->cluster_offset = acb->l2meta.cluster_offset;
-    assert((acb->cluster_offset & 511) == 0);
+        cluster_offset = l2meta.cluster_offset;
+        assert((cluster_offset & 511) == 0);
 
-    qemu_iovec_reset(&acb->hd_qiov);
-    qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
-        acb->cur_nr_sectors * 512);
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);
 
-    if (s->crypt_method) {
-        if (!acb->cluster_data) {
-            acb->cluster_data = g_malloc0(QCOW_MAX_CRYPT_CLUSTERS *
-                                             s->cluster_size);
-        }
+        if (s->crypt_method) {
+            if (!cluster_data) {
+                cluster_data = g_malloc0(QCOW_MAX_CRYPT_CLUSTERS *
+                                                 s->cluster_size);
+            }
 
-        assert(acb->hd_qiov.size <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
-        qemu_iovec_to_buffer(&acb->hd_qiov, acb->cluster_data);
+            assert(hd_qiov.size <=
+                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+            qemu_iovec_to_buffer(&hd_qiov, cluster_data);
 
-        qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data,
-            acb->cluster_data, acb->cur_nr_sectors, 1, &s->aes_encrypt_key);
+            qcow2_encrypt_sectors(s, sector_num, cluster_data,
+                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
 
-        qemu_iovec_reset(&acb->hd_qiov);
-        qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
-            acb->cur_nr_sectors * 512);
-    }
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_add(&hd_qiov, cluster_data,
+                cur_nr_sectors * 512);
+        }
 
-    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    qemu_co_mutex_unlock(&s->lock);
-    ret = bdrv_co_writev(bs->file,
-                         (acb->cluster_offset >> 9) + index_in_cluster,
-                         acb->cur_nr_sectors, &acb->hd_qiov);
-    qemu_co_mutex_lock(&s->lock);
-    if (ret < 0) {
-        return ret;
-    }
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_writev(bs->file,
+                             (cluster_offset >> 9) + index_in_cluster,
+                             cur_nr_sectors, &hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            goto fail;
+        }
 
-    return 1;
-}
+        ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
 
-static int qcow2_co_writev(BlockDriverState *bs,
-                           int64_t sector_num,
-                           int nb_sectors,
-                           QEMUIOVector *qiov)
-{
-    BDRVQcowState *s = bs->opaque;
-    QCowAIOCB *acb;
-    int ret;
+        run_dependent_requests(s, &l2meta);
 
-    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1);
-    s->cluster_cache_offset = -1; /* disable compressed cache */
+        if (ret < 0) {
+            goto fail;
+        }
 
-    qemu_co_mutex_lock(&s->lock);
-    do {
-        ret = qcow2_aio_write_cb(acb);
-    } while (ret > 0);
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
+    }
+    ret = 0;
+
+fail:
     qemu_co_mutex_unlock(&s->lock);
 
-    qemu_iovec_destroy(&acb->hd_qiov);
-    qemu_aio_release(acb);
+    qemu_iovec_destroy(&hd_qiov);
+    g_free(cluster_data);
 
     return ret;
 }
diff --git a/block/qcow2.h b/block/qcow2.h
index de23abe1a4..c8ca3bc574 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -189,8 +189,6 @@ void qcow2_free_clusters(BlockDriverState *bs,
 void qcow2_free_any_clusters(BlockDriverState *bs,
     uint64_t cluster_offset, int nb_clusters);
 
-void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
-    int64_t size);
 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend);
 
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 57b6e1aad7..c1f6e07ec1 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -274,7 +274,7 @@ struct SheepdogAIOCB {
     int ret;
     enum AIOCBState aiocb_type;
 
-    QEMUBH *bh;
+    Coroutine *coroutine;
     void (*aio_done_func)(SheepdogAIOCB *);
 
     int canceled;
@@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState {
     char *port;
     int fd;
 
+    CoMutex lock;
+    Coroutine *co_send;
+    Coroutine *co_recv;
+
     uint32_t aioreq_seq_num;
     QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
 } BDRVSheepdogState;
@@ -346,19 +350,16 @@ static const char * sd_strerror(int err)
 /*
  * Sheepdog I/O handling:
  *
- * 1. In the sd_aio_readv/writev, read/write requests are added to the
- *    QEMU Bottom Halves.
- *
- * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
- *    requests to the server and link the requests to the
- *    outstanding_list in the BDRVSheepdogState.  we exits the
- *    function without waiting for receiving the response.
+ * 1. In sd_co_rw_vector, we send the I/O requests to the server and
+ *    link the requests to the outstanding_list in the
+ *    BDRVSheepdogState.  The function exits without waiting for
+ *    receiving the response.
  *
- * 3. We receive the response in aio_read_response, the fd handler to
+ * 2. We receive the response in aio_read_response, the fd handler to
  *    the sheepdog connection.  If metadata update is needed, we send
  *    the write request to the vdi object in sd_write_done, the write
- *    completion function.  The AIOCB callback is not called until all
- *    the requests belonging to the AIOCB are finished.
+ *    completion function.  We switch back to sd_co_readv/writev after
+ *    all the requests belonging to the AIOCB are finished.
  */
 
 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
@@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 static void sd_finish_aiocb(SheepdogAIOCB *acb)
 {
     if (!acb->canceled) {
-        acb->common.cb(acb->common.opaque, acb->ret);
+        qemu_coroutine_enter(acb->coroutine, NULL);
     }
     qemu_aio_release(acb);
 }
@@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
      * Sheepdog cannot cancel the requests which are already sent to
      * the servers, so we just complete the request with -EIO here.
      */
-    acb->common.cb(acb->common.opaque, -EIO);
+    acb->ret = -EIO;
+    qemu_coroutine_enter(acb->coroutine, NULL);
     acb->canceled = 1;
 }
 
@@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
 
     acb->aio_done_func = NULL;
     acb->canceled = 0;
-    acb->bh = NULL;
+    acb->coroutine = qemu_coroutine_self();
     acb->ret = 0;
     QLIST_INIT(&acb->aioreq_head);
     return acb;
 }
 
-static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
-{
-    if (acb->bh) {
-        error_report("bug: %d %d", acb->aiocb_type, acb->aiocb_type);
-        return -EIO;
-    }
-
-    acb->bh = qemu_bh_new(cb, acb);
-    qemu_bh_schedule(acb->bh);
-    return 0;
-}
-
 #ifdef _WIN32
 
 struct msghdr {
@@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec *iov, int len,
 again:
     ret = do_send_recv(sockfd, iov, len, iov_offset, write);
     if (ret < 0) {
-        if (errno == EINTR || errno == EAGAIN) {
+        if (errno == EINTR) {
+            goto again;
+        }
+        if (errno == EAGAIN) {
+            if (qemu_in_coroutine()) {
+                qemu_coroutine_yield();
+            }
             goto again;
         }
         error_report("failed to recv a rsp, %s", strerror(errno));
@@ -793,14 +789,14 @@ static void aio_read_response(void *opaque)
     unsigned long idx;
 
     if (QLIST_EMPTY(&s->outstanding_aio_head)) {
-        return;
+        goto out;
     }
 
     /* read a header */
     ret = do_read(fd, &rsp, sizeof(rsp));
     if (ret) {
         error_report("failed to get the header, %s", strerror(errno));
-        return;
+        goto out;
     }
 
     /* find the right aio_req from the outstanding_aio list */
@@ -811,7 +807,7 @@ static void aio_read_response(void *opaque)
     }
     if (!aio_req) {
         error_report("cannot find aio_req %x", rsp.id);
-        return;
+        goto out;
     }
 
     acb = aio_req->aiocb;
@@ -847,7 +843,7 @@ static void aio_read_response(void *opaque)
                        aio_req->iov_offset);
         if (ret) {
             error_report("failed to get the data, %s", strerror(errno));
-            return;
+            goto out;
         }
         break;
     }
@@ -861,10 +857,30 @@ static void aio_read_response(void *opaque)
     if (!rest) {
         /*
          * We've finished all requests which belong to the AIOCB, so
-         * we can call the callback now.
+         * we can switch back to sd_co_readv/writev now.
          */
         acb->aio_done_func(acb);
     }
+out:
+    s->co_recv = NULL;
+}
+
+static void co_read_response(void *opaque)
+{
+    BDRVSheepdogState *s = opaque;
+
+    if (!s->co_recv) {
+        s->co_recv = qemu_coroutine_create(aio_read_response);
+    }
+
+    qemu_coroutine_enter(s->co_recv, opaque);
+}
+
+static void co_write_request(void *opaque)
+{
+    BDRVSheepdogState *s = opaque;
+
+    qemu_coroutine_enter(s->co_send, NULL);
 }
 
 static int aio_flush_request(void *opaque)
@@ -924,7 +940,7 @@ static int get_sheep_fd(BDRVSheepdogState *s)
         return -1;
     }
 
-    qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
+    qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request,
                             NULL, s);
     return fd;
 }
@@ -1091,6 +1107,10 @@ static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
     hdr.id = aio_req->id;
 
+    qemu_co_mutex_lock(&s->lock);
+    s->co_send = qemu_coroutine_self();
+    qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
+                            aio_flush_request, NULL, s);
     set_cork(s->fd, 1);
 
     /* send a header */
@@ -1109,6 +1129,9 @@ static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
     }
 
     set_cork(s->fd, 0);
+    qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
+                            aio_flush_request, NULL, s);
+    qemu_co_mutex_unlock(&s->lock);
 
     return 0;
 }
@@ -1225,6 +1248,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
 
     bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
     strncpy(s->name, vdi, sizeof(s->name));
+    qemu_co_mutex_init(&s->lock);
     g_free(buf);
     return 0;
 out:
@@ -1491,7 +1515,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
 /*
  * This function is called after writing data objects.  If we need to
  * update metadata, this sends a write request to the vdi object.
- * Otherwise, this calls the AIOCB callback.
+ * Otherwise, this switches back to sd_co_readv/writev.
  */
 static void sd_write_done(SheepdogAIOCB *acb)
 {
@@ -1587,8 +1611,11 @@ out:
  * waiting the response.  The responses are received in the
  * `aio_read_response' function which is called from the main loop as
  * a fd handler.
+ *
+ * Returns 1 when we need to wait a response, 0 when there is no sent
+ * request and -errno in error cases.
  */
-static void sd_readv_writev_bh_cb(void *p)
+static int sd_co_rw_vector(void *p)
 {
     SheepdogAIOCB *acb = p;
     int ret = 0;
@@ -1600,9 +1627,6 @@ static void sd_readv_writev_bh_cb(void *p)
     SheepdogInode *inode = &s->inode;
     AIOReq *aio_req;
 
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
     if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
         /*
          * In the case we open the snapshot VDI, Sheepdog creates the
@@ -1684,42 +1708,47 @@ static void sd_readv_writev_bh_cb(void *p)
     }
 out:
     if (QLIST_EMPTY(&acb->aioreq_head)) {
-        sd_finish_aiocb(acb);
+        return acb->ret;
     }
+    return 1;
 }
 
-static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                                       QEMUIOVector *qiov, int nb_sectors,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque)
+static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
+                        int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
+    int ret;
 
     if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
         /* TODO: shouldn't block here */
         if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
-            return NULL;
+            return -EIO;
         }
         bs->total_sectors = sector_num + nb_sectors;
     }
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
+    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
     acb->aio_done_func = sd_write_done;
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
-    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
-    return &acb->common;
+    ret = sd_co_rw_vector(acb);
+    if (ret <= 0) {
+        qemu_aio_release(acb);
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+
+    return acb->ret;
 }
 
-static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                                      QEMUIOVector *qiov, int nb_sectors,
-                                      BlockDriverCompletionFunc *cb,
-                                      void *opaque)
+static int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
+                       int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
-    int i;
+    int i, ret;
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
+    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
     acb->aiocb_type = AIOCB_READ_UDATA;
     acb->aio_done_func = sd_finish_aiocb;
 
@@ -1731,8 +1760,15 @@ static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
         memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
     }
 
-    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
-    return &acb->common;
+    ret = sd_co_rw_vector(acb);
+    if (ret <= 0) {
+        qemu_aio_release(acb);
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+
+    return acb->ret;
 }
 
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
@@ -2062,8 +2098,8 @@ BlockDriver bdrv_sheepdog = {
     .bdrv_getlength = sd_getlength,
     .bdrv_truncate  = sd_truncate,
 
-    .bdrv_aio_readv     = sd_aio_readv,
-    .bdrv_aio_writev    = sd_aio_writev,
+    .bdrv_co_readv  = sd_co_readv,
+    .bdrv_co_writev = sd_co_writev,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
diff --git a/block_int.h b/block_int.h
index f6d02b38a7..8a72b804b2 100644
--- a/block_int.h
+++ b/block_int.h
@@ -28,6 +28,7 @@
 #include "qemu-option.h"
 #include "qemu-queue.h"
 #include "qemu-coroutine.h"
+#include "qemu-timer.h"
 
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
@@ -184,10 +185,9 @@ struct BlockDriverState {
     void *sync_aiocb;
 
     /* I/O stats (display with "info blockstats"). */
-    uint64_t rd_bytes;
-    uint64_t wr_bytes;
-    uint64_t rd_ops;
-    uint64_t wr_ops;
+    uint64_t nr_bytes[BDRV_MAX_IOTYPE];
+    uint64_t nr_ops[BDRV_MAX_IOTYPE];
+    uint64_t total_time_ns[BDRV_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
diff --git a/blockdev.c b/blockdev.c
index d272659ab2..2602591bf6 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -321,18 +321,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     }
 
     if ((buf = qemu_opt_get(opts, "cache")) != NULL) {
-        if (!strcmp(buf, "off") || !strcmp(buf, "none")) {
-            bdrv_flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
-        } else if (!strcmp(buf, "writeback")) {
-            bdrv_flags |= BDRV_O_CACHE_WB;
-        } else if (!strcmp(buf, "unsafe")) {
-            bdrv_flags |= BDRV_O_CACHE_WB;
-            bdrv_flags |= BDRV_O_NO_FLUSH;
-        } else if (!strcmp(buf, "writethrough")) {
-            /* this is the default */
-        } else {
-           error_report("invalid cache option");
-           return NULL;
+        if (bdrv_parse_cache_flags(buf, &bdrv_flags) != 0) {
+            error_report("invalid cache option");
+            return NULL;
         }
     }
 
diff --git a/hw/flash.h b/hw/flash.h
index 140ae39801..270be5e127 100644
--- a/hw/flash.h
+++ b/hw/flash.h
@@ -36,12 +36,7 @@ uint32_t nand_getbuswidth(DeviceState *dev);
 #define NAND_MFR_MICRON		0x2c
 
 /* onenand.c */
-void onenand_base_update(void *opaque, target_phys_addr_t new);
-void onenand_base_unmap(void *opaque);
-void *onenand_init(BlockDriverState *bdrv,
-                uint16_t man_id, uint16_t dev_id, uint16_t ver_id,
-                int regshift, qemu_irq irq);
-void *onenand_raw_otp(void *opaque);
+void *onenand_raw_otp(DeviceState *onenand_device);
 
 /* ecc.c */
 typedef struct {
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 29521babf7..f4fa1545bd 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -710,6 +710,7 @@ static void ncq_cb(void *opaque, int ret)
     DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
             ncq_tfs->tag);
 
+    bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct);
     qemu_sglist_destroy(&ncq_tfs->sglist);
     ncq_tfs->used = 0;
 }
@@ -756,6 +757,10 @@ static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis,
             ncq_tfs->is_read = 1;
 
             DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_READ);
             ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs,
                                            &ncq_tfs->sglist, ncq_tfs->lba,
                                            ncq_cb, ncq_tfs);
@@ -766,6 +771,10 @@ static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis,
             ncq_tfs->is_read = 0;
 
             DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_WRITE);
             ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs,
                                             &ncq_tfs->sglist, ncq_tfs->lba,
                                             ncq_cb, ncq_tfs);
diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h
index e456193b2b..832539c23c 100644
--- a/hw/ide/ahci.h
+++ b/hw/ide/ahci.h
@@ -258,6 +258,7 @@ typedef struct NCQTransferState {
     AHCIDevice *drive;
     BlockDriverAIOCB *aiocb;
     QEMUSGList sglist;
+    BlockAcctCookie acct;
     int is_read;
     uint16_t sector_count;
     uint64_t lba;
diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index fe2fb0b806..c552320122 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -104,17 +104,20 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
     memset(buf, 0, 288);
 }
 
-static int cd_read_sector(BlockDriverState *bs, int lba, uint8_t *buf,
-                           int sector_size)
+static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
 {
     int ret;
 
     switch(sector_size) {
     case 2048:
-        ret = bdrv_read(bs, (int64_t)lba << 2, buf, 4);
+        bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
+        ret = bdrv_read(s->bs, (int64_t)lba << 2, buf, 4);
+        bdrv_acct_done(s->bs, &s->acct);
         break;
     case 2352:
-        ret = bdrv_read(bs, (int64_t)lba << 2, buf + 16, 4);
+        bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
+        ret = bdrv_read(s->bs, (int64_t)lba << 2, buf + 16, 4);
+        bdrv_acct_done(s->bs, &s->acct);
         if (ret < 0)
             return ret;
         cd_data_to_raw(buf, lba);
@@ -181,7 +184,7 @@ void ide_atapi_cmd_reply_end(IDEState *s)
     } else {
         /* see if a new sector must be read */
         if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) {
-            ret = cd_read_sector(s->bs, s->lba, s->io_buffer, s->cd_sector_size);
+            ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
             if (ret < 0) {
                 ide_transfer_stop(s);
                 ide_atapi_io_error(s, ret);
@@ -250,6 +253,7 @@ static void ide_atapi_cmd_reply(IDEState *s, int size, int max_size)
     s->io_buffer_index = 0;
 
     if (s->atapi_dma) {
+        bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ);
         s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
         s->bus->dma->ops->start_dma(s->bus->dma, s,
                                    ide_atapi_cmd_read_dma_cb);
@@ -322,10 +326,7 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
         s->status = READY_STAT | SEEK_STAT;
         s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
         ide_set_irq(s->bus);
-    eot:
-        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
-        ide_set_inactive(s);
-        return;
+        goto eot;
     }
 
     s->io_buffer_index = 0;
@@ -343,9 +344,11 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
 #ifdef DEBUG_AIO
     printf("aio_read_cd: lba=%u n=%d\n", s->lba, n);
 #endif
+
     s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset);
     s->bus->dma->iov.iov_len = n * 4 * 512;
     qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
+
     s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2,
                                        &s->bus->dma->qiov, n * 4,
                                        ide_atapi_cmd_read_dma_cb, s);
@@ -355,6 +358,12 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
                             ASC_MEDIUM_NOT_PRESENT);
         goto eot;
     }
+
+    return;
+eot:
+    bdrv_acct_done(s->bs, &s->acct);
+    s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
+    ide_set_inactive(s);
 }
 
 /* start a CD-CDROM read command with DMA */
@@ -368,6 +377,8 @@ static void ide_atapi_cmd_read_dma(IDEState *s, int lba, int nb_sectors,
     s->io_buffer_size = 0;
     s->cd_sector_size = sector_size;
 
+    bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ);
+
     /* XXX: check if BUSY_STAT should be set */
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
     s->bus->dma->ops->start_dma(s->bus->dma, s,
diff --git a/hw/ide/core.c b/hw/ide/core.c
index d145b19b0c..40abc1edd2 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -473,7 +473,10 @@ void ide_sector_read(IDEState *s)
 #endif
         if (n > s->req_nb_sectors)
             n = s->req_nb_sectors;
+
+        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
         ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
+        bdrv_acct_done(s->bs, &s->acct);
         if (ret != 0) {
             if (ide_handle_rw_error(s, -ret,
                 BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ))
@@ -610,7 +613,10 @@ handle_rw_error:
     return;
 
 eot:
-   ide_set_inactive(s);
+    if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) {
+        bdrv_acct_done(s->bs, &s->acct);
+    }
+    ide_set_inactive(s);
 }
 
 static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
@@ -619,6 +625,20 @@ static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
     s->dma_cmd = dma_cmd;
+
+    switch (dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_WRITE);
+        break;
+    default:
+        break;
+    }
+
     s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb);
 }
 
@@ -641,7 +661,10 @@ void ide_sector_write(IDEState *s)
     n = s->nsector;
     if (n > s->req_nb_sectors)
         n = s->req_nb_sectors;
+
+    bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+    bdrv_acct_done(s->bs, &s->acct);
 
     if (ret != 0) {
         if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY))
@@ -685,6 +708,7 @@ static void ide_flush_cb(void *opaque, int ret)
         }
     }
 
+    bdrv_acct_done(s->bs, &s->acct);
     s->status = READY_STAT | SEEK_STAT;
     ide_set_irq(s->bus);
 }
@@ -698,6 +722,7 @@ void ide_flush_cache(IDEState *s)
         return;
     }
 
+    bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH);
     acb = bdrv_aio_flush(s->bs, ide_flush_cb, s);
     if (acb == NULL) {
         ide_flush_cb(s, -EIO);
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 02e805f070..7f5ef8de1d 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -440,6 +440,7 @@ struct IDEState {
     int lba;
     int cd_sector_size;
     int atapi_dma; /* true if dma is requested for the packet cmd */
+    BlockAcctCookie acct;
     /* ATA DMA state */
     int io_buffer_size;
     QEMUSGList sg;
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index 44fb3fef60..fdf5d75082 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -52,8 +52,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
         ide_atapi_io_error(s, ret);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     if (s->io_buffer_size > 0) {
@@ -71,8 +70,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
         ide_atapi_cmd_ok(s);
 
     if (io->len == 0) {
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -92,9 +90,14 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
         /* Note: media not present is the most likely case */
         ide_atapi_cmd_error(s, SENSE_NOT_READY,
                             ASC_MEDIUM_NOT_PRESENT);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
+    return;
+
+done:
+    bdrv_acct_done(s->bs, &s->acct);
+    io->dma_end(opaque);
+    return;
 }
 
 static void pmac_ide_transfer_cb(void *opaque, int ret)
@@ -109,8 +112,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
 	ide_dma_error(s);
-        io->dma_end(io);
-        return;
+        goto done;
     }
 
     sector_num = ide_get_sector(s);
@@ -130,10 +132,8 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
     }
 
     /* end of DMA ? */
-
     if (io->len == 0) {
-        io->dma_end(io);
-	return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -163,6 +163,12 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
 
     if (!m->aiocb)
         pmac_ide_transfer_cb(io, -1);
+    return;
+done:
+    if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) {
+        bdrv_acct_done(s->bs, &s->acct);
+    }
+    io->dma_end(io);
 }
 
 static void pmac_ide_transfer(DBDMA_io *io)
@@ -172,10 +178,22 @@ static void pmac_ide_transfer(DBDMA_io *io)
 
     s->io_buffer_size = 0;
     if (s->drive_kind == IDE_CD) {
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
         pmac_ide_atapi_transfer_cb(io, 0);
         return;
     }
 
+    switch (s->dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
+        break;
+    default:
+        break;
+    }
+
     pmac_ide_transfer_cb(io, 0);
 }
 
diff --git a/hw/nseries.c b/hw/nseries.c
index f7aae7a59e..af287dd6dc 100644
--- a/hw/nseries.c
+++ b/hw/nseries.c
@@ -32,7 +32,7 @@
 #include "bt.h"
 #include "loader.h"
 #include "blockdev.h"
-#include "tusb6010.h"
+#include "sysbus.h"
 
 /* Nokia N8x0 support */
 struct n800_s {
@@ -49,10 +49,10 @@ struct n800_s {
     int keymap[0x80];
     DeviceState *kbd;
 
-    TUSBState *usb;
+    DeviceState *usb;
     void *retu;
     void *tahvo;
-    void *nand;
+    DeviceState *nand;
 };
 
 /* GPIO pins */
@@ -167,13 +167,21 @@ static void n8x0_nand_setup(struct n800_s *s)
     char *otp_region;
     DriveInfo *dinfo;
 
-    dinfo = drive_get(IF_MTD, 0, 0);
+    s->nand = qdev_create(NULL, "onenand");
+    qdev_prop_set_uint16(s->nand, "manufacturer_id", NAND_MFR_SAMSUNG);
     /* Either 0x40 or 0x48 are OK for the device ID */
-    s->nand = onenand_init(dinfo ? dinfo->bdrv : 0,
-                    NAND_MFR_SAMSUNG, 0x48, 0, 1,
-                    qdev_get_gpio_in(s->cpu->gpio, N8X0_ONENAND_GPIO));
-    omap_gpmc_attach(s->cpu->gpmc, N8X0_ONENAND_CS, 0, onenand_base_update,
-                    onenand_base_unmap, s->nand);
+    qdev_prop_set_uint16(s->nand, "device_id", 0x48);
+    qdev_prop_set_uint16(s->nand, "version_id", 0);
+    qdev_prop_set_int32(s->nand, "shift", 1);
+    dinfo = drive_get(IF_MTD, 0, 0);
+    if (dinfo && dinfo->bdrv) {
+        qdev_prop_set_drive_nofail(s->nand, "drive", dinfo->bdrv);
+    }
+    qdev_init_nofail(s->nand);
+    sysbus_connect_irq(sysbus_from_qdev(s->nand), 0,
+                       qdev_get_gpio_in(s->cpu->gpio, N8X0_ONENAND_GPIO));
+    omap_gpmc_attach(s->cpu->gpmc, N8X0_ONENAND_CS,
+                     sysbus_mmio_get_region(sysbus_from_qdev(s->nand), 0));
     otp_region = onenand_raw_otp(s->nand);
 
     memcpy(otp_region + 0x000, n8x0_cal_wlan_mac, sizeof(n8x0_cal_wlan_mac));
@@ -756,27 +764,21 @@ static void n8x0_uart_setup(struct n800_s *s)
     omap_uart_attach(s->cpu->uart[BT_UART], radio);
 }
 
-static void n8x0_usb_power_cb(void *opaque, int line, int level)
-{
-    struct n800_s *s = opaque;
-
-    tusb6010_power(s->usb, level);
-}
-
 static void n8x0_usb_setup(struct n800_s *s)
 {
-    qemu_irq tusb_irq = qdev_get_gpio_in(s->cpu->gpio, N8X0_TUSB_INT_GPIO);
-    qemu_irq tusb_pwr = qemu_allocate_irqs(n8x0_usb_power_cb, s, 1)[0];
-    TUSBState *tusb = tusb6010_init(tusb_irq);
-
+    SysBusDevice *dev;
+    s->usb = qdev_create(NULL, "tusb6010");
+    dev = sysbus_from_qdev(s->usb);
+    qdev_init_nofail(s->usb);
+    sysbus_connect_irq(dev, 0,
+                       qdev_get_gpio_in(s->cpu->gpio, N8X0_TUSB_INT_GPIO));
     /* Using the NOR interface */
     omap_gpmc_attach(s->cpu->gpmc, N8X0_USB_ASYNC_CS,
-                    tusb6010_async_io(tusb), NULL, NULL, tusb);
+                     sysbus_mmio_get_region(dev, 0));
     omap_gpmc_attach(s->cpu->gpmc, N8X0_USB_SYNC_CS,
-                    tusb6010_sync_io(tusb), NULL, NULL, tusb);
-
-    s->usb = tusb;
-    qdev_connect_gpio_out(s->cpu->gpio, N8X0_TUSB_ENABLE_GPIO, tusb_pwr);
+                     sysbus_mmio_get_region(dev, 1));
+    qdev_connect_gpio_out(s->cpu->gpio, N8X0_TUSB_ENABLE_GPIO,
+                          qdev_get_gpio_in(s->usb, 0)); /* tusb_pwr */
 }
 
 /* Setup done before the main bootloader starts by some early setup code
diff --git a/hw/omap.h b/hw/omap.h
index db101c61f3..d9ab006ae0 100644
--- a/hw/omap.h
+++ b/hw/omap.h
@@ -118,11 +118,12 @@ void omap_sdrc_reset(struct omap_sdrc_s *s);
 
 /* OMAP2 general purpose memory controller */
 struct omap_gpmc_s;
-struct omap_gpmc_s *omap_gpmc_init(target_phys_addr_t base, qemu_irq irq);
+struct omap_gpmc_s *omap_gpmc_init(struct omap_mpu_state_s *mpu,
+                                   target_phys_addr_t base,
+                                   qemu_irq irq, qemu_irq drq);
 void omap_gpmc_reset(struct omap_gpmc_s *s);
-void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem,
-                void (*base_upd)(void *opaque, target_phys_addr_t new),
-                void (*unmap)(void *opaque), void *opaque);
+void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem);
+void omap_gpmc_attach_nand(struct omap_gpmc_s *s, int cs, DeviceState *nand);
 
 /*
  * Common IRQ numbers for level 1 interrupt handler
@@ -788,6 +789,7 @@ i2c_bus *omap_i2c_bus(struct omap_i2c_s *s);
 # define cpu_is_omap2420(cpu)		(cpu->mpu_model == omap2420)
 # define cpu_is_omap2430(cpu)		(cpu->mpu_model == omap2430)
 # define cpu_is_omap3430(cpu)		(cpu->mpu_model == omap3430)
+# define cpu_is_omap3630(cpu)           (cpu->mpu_model == omap3630)
 
 # define cpu_is_omap15xx(cpu)		\
         (cpu_is_omap310(cpu) || cpu_is_omap1510(cpu))
@@ -799,7 +801,8 @@ i2c_bus *omap_i2c_bus(struct omap_i2c_s *s);
 # define cpu_class_omap1(cpu)		\
         (cpu_is_omap15xx(cpu) || cpu_is_omap16xx(cpu))
 # define cpu_class_omap2(cpu)		cpu_is_omap24xx(cpu)
-# define cpu_class_omap3(cpu)		cpu_is_omap3430(cpu)
+# define cpu_class_omap3(cpu) \
+        (cpu_is_omap3430(cpu) || cpu_is_omap3630(cpu))
 
 struct omap_mpu_state_s {
     enum omap_mpu_model {
@@ -813,6 +816,7 @@ struct omap_mpu_state_s {
         omap2423,
         omap2430,
         omap3430,
+        omap3630,
     } mpu_model;
 
     CPUState *env;
diff --git a/hw/omap2.c b/hw/omap2.c
index 7e5820a97b..ca088d9f53 100644
--- a/hw/omap2.c
+++ b/hw/omap2.c
@@ -2402,7 +2402,8 @@ struct omap_mpu_state_s *omap2420_mpu_init(unsigned long sdram_size,
     sysbus_mmio_map(busdev, 4, omap_l4_region_base(ta, 5));
 
     s->sdrc = omap_sdrc_init(0x68009000);
-    s->gpmc = omap_gpmc_init(0x6800a000, s->irq[0][OMAP_INT_24XX_GPMC_IRQ]);
+    s->gpmc = omap_gpmc_init(s, 0x6800a000, s->irq[0][OMAP_INT_24XX_GPMC_IRQ],
+                             s->drq[OMAP24XX_DMA_GPMC]);
 
     dinfo = drive_get(IF_SD, 0, 0);
     if (!dinfo) {
diff --git a/hw/omap_gpmc.c b/hw/omap_gpmc.c
index 673dddd237..02f0c52107 100644
--- a/hw/omap_gpmc.c
+++ b/hw/omap_gpmc.c
@@ -27,82 +27,410 @@
 /* General-Purpose Memory Controller */
 struct omap_gpmc_s {
     qemu_irq irq;
+    qemu_irq drq;
     MemoryRegion iomem;
+    int accept_256;
 
+    uint8_t revision;
     uint8_t sysconfig;
     uint16_t irqst;
     uint16_t irqen;
+    uint16_t lastirq;
     uint16_t timeout;
     uint16_t config;
-    uint32_t prefconfig[2];
-    int prefcontrol;
-    int preffifo;
-    int prefcount;
     struct omap_gpmc_cs_file_s {
         uint32_t config[7];
-        target_phys_addr_t base;
-        size_t size;
         MemoryRegion *iomem;
         MemoryRegion container;
-        void (*base_update)(void *opaque, target_phys_addr_t new);
-        void (*unmap)(void *opaque);
-        void *opaque;
+        MemoryRegion nandiomem;
+        DeviceState *dev;
     } cs_file[8];
     int ecc_cs;
     int ecc_ptr;
     uint32_t ecc_cfg;
     ECCState ecc[9];
+    struct prefetch {
+        uint32_t config1; /* GPMC_PREFETCH_CONFIG1 */
+        uint32_t transfercount; /* GPMC_PREFETCH_CONFIG2:TRANSFERCOUNT */
+        int startengine; /* GPMC_PREFETCH_CONTROL:STARTENGINE */
+        int fifopointer; /* GPMC_PREFETCH_STATUS:FIFOPOINTER */
+        int count; /* GPMC_PREFETCH_STATUS:COUNTVALUE */
+        MemoryRegion iomem;
+        uint8_t fifo[64];
+    } prefetch;
 };
 
+#define OMAP_GPMC_8BIT 0
+#define OMAP_GPMC_16BIT 1
+#define OMAP_GPMC_NOR 0
+#define OMAP_GPMC_NAND 2
+
+static int omap_gpmc_devtype(struct omap_gpmc_cs_file_s *f)
+{
+    return (f->config[0] >> 10) & 3;
+}
+
+static int omap_gpmc_devsize(struct omap_gpmc_cs_file_s *f)
+{
+    /* devsize field is really 2 bits but we ignore the high
+     * bit to ensure consistent behaviour if the guest sets
+     * it (values 2 and 3 are reserved in the TRM)
+     */
+    return (f->config[0] >> 12) & 1;
+}
+
+/* Extract the chip-select value from the prefetch config1 register */
+static int prefetch_cs(uint32_t config1)
+{
+    return (config1 >> 24) & 7;
+}
+
+static int prefetch_threshold(uint32_t config1)
+{
+    return (config1 >> 8) & 0x7f;
+}
+
 static void omap_gpmc_int_update(struct omap_gpmc_s *s)
 {
-    qemu_set_irq(s->irq, s->irqen & s->irqst);
+    /* The TRM is a bit unclear, but it seems to say that
+     * the TERMINALCOUNTSTATUS bit is set only on the
+     * transition when the prefetch engine goes from
+     * active to inactive, whereas the FIFOEVENTSTATUS
+     * bit is held high as long as the fifo has at
+     * least THRESHOLD bytes available.
+     * So we do the latter here, but TERMINALCOUNTSTATUS
+     * is set elsewhere.
+     */
+    if (s->prefetch.fifopointer >= prefetch_threshold(s->prefetch.config1)) {
+        s->irqst |= 1;
+    }
+    if ((s->irqen & s->irqst) != s->lastirq) {
+        s->lastirq = s->irqen & s->irqst;
+        qemu_set_irq(s->irq, s->lastirq);
+    }
 }
 
-static void omap_gpmc_cs_map(struct omap_gpmc_cs_file_s *f, int base, int mask)
+static void omap_gpmc_dma_update(struct omap_gpmc_s *s, int value)
 {
-    /* TODO: check for overlapping regions and report access errors */
-    if ((mask != 0x8 && mask != 0xc && mask != 0xe && mask != 0xf) ||
-                    (base < 0 || base >= 0x40) ||
-                    (base & 0x0f & ~mask)) {
-        fprintf(stderr, "%s: wrong cs address mapping/decoding!\n",
-                        __FUNCTION__);
+    if (s->prefetch.config1 & 4) {
+        qemu_set_irq(s->drq, value);
+    }
+}
+
+/* Access functions for when a NAND-like device is mapped into memory:
+ * all addresses in the region behave like accesses to the relevant
+ * GPMC_NAND_DATA_i register (which is actually implemented to call these)
+ */
+static uint64_t omap_nand_read(void *opaque, target_phys_addr_t addr,
+                               unsigned size)
+{
+    struct omap_gpmc_cs_file_s *f = (struct omap_gpmc_cs_file_s *)opaque;
+    uint64_t v;
+    nand_setpins(f->dev, 0, 0, 0, 1, 0);
+    switch (omap_gpmc_devsize(f)) {
+    case OMAP_GPMC_8BIT:
+        v = nand_getio(f->dev);
+        if (size == 1) {
+            return v;
+        }
+        v |= (nand_getio(f->dev) << 8);
+        if (size == 2) {
+            return v;
+        }
+        v |= (nand_getio(f->dev) << 16);
+        v |= (nand_getio(f->dev) << 24);
+        return v;
+    case OMAP_GPMC_16BIT:
+        v = nand_getio(f->dev);
+        if (size == 1) {
+            /* 8 bit read from 16 bit device : probably a guest bug */
+            return v & 0xff;
+        }
+        if (size == 2) {
+            return v;
+        }
+        v |= (nand_getio(f->dev) << 16);
+        return v;
+    default:
+        abort();
+    }
+}
+
+static void omap_nand_setio(DeviceState *dev, uint64_t value,
+                            int nandsize, int size)
+{
+    /* Write the specified value to the NAND device, respecting
+     * both size of the NAND device and size of the write access.
+     */
+    switch (nandsize) {
+    case OMAP_GPMC_8BIT:
+        switch (size) {
+        case 1:
+            nand_setio(dev, value & 0xff);
+            break;
+        case 2:
+            nand_setio(dev, value & 0xff);
+            nand_setio(dev, (value >> 8) & 0xff);
+            break;
+        case 4:
+        default:
+            nand_setio(dev, value & 0xff);
+            nand_setio(dev, (value >> 8) & 0xff);
+            nand_setio(dev, (value >> 16) & 0xff);
+            nand_setio(dev, (value >> 24) & 0xff);
+            break;
+        }
+    case OMAP_GPMC_16BIT:
+        switch (size) {
+        case 1:
+            /* writing to a 16bit device with 8bit access is probably a guest
+             * bug; pass the value through anyway.
+             */
+        case 2:
+            nand_setio(dev, value & 0xffff);
+            break;
+        case 4:
+        default:
+            nand_setio(dev, value & 0xffff);
+            nand_setio(dev, (value >> 16) & 0xffff);
+            break;
+        }
+    }
+}
+
+static void omap_nand_write(void *opaque, target_phys_addr_t addr,
+                            uint64_t value, unsigned size)
+{
+    struct omap_gpmc_cs_file_s *f = (struct omap_gpmc_cs_file_s *)opaque;
+    nand_setpins(f->dev, 0, 0, 0, 1, 0);
+    omap_nand_setio(f->dev, value, omap_gpmc_devsize(f), size);
+}
+
+static const MemoryRegionOps omap_nand_ops = {
+    .read = omap_nand_read,
+    .write = omap_nand_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void fill_prefetch_fifo(struct omap_gpmc_s *s)
+{
+    /* Fill the prefetch FIFO by reading data from NAND.
+     * We do this synchronously, unlike the hardware which
+     * will do this asynchronously. We refill when the
+     * FIFO has THRESHOLD bytes free, and we always refill
+     * as much data as possible starting at the top end
+     * of the FIFO.
+     * (We have to refill at THRESHOLD rather than waiting
+     * for the FIFO to empty to allow for the case where
+     * the FIFO size isn't an exact multiple of THRESHOLD
+     * and we're doing DMA transfers.)
+     * This means we never need to handle wrap-around in
+     * the fifo-reading code, and the next byte of data
+     * to read is always fifo[63 - fifopointer].
+     */
+    int fptr;
+    int cs = prefetch_cs(s->prefetch.config1);
+    int is16bit = (((s->cs_file[cs].config[0] >> 12) & 3) != 0);
+    int bytes;
+    /* Don't believe the bit of the OMAP TRM that says that COUNTVALUE
+     * and TRANSFERCOUNT are in units of 16 bit words for 16 bit NAND.
+     * Instead believe the bit that says it is always a byte count.
+     */
+    bytes = 64 - s->prefetch.fifopointer;
+    if (bytes > s->prefetch.count) {
+        bytes = s->prefetch.count;
+    }
+    s->prefetch.count -= bytes;
+    s->prefetch.fifopointer += bytes;
+    fptr = 64 - s->prefetch.fifopointer;
+    /* Move the existing data in the FIFO so it sits just
+     * before what we're about to read in
+     */
+    while (fptr < (64 - bytes)) {
+        s->prefetch.fifo[fptr] = s->prefetch.fifo[fptr + bytes];
+        fptr++;
+    }
+    while (fptr < 64) {
+        if (is16bit) {
+            uint32_t v = omap_nand_read(&s->cs_file[cs], 0, 2);
+            s->prefetch.fifo[fptr++] = v & 0xff;
+            s->prefetch.fifo[fptr++] = (v >> 8) & 0xff;
+        } else {
+            s->prefetch.fifo[fptr++] = omap_nand_read(&s->cs_file[cs], 0, 1);
+        }
+    }
+    if (s->prefetch.startengine && (s->prefetch.count == 0)) {
+        /* This was the final transfer: raise TERMINALCOUNTSTATUS */
+        s->irqst |= 2;
+        s->prefetch.startengine = 0;
+    }
+    /* If there are any bytes in the FIFO at this point then
+     * we must raise a DMA request (either this is a final part
+     * transfer, or we filled the FIFO in which case we certainly
+     * have THRESHOLD bytes available)
+     */
+    if (s->prefetch.fifopointer != 0) {
+        omap_gpmc_dma_update(s, 1);
+    }
+    omap_gpmc_int_update(s);
+}
+
+/* Access functions for a NAND-like device when the prefetch/postwrite
+ * engine is enabled -- all addresses in the region behave alike:
+ * data is read or written to the FIFO.
+ */
+static uint64_t omap_gpmc_prefetch_read(void *opaque, target_phys_addr_t addr,
+                                        unsigned size)
+{
+    struct omap_gpmc_s *s = (struct omap_gpmc_s *) opaque;
+    uint32_t data;
+    if (s->prefetch.config1 & 1) {
+        /* The TRM doesn't define the behaviour if you read from the
+         * FIFO when the prefetch engine is in write mode. We choose
+         * to always return zero.
+         */
+        return 0;
+    }
+    /* Note that trying to read an empty fifo repeats the last byte */
+    if (s->prefetch.fifopointer) {
+        s->prefetch.fifopointer--;
+    }
+    data = s->prefetch.fifo[63 - s->prefetch.fifopointer];
+    if (s->prefetch.fifopointer ==
+        (64 - prefetch_threshold(s->prefetch.config1))) {
+        /* We've drained THRESHOLD bytes now. So deassert the
+         * DMA request, then refill the FIFO (which will probably
+         * assert it again.)
+         */
+        omap_gpmc_dma_update(s, 0);
+        fill_prefetch_fifo(s);
+    }
+    omap_gpmc_int_update(s);
+    return data;
+}
+
+static void omap_gpmc_prefetch_write(void *opaque, target_phys_addr_t addr,
+                                     uint64_t value, unsigned size)
+{
+    struct omap_gpmc_s *s = (struct omap_gpmc_s *) opaque;
+    int cs = prefetch_cs(s->prefetch.config1);
+    if ((s->prefetch.config1 & 1) == 0) {
+        /* The TRM doesn't define the behaviour of writing to the
+         * FIFO when the prefetch engine is in read mode. We
+         * choose to ignore the write.
+         */
+        return;
+    }
+    if (s->prefetch.count == 0) {
+        /* The TRM doesn't define the behaviour of writing to the
+         * FIFO if the transfer is complete. We choose to ignore.
+         */
         return;
     }
+    /* The only reason we do any data buffering in postwrite
+     * mode is if we are talking to a 16 bit NAND device, in
+     * which case we need to buffer the first byte of the
+     * 16 bit word until the other byte arrives.
+     */
+    int is16bit = (((s->cs_file[cs].config[0] >> 12) & 3) != 0);
+    if (is16bit) {
+        /* fifopointer alternates between 64 (waiting for first
+         * byte of word) and 63 (waiting for second byte)
+         */
+        if (s->prefetch.fifopointer == 64) {
+            s->prefetch.fifo[0] = value;
+            s->prefetch.fifopointer--;
+        } else {
+            value = (value << 8) | s->prefetch.fifo[0];
+            omap_nand_write(&s->cs_file[cs], 0, value, 2);
+            s->prefetch.count--;
+            s->prefetch.fifopointer = 64;
+        }
+    } else {
+        /* Just write the byte : fifopointer remains 64 at all times */
+        omap_nand_write(&s->cs_file[cs], 0, value, 1);
+        s->prefetch.count--;
+    }
+    if (s->prefetch.count == 0) {
+        /* Final transfer: raise TERMINALCOUNTSTATUS */
+        s->irqst |= 2;
+        s->prefetch.startengine = 0;
+    }
+    omap_gpmc_int_update(s);
+}
+
+static const MemoryRegionOps omap_prefetch_ops = {
+    .read = omap_gpmc_prefetch_read,
+    .write = omap_gpmc_prefetch_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl.min_access_size = 1,
+    .impl.max_access_size = 1,
+};
 
-    if (!f->opaque)
+static MemoryRegion *omap_gpmc_cs_memregion(struct omap_gpmc_s *s, int cs)
+{
+    /* Return the MemoryRegion* to map/unmap for this chipselect */
+    struct omap_gpmc_cs_file_s *f = &s->cs_file[cs];
+    if (omap_gpmc_devtype(f) == OMAP_GPMC_NOR) {
+        return f->iomem;
+    }
+    if ((s->prefetch.config1 & 0x80) &&
+        (prefetch_cs(s->prefetch.config1) == cs)) {
+        /* The prefetch engine is enabled for this CS: map the FIFO */
+        return &s->prefetch.iomem;
+    }
+    return &f->nandiomem;
+}
+
+static void omap_gpmc_cs_map(struct omap_gpmc_s *s, int cs)
+{
+    struct omap_gpmc_cs_file_s *f = &s->cs_file[cs];
+    uint32_t mask = (f->config[6] >> 8) & 0xf;
+    uint32_t base = f->config[6] & 0x3f;
+    uint32_t size;
+
+    if (!f->iomem && !f->dev) {
+        return;
+    }
+
+    if (!(f->config[6] & (1 << 6))) {
+        /* Do nothing unless CSVALID */
         return;
+    }
 
-    f->base = base << 24;
-    f->size = (0x0fffffff & ~(mask << 24)) + 1;
+    /* TODO: check for overlapping regions and report access errors */
+    if (mask != 0x8 && mask != 0xc && mask != 0xe && mask != 0xf
+         && !(s->accept_256 && !mask)) {
+        fprintf(stderr, "%s: invalid chip-select mask address (0x%x)\n",
+                 __func__, mask);
+    }
+
+    base <<= 24;
+    size = (0x0fffffff & ~(mask << 24)) + 1;
     /* TODO: rather than setting the size of the mapping (which should be
      * constant), the mask should cause wrapping of the address space, so
      * that the same memory becomes accessible at every <i>size</i> bytes
      * starting from <i>base</i>.  */
-    if (f->iomem) {
-        memory_region_init(&f->container, "omap-gpmc-file", f->size);
-        memory_region_add_subregion(&f->container, 0, f->iomem);
-        memory_region_add_subregion(get_system_memory(), f->base,
-                                    &f->container);
-    }
-
-    if (f->base_update)
-        f->base_update(f->opaque, f->base);
+    memory_region_init(&f->container, "omap-gpmc-file", size);
+    memory_region_add_subregion(&f->container, 0,
+                                omap_gpmc_cs_memregion(s, cs));
+    memory_region_add_subregion(get_system_memory(), base,
+                                &f->container);
 }
 
-static void omap_gpmc_cs_unmap(struct omap_gpmc_cs_file_s *f)
+static void omap_gpmc_cs_unmap(struct omap_gpmc_s *s, int cs)
 {
-    if (f->size) {
-        if (f->unmap)
-            f->unmap(f->opaque);
-        if (f->iomem) {
-            memory_region_del_subregion(get_system_memory(), &f->container);
-            memory_region_del_subregion(&f->container, f->iomem);
-            memory_region_destroy(&f->container);
-        }
-        f->base = 0;
-        f->size = 0;
+    struct omap_gpmc_cs_file_s *f = &s->cs_file[cs];
+    if (!(f->config[6] & (1 << 6))) {
+        /* Do nothing unless CSVALID */
+        return;
+    }
+    if (!f->iomem && !f->dev) {
+        return;
     }
+    memory_region_del_subregion(get_system_memory(), &f->container);
+    memory_region_del_subregion(&f->container, omap_gpmc_cs_memregion(s, cs));
+    memory_region_destroy(&f->container);
 }
 
 void omap_gpmc_reset(struct omap_gpmc_s *s)
@@ -115,25 +443,32 @@ void omap_gpmc_reset(struct omap_gpmc_s *s)
     omap_gpmc_int_update(s);
     s->timeout = 0;
     s->config = 0xa00;
-    s->prefconfig[0] = 0x00004000;
-    s->prefconfig[1] = 0x00000000;
-    s->prefcontrol = 0;
-    s->preffifo = 0;
-    s->prefcount = 0;
+    s->prefetch.config1 = 0x00004000;
+    s->prefetch.transfercount = 0x00000000;
+    s->prefetch.startengine = 0;
+    s->prefetch.fifopointer = 0;
+    s->prefetch.count = 0;
     for (i = 0; i < 8; i ++) {
-        if (s->cs_file[i].config[6] & (1 << 6))			/* CSVALID */
-            omap_gpmc_cs_unmap(s->cs_file + i);
-        s->cs_file[i].config[0] = i ? 1 << 12 : 0;
+        omap_gpmc_cs_unmap(s, i);
         s->cs_file[i].config[1] = 0x101001;
         s->cs_file[i].config[2] = 0x020201;
         s->cs_file[i].config[3] = 0x10031003;
         s->cs_file[i].config[4] = 0x10f1111;
         s->cs_file[i].config[5] = 0;
         s->cs_file[i].config[6] = 0xf00 | (i ? 0 : 1 << 6);
-        if (s->cs_file[i].config[6] & (1 << 6))			/* CSVALID */
-            omap_gpmc_cs_map(&s->cs_file[i],
-                            s->cs_file[i].config[6] & 0x1f,	/* MASKADDR */
-                        (s->cs_file[i].config[6] >> 8 & 0xf));	/* BASEADDR */
+
+        s->cs_file[i].config[6] = 0xf00;
+        /* In theory we could probe attached devices for some CFG1
+         * bits here, but we just retain them across resets as they
+         * were set initially by omap_gpmc_attach().
+         */
+        if (i == 0) {
+            s->cs_file[i].config[0] &= 0x00433e00;
+            s->cs_file[i].config[6] |= 1 << 6; /* CSVALID */
+            omap_gpmc_cs_map(s, i);
+        } else {
+            s->cs_file[i].config[0] &= 0x00403c00;
+        }
     }
     s->ecc_cs = 0;
     s->ecc_ptr = 0;
@@ -142,6 +477,24 @@ void omap_gpmc_reset(struct omap_gpmc_s *s)
         ecc_reset(&s->ecc[i]);
 }
 
+static int gpmc_wordaccess_only(target_phys_addr_t addr)
+{
+    /* Return true if the register offset is to a register that
+     * only permits word width accesses.
+     * Non-word accesses are only OK for GPMC_NAND_DATA/ADDRESS/COMMAND
+     * for any chipselect.
+     */
+    if (addr >= 0x60 && addr <= 0x1d4) {
+        int cs = (addr - 0x60) / 0x30;
+        addr -= cs * 0x30;
+        if (addr >= 0x7c && addr < 0x88) {
+            /* GPMC_NAND_COMMAND, GPMC_NAND_ADDRESS, GPMC_NAND_DATA */
+            return 0;
+        }
+    }
+    return 1;
+}
+
 static uint64_t omap_gpmc_read(void *opaque, target_phys_addr_t addr,
                                unsigned size)
 {
@@ -149,13 +502,13 @@ static uint64_t omap_gpmc_read(void *opaque, target_phys_addr_t addr,
     int cs;
     struct omap_gpmc_cs_file_s *f;
 
-    if (size != 4) {
+    if (size != 4 && gpmc_wordaccess_only(addr)) {
         return omap_badwidth_read32(opaque, addr);
     }
 
     switch (addr) {
     case 0x000:	/* GPMC_REVISION */
-        return 0x20;
+        return s->revision;
 
     case 0x010:	/* GPMC_SYSCONFIG */
         return s->sysconfig;
@@ -187,36 +540,39 @@ static uint64_t omap_gpmc_read(void *opaque, target_phys_addr_t addr,
         addr -= cs * 0x30;
         f = s->cs_file + cs;
         switch (addr) {
-            case 0x60:	/* GPMC_CONFIG1 */
-                return f->config[0];
-            case 0x64:	/* GPMC_CONFIG2 */
-                return f->config[1];
-            case 0x68:	/* GPMC_CONFIG3 */
-                return f->config[2];
-            case 0x6c:	/* GPMC_CONFIG4 */
-                return f->config[3];
-            case 0x70:	/* GPMC_CONFIG5 */
-                return f->config[4];
-            case 0x74:	/* GPMC_CONFIG6 */
-                return f->config[5];
-            case 0x78:	/* GPMC_CONFIG7 */
-                return f->config[6];
-            case 0x84:	/* GPMC_NAND_DATA */
-                return 0;
+        case 0x60:      /* GPMC_CONFIG1 */
+            return f->config[0];
+        case 0x64:      /* GPMC_CONFIG2 */
+            return f->config[1];
+        case 0x68:      /* GPMC_CONFIG3 */
+            return f->config[2];
+        case 0x6c:      /* GPMC_CONFIG4 */
+            return f->config[3];
+        case 0x70:      /* GPMC_CONFIG5 */
+            return f->config[4];
+        case 0x74:      /* GPMC_CONFIG6 */
+            return f->config[5];
+        case 0x78:      /* GPMC_CONFIG7 */
+            return f->config[6];
+        case 0x84 ... 0x87: /* GPMC_NAND_DATA */
+            if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) {
+                return omap_nand_read(f, 0, size);
+            }
+            return 0;
         }
         break;
 
     case 0x1e0:	/* GPMC_PREFETCH_CONFIG1 */
-        return s->prefconfig[0];
+        return s->prefetch.config1;
     case 0x1e4:	/* GPMC_PREFETCH_CONFIG2 */
-        return s->prefconfig[1];
+        return s->prefetch.transfercount;
     case 0x1ec:	/* GPMC_PREFETCH_CONTROL */
-        return s->prefcontrol;
+        return s->prefetch.startengine;
     case 0x1f0:	/* GPMC_PREFETCH_STATUS */
-        return (s->preffifo << 24) |
-                ((s->preffifo >
-                  ((s->prefconfig[0] >> 8) & 0x7f) ? 1 : 0) << 16) |
-                s->prefcount;
+        return (s->prefetch.fifopointer << 24) |
+                ((s->prefetch.fifopointer >=
+                  ((s->prefetch.config1 >> 8) & 0x7f) ? 1 : 0) << 16) |
+                s->prefetch.count;
 
     case 0x1f4:	/* GPMC_ECC_CONFIG */
         return s->ecc_cs;
@@ -251,7 +607,7 @@ static void omap_gpmc_write(void *opaque, target_phys_addr_t addr,
     int cs;
     struct omap_gpmc_cs_file_s *f;
 
-    if (size != 4) {
+    if (size != 4 && gpmc_wordaccess_only(addr)) {
         return omap_badwidth_write32(opaque, addr, value);
     }
 
@@ -276,7 +632,7 @@ static void omap_gpmc_write(void *opaque, target_phys_addr_t addr,
         break;
 
     case 0x018:	/* GPMC_IRQSTATUS */
-        s->irqen = ~value;
+        s->irqen &= ~value;
         omap_gpmc_int_update(s);
         break;
 
@@ -302,62 +658,109 @@ static void omap_gpmc_write(void *opaque, target_phys_addr_t addr,
         addr -= cs * 0x30;
         f = s->cs_file + cs;
         switch (addr) {
-            case 0x60:	/* GPMC_CONFIG1 */
-                f->config[0] = value & 0xffef3e13;
-                break;
-            case 0x64:	/* GPMC_CONFIG2 */
-                f->config[1] = value & 0x001f1f8f;
-                break;
-            case 0x68:	/* GPMC_CONFIG3 */
-                f->config[2] = value & 0x001f1f8f;
-                break;
-            case 0x6c:	/* GPMC_CONFIG4 */
-                f->config[3] = value & 0x1f8f1f8f;
-                break;
-            case 0x70:	/* GPMC_CONFIG5 */
-                f->config[4] = value & 0x0f1f1f1f;
-                break;
-            case 0x74:	/* GPMC_CONFIG6 */
-                f->config[5] = value & 0x00000fcf;
-                break;
-            case 0x78:	/* GPMC_CONFIG7 */
-                if ((f->config[6] ^ value) & 0xf7f) {
-                    if (f->config[6] & (1 << 6))		/* CSVALID */
-                        omap_gpmc_cs_unmap(f);
-                    if (value & (1 << 6))			/* CSVALID */
-                        omap_gpmc_cs_map(f, value & 0x1f,	/* MASKADDR */
-                                        (value >> 8 & 0xf));	/* BASEADDR */
-                }
+        case 0x60:      /* GPMC_CONFIG1 */
+            f->config[0] = value & 0xffef3e13;
+            break;
+        case 0x64:      /* GPMC_CONFIG2 */
+            f->config[1] = value & 0x001f1f8f;
+            break;
+        case 0x68:      /* GPMC_CONFIG3 */
+            f->config[2] = value & 0x001f1f8f;
+            break;
+        case 0x6c:      /* GPMC_CONFIG4 */
+            f->config[3] = value & 0x1f8f1f8f;
+            break;
+        case 0x70:      /* GPMC_CONFIG5 */
+            f->config[4] = value & 0x0f1f1f1f;
+            break;
+        case 0x74:      /* GPMC_CONFIG6 */
+            f->config[5] = value & 0x00000fcf;
+            break;
+        case 0x78:      /* GPMC_CONFIG7 */
+            if ((f->config[6] ^ value) & 0xf7f) {
+                omap_gpmc_cs_unmap(s, cs);
                 f->config[6] = value & 0x00000f7f;
-                break;
-            case 0x7c:	/* GPMC_NAND_COMMAND */
-            case 0x80:	/* GPMC_NAND_ADDRESS */
-            case 0x84:	/* GPMC_NAND_DATA */
-                break;
-
-            default:
-                goto bad_reg;
+                omap_gpmc_cs_map(s, cs);
+            }
+            break;
+        case 0x7c ... 0x7f: /* GPMC_NAND_COMMAND */
+            if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) {
+                nand_setpins(f->dev, 1, 0, 0, 1, 0); /* CLE */
+                omap_nand_setio(f->dev, value, omap_gpmc_devsize(f), size);
+            }
+            break;
+        case 0x80 ... 0x83: /* GPMC_NAND_ADDRESS */
+            if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) {
+                nand_setpins(f->dev, 0, 1, 0, 1, 0); /* ALE */
+                omap_nand_setio(f->dev, value, omap_gpmc_devsize(f), size);
+            }
+            break;
+        case 0x84 ... 0x87: /* GPMC_NAND_DATA */
+            if (omap_gpmc_devtype(f) == OMAP_GPMC_NAND) {
+                omap_nand_write(f, 0, value, size);
+            }
+            break;
+        default:
+            goto bad_reg;
         }
         break;
 
     case 0x1e0:	/* GPMC_PREFETCH_CONFIG1 */
-        s->prefconfig[0] = value & 0x7f8f7fbf;
-        /* TODO: update interrupts, fifos, dmas */
+        if (!s->prefetch.startengine) {
+            uint32_t oldconfig1 = s->prefetch.config1;
+            uint32_t changed;
+            s->prefetch.config1 = value & 0x7f8f7fbf;
+            changed = oldconfig1 ^ s->prefetch.config1;
+            if (changed & (0x80 | 0x7000000)) {
+                /* Turning the engine on or off, or mapping it somewhere else.
+                 * cs_map() and cs_unmap() check the prefetch config and
+                 * overall CSVALID bits, so it is sufficient to unmap-and-map
+                 * both the old cs and the new one.
+                 */
+                int oldcs = prefetch_cs(oldconfig1);
+                int newcs = prefetch_cs(s->prefetch.config1);
+                omap_gpmc_cs_unmap(s, oldcs);
+                omap_gpmc_cs_map(s, oldcs);
+                if (newcs != oldcs) {
+                    omap_gpmc_cs_unmap(s, newcs);
+                    omap_gpmc_cs_map(s, newcs);
+                }
+            }
+        }
         break;
 
     case 0x1e4:	/* GPMC_PREFETCH_CONFIG2 */
-        s->prefconfig[1] = value & 0x3fff;
+        if (!s->prefetch.startengine) {
+            s->prefetch.transfercount = value & 0x3fff;
+        }
         break;
 
     case 0x1ec:	/* GPMC_PREFETCH_CONTROL */
-        s->prefcontrol = value & 1;
-        if (s->prefcontrol) {
-            if (s->prefconfig[0] & 1)
-                s->preffifo = 0x40;
-            else
-                s->preffifo = 0x00;
+        if (s->prefetch.startengine != (value & 1)) {
+            s->prefetch.startengine = value & 1;
+            if (s->prefetch.startengine) {
+                /* Prefetch engine start */
+                s->prefetch.count = s->prefetch.transfercount;
+                if (s->prefetch.config1 & 1) {
+                    /* Write */
+                    s->prefetch.fifopointer = 64;
+                } else {
+                    /* Read */
+                    s->prefetch.fifopointer = 0;
+                    fill_prefetch_fifo(s);
+                }
+            } else {
+                /* Prefetch engine forcibly stopped. The TRM
+                 * doesn't define the behaviour if you do this.
+                 * We clear the prefetch count, which means that
+                 * we permit no more writes, and don't read any
+                 * more data from NAND. The CPU can still drain
+                 * the FIFO of unread data.
+                 */
+                s->prefetch.count = 0;
+            }
+            omap_gpmc_int_update(s);
         }
-        /* TODO: start */
         break;
 
     case 0x1f4:	/* GPMC_ECC_CONFIG */
@@ -394,24 +797,47 @@ static const MemoryRegionOps omap_gpmc_ops = {
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-struct omap_gpmc_s *omap_gpmc_init(target_phys_addr_t base, qemu_irq irq)
+struct omap_gpmc_s *omap_gpmc_init(struct omap_mpu_state_s *mpu,
+                                   target_phys_addr_t base,
+                                   qemu_irq irq, qemu_irq drq)
 {
+    int cs;
     struct omap_gpmc_s *s = (struct omap_gpmc_s *)
             g_malloc0(sizeof(struct omap_gpmc_s));
 
-    omap_gpmc_reset(s);
-
     memory_region_init_io(&s->iomem, &omap_gpmc_ops, s, "omap-gpmc", 0x1000);
     memory_region_add_subregion(get_system_memory(), base, &s->iomem);
 
+    s->irq = irq;
+    s->drq = drq;
+    s->accept_256 = cpu_is_omap3630(mpu);
+    s->revision = cpu_class_omap3(mpu) ? 0x50 : 0x20;
+    s->lastirq = 0;
+    omap_gpmc_reset(s);
+
+    /* We have to register a different IO memory handler for each
+     * chip select region in case a NAND device is mapped there. We
+     * make the region the worst-case size of 256MB and rely on the
+     * container memory region in cs_map to chop it down to the actual
+     * guest-requested size.
+     */
+    for (cs = 0; cs < 8; cs++) {
+        memory_region_init_io(&s->cs_file[cs].nandiomem,
+                              &omap_nand_ops,
+                              &s->cs_file[cs],
+                              "omap-nand",
+                              256 * 1024 * 1024);
+    }
+
+    memory_region_init_io(&s->prefetch.iomem, &omap_prefetch_ops, s,
+                          "omap-gpmc-prefetch", 256 * 1024 * 1024);
     return s;
 }
 
-void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem,
-                void (*base_upd)(void *opaque, target_phys_addr_t new),
-                void (*unmap)(void *opaque), void *opaque)
+void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem)
 {
     struct omap_gpmc_cs_file_s *f;
+    assert(iomem);
 
     if (cs < 0 || cs >= 8) {
         fprintf(stderr, "%s: bad chip-select %i\n", __FUNCTION__, cs);
@@ -419,12 +845,29 @@ void omap_gpmc_attach(struct omap_gpmc_s *s, int cs, MemoryRegion *iomem,
     }
     f = &s->cs_file[cs];
 
+    omap_gpmc_cs_unmap(s, cs);
+    f->config[0] &= ~(0xf << 10);
     f->iomem = iomem;
-    f->base_update = base_upd;
-    f->unmap = unmap;
-    f->opaque = opaque;
+    omap_gpmc_cs_map(s, cs);
+}
 
-    if (f->config[6] & (1 << 6))				/* CSVALID */
-        omap_gpmc_cs_map(f, f->config[6] & 0x1f,		/* MASKADDR */
-                        (f->config[6] >> 8 & 0xf));		/* BASEADDR */
+void omap_gpmc_attach_nand(struct omap_gpmc_s *s, int cs, DeviceState *nand)
+{
+    struct omap_gpmc_cs_file_s *f;
+    assert(nand);
+
+    if (cs < 0 || cs >= 8) {
+        fprintf(stderr, "%s: bad chip-select %i\n", __func__, cs);
+        exit(-1);
+    }
+    f = &s->cs_file[cs];
+
+    omap_gpmc_cs_unmap(s, cs);
+    f->config[0] &= ~(0xf << 10);
+    f->config[0] |= (OMAP_GPMC_NAND << 10);
+    f->dev = nand;
+    if (nand_getbuswidth(f->dev) == 16) {
+        f->config[0] |= OMAP_GPMC_16BIT << 12;
+    }
+    omap_gpmc_cs_map(s, cs);
 }
diff --git a/hw/onenand.c b/hw/onenand.c
index 00276a03cb..6f68f70698 100644
--- a/hw/onenand.c
+++ b/hw/onenand.c
@@ -25,6 +25,7 @@
 #include "blockdev.h"
 #include "memory.h"
 #include "exec-memory.h"
+#include "sysbus.h"
 
 /* 11 for 2kB-page OneNAND ("2nd generation") and 10 for 1kB-page chips */
 #define PAGE_SHIFT	11
@@ -33,6 +34,7 @@
 #define BLOCK_SHIFT	(PAGE_SHIFT + 6)
 
 typedef struct {
+    SysBusDevice busdev;
     struct {
         uint16_t man;
         uint16_t dev;
@@ -49,6 +51,7 @@ typedef struct {
     uint8_t *current;
     MemoryRegion ram;
     MemoryRegion mapped_ram;
+    uint8_t current_direction;
     uint8_t *boot[2];
     uint8_t *data[2][2];
     MemoryRegion iomem;
@@ -120,27 +123,72 @@ static void onenand_mem_setup(OneNANDState *s)
                                         1);
 }
 
-void onenand_base_update(void *opaque, target_phys_addr_t new)
+static void onenand_intr_update(OneNANDState *s)
 {
-    OneNANDState *s = (OneNANDState *) opaque;
-
-    s->base = new;
-
-    memory_region_add_subregion(get_system_memory(), s->base, &s->container);
+    qemu_set_irq(s->intr, ((s->intstatus >> 15) ^ (~s->config[0] >> 6)) & 1);
 }
 
-void onenand_base_unmap(void *opaque)
+static void onenand_pre_save(void *opaque)
 {
-    OneNANDState *s = (OneNANDState *) opaque;
-
-    memory_region_del_subregion(get_system_memory(), &s->container);
+    OneNANDState *s = opaque;
+    if (s->current == s->otp) {
+        s->current_direction = 1;
+    } else if (s->current == s->image) {
+        s->current_direction = 2;
+    } else {
+        s->current_direction = 0;
+    }
 }
 
-static void onenand_intr_update(OneNANDState *s)
+static int onenand_post_load(void *opaque, int version_id)
 {
-    qemu_set_irq(s->intr, ((s->intstatus >> 15) ^ (~s->config[0] >> 6)) & 1);
+    OneNANDState *s = opaque;
+    switch (s->current_direction) {
+    case 0:
+        break;
+    case 1:
+        s->current = s->otp;
+        break;
+    case 2:
+        s->current = s->image;
+        break;
+    default:
+        return -1;
+    }
+    onenand_intr_update(s);
+    return 0;
 }
 
+static const VMStateDescription vmstate_onenand = {
+    .name = "onenand",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .pre_save = onenand_pre_save,
+    .post_load = onenand_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(current_direction, OneNANDState),
+        VMSTATE_INT32(cycle, OneNANDState),
+        VMSTATE_INT32(otpmode, OneNANDState),
+        VMSTATE_UINT16_ARRAY(addr, OneNANDState, 8),
+        VMSTATE_UINT16_ARRAY(unladdr, OneNANDState, 8),
+        VMSTATE_INT32(bufaddr, OneNANDState),
+        VMSTATE_INT32(count, OneNANDState),
+        VMSTATE_UINT16(command, OneNANDState),
+        VMSTATE_UINT16_ARRAY(config, OneNANDState, 2),
+        VMSTATE_UINT16(status, OneNANDState),
+        VMSTATE_UINT16(intstatus, OneNANDState),
+        VMSTATE_UINT16(wpstatus, OneNANDState),
+        VMSTATE_INT32(secs_cur, OneNANDState),
+        VMSTATE_PARTIAL_VBUFFER(blockwp, OneNANDState, blocks),
+        VMSTATE_UINT8(ecc.cp, OneNANDState),
+        VMSTATE_UINT16_ARRAY(ecc.lp, OneNANDState, 2),
+        VMSTATE_UINT16(ecc.count, OneNANDState),
+        VMSTATE_BUFFER_UNSAFE(otp, OneNANDState, 0, ((64 + 2) << PAGE_SHIFT)),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 /* Hot reset (Reset OneNAND command) or warm reset (RP pin low) */
 static void onenand_reset(OneNANDState *s, int cold)
 {
@@ -167,11 +215,17 @@ static void onenand_reset(OneNANDState *s, int cold)
         /* Lock the whole flash */
         memset(s->blockwp, ONEN_LOCK_LOCKED, s->blocks);
 
-        if (s->bdrv && bdrv_read(s->bdrv, 0, s->boot[0], 8) < 0)
-            hw_error("%s: Loading the BootRAM failed.\n", __FUNCTION__);
+        if (s->bdrv_cur && bdrv_read(s->bdrv_cur, 0, s->boot[0], 8) < 0) {
+            hw_error("%s: Loading the BootRAM failed.\n", __func__);
+        }
     }
 }
 
+static void onenand_system_reset(DeviceState *dev)
+{
+    onenand_reset(FROM_SYSBUS(OneNANDState, sysbus_from_qdev(dev)), 1);
+}
+
 static inline int onenand_load_main(OneNANDState *s, int sec, int secn,
                 void *dest)
 {
@@ -191,8 +245,8 @@ static inline int onenand_prog_main(OneNANDState *s, int sec, int secn,
     int result = 0;
 
     if (secn > 0) {
-        uint32_t size = (uint32_t) secn * 512;
-        const uint8_t *sp = (const uint8_t *) src;
+        uint32_t size = (uint32_t)secn * 512;
+        const uint8_t *sp = (const uint8_t *)src;
         uint8_t *dp = 0;
         if (s->bdrv_cur) {
             dp = g_malloc(size);
@@ -203,7 +257,7 @@ static inline int onenand_prog_main(OneNANDState *s, int sec, int secn,
             if (sec + secn > s->secs_cur) {
                 result = 1;
             } else {
-                dp = (uint8_t *) s->current + (sec << 9);
+                dp = (uint8_t *)s->current + (sec << 9);
             }
         }
         if (!result) {
@@ -245,13 +299,13 @@ static inline int onenand_prog_spare(OneNANDState *s, int sec, int secn,
 {
     int result = 0;
     if (secn > 0) {
-        const uint8_t *sp = (const uint8_t *) src;
+        const uint8_t *sp = (const uint8_t *)src;
         uint8_t *dp = 0, *dpp = 0;
         if (s->bdrv_cur) {
             dp = g_malloc(512);
             if (!dp || bdrv_read(s->bdrv_cur,
-                                s->secs_cur + (sec >> 5),
-                                dp, 1) < 0) {
+                                 s->secs_cur + (sec >> 5),
+                                 dp, 1) < 0) {
                 result = 1;
             } else {
                 dpp = dp + ((sec & 31) << 4);
@@ -270,7 +324,7 @@ static inline int onenand_prog_spare(OneNANDState *s, int sec, int secn,
             }
             if (s->bdrv_cur) {
                 result = bdrv_write(s->bdrv_cur, s->secs_cur + (sec >> 5),
-                                dp, 1) < 0;
+                                    dp, 1) < 0;
             }
         }
         if (dp) {
@@ -326,7 +380,7 @@ fail:
     return 1;
 }
 
-static void onenand_command(OneNANDState *s, int cmd)
+static void onenand_command(OneNANDState *s)
 {
     int b;
     int sec;
@@ -346,7 +400,7 @@ static void onenand_command(OneNANDState *s, int cmd)
             s->data[(s->bufaddr >> 2) & 1][1] : s->boot[1];	\
     buf += (s->bufaddr & 3) << 4;
 
-    switch (cmd) {
+    switch (s->command) {
     case 0x00:	/* Load single/multiple sector data unit into buffer */
         SETADDR(ONEN_BUF_BLOCK, ONEN_BUF_PAGE)
 
@@ -527,7 +581,7 @@ static void onenand_command(OneNANDState *s, int cmd)
         s->status |= ONEN_ERR_CMD;
         s->intstatus |= ONEN_INT;
         fprintf(stderr, "%s: unknown OneNAND command %x\n",
-                        __FUNCTION__, cmd);
+                        __func__, s->command);
     }
 
     onenand_intr_update(s);
@@ -659,7 +713,7 @@ static void onenand_write(void *opaque, target_phys_addr_t addr,
         if (s->intstatus & (1 << 15))
             break;
         s->command = value;
-        onenand_command(s, s->command);
+        onenand_command(s);
         break;
     case 0xf221:	/* System Configuration 1 */
         s->config[0] = value;
@@ -700,30 +754,25 @@ static const MemoryRegionOps onenand_ops = {
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-void *onenand_init(BlockDriverState *bdrv,
-                uint16_t man_id, uint16_t dev_id, uint16_t ver_id,
-                int regshift, qemu_irq irq)
+static int onenand_initfn(SysBusDevice *dev)
 {
-    OneNANDState *s = (OneNANDState *) g_malloc0(sizeof(*s));
-    uint32_t size = 1 << (24 + ((dev_id >> 4) & 7));
+    OneNANDState *s = (OneNANDState *)dev;
+    uint32_t size = 1 << (24 + ((s->id.dev >> 4) & 7));
     void *ram;
-
-    s->shift = regshift;
-    s->intr = irq;
+    s->base = (target_phys_addr_t)-1;
     s->rdy = NULL;
-    s->id.man = man_id;
-    s->id.dev = dev_id;
-    s->id.ver = ver_id;
     s->blocks = size >> BLOCK_SHIFT;
     s->secs = size >> 9;
     s->blockwp = g_malloc(s->blocks);
-    s->density_mask = (dev_id & 0x08) ? (1 << (6 + ((dev_id >> 4) & 7))) : 0;
+    s->density_mask = (s->id.dev & 0x08)
+        ? (1 << (6 + ((s->id.dev >> 4) & 7))) : 0;
     memory_region_init_io(&s->iomem, &onenand_ops, s, "onenand",
                           0x10000 << s->shift);
-    s->bdrv = bdrv;
     if (!s->bdrv) {
         s->image = memset(g_malloc(size + (size >> 5)),
-                        0xff, size + (size >> 5));
+                          0xff, size + (size >> 5));
+    } else {
+        s->bdrv_cur = s->bdrv;
     }
     s->otp = memset(g_malloc((64 + 2) << PAGE_SHIFT),
                     0xff, (64 + 2) << PAGE_SHIFT);
@@ -736,15 +785,40 @@ void *onenand_init(BlockDriverState *bdrv,
     s->data[1][0] = ram + ((0x0200 + (1 << (PAGE_SHIFT - 1))) << s->shift);
     s->data[1][1] = ram + ((0x8010 + (1 << (PAGE_SHIFT - 6))) << s->shift);
     onenand_mem_setup(s);
+    sysbus_init_irq(dev, &s->intr);
+    sysbus_init_mmio_region(dev, &s->container);
+    vmstate_register(&dev->qdev,
+                     ((s->shift & 0x7f) << 24)
+                     | ((s->id.man & 0xff) << 16)
+                     | ((s->id.dev & 0xff) << 8)
+                     | (s->id.ver & 0xff),
+                     &vmstate_onenand, s);
+    return 0;
+}
 
-    onenand_reset(s, 1);
+static SysBusDeviceInfo onenand_info = {
+    .init = onenand_initfn,
+    .qdev.name = "onenand",
+    .qdev.size = sizeof(OneNANDState),
+    .qdev.reset = onenand_system_reset,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_UINT16("manufacturer_id", OneNANDState, id.man, 0),
+        DEFINE_PROP_UINT16("device_id", OneNANDState, id.dev, 0),
+        DEFINE_PROP_UINT16("version_id", OneNANDState, id.ver, 0),
+        DEFINE_PROP_INT32("shift", OneNANDState, shift, 0),
+        DEFINE_PROP_DRIVE("drive", OneNANDState, bdrv),
+        DEFINE_PROP_END_OF_LIST()
+    }
+};
 
-    return s;
+static void onenand_register_device(void)
+{
+    sysbus_register_withprop(&onenand_info);
 }
 
-void *onenand_raw_otp(void *opaque)
+void *onenand_raw_otp(DeviceState *onenand_device)
 {
-    OneNANDState *s = (OneNANDState *) opaque;
-
-    return s->otp;
+    return FROM_SYSBUS(OneNANDState, sysbus_from_qdev(onenand_device))->otp;
 }
+
+device_init(onenand_register_device)
diff --git a/hw/pci.c b/hw/pci.c
index 6124790f01..57ff7b1098 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1811,6 +1811,25 @@ static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id,
     return next;
 }
 
+static uint8_t pci_find_capability_at_offset(PCIDevice *pdev, uint8_t offset)
+{
+    uint8_t next, prev, found = 0;
+
+    if (!(pdev->used[offset])) {
+        return 0;
+    }
+
+    assert(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST);
+
+    for (prev = PCI_CAPABILITY_LIST; (next = pdev->config[prev]);
+         prev = next + PCI_CAP_LIST_NEXT) {
+        if (next <= offset && next > found) {
+            found = next;
+        }
+    }
+    return found;
+}
+
 /* Patch the PCI vendor and device ids in a PCI rom image if necessary.
    This is needed for an option rom which is used for more than one device. */
 static void pci_patch_ids(PCIDevice *pdev, uint8_t *ptr, int size)
@@ -1952,11 +1971,30 @@ int pci_add_capability(PCIDevice *pdev, uint8_t cap_id,
                        uint8_t offset, uint8_t size)
 {
     uint8_t *config;
+    int i, overlapping_cap;
+
     if (!offset) {
         offset = pci_find_space(pdev, size);
         if (!offset) {
             return -ENOSPC;
         }
+    } else {
+        /* Verify that capabilities don't overlap.  Note: device assignment
+         * depends on this check to verify that the device is not broken.
+         * Should never trigger for emulated devices, but it's helpful
+         * for debugging these. */
+        for (i = offset; i < offset + size; i++) {
+            overlapping_cap = pci_find_capability_at_offset(pdev, i);
+            if (overlapping_cap) {
+                fprintf(stderr, "ERROR: %04x:%02x:%02x.%x "
+                        "Attempt to add PCI capability %x at offset "
+                        "%x overlaps existing capability %x at offset %x\n",
+                        pci_find_domain(pdev->bus), pci_bus_num(pdev->bus),
+                        PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
+                        cap_id, offset, overlapping_cap, i);
+                return -EINVAL;
+            }
+        }
     }
 
     config = pdev->config + offset;
diff --git a/hw/pcie.c b/hw/pcie.c
index 39607bf31a..5c9eb2f0ac 100644
--- a/hw/pcie.c
+++ b/hw/pcie.c
@@ -175,6 +175,14 @@ static void hotplug_event_notify(PCIDevice *dev)
     }
 }
 
+static void hotplug_event_clear(PCIDevice *dev)
+{
+    hotplug_event_update_event_status(dev);
+    if (!msix_enabled(dev) && !msi_enabled(dev) && !dev->exp.hpev_notified) {
+        qemu_set_irq(dev->irq[dev->exp.hpev_intx], 0);
+    }
+}
+
 /*
  * A PCI Express Hot-Plug Event has occurred, so update slot status register
  * and notify OS of the event if necessary.
@@ -320,6 +328,10 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
     uint8_t *exp_cap = dev->config + pos;
     uint16_t sltsta = pci_get_word(exp_cap + PCI_EXP_SLTSTA);
 
+    if (ranges_overlap(addr, len, pos + PCI_EXP_SLTSTA, 2)) {
+        hotplug_event_clear(dev);
+    }
+
     if (!ranges_overlap(addr, len, pos + PCI_EXP_SLTCTL, 2)) {
         return;
     }
diff --git a/hw/pcie_aer.c b/hw/pcie_aer.c
index 2ae65ec807..62c06eafd6 100644
--- a/hw/pcie_aer.c
+++ b/hw/pcie_aer.c
@@ -415,7 +415,7 @@ static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err)
     int i;
 
     assert(err->status);
-    assert(err->status & (err->status - 1));
+    assert(!(err->status & (err->status - 1)));
 
     errcap &= ~(PCI_ERR_CAP_FEP_MASK | PCI_ERR_CAP_TLP);
     errcap |= PCI_ERR_CAP_FEP(first_bit);
@@ -495,7 +495,7 @@ static int pcie_aer_record_error(PCIDevice *dev,
     int fep = PCI_ERR_CAP_FEP(errcap);
 
     assert(err->status);
-    assert(err->status & (err->status - 1));
+    assert(!(err->status & (err->status - 1)));
 
     if (errcap & PCI_ERR_CAP_MHRE &&
         (pci_get_long(aer_cap + PCI_ERR_UNCOR_STATUS) & (1U << fep))) {
@@ -979,20 +979,21 @@ int do_pcie_aer_inejct_error(Monitor *mon,
     if (pcie_aer_parse_error_string(error_name, &error_status, &correctable)) {
         char *e = NULL;
         error_status = strtoul(error_name, &e, 0);
-        correctable = !!qdict_get_int(qdict, "correctable");
+        correctable = qdict_get_try_bool(qdict, "correctable", 0);
         if (!e || *e != '\0') {
             monitor_printf(mon, "invalid error status value. \"%s\"",
                            error_name);
             return -EINVAL;
         }
     }
+    err.status = error_status;
     err.source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn;
 
     err.flags = 0;
     if (correctable) {
         err.flags |= PCIE_AER_ERR_IS_CORRECTABLE;
     }
-    if (qdict_get_int(qdict, "advisory_non_fatal")) {
+    if (qdict_get_try_bool(qdict, "advisory_non_fatal", 0)) {
         err.flags |= PCIE_AER_ERR_MAYBE_ADVISORY;
     }
     if (qdict_haskey(qdict, "header0")) {
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index d94b1eb53c..3cc830ff95 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -57,6 +57,7 @@ typedef struct SCSIDiskReq {
     struct iovec iov;
     QEMUIOVector qiov;
     uint32_t status;
+    BlockAcctCookie acct;
 } SCSIDiskReq;
 
 struct SCSIDiskState
@@ -107,10 +108,13 @@ static void scsi_cancel_io(SCSIRequest *req)
 static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     int n;
 
     r->req.aiocb = NULL;
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     if (ret) {
         if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_READ)) {
             return;
@@ -161,6 +165,8 @@ static void scsi_read_data(SCSIRequest *req)
 
     r->iov.iov_len = n * 512;
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+    bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
                               scsi_read_complete, r);
     if (r->req.aiocb == NULL) {
@@ -207,11 +213,14 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type)
 static void scsi_write_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t len;
     uint32_t n;
 
     r->req.aiocb = NULL;
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     if (ret) {
         if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_WRITE)) {
             return;
@@ -252,6 +261,8 @@ static void scsi_write_data(SCSIRequest *req)
     n = r->iov.iov_len / 512;
     if (n) {
         qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+        bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
         r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
                                    scsi_write_complete, r);
         if (r->req.aiocb == NULL) {
@@ -854,13 +865,19 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         buflen = 8;
         break;
     case SYNCHRONIZE_CACHE:
+    {
+        BlockAcctCookie acct;
+
+        bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH);
         ret = bdrv_flush(s->bs);
+        bdrv_acct_done(s->bs, &acct);
         if (ret < 0) {
             if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) {
                 return -1;
             }
         }
         break;
+    }
     case GET_CONFIGURATION:
         memset(outbuf, 0, 8);
         /* ??? This should probably return much more information.  For now
diff --git a/hw/sh_pci.c b/hw/sh_pci.c
index 76061bb756..36f39300d5 100644
--- a/hw/sh_pci.c
+++ b/hw/sh_pci.c
@@ -150,7 +150,7 @@ static int sh_pci_init_device(SysBusDevice *dev)
                               PCI_DEVFN(0, 0), 4);
     memory_region_init_io(&s->memconfig_p4, &sh_pci_reg_ops, s,
                           "sh_pci", 0x224);
-    memory_region_init_alias(&s->memconfig_a7, "sh_pci.2", &s->memconfig_a7,
+    memory_region_init_alias(&s->memconfig_a7, "sh_pci.2", &s->memconfig_p4,
                              0, 0x224);
     isa_mmio_setup(&s->isa, 0x40000);
     sysbus_init_mmio_cb2(dev, sh_pci_map, sh_pci_unmap);
diff --git a/hw/sysbus.c b/hw/sysbus.c
index f39768b6a2..c365d39d24 100644
--- a/hw/sysbus.c
+++ b/hw/sysbus.c
@@ -131,6 +131,11 @@ void sysbus_init_mmio_region(SysBusDevice *dev, MemoryRegion *memory)
     dev->mmio[n].memory = memory;
 }
 
+MemoryRegion *sysbus_mmio_get_region(SysBusDevice *dev, int n)
+{
+    return dev->mmio[n].memory;
+}
+
 void sysbus_init_ioports(SysBusDevice *dev, pio_addr_t ioport, pio_addr_t size)
 {
     pio_addr_t i;
diff --git a/hw/sysbus.h b/hw/sysbus.h
index b87c6c5aab..aa3d383277 100644
--- a/hw/sysbus.h
+++ b/hw/sysbus.h
@@ -50,6 +50,7 @@ void sysbus_init_mmio(SysBusDevice *dev, target_phys_addr_t size,
 void sysbus_init_mmio_cb2(SysBusDevice *dev,
                           mmio_mapfunc cb, mmio_mapfunc unmap);
 void sysbus_init_mmio_region(SysBusDevice *dev, MemoryRegion *memory);
+MemoryRegion *sysbus_mmio_get_region(SysBusDevice *dev, int n);
 void sysbus_init_irq(SysBusDevice *dev, qemu_irq *p);
 void sysbus_pass_irq(SysBusDevice *dev, SysBusDevice *target);
 void sysbus_init_ioports(SysBusDevice *dev, pio_addr_t ioport, pio_addr_t size);
diff --git a/hw/tusb6010.c b/hw/tusb6010.c
index b2bf35934d..de6ffc6133 100644
--- a/hw/tusb6010.c
+++ b/hw/tusb6010.c
@@ -23,9 +23,11 @@
 #include "usb.h"
 #include "omap.h"
 #include "irq.h"
-#include "tusb6010.h"
+#include "devices.h"
+#include "sysbus.h"
 
-struct TUSBState {
+typedef struct TUSBState {
+    SysBusDevice busdev;
     MemoryRegion iomem[2];
     qemu_irq irq;
     MUSBState *musb;
@@ -59,7 +61,7 @@ struct TUSBState {
     uint32_t pullup[2];
     uint32_t control_config;
     uint32_t otg_timer_val;
-};
+} TUSBState;
 
 #define TUSB_DEVCLOCK			60000000	/* 60 MHz */
 
@@ -234,16 +236,6 @@ struct TUSBState {
 #define TUSB_EP_CONFIG_XFR_SIZE(v)	((v) & 0x7fffffff)
 #define TUSB_PROD_TEST_RESET_VAL	0xa596
 
-MemoryRegion *tusb6010_sync_io(TUSBState *s)
-{
-    return &s->iomem[0];
-}
-
-MemoryRegion *tusb6010_async_io(TUSBState *s)
-{
-    return &s->iomem[1];
-}
-
 static void tusb_intr_update(TUSBState *s)
 {
     if (s->control_config & TUSB_INT_CTRL_CONF_INT_POLARITY)
@@ -723,9 +715,33 @@ static void tusb_musb_core_intr(void *opaque, int source, int level)
     }
 }
 
-TUSBState *tusb6010_init(qemu_irq intr)
+static void tusb6010_power(TUSBState *s, int on)
 {
-    TUSBState *s = g_malloc0(sizeof(*s));
+    if (!on) {
+        s->power = 0;
+    } else if (!s->power && on) {
+        s->power = 1;
+        /* Pull the interrupt down after TUSB6010 comes up.  */
+        s->intr_ok = 0;
+        tusb_intr_update(s);
+        qemu_mod_timer(s->pwr_timer,
+                       qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 2);
+    }
+}
+
+static void tusb6010_irq(void *opaque, int source, int level)
+{
+    if (source) {
+        tusb_musb_core_intr(opaque, source - 1, level);
+    } else {
+        tusb6010_power(opaque, level);
+    }
+}
+
+static void tusb6010_reset(DeviceState *dev)
+{
+    TUSBState *s = FROM_SYSBUS(TUSBState, sysbus_from_qdev(dev));
+    int i;
 
     s->test_reset = TUSB_PROD_TEST_RESET_VAL;
     s->host_mode = 0;
@@ -735,28 +751,59 @@ TUSBState *tusb6010_init(qemu_irq intr)
     s->mask = 0xffffffff;
     s->intr = 0x00000000;
     s->otg_timer_val = 0;
-    memory_region_init_io(&s->iomem[1], &tusb_async_ops, s, "tusb-async",
-                          UINT32_MAX);
-    s->irq = intr;
+    s->scratch = 0;
+    s->prcm_config = 0;
+    s->prcm_mngmt = 0;
+    s->intr_ok = 0;
+    s->usbip_intr = 0;
+    s->usbip_mask = 0;
+    s->gpio_intr = 0;
+    s->gpio_mask = 0;
+    s->gpio_config = 0;
+    s->dma_intr = 0;
+    s->dma_mask = 0;
+    s->dma_map = 0;
+    s->dma_config = 0;
+    s->ep0_config = 0;
+    s->wkup_mask = 0;
+    s->pullup[0] = s->pullup[1] = 0;
+    s->control_config = 0;
+    for (i = 0; i < 15; i++) {
+        s->rx_config[i] = s->tx_config[i] = 0;
+    }
+}
+
+static int tusb6010_init(SysBusDevice *dev)
+{
+    TUSBState *s = FROM_SYSBUS(TUSBState, dev);
+    qemu_irq *musb_irqs;
+    int i;
     s->otg_timer = qemu_new_timer_ns(vm_clock, tusb_otg_tick, s);
     s->pwr_timer = qemu_new_timer_ns(vm_clock, tusb_power_tick, s);
-    s->musb = musb_init(qemu_allocate_irqs(tusb_musb_core_intr, s,
-                            __musb_irq_max));
-
-    return s;
+    memory_region_init_io(&s->iomem[1], &tusb_async_ops, s, "tusb-async",
+                          UINT32_MAX);
+    sysbus_init_mmio_region(dev, &s->iomem[0]);
+    sysbus_init_mmio_region(dev, &s->iomem[1]);
+    sysbus_init_irq(dev, &s->irq);
+    qdev_init_gpio_in(&dev->qdev, tusb6010_irq, __musb_irq_max + 1);
+    musb_irqs = g_new0(qemu_irq, __musb_irq_max);
+    for (i = 0; i < __musb_irq_max; i++) {
+        musb_irqs[i] = qdev_get_gpio_in(&dev->qdev, i + 1);
+    }
+    s->musb = musb_init(musb_irqs);
+    return 0;
 }
 
-void tusb6010_power(TUSBState *s, int on)
-{
-    if (!on)
-        s->power = 0;
-    else if (!s->power && on) {
-        s->power = 1;
+static SysBusDeviceInfo tusb6010_info = {
+    .init = tusb6010_init,
+    .qdev.name = "tusb6010",
+    .qdev.size = sizeof(TUSBState),
+    .qdev.reset = tusb6010_reset,
+};
 
-        /* Pull the interrupt down after TUSB6010 comes up.  */
-        s->intr_ok = 0;
-        tusb_intr_update(s);
-        qemu_mod_timer(s->pwr_timer,
-                       qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 2);
-    }
+static void tusb6010_register_device(void)
+{
+    sysbus_register_withprop(&tusb6010_info);
 }
+
+device_init(tusb6010_register_device)
diff --git a/hw/tusb6010.h b/hw/tusb6010.h
deleted file mode 100644
index b85ee86215..0000000000
--- a/hw/tusb6010.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * tusb6010 interfaces
- *
- * Copyright 2011 Red Hat, Inc. and/or its affiliates
- *
- * Authors:
- *  Avi Kivity <avi@redhat.com>
- *
- * Derived from hw/devices.h.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#ifndef TUSB6010_H
-#define TUSB6010_H
-
-#include "targphys.h"
-#include "memory.h"
-
-typedef struct TUSBState TUSBState;
-TUSBState *tusb6010_init(qemu_irq intr);
-MemoryRegion *tusb6010_sync_io(TUSBState *s);
-MemoryRegion *tusb6010_async_io(TUSBState *s);
-void tusb6010_power(TUSBState *s, int on);
-
-#endif
diff --git a/hw/vhost.c b/hw/vhost.c
index 18860678ba..0870cb7d85 100644
--- a/hw/vhost.c
+++ b/hw/vhost.c
@@ -515,11 +515,6 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
     };
     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 
-    if (!vdev->binding->set_host_notifier) {
-        fprintf(stderr, "binding does not support host notifiers\n");
-        return -ENOSYS;
-    }
-
     vq->num = state.num = virtio_queue_get_num(vdev, idx);
     r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
     if (r) {
@@ -567,12 +562,6 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
         r = -errno;
         goto fail_alloc;
     }
-    r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, true);
-    if (r < 0) {
-        fprintf(stderr, "Error binding host notifier: %d\n", -r);
-        goto fail_host_notifier;
-    }
-
     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
     r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
     if (r) {
@@ -591,8 +580,6 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
 
 fail_call:
 fail_kick:
-    vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false);
-fail_host_notifier:
 fail_alloc:
     cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
                               0, 0);
@@ -618,12 +605,6 @@ static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
         .index = idx,
     };
     int r;
-    r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false);
-    if (r < 0) {
-        fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r);
-        fflush(stderr);
-    }
-    assert (r >= 0);
     r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state);
     if (r < 0) {
         fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
@@ -697,6 +678,60 @@ bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev)
         hdev->force;
 }
 
+/* Stop processing guest IO notifications in qemu.
+ * Start processing them in vhost in kernel.
+ */
+int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+    int i, r;
+    if (!vdev->binding->set_host_notifier) {
+        fprintf(stderr, "binding does not support host notifiers\n");
+        r = -ENOSYS;
+        goto fail;
+    }
+
+    for (i = 0; i < hdev->nvqs; ++i) {
+        r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, true);
+        if (r < 0) {
+            fprintf(stderr, "vhost VQ %d notifier binding failed: %d\n", i, -r);
+            goto fail_vq;
+        }
+    }
+
+    return 0;
+fail_vq:
+    while (--i >= 0) {
+        r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, false);
+        if (r < 0) {
+            fprintf(stderr, "vhost VQ %d notifier cleanup error: %d\n", i, -r);
+            fflush(stderr);
+        }
+        assert (r >= 0);
+    }
+fail:
+    return r;
+}
+
+/* Stop processing guest IO notifications in vhost.
+ * Start processing them in qemu.
+ * This might actually run the qemu handlers right away,
+ * so virtio in qemu must be completely setup when this is called.
+ */
+void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+    int i, r;
+
+    for (i = 0; i < hdev->nvqs; ++i) {
+        r = vdev->binding->set_host_notifier(vdev->binding_opaque, i, false);
+        if (r < 0) {
+            fprintf(stderr, "vhost VQ %d notifier cleanup failed: %d\n", i, -r);
+            fflush(stderr);
+        }
+        assert (r >= 0);
+    }
+}
+
+/* Host notifiers must be enabled at this point. */
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
     int i, r;
@@ -762,6 +797,7 @@ fail:
     return r;
 }
 
+/* Host notifiers must be enabled at this point. */
 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
     int i, r;
diff --git a/hw/vhost.h b/hw/vhost.h
index c8c595a147..c9452f0732 100644
--- a/hw/vhost.h
+++ b/hw/vhost.h
@@ -46,5 +46,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev);
 bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev);
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
+int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev);
+void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev);
 
 #endif
diff --git a/hw/vhost_net.c b/hw/vhost_net.c
index a55981200d..950a6b8d99 100644
--- a/hw/vhost_net.c
+++ b/hw/vhost_net.c
@@ -139,16 +139,22 @@ int vhost_net_start(struct vhost_net *net,
 {
     struct vhost_vring_file file = { };
     int r;
+
+    net->dev.nvqs = 2;
+    net->dev.vqs = net->vqs;
+
+    r = vhost_dev_enable_notifiers(&net->dev, dev);
+    if (r < 0) {
+        goto fail_notifiers;
+    }
     if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) {
         tap_set_vnet_hdr_len(net->vc,
                              sizeof(struct virtio_net_hdr_mrg_rxbuf));
     }
 
-    net->dev.nvqs = 2;
-    net->dev.vqs = net->vqs;
     r = vhost_dev_start(&net->dev, dev);
     if (r < 0) {
-        return r;
+        goto fail_start;
     }
 
     net->vc->info->poll(net->vc, false);
@@ -173,6 +179,9 @@ fail:
     if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) {
         tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr));
     }
+fail_start:
+    vhost_dev_disable_notifiers(&net->dev, dev);
+fail_notifiers:
     return r;
 }
 
@@ -190,6 +199,7 @@ void vhost_net_stop(struct vhost_net *net,
     if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) {
         tap_set_vnet_hdr_len(net->vc, sizeof(struct virtio_net_hdr));
     }
+    vhost_dev_disable_notifiers(&net->dev, dev);
 }
 
 void vhost_net_cleanup(struct vhost_net *net)
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index dad8c0a6a2..2a8ccd0aa9 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -47,6 +47,7 @@ typedef struct VirtIOBlockReq
     struct virtio_scsi_inhdr *scsi;
     QEMUIOVector qiov;
     struct VirtIOBlockReq *next;
+    BlockAcctCookie acct;
 } VirtIOBlockReq;
 
 static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
@@ -58,8 +59,6 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
     stb_p(&req->in->status, status);
     virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
     virtio_notify(&s->vdev, s->vq);
-
-    g_free(req);
 }
 
 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
@@ -81,6 +80,8 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
         vm_stop(VMSTOP_DISKFULL);
     } else {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        bdrv_acct_done(s->bs, &req->acct);
+        g_free(req);
         bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read);
     }
 
@@ -100,6 +101,8 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    bdrv_acct_done(req->dev->bs, &req->acct);
+    g_free(req);
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -113,6 +116,8 @@ static void virtio_blk_flush_complete(void *opaque, int ret)
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    bdrv_acct_done(req->dev->bs, &req->acct);
+    g_free(req);
 }
 
 static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
@@ -155,6 +160,7 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
      */
     if (req->elem.out_num < 2 || req->elem.in_num < 3) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        g_free(req);
         return;
     }
 
@@ -163,6 +169,7 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
      */
     if (req->elem.out_num > 2 && req->elem.in_num > 3) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+        g_free(req);
         return;
     }
 
@@ -229,11 +236,13 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
     stl_p(&req->scsi->data_len, hdr.dxfer_len);
 
     virtio_blk_req_complete(req, status);
+    g_free(req);
 }
 #else
 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
 {
     virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+    g_free(req);
 }
 #endif /* __linux__ */
 
@@ -266,6 +275,8 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     BlockDriverAIOCB *acb;
 
+    bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
+
     /*
      * Make sure all outstanding writes are posted to the backing device.
      */
@@ -284,6 +295,8 @@ static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
+
     trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
 
     if (sector & req->dev->sector_mask) {
@@ -317,6 +330,8 @@ static void virtio_blk_handle_read(VirtIOBlockReq *req)
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
+
     if (sector & req->dev->sector_mask) {
         virtio_blk_rw_complete(req, -EIO);
         return;
@@ -370,6 +385,7 @@ static void virtio_blk_handle_request(VirtIOBlockReq *req,
                 s->serial ? s->serial : "",
                 MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+        g_free(req);
     } else if (type & VIRTIO_BLK_T_OUT) {
         qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
                                  req->elem.out_num - 1);
diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index 31f91514f2..bd5c66916b 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -79,6 +79,7 @@ struct ioreq {
 
     struct XenBlkDev    *blkdev;
     QLIST_ENTRY(ioreq)   list;
+    BlockAcctCookie     acct;
 };
 
 struct XenBlkDev {
@@ -401,6 +402,7 @@ static void qemu_aio_complete(void *opaque, int ret)
     ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
     ioreq_unmap(ioreq);
     ioreq_finish(ioreq);
+    bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct);
     qemu_bh_schedule(ioreq->blkdev->bh);
 }
 
@@ -419,6 +421,7 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 
     switch (ioreq->req.operation) {
     case BLKIF_OP_READ:
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ);
         ioreq->aio_inflight++;
         bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
@@ -429,6 +432,8 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
         if (!ioreq->req.nr_segments) {
             break;
         }
+
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE);
         ioreq->aio_inflight++;
         bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
                         &ioreq->v, ioreq->v.size / BLOCK_SIZE,
diff --git a/iohandler.c b/iohandler.c
index 4deae1e6ab..5ef66fb6e8 100644
--- a/iohandler.c
+++ b/iohandler.c
@@ -80,12 +80,67 @@ int qemu_set_fd_handler2(int fd,
     return 0;
 }
 
+typedef struct IOTrampoline
+{
+    GIOChannel *chan;
+    IOHandler *fd_read;
+    IOHandler *fd_write;
+    void *opaque;
+    guint tag;
+} IOTrampoline;
+
+static gboolean fd_trampoline(GIOChannel *chan, GIOCondition cond, gpointer opaque)
+{
+    IOTrampoline *tramp = opaque;
+
+    if (tramp->opaque == NULL) {
+        return FALSE;
+    }
+
+    if ((cond & G_IO_IN) && tramp->fd_read) {
+        tramp->fd_read(tramp->opaque);
+    }
+
+    if ((cond & G_IO_OUT) && tramp->fd_write) {
+        tramp->fd_write(tramp->opaque);
+    }
+
+    return TRUE;
+}
+
 int qemu_set_fd_handler(int fd,
                         IOHandler *fd_read,
                         IOHandler *fd_write,
                         void *opaque)
 {
-    return qemu_set_fd_handler2(fd, NULL, fd_read, fd_write, opaque);
+    static IOTrampoline fd_trampolines[FD_SETSIZE];
+    IOTrampoline *tramp = &fd_trampolines[fd];
+
+    if (tramp->tag != 0) {
+        g_io_channel_unref(tramp->chan);
+        g_source_remove(tramp->tag);
+    }
+
+    if (opaque) {
+        GIOCondition cond = 0;
+
+        tramp->fd_read = fd_read;
+        tramp->fd_write = fd_write;
+        tramp->opaque = opaque;
+
+        if (fd_read) {
+            cond |= G_IO_IN | G_IO_ERR;
+        }
+
+        if (fd_write) {
+            cond |= G_IO_OUT | G_IO_ERR;
+        }
+
+        tramp->chan = g_io_channel_unix_new(fd);
+        tramp->tag = g_io_add_watch(tramp->chan, cond, fd_trampoline, tramp);
+    }
+
+    return 0;
 }
 
 void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds)
diff --git a/memory.c b/memory.c
index 8e9ac460e8..57f0fa44ff 100644
--- a/memory.c
+++ b/memory.c
@@ -304,7 +304,7 @@ static void as_memory_range_add(AddressSpace *as, FlatRange *fr)
     }
 
     if (!fr->readable) {
-        phys_offset &= TARGET_PAGE_MASK;
+        phys_offset &= ~TARGET_PAGE_MASK & ~IO_MEM_ROMD;
     }
 
     cpu_register_physical_memory_log(fr->addr.start,
@@ -962,11 +962,14 @@ void memory_region_init_alias(MemoryRegion *mr,
 
 void memory_region_init_rom_device(MemoryRegion *mr,
                                    const MemoryRegionOps *ops,
+                                   void *opaque,
                                    DeviceState *dev,
                                    const char *name,
                                    uint64_t size)
 {
     memory_region_init(mr, name, size);
+    mr->ops = ops;
+    mr->opaque = opaque;
     mr->terminates = true;
     mr->destructor = memory_region_destructor_rom_device;
     mr->ram_addr = qemu_ram_alloc(dev, name, size);
@@ -1060,7 +1063,7 @@ void *memory_region_get_ram_ptr(MemoryRegion *mr)
 
     assert(mr->terminates);
 
-    return qemu_get_ram_ptr(mr->ram_addr);
+    return qemu_get_ram_ptr(mr->ram_addr & TARGET_PAGE_MASK);
 }
 
 static void memory_region_update_coalesced_range(MemoryRegion *mr)
diff --git a/memory.h b/memory.h
index 0553cc7526..06b83ae76b 100644
--- a/memory.h
+++ b/memory.h
@@ -235,6 +235,7 @@ void memory_region_init_alias(MemoryRegion *mr,
  */
 void memory_region_init_rom_device(MemoryRegion *mr,
                                    const MemoryRegionOps *ops,
+                                   void *opaque,
                                    DeviceState *dev, /* FIXME: layering violation */
                                    const char *name,
                                    uint64_t size);
diff --git a/monitor.c b/monitor.c
index ada51d0c23..04f465aa1c 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1189,7 +1189,6 @@ static int add_graphics_client(Monitor *mon, const QDict *qdict, QObject **ret_d
 {
     const char *protocol  = qdict_get_str(qdict, "protocol");
     const char *fdname = qdict_get_str(qdict, "fdname");
-    int skipauth = qdict_get_try_bool(qdict, "skipauth", 0);
     CharDriverState *s;
 
     if (strcmp(protocol, "spice") == 0) {
@@ -1203,6 +1202,7 @@ static int add_graphics_client(Monitor *mon, const QDict *qdict, QObject **ret_d
 #ifdef CONFIG_VNC
     } else if (strcmp(protocol, "vnc") == 0) {
 	int fd = monitor_get_fd(mon, fdname);
+        int skipauth = qdict_get_try_bool(qdict, "skipauth", 0);
 	vnc_display_add_client(NULL, fd, skipauth);
 	return 0;
 #endif
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index babb0940dd..3193dbf83c 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -30,6 +30,7 @@
 
 #include "block/raw-posix-aio.h"
 
+static void do_spawn_thread(void);
 
 struct qemu_paiocb {
     BlockDriverAIOCB common;
@@ -64,6 +65,9 @@ static pthread_attr_t attr;
 static int max_threads = 64;
 static int cur_threads = 0;
 static int idle_threads = 0;
+static int new_threads = 0;     /* backlog of threads we need to create */
+static int pending_threads = 0; /* threads created but not running yet */
+static QEMUBH *new_thread_bh;
 static QTAILQ_HEAD(, qemu_paiocb) request_list;
 
 #ifdef CONFIG_PREADV
@@ -311,6 +315,11 @@ static void *aio_thread(void *unused)
 
     pid = getpid();
 
+    mutex_lock(&lock);
+    pending_threads--;
+    mutex_unlock(&lock);
+    do_spawn_thread();
+
     while (1) {
         struct qemu_paiocb *aiocb;
         ssize_t ret = 0;
@@ -381,11 +390,20 @@ static void *aio_thread(void *unused)
     return NULL;
 }
 
-static void spawn_thread(void)
+static void do_spawn_thread(void)
 {
     sigset_t set, oldset;
 
-    cur_threads++;
+    mutex_lock(&lock);
+    if (!new_threads) {
+        mutex_unlock(&lock);
+        return;
+    }
+
+    new_threads--;
+    pending_threads++;
+
+    mutex_unlock(&lock);
 
     /* block all signals */
     if (sigfillset(&set)) die("sigfillset");
@@ -396,6 +414,27 @@ static void spawn_thread(void)
     if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
 }
 
+static void spawn_thread_bh_fn(void *opaque)
+{
+    do_spawn_thread();
+}
+
+static void spawn_thread(void)
+{
+    cur_threads++;
+    new_threads++;
+    /* If there are threads being created, they will spawn new workers, so
+     * we don't spend time creating many threads in a loop holding a mutex or
+     * starving the current vcpu.
+     *
+     * If there are no idle threads, ask the main thread to create one, so we
+     * inherit the correct affinity instead of the vcpu affinity.
+     */
+    if (!pending_threads) {
+        qemu_bh_schedule(new_thread_bh);
+    }
+}
+
 static void qemu_paio_submit(struct qemu_paiocb *aiocb)
 {
     aiocb->ret = -EINPROGRESS;
@@ -665,6 +704,7 @@ int paio_init(void)
         die2(ret, "pthread_attr_setdetachstate");
 
     QTAILQ_INIT(&request_list);
+    new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
 
     posix_aio_state = s;
     return 0;
diff --git a/qemu-config.c b/qemu-config.c
index 1eb6b9a709..139e07768b 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -55,7 +55,8 @@ static QemuOptsList qemu_drive_opts = {
         },{
             .name = "cache",
             .type = QEMU_OPT_STRING,
-            .help = "host cache usage (none, writeback, writethrough, unsafe)",
+            .help = "host cache usage (none, writeback, writethrough, "
+                    "directsync, unsafe)",
         },{
             .name = "aio",
             .type = QEMU_OPT_STRING,
diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
index a80f437c59..2a385a3bb8 100644
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@@ -115,3 +115,47 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
 
     trace_qemu_co_mutex_unlock_return(mutex, self);
 }
+
+void qemu_co_rwlock_init(CoRwlock *lock)
+{
+    memset(lock, 0, sizeof(*lock));
+    qemu_co_queue_init(&lock->queue);
+}
+
+void qemu_co_rwlock_rdlock(CoRwlock *lock)
+{
+    while (lock->writer) {
+        qemu_co_queue_wait(&lock->queue);
+    }
+    lock->reader++;
+}
+
+void qemu_co_rwlock_unlock(CoRwlock *lock)
+{
+    assert(qemu_in_coroutine());
+    if (lock->writer) {
+        lock->writer = false;
+        while (!qemu_co_queue_empty(&lock->queue)) {
+            /*
+             * Wakeup every body. This will include some
+             * writers too.
+             */
+            qemu_co_queue_next(&lock->queue);
+        }
+    } else {
+        lock->reader--;
+        assert(lock->reader >= 0);
+        /* Wakeup only one waiting writer */
+        if (!lock->reader) {
+            qemu_co_queue_next(&lock->queue);
+        }
+    }
+}
+
+void qemu_co_rwlock_wrlock(CoRwlock *lock)
+{
+    while (lock->writer || lock->reader) {
+        qemu_co_queue_wait(&lock->queue);
+    }
+    lock->writer = true;
+}
diff --git a/qemu-coroutine.h b/qemu-coroutine.h
index 2f2fd95552..b8fc4f4332 100644
--- a/qemu-coroutine.h
+++ b/qemu-coroutine.h
@@ -156,4 +156,36 @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
  */
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
+typedef struct CoRwlock {
+    bool writer;
+    int reader;
+    CoQueue queue;
+} CoRwlock;
+
+/**
+ * Initialises a CoRwlock. This must be called before any other operation
+ * is used on the CoRwlock
+ */
+void qemu_co_rwlock_init(CoRwlock *lock);
+
+/**
+ * Read locks the CoRwlock. If the lock cannot be taken immediately because
+ * of a parallel writer, control is transferred to the caller of the current
+ * coroutine.
+ */
+void qemu_co_rwlock_rdlock(CoRwlock *lock);
+
+/**
+ * Write Locks the mutex. If the lock cannot be taken immediately because
+ * of a parallel reader, control is transferred to the caller of the current
+ * coroutine.
+ */
+void qemu_co_rwlock_wrlock(CoRwlock *lock);
+
+/**
+ * Unlocks the read/write lock and schedules the next coroutine that was
+ * waiting for this lock to be run.
+ */
+void qemu_co_rwlock_unlock(CoRwlock *lock);
+
 #endif /* QEMU_COROUTINE_H */
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 1299e83ef2..4be00a5edd 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -28,9 +28,9 @@ STEXI
 ETEXI
 
 DEF("convert", img_convert,
-    "convert [-c] [-p] [-f fmt] [-t cache] [-O output_fmt] [-o options] [-s snapshot_name] filename [filename2 [...]] output_filename")
+    "convert [-c] [-p] [-f fmt] [-t cache] [-O output_fmt] [-o options] [-s snapshot_name] [-S sparse_size] filename [filename2 [...]] output_filename")
 STEXI
-@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] @var{filename} [@var{filename2} [...]] @var{output_filename}
+@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename}
 ETEXI
 
 DEF("info", img_info,
diff --git a/qemu-img.c b/qemu-img.c
index 95f3219571..6a3973163f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -66,7 +66,8 @@ static void help(void)
            "  'filename' is a disk image filename\n"
            "  'fmt' is the disk image format. It is guessed automatically in most cases\n"
            "  'cache' is the cache mode used to write the output disk image, the valid\n"
-           "    options are: 'none', 'writeback' (default), 'writethrough' and 'unsafe'\n"
+           "    options are: 'none', 'writeback' (default), 'writethrough', 'directsync'\n"
+           "    and 'unsafe'\n"
            "  'size' is the disk image size in bytes. Optional suffixes\n"
            "    'k' or 'K' (kilobyte, 1024), 'M' (megabyte, 1024k), 'G' (gigabyte, 1024M)\n"
            "    and T (terabyte, 1024G) are supported. 'b' is ignored.\n"
@@ -81,6 +82,8 @@ static void help(void)
            "       rebasing in this case (useful for renaming the backing file)\n"
            "  '-h' with or without a command shows this help and lists the supported formats\n"
            "  '-p' show progress of command (only certain commands)\n"
+           "  '-S' indicates the consecutive number of bytes that must contain only zeros\n"
+           "       for qemu-img to create a sparse image during conversion\n"
            "\n"
            "Parameters to snapshot subcommand:\n"
            "  'snapshot' is the name of the snapshot to create, apply or delete\n"
@@ -183,27 +186,6 @@ static int read_password(char *buf, int buf_size)
 }
 #endif
 
-static int set_cache_flag(const char *mode, int *flags)
-{
-    *flags &= ~BDRV_O_CACHE_MASK;
-
-    if (!strcmp(mode, "none") || !strcmp(mode, "off")) {
-        *flags |= BDRV_O_CACHE_WB;
-        *flags |= BDRV_O_NOCACHE;
-    } else if (!strcmp(mode, "writeback")) {
-        *flags |= BDRV_O_CACHE_WB;
-    } else if (!strcmp(mode, "unsafe")) {
-        *flags |= BDRV_O_CACHE_WB;
-        *flags |= BDRV_O_NO_FLUSH;
-    } else if (!strcmp(mode, "writethrough")) {
-        /* this is the default */
-    } else {
-        return -1;
-    }
-
-    return 0;
-}
-
 static int print_block_option_help(const char *filename, const char *fmt)
 {
     BlockDriver *drv, *proto_drv;
@@ -495,7 +477,7 @@ static int img_commit(int argc, char **argv)
     filename = argv[optind++];
 
     flags = BDRV_O_RDWR;
-    ret = set_cache_flag(cache, &flags);
+    ret = bdrv_parse_cache_flags(cache, &flags);
     if (ret < 0) {
         error_report("Invalid cache option: %s", cache);
         return -1;
@@ -591,6 +573,48 @@ static int is_allocated_sectors(const uint8_t *buf, int n, int *pnum)
 }
 
 /*
+ * Like is_allocated_sectors, but if the buffer starts with a used sector,
+ * up to 'min' consecutive sectors containing zeros are ignored. This avoids
+ * breaking up write requests for only small sparse areas.
+ */
+static int is_allocated_sectors_min(const uint8_t *buf, int n, int *pnum,
+    int min)
+{
+    int ret;
+    int num_checked, num_used;
+
+    if (n < min) {
+        min = n;
+    }
+
+    ret = is_allocated_sectors(buf, n, pnum);
+    if (!ret) {
+        return ret;
+    }
+
+    num_used = *pnum;
+    buf += BDRV_SECTOR_SIZE * *pnum;
+    n -= *pnum;
+    num_checked = num_used;
+
+    while (n > 0) {
+        ret = is_allocated_sectors(buf, n, pnum);
+
+        buf += BDRV_SECTOR_SIZE * *pnum;
+        n -= *pnum;
+        num_checked += *pnum;
+        if (ret) {
+            num_used = num_checked;
+        } else if (*pnum >= min) {
+            break;
+        }
+    }
+
+    *pnum = num_used;
+    return 1;
+}
+
+/*
  * Compares two buffers sector by sector. Returns 0 if the first sector of both
  * buffers matches, non-zero otherwise.
  *
@@ -640,6 +664,7 @@ static int img_convert(int argc, char **argv)
     char *options = NULL;
     const char *snapshot_name = NULL;
     float local_progress;
+    int min_sparse = 8; /* Need at least 4k of zeros for sparse detection */
 
     fmt = NULL;
     out_fmt = "raw";
@@ -647,7 +672,7 @@ static int img_convert(int argc, char **argv)
     out_baseimg = NULL;
     compress = 0;
     for(;;) {
-        c = getopt(argc, argv, "f:O:B:s:hce6o:pt:");
+        c = getopt(argc, argv, "f:O:B:s:hce6o:pS:t:");
         if (c == -1) {
             break;
         }
@@ -682,6 +707,18 @@ static int img_convert(int argc, char **argv)
         case 's':
             snapshot_name = optarg;
             break;
+        case 'S':
+        {
+            int64_t sval;
+            sval = strtosz_suffix(optarg, NULL, STRTOSZ_DEFSUFFIX_B);
+            if (sval < 0) {
+                error_report("Invalid minimum zero buffer size for sparse output specified");
+                return 1;
+            }
+
+            min_sparse = sval / BDRV_SECTOR_SIZE;
+            break;
+        }
         case 'p':
             progress = 1;
             break;
@@ -819,7 +856,7 @@ static int img_convert(int argc, char **argv)
     }
 
     flags = BDRV_O_RDWR;
-    ret = set_cache_flag(cache, &flags);
+    ret = bdrv_parse_cache_flags(cache, &flags);
     if (ret < 0) {
         error_report("Invalid cache option: %s", cache);
         return -1;
@@ -834,7 +871,7 @@ static int img_convert(int argc, char **argv)
     bs_i = 0;
     bs_offset = 0;
     bdrv_get_geometry(bs[0], &bs_sectors);
-    buf = g_malloc(IO_BUF_SIZE);
+    buf = qemu_blockalign(out_bs, IO_BUF_SIZE);
 
     if (compress) {
         ret = bdrv_get_info(out_bs, &bdi);
@@ -890,7 +927,8 @@ static int img_convert(int argc, char **argv)
 
                 ret = bdrv_read(bs[bs_i], bs_num, buf2, nlow);
                 if (ret < 0) {
-                    error_report("error while reading");
+                    error_report("error while reading sector %" PRId64 ": %s",
+                                 bs_num, strerror(-ret));
                     goto out;
                 }
 
@@ -908,8 +946,8 @@ static int img_convert(int argc, char **argv)
                 ret = bdrv_write_compressed(out_bs, sector_num, buf,
                                             cluster_sectors);
                 if (ret != 0) {
-                    error_report("error while compressing sector %" PRId64,
-                          sector_num);
+                    error_report("error while compressing sector %" PRId64
+                                 ": %s", sector_num, strerror(-ret));
                     goto out;
                 }
             }
@@ -972,7 +1010,8 @@ static int img_convert(int argc, char **argv)
 
             ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
             if (ret < 0) {
-                error_report("error while reading");
+                error_report("error while reading sector %" PRId64 ": %s",
+                             sector_num - bs_offset, strerror(-ret));
                 goto out;
             }
             /* NOTE: at the same time we convert, we do not write zero
@@ -988,10 +1027,11 @@ static int img_convert(int argc, char **argv)
                    sectors that are entirely 0, since whatever data was
                    already there is garbage, not 0s. */
                 if (!has_zero_init || out_baseimg ||
-                    is_allocated_sectors(buf1, n, &n1)) {
+                    is_allocated_sectors_min(buf1, n, &n1, min_sparse)) {
                     ret = bdrv_write(out_bs, sector_num, buf1, n1);
                     if (ret < 0) {
-                        error_report("error while writing");
+                        error_report("error while writing sector %" PRId64
+                                     ": %s", sector_num, strerror(-ret));
                         goto out;
                     }
                 }
@@ -1006,7 +1046,7 @@ out:
     qemu_progress_end();
     free_option_parameters(create_options);
     free_option_parameters(param);
-    g_free(buf);
+    qemu_vfree(buf);
     if (out_bs) {
         bdrv_delete(out_bs);
     }
@@ -1291,7 +1331,7 @@ static int img_rebase(int argc, char **argv)
     qemu_progress_print(0, 100);
 
     flags = BDRV_O_RDWR | (unsafe ? BDRV_O_NO_BACKING : 0);
-    ret = set_cache_flag(cache, &flags);
+    ret = bdrv_parse_cache_flags(cache, &flags);
     if (ret < 0) {
         error_report("Invalid cache option: %s", cache);
         return -1;
@@ -1373,8 +1413,8 @@ static int img_rebase(int argc, char **argv)
         uint8_t * buf_new;
         float local_progress;
 
-        buf_old = g_malloc(IO_BUF_SIZE);
-        buf_new = g_malloc(IO_BUF_SIZE);
+        buf_old = qemu_blockalign(bs, IO_BUF_SIZE);
+        buf_new = qemu_blockalign(bs, IO_BUF_SIZE);
 
         bdrv_get_geometry(bs, &num_sectors);
 
@@ -1430,8 +1470,8 @@ static int img_rebase(int argc, char **argv)
             qemu_progress_print(local_progress, 100);
         }
 
-        g_free(buf_old);
-        g_free(buf_new);
+        qemu_vfree(buf_old);
+        qemu_vfree(buf_new);
     }
 
     /*
diff --git a/qemu-img.texi b/qemu-img.texi
index 495a1b6695..70fa321dff 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -40,6 +40,11 @@ indicates that target image must be compressed (qcow format only)
 with or without a command shows help and lists the supported formats
 @item -p
 display progress bar (convert and rebase commands only)
+@item -S @var{size}
+indicates the consecutive number of bytes that must contain only zeros
+for qemu-img to create a sparse image during conversion. This value is rounded
+down to the nearest 512 bytes. You may use the common size suffixes like
+@code{k} for kilobytes.
 @end table
 
 Parameters to snapshot subcommand:
@@ -86,7 +91,7 @@ it doesn't need to be specified separately in this case.
 
 Commit the changes recorded in @var{filename} in its base image.
 
-@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] @var{filename} [@var{filename2} [...]] @var{output_filename}
+@item convert [-c] [-p] [-f @var{fmt}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename}
 
 Convert the disk image @var{filename} or a snapshot @var{snapshot_name} to disk image @var{output_filename}
 using format @var{output_fmt}. It can be optionally compressed (@code{-c}
diff --git a/qemu-options.hx b/qemu-options.hx
index d86815dc04..35d95d1d89 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -133,7 +133,7 @@ ETEXI
 DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
-    "       [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
+    "       [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
     "       [,readonly=on|off]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
@@ -164,7 +164,7 @@ These options have the same definition as they have in @option{-hdachs}.
 @item snapshot=@var{snapshot}
 @var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}).
 @item cache=@var{cache}
-@var{cache} is "none", "writeback", "unsafe", or "writethrough" and controls how the host cache is used to access block data.
+@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough" and controls how the host cache is used to access block data.
 @item aio=@var{aio}
 @var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
 @item format=@var{format}
@@ -199,6 +199,10 @@ The host page cache can be avoided entirely with @option{cache=none}.  This will
 attempt to do disk IO directly to the guests memory.  QEMU may still perform
 an internal copy of the data.
 
+The host page cache can be avoided while only sending write notifications to
+the guest when the data has been reported as written by the storage subsystem
+using @option{cache=directsync}.
+
 Some block drivers perform badly with @option{cache=writethrough}, most notably,
 qcow2.  If performance is more important than correctness,
 @option{cache=writeback} should be used with qcow2.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 03f67da198..27cc66ebc9 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1201,6 +1201,10 @@ Each json-object contain the following:
     - "wr_bytes": bytes written (json-int)
     - "rd_operations": read operations (json-int)
     - "wr_operations": write operations (json-int)
+    - "flush_operations": cache flush operations (json-int)
+    - "wr_total_time_ns": total time spend on writes in nano-seconds (json-int)
+    - "rd_total_time_ns": total time spend on reads in nano-seconds (json-int)
+    - "flush_total_time_ns": total time spend on cache flushes in nano-seconds (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
 - "parent": Contains recursively the statistics of the underlying
@@ -1222,6 +1226,10 @@ Example:
                   "wr_operations":751,
                   "rd_bytes":122567168,
                   "rd_operations":36772
+                  "wr_total_times_ns":313253456
+                  "rd_total_times_ns":3465673657
+                  "flush_total_times_ns":49653
+                  "flush_operations":61,
                }
             },
             "stats":{
@@ -1230,6 +1238,10 @@ Example:
                "wr_operations":692,
                "rd_bytes":122739200,
                "rd_operations":36604
+               "flush_operations":51,
+               "wr_total_times_ns":313253456
+               "rd_total_times_ns":3465673657
+               "flush_total_times_ns":49653
             }
          },
          {
@@ -1240,6 +1252,10 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          },
          {
@@ -1250,6 +1266,10 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          },
          {
@@ -1260,6 +1280,10 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          }
       ]
diff --git a/rules.mak b/rules.mak
index 884d421ae4..04a91983ec 100644
--- a/rules.mak
+++ b/rules.mak
@@ -31,7 +31,7 @@ endif
 %.o: %.m
 	$(call quiet-command,$(CC) $(QEMU_INCLUDES) $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) -c -o $@ $<,"  OBJC  $(TARGET_DIR)$@")
 
-LINK = $(call quiet-command,$(CC) $(QEMU_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ $(1) $(LIBS),"  LINK  $(TARGET_DIR)$@")
+LINK = $(call quiet-command,$(CC) $(QEMU_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ $(sort $(1)) $(LIBS),"  LINK  $(TARGET_DIR)$@")
 
 %$(EXESUF): %.o
 	$(call LINK,$^)
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 4462647ef1..87cc1177f6 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -525,14 +525,14 @@ static void *qemu_st_helpers[4] = {
 
 static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc)
 {
-    int addr_reg, data_reg, data_reg2, r0, r1, rbase, mem_index, s_bits, bswap;
+    int addr_reg, data_reg, data_reg2, r0, r1, rbase, bswap;
 #ifdef CONFIG_SOFTMMU
-    int r2;
+    int mem_index, s_bits, r2;
     void *label1_ptr, *label2_ptr;
-#endif
 #if TARGET_LONG_BITS == 64
     int addr_reg2;
 #endif
+#endif
 
     data_reg = *args++;
     if (opc == 3)
@@ -540,13 +540,13 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc)
     else
         data_reg2 = 0;
     addr_reg = *args++;
+
+#ifdef CONFIG_SOFTMMU
 #if TARGET_LONG_BITS == 64
     addr_reg2 = *args++;
 #endif
     mem_index = *args;
     s_bits = opc & 3;
-
-#ifdef CONFIG_SOFTMMU
     r0 = 3;
     r1 = 4;
     r2 = 0;
@@ -722,14 +722,14 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc)
 
 static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc)
 {
-    int addr_reg, r0, r1, data_reg, data_reg2, mem_index, bswap, rbase;
+    int addr_reg, r0, r1, data_reg, data_reg2, bswap, rbase;
 #ifdef CONFIG_SOFTMMU
-    int r2, ir;
+    int mem_index, r2, ir;
     void *label1_ptr, *label2_ptr;
-#endif
 #if TARGET_LONG_BITS == 64
     int addr_reg2;
 #endif
+#endif
 
     data_reg = *args++;
     if (opc == 3)
@@ -737,12 +737,12 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc)
     else
         data_reg2 = 0;
     addr_reg = *args++;
+
+#ifdef CONFIG_SOFTMMU
 #if TARGET_LONG_BITS == 64
     addr_reg2 = *args++;
 #endif
     mem_index = *args;
-
-#ifdef CONFIG_SOFTMMU
     r0 = 3;
     r1 = 4;
     r2 = 0;
diff --git a/vl.c b/vl.c
index 9cd67a3746..a6f4c710d6 100644
--- a/vl.c
+++ b/vl.c
@@ -111,6 +111,8 @@ int main(int argc, char **argv)
 #define main qemu_main
 #endif /* CONFIG_COCOA */
 
+#include <glib.h>
+
 #include "hw/hw.h"
 #include "hw/boards.h"
 #include "hw/usb.h"
@@ -1321,6 +1323,75 @@ void qemu_system_vmstop_request(int reason)
     qemu_notify_event();
 }
 
+static GPollFD poll_fds[1024 * 2]; /* this is probably overkill */
+static int n_poll_fds;
+static int max_priority;
+
+static void glib_select_fill(int *max_fd, fd_set *rfds, fd_set *wfds,
+                             fd_set *xfds, struct timeval *tv)
+{
+    GMainContext *context = g_main_context_default();
+    int i;
+    int timeout = 0, cur_timeout;
+
+    g_main_context_prepare(context, &max_priority);
+
+    n_poll_fds = g_main_context_query(context, max_priority, &timeout,
+                                      poll_fds, ARRAY_SIZE(poll_fds));
+    g_assert(n_poll_fds <= ARRAY_SIZE(poll_fds));
+
+    for (i = 0; i < n_poll_fds; i++) {
+        GPollFD *p = &poll_fds[i];
+
+        if ((p->events & G_IO_IN)) {
+            FD_SET(p->fd, rfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+        if ((p->events & G_IO_OUT)) {
+            FD_SET(p->fd, wfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+        if ((p->events & G_IO_ERR)) {
+            FD_SET(p->fd, xfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+    }
+
+    cur_timeout = (tv->tv_sec * 1000) + ((tv->tv_usec + 500) / 1000);
+    if (timeout >= 0 && timeout < cur_timeout) {
+        tv->tv_sec = timeout / 1000;
+        tv->tv_usec = (timeout % 1000) * 1000;
+    }
+}
+
+static void glib_select_poll(fd_set *rfds, fd_set *wfds, fd_set *xfds,
+                             bool err)
+{
+    GMainContext *context = g_main_context_default();
+
+    if (!err) {
+        int i;
+
+        for (i = 0; i < n_poll_fds; i++) {
+            GPollFD *p = &poll_fds[i];
+
+            if ((p->events & G_IO_IN) && FD_ISSET(p->fd, rfds)) {
+                p->revents |= G_IO_IN;
+            }
+            if ((p->events & G_IO_OUT) && FD_ISSET(p->fd, wfds)) {
+                p->revents |= G_IO_OUT;
+            }
+            if ((p->events & G_IO_ERR) && FD_ISSET(p->fd, xfds)) {
+                p->revents |= G_IO_ERR;
+            }
+        }
+    }
+
+    if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) {
+        g_main_context_dispatch(context);
+    }
+}
+
 int main_loop_wait(int nonblocking)
 {
     fd_set rfds, wfds, xfds;
@@ -1346,8 +1417,10 @@ int main_loop_wait(int nonblocking)
     FD_ZERO(&rfds);
     FD_ZERO(&wfds);
     FD_ZERO(&xfds);
+
     qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);
     slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
+    glib_select_fill(&nfds, &rfds, &wfds, &xfds, &tv);
 
     if (timeout > 0) {
         qemu_mutex_unlock_iothread();
@@ -1361,6 +1434,7 @@ int main_loop_wait(int nonblocking)
 
     qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);
     slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
+    glib_select_poll(&rfds, &wfds, &xfds, (ret < 0));
 
     qemu_run_all_timers();