Merge remote-tracking branch 'kwolf/for-anthony' into staging
diff --git a/block/nbd.c b/block/nbd.c
index 70edd81..76f04d8 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -48,6 +48,7 @@
 
 typedef struct BDRVNBDState {
     int sock;
+    uint32_t nbdflags;
     off_t size;
     size_t blocksize;
     char *export_name; /* An NBD server may export several devices */
@@ -111,7 +112,6 @@
     int ret;
     off_t size;
     size_t blocksize;
-    uint32_t nbdflags;
 
     if (s->host_spec[0] == '/') {
         sock = unix_socket_outgoing(s->host_spec);
@@ -126,7 +126,7 @@
     }
 
     /* NBD handshake */
-    ret = nbd_receive_negotiate(sock, s->export_name, &nbdflags, &size,
+    ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size,
                                 &blocksize);
     if (ret == -1) {
         logout("Failed to negotiate with the NBD server\n");
diff --git a/block/raw-posix.c b/block/raw-posix.c
index a624f56..305998d 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -839,7 +839,14 @@
 static int raw_flush(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
-    return qemu_fdatasync(s->fd);
+    int ret;
+
+    ret = qemu_fdatasync(s->fd);
+    if (ret < 0) {
+        return -errno;
+    }
+
+    return 0;
 }
 
 #ifdef CONFIG_XFS
diff --git a/block/rbd.c b/block/rbd.c
index 1b78d51..3068c82 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -13,35 +13,33 @@
 
 #include "qemu-common.h"
 #include "qemu-error.h"
-
 #include "block_int.h"
 
 #include <rbd/librbd.h>
 
-
-
 /*
  * When specifying the image filename use:
  *
  * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
  *
- * poolname must be the name of an existing rados pool
+ * poolname must be the name of an existing rados pool.
  *
- * devicename is the basename for all objects used to
- * emulate the raw device.
+ * devicename is the name of the rbd image.
  *
- * Each option given is used to configure rados, and may be
- * any Ceph option, or "conf". The "conf" option specifies
- * a Ceph configuration file to read.
+ * Each option given is used to configure rados, and may be any valid
+ * Ceph option, "id", or "conf".
  *
- * Metadata information (image size, ...) is stored in an
- * object with the name "devicename.rbd".
+ * The "id" option indicates what user we should authenticate as to
+ * the Ceph cluster.  If it is excluded we will use the Ceph default
+ * (normally 'admin').
  *
- * The raw device is split into 4MB sized objects by default.
- * The sequencenumber is encoded in a 12 byte long hex-string,
- * and is attached to the devicename, separated by a dot.
- * e.g. "devicename.1234567890ab"
+ * The "conf" option specifies a Ceph configuration file to read.  If
+ * it is not specified, we will read from the default Ceph locations
+ * (e.g., /etc/ceph/ceph.conf).  To avoid reading _any_ configuration
+ * file, specify conf=/dev/null.
  *
+ * Configuration values containing :, @, or = can be escaped with a
+ * leading "\".
  */
 
 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
@@ -104,8 +102,15 @@
     *p = NULL;
 
     if (delim != '\0') {
-        end = strchr(src, delim);
-        if (end) {
+        for (end = src; *end; ++end) {
+            if (*end == delim) {
+                break;
+            }
+            if (*end == '\\' && end[1] != '\0') {
+                end++;
+            }
+        }
+        if (*end == delim) {
             *p = end + 1;
             *end = '\0';
         }
@@ -124,6 +129,19 @@
     return 0;
 }
 
+static void qemu_rbd_unescape(char *src)
+{
+    char *p;
+
+    for (p = src; *src; ++src, ++p) {
+        if (*src == '\\' && src[1] != '\0') {
+            src++;
+        }
+        *p = *src;
+    }
+    *p = '\0';
+}
+
 static int qemu_rbd_parsename(const char *filename,
                               char *pool, int pool_len,
                               char *snap, int snap_len,
@@ -148,6 +166,7 @@
         ret = -EINVAL;
         goto done;
     }
+    qemu_rbd_unescape(pool);
 
     if (strchr(p, '@')) {
         ret = qemu_rbd_next_tok(name, name_len, p, '@', "object name", &p);
@@ -155,9 +174,11 @@
             goto done;
         }
         ret = qemu_rbd_next_tok(snap, snap_len, p, ':', "snap name", &p);
+        qemu_rbd_unescape(snap);
     } else {
         ret = qemu_rbd_next_tok(name, name_len, p, ':', "object name", &p);
     }
+    qemu_rbd_unescape(name);
     if (ret < 0 || !p) {
         goto done;
     }
@@ -213,6 +234,7 @@
         if (ret < 0) {
             break;
         }
+        qemu_rbd_unescape(name);
 
         if (!p) {
             error_report("conf option %s has no value", name);
@@ -225,6 +247,7 @@
         if (ret < 0) {
             break;
         }
+        qemu_rbd_unescape(value);
 
         if (strcmp(name, "conf") == 0) {
             ret = rados_conf_read_file(cluster, value);
@@ -298,11 +321,8 @@
     }
 
     if (strstr(conf, "conf=") == NULL) {
-        if (rados_conf_read_file(cluster, NULL) < 0) {
-            error_report("error reading config file");
-            rados_shutdown(cluster);
-            return -EIO;
-        }
+        /* try default location, but ignore failure */
+        rados_conf_read_file(cluster, NULL);
     }
 
     if (conf[0] != '\0' &&
@@ -441,11 +461,8 @@
     }
 
     if (strstr(conf, "conf=") == NULL) {
-        r = rados_conf_read_file(s->cluster, NULL);
-        if (r < 0) {
-            error_report("error reading config file");
-            goto failed_shutdown;
-        }
+        /* try default location, but ignore failure */
+        rados_conf_read_file(s->cluster, NULL);
     }
 
     if (conf[0] != '\0') {
@@ -688,6 +705,17 @@
     return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
 }
 
+static int qemu_rbd_flush(BlockDriverState *bs)
+{
+#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
+    /* rbd_flush added in 0.1.1 */
+    BDRVRBDState *s = bs->opaque;
+    return rbd_flush(s->image);
+#else
+    return 0;
+#endif
+}
+
 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVRBDState *s = bs->opaque;
@@ -823,6 +851,7 @@
     .bdrv_file_open     = qemu_rbd_open,
     .bdrv_close         = qemu_rbd_close,
     .bdrv_create        = qemu_rbd_create,
+    .bdrv_flush         = qemu_rbd_flush,
     .bdrv_get_info      = qemu_rbd_getinfo,
     .create_options     = qemu_rbd_create_options,
     .bdrv_getlength     = qemu_rbd_getlength,
diff --git a/block/vmdk.c b/block/vmdk.c
index 6c8edfc..5d16ec4 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -179,11 +179,16 @@
 {
     int i;
     BDRVVmdkState *s = bs->opaque;
+    VmdkExtent *e;
 
     for (i = 0; i < s->num_extents; i++) {
-        g_free(s->extents[i].l1_table);
-        g_free(s->extents[i].l2_cache);
-        g_free(s->extents[i].l1_backup_table);
+        e = &s->extents[i];
+        g_free(e->l1_table);
+        g_free(e->l2_cache);
+        g_free(e->l1_backup_table);
+        if (e->file != bs->file) {
+            bdrv_delete(e->file);
+        }
     }
     g_free(s->extents);
 }
@@ -619,12 +624,13 @@
     s->desc_offset = 0;
     ret = vmdk_parse_extents(buf, bs, bs->file->filename);
     if (ret) {
+        vmdk_free_extents(bs);
         return ret;
     }
 
     /* try to open parent images, if exist */
     if (vmdk_parent_open(bs)) {
-        g_free(s->extents);
+        vmdk_free_extents(bs);
         return -EINVAL;
     }
     s->parent_cid = vmdk_read_cid(bs, 1);
diff --git a/cpus.c b/cpus.c
index 883c27a..8978779 100644
--- a/cpus.c
+++ b/cpus.c
@@ -380,11 +380,6 @@
     int sigfd;
     sigset_t set;
 
-    /* SIGUSR2 used by posix-aio-compat.c */
-    sigemptyset(&set);
-    sigaddset(&set, SIGUSR2);
-    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
-
     /*
      * SIG_IPI must be blocked in the main thread and must not be caught
      * by sigwait() in the signal thread. Otherwise, the cpu thread will
diff --git a/dma-helpers.c b/dma-helpers.c
index 4610ea0..86d2d0a 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -42,7 +42,8 @@
     BlockDriverAIOCB *acb;
     QEMUSGList *sg;
     uint64_t sector_num;
-    int is_write;
+    bool to_dev;
+    bool in_cancel;
     int sg_cur_index;
     target_phys_addr_t sg_cur_byte;
     QEMUIOVector iov;
@@ -58,7 +59,7 @@
 
     qemu_bh_delete(dbs->bh);
     dbs->bh = NULL;
-    dma_bdrv_cb(opaque, 0);
+    dma_bdrv_cb(dbs, 0);
 }
 
 static void continue_after_map_failure(void *opaque)
@@ -75,9 +76,29 @@
 
     for (i = 0; i < dbs->iov.niov; ++i) {
         cpu_physical_memory_unmap(dbs->iov.iov[i].iov_base,
-                                  dbs->iov.iov[i].iov_len, !dbs->is_write,
+                                  dbs->iov.iov[i].iov_len, !dbs->to_dev,
                                   dbs->iov.iov[i].iov_len);
     }
+    qemu_iovec_reset(&dbs->iov);
+}
+
+static void dma_complete(DMAAIOCB *dbs, int ret)
+{
+    dma_bdrv_unmap(dbs);
+    if (dbs->common.cb) {
+        dbs->common.cb(dbs->common.opaque, ret);
+    }
+    qemu_iovec_destroy(&dbs->iov);
+    if (dbs->bh) {
+        qemu_bh_delete(dbs->bh);
+        dbs->bh = NULL;
+    }
+    if (!dbs->in_cancel) {
+        /* Requests may complete while dma_aio_cancel is in progress.  In
+         * this case, the AIOCB should not be released because it is still
+         * referenced by dma_aio_cancel.  */
+        qemu_aio_release(dbs);
+    }
 }
 
 static void dma_bdrv_cb(void *opaque, int ret)
@@ -89,19 +110,16 @@
     dbs->acb = NULL;
     dbs->sector_num += dbs->iov.size / 512;
     dma_bdrv_unmap(dbs);
-    qemu_iovec_reset(&dbs->iov);
 
     if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
-        dbs->common.cb(dbs->common.opaque, ret);
-        qemu_iovec_destroy(&dbs->iov);
-        qemu_aio_release(dbs);
+        dma_complete(dbs, ret);
         return;
     }
 
     while (dbs->sg_cur_index < dbs->sg->nsg) {
         cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
         cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
-        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write);
+        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->to_dev);
         if (!mem)
             break;
         qemu_iovec_add(&dbs->iov, mem, cur_len);
@@ -120,9 +138,7 @@
     dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
                             dbs->iov.size / 512, dma_bdrv_cb, dbs);
     if (!dbs->acb) {
-        dma_bdrv_unmap(dbs);
-        qemu_iovec_destroy(&dbs->iov);
-        return;
+        dma_complete(dbs, -EIO);
     }
 }
 
@@ -131,8 +147,14 @@
     DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
 
     if (dbs->acb) {
-        bdrv_aio_cancel(dbs->acb);
+        BlockDriverAIOCB *acb = dbs->acb;
+        dbs->acb = NULL;
+        dbs->in_cancel = true;
+        bdrv_aio_cancel(acb);
+        dbs->in_cancel = false;
     }
+    dbs->common.cb = NULL;
+    dma_complete(dbs, 0);
 }
 
 static AIOPool dma_aio_pool = {
@@ -143,7 +165,7 @@
 BlockDriverAIOCB *dma_bdrv_io(
     BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num,
     DMAIOFunc *io_func, BlockDriverCompletionFunc *cb,
-    void *opaque, int is_write)
+    void *opaque, bool to_dev)
 {
     DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
 
@@ -153,15 +175,11 @@
     dbs->sector_num = sector_num;
     dbs->sg_cur_index = 0;
     dbs->sg_cur_byte = 0;
-    dbs->is_write = is_write;
+    dbs->to_dev = to_dev;
     dbs->io_func = io_func;
     dbs->bh = NULL;
     qemu_iovec_init(&dbs->iov, sg->nsg);
     dma_bdrv_cb(dbs, 0);
-    if (!dbs->acb) {
-        qemu_aio_release(dbs);
-        return NULL;
-    }
     return &dbs->common;
 }
 
@@ -170,12 +188,12 @@
                                 QEMUSGList *sg, uint64_t sector,
                                 void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque, 0);
+    return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque, false);
 }
 
 BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
                                  QEMUSGList *sg, uint64_t sector,
                                  void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque, 1);
+    return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque, true);
 }
diff --git a/dma.h b/dma.h
index a6db5ba..2bdc236 100644
--- a/dma.h
+++ b/dma.h
@@ -15,10 +15,13 @@
 #include "hw/hw.h"
 #include "block.h"
 
-typedef struct {
+typedef struct ScatterGatherEntry ScatterGatherEntry;
+
+#if defined(TARGET_PHYS_ADDR_BITS)
+struct ScatterGatherEntry {
     target_phys_addr_t base;
     target_phys_addr_t len;
-} ScatterGatherEntry;
+};
 
 struct QEMUSGList {
     ScatterGatherEntry *sg;
@@ -31,6 +34,7 @@
 void qemu_sglist_add(QEMUSGList *qsg, target_phys_addr_t base,
                      target_phys_addr_t len);
 void qemu_sglist_destroy(QEMUSGList *qsg);
+#endif
 
 typedef BlockDriverAIOCB *DMAIOFunc(BlockDriverState *bs, int64_t sector_num,
                                  QEMUIOVector *iov, int nb_sectors,
@@ -39,7 +43,7 @@
 BlockDriverAIOCB *dma_bdrv_io(BlockDriverState *bs,
                               QEMUSGList *sg, uint64_t sector_num,
                               DMAIOFunc *io_func, BlockDriverCompletionFunc *cb,
-                              void *opaque, int is_write);
+                              void *opaque, bool to_dev);
 BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
                                 BlockDriverCompletionFunc *cb, void *opaque);
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index b813236..226230c 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -499,10 +499,7 @@
     ide_bus_reset(&d->port);
     ide_state->ncq_queues = AHCI_MAX_CMDS;
 
-    pr->irq_stat = 0;
-    pr->irq_mask = 0;
     pr->scr_stat = 0;
-    pr->scr_ctl = 0;
     pr->scr_err = 0;
     pr->scr_act = 0;
     d->busy_slot = -1;
@@ -1159,12 +1156,17 @@
 void ahci_reset(void *opaque)
 {
     struct AHCIPCIState *d = opaque;
+    AHCIPortRegs *pr;
     int i;
 
     d->ahci.control_regs.irqstatus = 0;
     d->ahci.control_regs.ghc = 0;
 
     for (i = 0; i < d->ahci.ports; i++) {
+        pr = &d->ahci.dev[i].port_regs;
+        pr->irq_stat = 0;
+        pr->irq_mask = 0;
+        pr->scr_ctl = 0;
         ahci_reset_port(&d->ahci, i);
     }
 }
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 4d5a076..4e76fc7 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -603,7 +603,7 @@
         break;
     case IDE_DMA_TRIM:
         s->bus->dma->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num,
-                                         ide_issue_trim, ide_dma_cb, s, 1);
+                                         ide_issue_trim, ide_dma_cb, s, true);
         break;
     }
 
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index c1844cb..37b8239 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -156,7 +156,7 @@
         break;
     case IDE_DMA_TRIM:
         m->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num,
-                               ide_issue_trim, pmac_ide_transfer_cb, s, 1);
+                               ide_issue_trim, pmac_ide_transfer_cb, s, true);
         break;
     }
 
diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 0248294..aca65a1 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -542,15 +542,15 @@
         break;
     case 1:
     case 2:
-        cmd->xfer = buf[8] | (buf[7] << 8);
+        cmd->xfer = lduw_be_p(&buf[7]);
         cmd->len = 10;
         break;
     case 4:
-        cmd->xfer = buf[13] | (buf[12] << 8) | (buf[11] << 16) | (buf[10] << 24);
+        cmd->xfer = ldl_be_p(&buf[10]);
         cmd->len = 16;
         break;
     case 5:
-        cmd->xfer = buf[9] | (buf[8] << 8) | (buf[7] << 16) | (buf[6] << 24);
+        cmd->xfer = ldl_be_p(&buf[6]);
         cmd->len = 12;
         break;
     default:
@@ -710,23 +710,15 @@
 
     switch (buf[0] >> 5) {
     case 0:
-        lba = (uint64_t) buf[3] | ((uint64_t) buf[2] << 8) |
-              (((uint64_t) buf[1] & 0x1f) << 16);
+        lba = ldl_be_p(&buf[0]) & 0x1fffff;
         break;
     case 1:
     case 2:
-        lba = (uint64_t) buf[5] | ((uint64_t) buf[4] << 8) |
-              ((uint64_t) buf[3] << 16) | ((uint64_t) buf[2] << 24);
+    case 5:
+        lba = ldl_be_p(&buf[2]);
         break;
     case 4:
-        lba = (uint64_t) buf[9] | ((uint64_t) buf[8] << 8) |
-              ((uint64_t) buf[7] << 16) | ((uint64_t) buf[6] << 24) |
-              ((uint64_t) buf[5] << 32) | ((uint64_t) buf[4] << 40) |
-              ((uint64_t) buf[3] << 48) | ((uint64_t) buf[2] << 56);
-        break;
-    case 5:
-        lba = (uint64_t) buf[5] | ((uint64_t) buf[4] << 8) |
-              ((uint64_t) buf[3] << 16) | ((uint64_t) buf[2] << 24);
+        lba = ldq_be_p(&buf[2]);
         break;
     default:
         lba = -1;
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 96c554f..e843f71 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -55,6 +55,7 @@
     /* Both sector and sector_count are in terms of qemu 512 byte blocks.  */
     uint64_t sector;
     uint32_t sector_count;
+    uint32_t buflen;
     struct iovec iov;
     QEMUIOVector qiov;
     uint32_t status;
@@ -78,13 +79,15 @@
 };
 
 static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type);
-static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf);
+static int scsi_disk_emulate_command(SCSIDiskReq *r);
 
 static void scsi_free_request(SCSIRequest *req)
 {
     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
 
-    qemu_vfree(r->iov.iov_base);
+    if (r->iov.iov_base) {
+        qemu_vfree(r->iov.iov_base);
+    }
 }
 
 /* Helper function for command completion with sense.  */
@@ -108,6 +111,19 @@
     r->req.aiocb = NULL;
 }
 
+static uint32_t scsi_init_iovec(SCSIDiskReq *r)
+{
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+    if (!r->iov.iov_base) {
+        r->buflen = SCSI_DMA_BUF_SIZE;
+        r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
+    }
+    r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
+    qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+    return r->qiov.size / 512;
+}
+
 static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
@@ -125,12 +141,12 @@
         }
     }
 
-    DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);
+    DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size);
 
-    n = r->iov.iov_len / 512;
+    n = r->qiov.size / 512;
     r->sector += n;
     r->sector_count -= n;
-    scsi_req_data(&r->req, r->iov.iov_len);
+    scsi_req_data(&r->req, r->qiov.size);
 }
 
 static void scsi_flush_complete(void * opaque, int ret)
@@ -181,16 +197,10 @@
         return;
     }
 
-    n = r->sector_count;
-    if (n > SCSI_DMA_BUF_SIZE / 512)
-        n = SCSI_DMA_BUF_SIZE / 512;
-
     if (s->tray_open) {
         scsi_read_complete(r, -ENOMEDIUM);
     }
-    r->iov.iov_len = n * 512;
-    qemu_iovec_init_external(&r->qiov, &r->iov, 1);
-
+    n = scsi_init_iovec(r);
     bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
                               scsi_read_complete, r);
@@ -239,7 +249,6 @@
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
-    uint32_t len;
     uint32_t n;
 
     if (r->req.aiocb != NULL) {
@@ -253,19 +262,15 @@
         }
     }
 
-    n = r->iov.iov_len / 512;
+    n = r->qiov.size / 512;
     r->sector += n;
     r->sector_count -= n;
     if (r->sector_count == 0) {
         scsi_req_complete(&r->req, GOOD);
     } else {
-        len = r->sector_count * 512;
-        if (len > SCSI_DMA_BUF_SIZE) {
-            len = SCSI_DMA_BUF_SIZE;
-        }
-        r->iov.iov_len = len;
-        DPRINTF("Write complete tag=0x%x more=%d\n", r->req.tag, len);
-        scsi_req_data(&r->req, len);
+        scsi_init_iovec(r);
+        DPRINTF("Write complete tag=0x%x more=%d\n", r->req.tag, r->qiov.size);
+        scsi_req_data(&r->req, r->qiov.size);
     }
 }
 
@@ -284,21 +289,19 @@
         return;
     }
 
-    n = r->iov.iov_len / 512;
+    n = r->qiov.size / 512;
     if (n) {
         if (s->tray_open) {
             scsi_write_complete(r, -ENOMEDIUM);
         }
-        qemu_iovec_init_external(&r->qiov, &r->iov, 1);
-
         bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
         r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
-                                   scsi_write_complete, r);
+                                       scsi_write_complete, r);
         if (r->req.aiocb == NULL) {
             scsi_write_complete(r, -ENOMEM);
         }
     } else {
-        /* Invoke completion routine to fetch data from host.  */
+        /* Called for the first time.  Ask the driver to send us more data.  */
         scsi_write_complete(r, 0);
     }
 }
@@ -329,7 +332,7 @@
                 scsi_write_data(&r->req);
                 break;
             case SCSI_REQ_STATUS_RETRY_FLUSH:
-                ret = scsi_disk_emulate_command(r, r->iov.iov_base);
+                ret = scsi_disk_emulate_command(r);
                 if (ret == 0) {
                     scsi_req_complete(&r->req, GOOD);
                 }
@@ -844,13 +847,31 @@
     return 0;
 }
 
-static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
+static int scsi_disk_emulate_command(SCSIDiskReq *r)
 {
     SCSIRequest *req = &r->req;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     uint64_t nb_sectors;
+    uint8_t *outbuf;
     int buflen = 0;
 
+    if (!r->iov.iov_base) {
+        /*
+         * FIXME: we shouldn't return anything bigger than 4k, but the code
+         * requires the buffer to be as big as req->cmd.xfer in several
+         * places.  So, do not allow CDBs with a very large ALLOCATION
+         * LENGTH.  The real fix would be to modify scsi_read_data and
+         * dma_buf_read, so that they return data beyond the buflen
+         * as all zeros.
+         */
+        if (req->cmd.xfer > 65536) {
+            goto illegal_request;
+        }
+        r->buflen = MAX(4096, req->cmd.xfer);
+        r->iov.iov_base = qemu_blockalign(s->bs, r->buflen);
+    }
+
+    outbuf = r->iov.iov_base;
     switch (req->cmd.buf[0]) {
     case TEST_UNIT_READY:
         if (s->tray_open || !bdrv_is_inserted(s->bs))
@@ -1001,11 +1022,9 @@
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     int32_t len;
     uint8_t command;
-    uint8_t *outbuf;
     int rc;
 
     command = buf[0];
-    outbuf = (uint8_t *)r->iov.iov_base;
     DPRINTF("Command: lun=%d tag=0x%x data=0x%02x", req->lun, req->tag, buf[0]);
 
 #ifdef DEBUG_SCSI
@@ -1034,7 +1053,7 @@
     case GET_CONFIGURATION:
     case SERVICE_ACTION_IN_16:
     case VERIFY_10:
-        rc = scsi_disk_emulate_command(r, outbuf);
+        rc = scsi_disk_emulate_command(r);
         if (rc < 0) {
             return 0;
         }
@@ -1285,11 +1304,8 @@
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, d);
     SCSIRequest *req;
-    SCSIDiskReq *r;
 
     req = scsi_req_alloc(&scsi_disk_reqops, &s->qdev, tag, lun, hba_private);
-    r = DO_UPCAST(SCSIDiskReq, req, req);
-    r->iov.iov_base = qemu_blockalign(s->bs, SCSI_DMA_BUF_SIZE);
     return req;
 }
 
diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 5ce01af..8f6b70d 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -244,12 +244,6 @@
 static void scsi_req_fixup(SCSIRequest *req)
 {
     switch(req->cmd.buf[0]) {
-    case WRITE_10:
-        req->cmd.buf[1] &= ~0x08;	/* disable FUA */
-        break;
-    case READ_10:
-        req->cmd.buf[1] &= ~0x08;	/* disable FUA */
-        break;
     case REWIND:
     case START_STOP:
         if (req->dev->type == TYPE_TAPE) {
diff --git a/linux-aio.c b/linux-aio.c
index 5265a02..bffa6cd 100644
--- a/linux-aio.c
+++ b/linux-aio.c
@@ -68,15 +68,6 @@
     qemu_aio_release(laiocb);
 }
 
-/*
- * All requests are directly processed when they complete, so there's nothing
- * left to do during qemu_aio_wait().
- */
-static int qemu_laio_process_requests(void *opaque)
-{
-    return 0;
-}
-
 static void qemu_laio_completion_cb(void *opaque)
 {
     struct qemu_laio_state *s = opaque;
@@ -215,7 +206,7 @@
         goto out_close_efd;
 
     qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb, NULL,
-        qemu_laio_flush_cb, qemu_laio_process_requests, s);
+        qemu_laio_flush_cb, NULL, s);
 
     return s;
 
diff --git a/nbd.c b/nbd.c
index 6d81cfb..595f4d8 100644
--- a/nbd.c
+++ b/nbd.c
@@ -30,6 +30,10 @@
 #include <ctype.h>
 #include <inttypes.h>
 
+#ifdef __linux__
+#include <linux/fs.h>
+#endif
+
 #include "qemu_socket.h"
 
 //#define DEBUG_NBD
@@ -63,6 +67,8 @@
 #define NBD_PRINT_DEBUG         _IO(0xab, 6)
 #define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
 #define NBD_DISCONNECT          _IO(0xab, 8)
+#define NBD_SET_TIMEOUT         _IO(0xab, 9)
+#define NBD_SET_FLAGS           _IO(0xab, 10)
 
 #define NBD_OPT_EXPORT_NAME     (1 << 0)
 
@@ -172,7 +178,7 @@
                   Request (type == 2)
 */
 
-int nbd_negotiate(int csock, off_t size)
+int nbd_negotiate(int csock, off_t size, uint32_t flags)
 {
     char buf[8 + 8 + 8 + 128];
 
@@ -180,14 +186,16 @@
         [ 0 ..   7]   passwd   ("NBDMAGIC")
         [ 8 ..  15]   magic    (0x00420281861253)
         [16 ..  23]   size
-        [24 .. 151]   reserved (0)
+        [24 ..  27]   flags
+        [28 .. 151]   reserved (0)
      */
 
     TRACE("Beginning negotiation.");
     memcpy(buf, "NBDMAGIC", 8);
     cpu_to_be64w((uint64_t*)(buf + 8), 0x00420281861253LL);
     cpu_to_be64w((uint64_t*)(buf + 16), size);
-    memset(buf + 24, 0, 128);
+    cpu_to_be32w((uint32_t*)(buf + 24), flags | NBD_FLAG_HAS_FLAGS);
+    memset(buf + 28, 0, 124);
 
     if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
         LOG("write failed");
@@ -337,8 +345,8 @@
         return 0;
 }
 
-#ifndef _WIN32
-int nbd_init(int fd, int csock, off_t size, size_t blocksize)
+#ifdef __linux__
+int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
 {
     TRACE("Setting block size to %lu", (unsigned long)blocksize);
 
@@ -358,6 +366,26 @@
         return -1;
     }
 
+    if (flags & NBD_FLAG_READ_ONLY) {
+        int read_only = 1;
+        TRACE("Setting readonly attribute");
+
+        if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
+            int serrno = errno;
+            LOG("Failed setting read-only attribute");
+            errno = serrno;
+            return -1;
+        }
+    }
+
+    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0
+        && errno != ENOTTY) {
+        int serrno = errno;
+        LOG("Failed setting flags");
+        errno = serrno;
+        return -1;
+    }
+
     TRACE("Clearing NBD socket");
 
     if (ioctl(fd, NBD_CLEAR_SOCK) == -1) {
@@ -548,7 +576,7 @@
 }
 
 int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
-             off_t *offset, bool readonly, uint8_t *data, int data_size)
+             off_t *offset, uint32_t nbdflags, uint8_t *data, int data_size)
 {
     struct nbd_request request;
     struct nbd_reply reply;
@@ -632,7 +660,7 @@
             return -1;
         }
 
-        if (readonly) {
+        if (nbdflags & NBD_FLAG_READ_ONLY) {
             TRACE("Server is read-only, return error");
             reply.error = 1;
         } else {
diff --git a/nbd.h b/nbd.h
index df7b7af..61553f4 100644
--- a/nbd.h
+++ b/nbd.h
@@ -37,10 +37,22 @@
     uint64_t handle;
 } QEMU_PACKED;
 
+#define NBD_FLAG_HAS_FLAGS      (1 << 0)        /* Flags are there */
+#define NBD_FLAG_READ_ONLY      (1 << 1)        /* Device is read-only */
+#define NBD_FLAG_SEND_FLUSH     (1 << 2)        /* Send FLUSH */
+#define NBD_FLAG_SEND_FUA       (1 << 3)        /* Send FUA (Force Unit Access) */
+#define NBD_FLAG_ROTATIONAL     (1 << 4)        /* Use elevator algorithm - rotational media */
+#define NBD_FLAG_SEND_TRIM      (1 << 5)        /* Send TRIM (discard) */
+
+#define NBD_CMD_MASK_COMMAND	0x0000ffff
+#define NBD_CMD_FLAG_FUA	(1 << 16)
+
 enum {
     NBD_CMD_READ = 0,
     NBD_CMD_WRITE = 1,
-    NBD_CMD_DISC = 2
+    NBD_CMD_DISC = 2,
+    NBD_CMD_FLUSH = 3,
+    NBD_CMD_TRIM = 4
 };
 
 #define NBD_DEFAULT_PORT	10809
@@ -53,14 +65,14 @@
 int unix_socket_outgoing(const char *path);
 int unix_socket_incoming(const char *path);
 
-int nbd_negotiate(int csock, off_t size);
+int nbd_negotiate(int csock, off_t size, uint32_t flags);
 int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
                           off_t *size, size_t *blocksize);
-int nbd_init(int fd, int csock, off_t size, size_t blocksize);
+int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize);
 int nbd_send_request(int csock, struct nbd_request *request);
 int nbd_receive_reply(int csock, struct nbd_reply *reply);
 int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
-             off_t *offset, bool readonly, uint8_t *data, int data_size);
+             off_t *offset, uint32_t nbdflags, uint8_t *data, int data_size);
 int nbd_client(int fd);
 int nbd_disconnect(int fd);
 
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index 3193dbf..d3c1174 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -42,7 +42,6 @@
     int aio_niov;
     size_t aio_nbytes;
 #define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
-    int ev_signo;
     off_t aio_offset;
 
     QTAILQ_ENTRY(qemu_paiocb) node;
@@ -181,7 +180,6 @@
 
 static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
 {
-    size_t offset = 0;
     ssize_t len;
 
     do {
@@ -189,12 +187,12 @@
             len = qemu_pwritev(aiocb->aio_fildes,
                                aiocb->aio_iov,
                                aiocb->aio_niov,
-                               aiocb->aio_offset + offset);
+                               aiocb->aio_offset);
          else
             len = qemu_preadv(aiocb->aio_fildes,
                               aiocb->aio_iov,
                               aiocb->aio_niov,
-                              aiocb->aio_offset + offset);
+                              aiocb->aio_offset);
     } while (len == -1 && errno == EINTR);
 
     if (len == -1)
@@ -309,12 +307,10 @@
     return nbytes;
 }
 
+static void posix_aio_notify_event(void);
+
 static void *aio_thread(void *unused)
 {
-    pid_t pid;
-
-    pid = getpid();
-
     mutex_lock(&lock);
     pending_threads--;
     mutex_unlock(&lock);
@@ -381,7 +377,7 @@
         aiocb->ret = ret;
         mutex_unlock(&lock);
 
-        if (kill(pid, aiocb->ev_signo)) die("kill failed");
+        posix_aio_notify_event();
     }
 
     cur_threads--;
@@ -548,18 +544,14 @@
 
 static PosixAioState *posix_aio_state;
 
-static void aio_signal_handler(int signum)
+static void posix_aio_notify_event(void)
 {
-    if (posix_aio_state) {
-        char byte = 0;
-        ssize_t ret;
+    char byte = 0;
+    ssize_t ret;
 
-        ret = write(posix_aio_state->wfd, &byte, sizeof(byte));
-        if (ret < 0 && errno != EAGAIN)
-            die("write()");
-    }
-
-    qemu_service_io();
+    ret = write(posix_aio_state->wfd, &byte, sizeof(byte));
+    if (ret < 0 && errno != EAGAIN)
+        die("write()");
 }
 
 static void paio_remove(struct qemu_paiocb *acb)
@@ -623,7 +615,6 @@
         return NULL;
     acb->aio_type = type;
     acb->aio_fildes = fd;
-    acb->ev_signo = SIGUSR2;
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
@@ -651,7 +642,6 @@
         return NULL;
     acb->aio_type = QEMU_AIO_IOCTL;
     acb->aio_fildes = fd;
-    acb->ev_signo = SIGUSR2;
     acb->aio_offset = 0;
     acb->aio_ioctl_buf = buf;
     acb->aio_ioctl_cmd = req;
@@ -665,7 +655,6 @@
 
 int paio_init(void)
 {
-    struct sigaction act;
     PosixAioState *s;
     int fds[2];
     int ret;
@@ -675,11 +664,6 @@
 
     s = g_malloc(sizeof(PosixAioState));
 
-    sigfillset(&act.sa_mask);
-    act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
-    act.sa_handler = aio_signal_handler;
-    sigaction(SIGUSR2, &act, NULL);
-
     s->first_aio = NULL;
     if (qemu_pipe(fds) == -1) {
         fprintf(stderr, "failed to create pipe\n");
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 3a39145..d8d3e15 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -185,7 +185,7 @@
     BlockDriverState *bs;
     off_t dev_offset = 0;
     off_t offset = 0;
-    bool readonly = false;
+    uint32_t nbdflags = 0;
     bool disconnect = false;
     const char *bindto = "0.0.0.0";
     int port = NBD_DEFAULT_PORT;
@@ -230,7 +230,6 @@
     int nb_fds = 0;
     int max_fd;
     int persistent = 0;
-    uint32_t nbdflags;
 
     while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
         switch (ch) {
@@ -263,7 +262,7 @@
             }
             break;
         case 'r':
-            readonly = true;
+            nbdflags |= NBD_FLAG_READ_ONLY;
             flags &= ~BDRV_O_RDWR;
             break;
         case 'P':
@@ -398,13 +397,13 @@
             }
 
             ret = nbd_receive_negotiate(sock, NULL, &nbdflags,
-					&size, &blocksize);
+                                        &size, &blocksize);
             if (ret == -1) {
                 ret = 1;
                 goto out;
             }
 
-            ret = nbd_init(fd, sock, size, blocksize);
+            ret = nbd_init(fd, sock, nbdflags, size, blocksize);
             if (ret == -1) {
                 ret = 1;
                 goto out;
@@ -463,7 +462,7 @@
         for (i = 1; i < nb_fds && ret; i++) {
             if (FD_ISSET(sharing_fds[i], &fds)) {
                 if (nbd_trip(bs, sharing_fds[i], fd_size, dev_offset,
-                    &offset, readonly, data, NBD_BUFFER_SIZE) != 0) {
+                    &offset, nbdflags, data, NBD_BUFFER_SIZE) != 0) {
                     close(sharing_fds[i]);
                     nb_fds--;
                     sharing_fds[i] = sharing_fds[nb_fds];
@@ -479,7 +478,7 @@
                                              (struct sockaddr *)&addr,
                                              &addr_len);
                 if (sharing_fds[nb_fds] != -1 &&
-                    nbd_negotiate(sharing_fds[nb_fds], fd_size) != -1) {
+                    nbd_negotiate(sharing_fds[nb_fds], fd_size, nbdflags) != -1) {
                         if (sharing_fds[nb_fds] > max_fd)
                             max_fd = sharing_fds[nb_fds];
                         nb_fds++;