Merge remote-tracking branch 'remotes/kraxel/tags/pull-input-20141002-1' into staging

input monitor patches: fix send-key release ordering
and new input-send-event command

# gpg: Signature made Thu 02 Oct 2014 09:10:44 BST using RSA key ID D3E87138
# gpg: Good signature from "Gerd Hoffmann (work) <kraxel@redhat.com>"
# gpg:                 aka "Gerd Hoffmann <gerd@kraxel.org>"
# gpg:                 aka "Gerd Hoffmann (private) <kraxel@gmail.com>"

* remotes/kraxel/tags/pull-input-20141002-1:
  add input-send-event command
  input: fix send-key monitor command release event ordering

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/block/iscsi.c b/block/iscsi.c
index 5c72ffe..3a01de0 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -316,6 +316,13 @@
     return 1;
 }
 
+static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
+{
+    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
+                                                       iscsilun),
+                                       iscsilun->cluster_sectors));
+}
+
 static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
                                     int nb_sectors)
 {
@@ -1402,9 +1409,10 @@
         iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                      iscsilun->block_size) >> BDRV_SECTOR_BITS;
         if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
-            iscsilun->allocationmap =
-                bitmap_new(DIV_ROUND_UP(bs->total_sectors,
-                                        iscsilun->cluster_sectors));
+            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
+            if (iscsilun->allocationmap == NULL) {
+                ret = -ENOMEM;
+            }
         }
     }
 
@@ -1497,10 +1505,7 @@
 
     if (iscsilun->allocationmap != NULL) {
         g_free(iscsilun->allocationmap);
-        iscsilun->allocationmap =
-            bitmap_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
-                                                    iscsilun),
-                                    iscsilun->cluster_sectors));
+        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
     }
 
     return 0;
diff --git a/disas/sparc.c b/disas/sparc.c
index 8eb22e6..8e755d1 100644
--- a/disas/sparc.c
+++ b/disas/sparc.c
@@ -1175,15 +1175,11 @@
 { "subcc",      F3(2, 0x14, 0), F3(~2, ~0x14, ~0)|ASI(~0),      "1,2,d", 0, v6 },
 { "subcc",      F3(2, 0x14, 1), F3(~2, ~0x14, ~1),              "1,i,d", 0, v6 },
 
-{ "subx",       F3(2, 0x0c, 0), F3(~2, ~0x0c, ~0)|ASI(~0),      "1,2,d", 0, v6notv9 },
-{ "subx",       F3(2, 0x0c, 1), F3(~2, ~0x0c, ~1),              "1,i,d", 0, v6notv9 },
-{ "subc",       F3(2, 0x0c, 0), F3(~2, ~0x0c, ~0)|ASI(~0),      "1,2,d", 0, v9 },
-{ "subc",       F3(2, 0x0c, 1), F3(~2, ~0x0c, ~1),              "1,i,d", 0, v9 },
+{ "subc",       F3(2, 0x0c, 0), F3(~2, ~0x0c, ~0)|ASI(~0),      "1,2,d", 0, v6 },
+{ "subc",       F3(2, 0x0c, 1), F3(~2, ~0x0c, ~1),              "1,i,d", 0, v6 },
 
-{ "subxcc",     F3(2, 0x1c, 0), F3(~2, ~0x1c, ~0)|ASI(~0),      "1,2,d", 0, v6notv9 },
-{ "subxcc",     F3(2, 0x1c, 1), F3(~2, ~0x1c, ~1),              "1,i,d", 0, v6notv9 },
-{ "subccc",     F3(2, 0x1c, 0), F3(~2, ~0x1c, ~0)|ASI(~0),      "1,2,d", 0, v9 },
-{ "subccc",     F3(2, 0x1c, 1), F3(~2, ~0x1c, ~1),              "1,i,d", 0, v9 },
+{ "subccc",     F3(2, 0x1c, 0), F3(~2, ~0x1c, ~0)|ASI(~0),      "1,2,d", 0, v6 },
+{ "subccc",     F3(2, 0x1c, 1), F3(~2, ~0x1c, ~1),              "1,i,d", 0, v6 },
 
 { "and",        F3(2, 0x01, 0), F3(~2, ~0x01, ~0)|ASI(~0),      "1,2,d", 0, v6 },
 { "and",        F3(2, 0x01, 1), F3(~2, ~0x01, ~1),              "1,i,d", 0, v6 },
@@ -1215,19 +1211,13 @@
 { "addcc",      F3(2, 0x10, 1), F3(~2, ~0x10, ~1),              "1,i,d", 0, v6 },
 { "addcc",      F3(2, 0x10, 1), F3(~2, ~0x10, ~1),              "i,1,d", 0, v6 },
 
-{ "addx",       F3(2, 0x08, 0), F3(~2, ~0x08, ~0)|ASI(~0),      "1,2,d", 0, v6notv9 },
-{ "addx",       F3(2, 0x08, 1), F3(~2, ~0x08, ~1),              "1,i,d", 0, v6notv9 },
-{ "addx",       F3(2, 0x08, 1), F3(~2, ~0x08, ~1),              "i,1,d", 0, v6notv9 },
-{ "addc",       F3(2, 0x08, 0), F3(~2, ~0x08, ~0)|ASI(~0),      "1,2,d", 0, v9 },
-{ "addc",       F3(2, 0x08, 1), F3(~2, ~0x08, ~1),              "1,i,d", 0, v9 },
-{ "addc",       F3(2, 0x08, 1), F3(~2, ~0x08, ~1),              "i,1,d", 0, v9 },
+{ "addc",       F3(2, 0x08, 0), F3(~2, ~0x08, ~0)|ASI(~0),      "1,2,d", 0, v6 },
+{ "addc",       F3(2, 0x08, 1), F3(~2, ~0x08, ~1),              "1,i,d", 0, v6 },
+{ "addc",       F3(2, 0x08, 1), F3(~2, ~0x08, ~1),              "i,1,d", 0, v6 },
 
-{ "addxcc",     F3(2, 0x18, 0), F3(~2, ~0x18, ~0)|ASI(~0),      "1,2,d", 0, v6notv9 },
-{ "addxcc",     F3(2, 0x18, 1), F3(~2, ~0x18, ~1),              "1,i,d", 0, v6notv9 },
-{ "addxcc",     F3(2, 0x18, 1), F3(~2, ~0x18, ~1),              "i,1,d", 0, v6notv9 },
-{ "addccc",     F3(2, 0x18, 0), F3(~2, ~0x18, ~0)|ASI(~0),      "1,2,d", 0, v9 },
-{ "addccc",     F3(2, 0x18, 1), F3(~2, ~0x18, ~1),              "1,i,d", 0, v9 },
-{ "addccc",     F3(2, 0x18, 1), F3(~2, ~0x18, ~1),              "i,1,d", 0, v9 },
+{ "addccc",     F3(2, 0x18, 0), F3(~2, ~0x18, ~0)|ASI(~0),      "1,2,d", 0, v6 },
+{ "addccc",     F3(2, 0x18, 1), F3(~2, ~0x18, ~1),              "1,i,d", 0, v6 },
+{ "addccc",     F3(2, 0x18, 1), F3(~2, ~0x18, ~1),              "i,1,d", 0, v6 },
 
 { "smul",       F3(2, 0x0b, 0), F3(~2, ~0x0b, ~0)|ASI(~0),      "1,2,d", 0, v8 },
 { "smul",       F3(2, 0x0b, 1), F3(~2, ~0x0b, ~1),              "1,i,d", 0, v8 },
@@ -2042,6 +2032,10 @@
 
 #undef IMPDEP
 
+{ "addxc", F3F(2, 0x36, 0x011), F3F(~2, ~0x36, ~0x011), "1,2,d", 0, v9b },
+{ "addxccc", F3F(2, 0x36, 0x013), F3F(~2, ~0x36, ~0x013), "1,2,d", 0, v9b },
+{ "umulxhi", F3F(2, 0x36, 0x016), F3F(~2, ~0x36, ~0x016), "1,2,d", 0, v9b },
+
 };
 
 static const int sparc_num_opcodes = ((sizeof sparc_opcodes)/(sizeof sparc_opcodes[0]));
diff --git a/hw/audio/ac97.c b/hw/audio/ac97.c
index 0e22bb9..111ec0e 100644
--- a/hw/audio/ac97.c
+++ b/hw/audio/ac97.c
@@ -1321,9 +1321,9 @@
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
-static void ac97_on_reset (void *opaque)
+static void ac97_on_reset (DeviceState *dev)
 {
-    AC97LinkState *s = opaque;
+    AC97LinkState *s = container_of(dev, AC97LinkState, dev.qdev);
 
     reset_bm_regs (s, &s->bm_regs[0]);
     reset_bm_regs (s, &s->bm_regs[1]);
@@ -1382,9 +1382,8 @@
                            "ac97-nabm", 256);
     pci_register_bar (&s->dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &s->io_nam);
     pci_register_bar (&s->dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &s->io_nabm);
-    qemu_register_reset (ac97_on_reset, s);
     AUD_register_card ("ac97", &s->card);
-    ac97_on_reset (s);
+    ac97_on_reset (&s->dev.qdev);
     return 0;
 }
 
@@ -1413,6 +1412,7 @@
     dc->desc = "Intel 82801AA AC97 Audio";
     dc->vmsd = &vmstate_ac97;
     dc->props = ac97_properties;
+    dc->reset = ac97_on_reset;
 }
 
 static const TypeInfo ac97_info = {
diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c
index db330e9..8a5b76c 100644
--- a/hw/display/cirrus_vga.c
+++ b/hw/display/cirrus_vga.c
@@ -29,6 +29,7 @@
 #include "hw/hw.h"
 #include "hw/pci/pci.h"
 #include "ui/console.h"
+#include "ui/pixel_ops.h"
 #include "vga_int.h"
 #include "hw/loader.h"
 
@@ -2170,20 +2171,44 @@
     }
 }
 
-#define DEPTH 8
-#include "cirrus_vga_template.h"
+static void vga_draw_cursor_line(uint8_t *d1,
+                                 const uint8_t *src1,
+                                 int poffset, int w,
+                                 unsigned int color0,
+                                 unsigned int color1,
+                                 unsigned int color_xor)
+{
+    const uint8_t *plane0, *plane1;
+    int x, b0, b1;
+    uint8_t *d;
 
-#define DEPTH 16
-#include "cirrus_vga_template.h"
-
-#define DEPTH 32
-#include "cirrus_vga_template.h"
+    d = d1;
+    plane0 = src1;
+    plane1 = src1 + poffset;
+    for (x = 0; x < w; x++) {
+        b0 = (plane0[x >> 3] >> (7 - (x & 7))) & 1;
+        b1 = (plane1[x >> 3] >> (7 - (x & 7))) & 1;
+        switch (b0 | (b1 << 1)) {
+        case 0:
+            break;
+        case 1:
+            ((uint32_t *)d)[0] ^= color_xor;
+            break;
+        case 2:
+            ((uint32_t *)d)[0] = color0;
+            break;
+        case 3:
+            ((uint32_t *)d)[0] = color1;
+            break;
+        }
+        d += 4;
+    }
+}
 
 static void cirrus_cursor_draw_line(VGACommonState *s1, uint8_t *d1, int scr_y)
 {
     CirrusVGAState *s = container_of(s1, CirrusVGAState, vga);
-    DisplaySurface *surface = qemu_console_surface(s->vga.con);
-    int w, h, bpp, x1, x2, poffset;
+    int w, h, x1, x2, poffset;
     unsigned int color0, color1;
     const uint8_t *palette, *src;
     uint32_t content;
@@ -2212,6 +2237,8 @@
     } else {
         src += (s->vga.sr[0x13] & 0x3f) * 256;
         src += (scr_y - s->hw_cursor_y) * 4;
+
+
         poffset = 128;
         content = ((uint32_t *)src)[0] |
             ((uint32_t *)(src + 128))[0];
@@ -2229,30 +2256,14 @@
         x2 = s->vga.last_scr_width;
     w = x2 - x1;
     palette = s->cirrus_hidden_palette;
-    color0 = s->vga.rgb_to_pixel(c6_to_8(palette[0x0 * 3]),
-                                 c6_to_8(palette[0x0 * 3 + 1]),
-                                 c6_to_8(palette[0x0 * 3 + 2]));
-    color1 = s->vga.rgb_to_pixel(c6_to_8(palette[0xf * 3]),
-                                 c6_to_8(palette[0xf * 3 + 1]),
-                                 c6_to_8(palette[0xf * 3 + 2]));
-    bpp = surface_bytes_per_pixel(surface);
-    d1 += x1 * bpp;
-    switch (surface_bits_per_pixel(surface)) {
-    default:
-        break;
-    case 8:
-        vga_draw_cursor_line_8(d1, src, poffset, w, color0, color1, 0xff);
-        break;
-    case 15:
-        vga_draw_cursor_line_16(d1, src, poffset, w, color0, color1, 0x7fff);
-        break;
-    case 16:
-        vga_draw_cursor_line_16(d1, src, poffset, w, color0, color1, 0xffff);
-        break;
-    case 32:
-        vga_draw_cursor_line_32(d1, src, poffset, w, color0, color1, 0xffffff);
-        break;
-    }
+    color0 = rgb_to_pixel32(c6_to_8(palette[0x0 * 3]),
+                            c6_to_8(palette[0x0 * 3 + 1]),
+                            c6_to_8(palette[0x0 * 3 + 2]));
+    color1 = rgb_to_pixel32(c6_to_8(palette[0xf * 3]),
+                            c6_to_8(palette[0xf * 3 + 1]),
+                            c6_to_8(palette[0xf * 3 + 2]));
+    d1 += x1 * 4;
+    vga_draw_cursor_line(d1, src, poffset, w, color0, color1, 0xffffff);
 }
 
 /***************************************
diff --git a/hw/display/cirrus_vga_template.h b/hw/display/cirrus_vga_template.h
deleted file mode 100644
index 3b28280..0000000
--- a/hw/display/cirrus_vga_template.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * QEMU Cirrus VGA Emulator templates
- *
- * Copyright (c) 2003 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#if DEPTH == 8
-#define BPP 1
-#elif DEPTH == 15 || DEPTH == 16
-#define BPP 2
-#elif DEPTH == 32
-#define BPP 4
-#else
-#error unsupported depth
-#endif
-
-static void glue(vga_draw_cursor_line_, DEPTH)(uint8_t *d1,
-                                               const uint8_t *src1,
-                                               int poffset, int w,
-                                               unsigned int color0,
-                                               unsigned int color1,
-                                               unsigned int color_xor)
-{
-    const uint8_t *plane0, *plane1;
-    int x, b0, b1;
-    uint8_t *d;
-
-    d = d1;
-    plane0 = src1;
-    plane1 = src1 + poffset;
-    for (x = 0; x < w; x++) {
-        b0 = (plane0[x >> 3] >> (7 - (x & 7))) & 1;
-        b1 = (plane1[x >> 3] >> (7 - (x & 7))) & 1;
-#if DEPTH == 8
-        switch (b0 | (b1 << 1)) {
-        case 0:
-            break;
-        case 1:
-            d[0] ^= color_xor;
-            break;
-        case 2:
-            d[0] = color0;
-            break;
-        case 3:
-            d[0] = color1;
-            break;
-        }
-#elif DEPTH == 16
-        switch (b0 | (b1 << 1)) {
-        case 0:
-            break;
-        case 1:
-            ((uint16_t *)d)[0] ^= color_xor;
-            break;
-        case 2:
-            ((uint16_t *)d)[0] = color0;
-            break;
-        case 3:
-            ((uint16_t *)d)[0] = color1;
-            break;
-        }
-#elif DEPTH == 32
-        switch (b0 | (b1 << 1)) {
-        case 0:
-            break;
-        case 1:
-            ((uint32_t *)d)[0] ^= color_xor;
-            break;
-        case 2:
-            ((uint32_t *)d)[0] = color0;
-            break;
-        case 3:
-            ((uint32_t *)d)[0] = color1;
-            break;
-        }
-#else
-#error unsupported depth
-#endif
-        d += BPP;
-    }
-}
-
-#undef DEPTH
-#undef BPP
diff --git a/hw/display/vga-helpers.h b/hw/display/vga-helpers.h
new file mode 100644
index 0000000..94f6de2
--- /dev/null
+++ b/hw/display/vga-helpers.h
@@ -0,0 +1,439 @@
+/*
+ * QEMU VGA Emulator templates
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+static inline void vga_draw_glyph_line(uint8_t *d, uint32_t font_data,
+                                       uint32_t xorcol, uint32_t bgcol)
+{
+        ((uint32_t *)d)[0] = (-((font_data >> 7)) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[1] = (-((font_data >> 6) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[2] = (-((font_data >> 5) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[3] = (-((font_data >> 4) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[4] = (-((font_data >> 3) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[5] = (-((font_data >> 2) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[6] = (-((font_data >> 1) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[7] = (-((font_data >> 0) & 1) & xorcol) ^ bgcol;
+}
+
+static void vga_draw_glyph8(uint8_t *d, int linesize,
+                            const uint8_t *font_ptr, int h,
+                            uint32_t fgcol, uint32_t bgcol)
+{
+    uint32_t font_data, xorcol;
+
+    xorcol = bgcol ^ fgcol;
+    do {
+        font_data = font_ptr[0];
+        vga_draw_glyph_line(d, font_data, xorcol, bgcol);
+        font_ptr += 4;
+        d += linesize;
+    } while (--h);
+}
+
+static void vga_draw_glyph16(uint8_t *d, int linesize,
+                                          const uint8_t *font_ptr, int h,
+                                          uint32_t fgcol, uint32_t bgcol)
+{
+    uint32_t font_data, xorcol;
+
+    xorcol = bgcol ^ fgcol;
+    do {
+        font_data = font_ptr[0];
+        vga_draw_glyph_line(d, expand4to8[font_data >> 4],
+                            xorcol, bgcol);
+        vga_draw_glyph_line(d + 32, expand4to8[font_data & 0x0f],
+                            xorcol, bgcol);
+        font_ptr += 4;
+        d += linesize;
+    } while (--h);
+}
+
+static void vga_draw_glyph9(uint8_t *d, int linesize,
+                            const uint8_t *font_ptr, int h,
+                            uint32_t fgcol, uint32_t bgcol, int dup9)
+{
+    uint32_t font_data, xorcol, v;
+
+    xorcol = bgcol ^ fgcol;
+    do {
+        font_data = font_ptr[0];
+        ((uint32_t *)d)[0] = (-((font_data >> 7)) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[1] = (-((font_data >> 6) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[2] = (-((font_data >> 5) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[3] = (-((font_data >> 4) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[4] = (-((font_data >> 3) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[5] = (-((font_data >> 2) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[6] = (-((font_data >> 1) & 1) & xorcol) ^ bgcol;
+        v = (-((font_data >> 0) & 1) & xorcol) ^ bgcol;
+        ((uint32_t *)d)[7] = v;
+        if (dup9)
+            ((uint32_t *)d)[8] = v;
+        else
+            ((uint32_t *)d)[8] = bgcol;
+        font_ptr += 4;
+        d += linesize;
+    } while (--h);
+}
+
+/*
+ * 4 color mode
+ */
+static void vga_draw_line2(VGACommonState *s1, uint8_t *d,
+                           const uint8_t *s, int width)
+{
+    uint32_t plane_mask, *palette, data, v;
+    int x;
+
+    palette = s1->last_palette;
+    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
+    width >>= 3;
+    for(x = 0; x < width; x++) {
+        data = ((uint32_t *)s)[0];
+        data &= plane_mask;
+        v = expand2[GET_PLANE(data, 0)];
+        v |= expand2[GET_PLANE(data, 2)] << 2;
+        ((uint32_t *)d)[0] = palette[v >> 12];
+        ((uint32_t *)d)[1] = palette[(v >> 8) & 0xf];
+        ((uint32_t *)d)[2] = palette[(v >> 4) & 0xf];
+        ((uint32_t *)d)[3] = palette[(v >> 0) & 0xf];
+
+        v = expand2[GET_PLANE(data, 1)];
+        v |= expand2[GET_PLANE(data, 3)] << 2;
+        ((uint32_t *)d)[4] = palette[v >> 12];
+        ((uint32_t *)d)[5] = palette[(v >> 8) & 0xf];
+        ((uint32_t *)d)[6] = palette[(v >> 4) & 0xf];
+        ((uint32_t *)d)[7] = palette[(v >> 0) & 0xf];
+        d += 32;
+        s += 4;
+    }
+}
+
+#define PUT_PIXEL2(d, n, v) \
+((uint32_t *)d)[2*(n)] = ((uint32_t *)d)[2*(n)+1] = (v)
+
+/*
+ * 4 color mode, dup2 horizontal
+ */
+static void vga_draw_line2d2(VGACommonState *s1, uint8_t *d,
+                             const uint8_t *s, int width)
+{
+    uint32_t plane_mask, *palette, data, v;
+    int x;
+
+    palette = s1->last_palette;
+    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
+    width >>= 3;
+    for(x = 0; x < width; x++) {
+        data = ((uint32_t *)s)[0];
+        data &= plane_mask;
+        v = expand2[GET_PLANE(data, 0)];
+        v |= expand2[GET_PLANE(data, 2)] << 2;
+        PUT_PIXEL2(d, 0, palette[v >> 12]);
+        PUT_PIXEL2(d, 1, palette[(v >> 8) & 0xf]);
+        PUT_PIXEL2(d, 2, palette[(v >> 4) & 0xf]);
+        PUT_PIXEL2(d, 3, palette[(v >> 0) & 0xf]);
+
+        v = expand2[GET_PLANE(data, 1)];
+        v |= expand2[GET_PLANE(data, 3)] << 2;
+        PUT_PIXEL2(d, 4, palette[v >> 12]);
+        PUT_PIXEL2(d, 5, palette[(v >> 8) & 0xf]);
+        PUT_PIXEL2(d, 6, palette[(v >> 4) & 0xf]);
+        PUT_PIXEL2(d, 7, palette[(v >> 0) & 0xf]);
+        d += 64;
+        s += 4;
+    }
+}
+
+/*
+ * 16 color mode
+ */
+static void vga_draw_line4(VGACommonState *s1, uint8_t *d,
+                           const uint8_t *s, int width)
+{
+    uint32_t plane_mask, data, v, *palette;
+    int x;
+
+    palette = s1->last_palette;
+    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
+    width >>= 3;
+    for(x = 0; x < width; x++) {
+        data = ((uint32_t *)s)[0];
+        data &= plane_mask;
+        v = expand4[GET_PLANE(data, 0)];
+        v |= expand4[GET_PLANE(data, 1)] << 1;
+        v |= expand4[GET_PLANE(data, 2)] << 2;
+        v |= expand4[GET_PLANE(data, 3)] << 3;
+        ((uint32_t *)d)[0] = palette[v >> 28];
+        ((uint32_t *)d)[1] = palette[(v >> 24) & 0xf];
+        ((uint32_t *)d)[2] = palette[(v >> 20) & 0xf];
+        ((uint32_t *)d)[3] = palette[(v >> 16) & 0xf];
+        ((uint32_t *)d)[4] = palette[(v >> 12) & 0xf];
+        ((uint32_t *)d)[5] = palette[(v >> 8) & 0xf];
+        ((uint32_t *)d)[6] = palette[(v >> 4) & 0xf];
+        ((uint32_t *)d)[7] = palette[(v >> 0) & 0xf];
+        d += 32;
+        s += 4;
+    }
+}
+
+/*
+ * 16 color mode, dup2 horizontal
+ */
+static void vga_draw_line4d2(VGACommonState *s1, uint8_t *d,
+                             const uint8_t *s, int width)
+{
+    uint32_t plane_mask, data, v, *palette;
+    int x;
+
+    palette = s1->last_palette;
+    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
+    width >>= 3;
+    for(x = 0; x < width; x++) {
+        data = ((uint32_t *)s)[0];
+        data &= plane_mask;
+        v = expand4[GET_PLANE(data, 0)];
+        v |= expand4[GET_PLANE(data, 1)] << 1;
+        v |= expand4[GET_PLANE(data, 2)] << 2;
+        v |= expand4[GET_PLANE(data, 3)] << 3;
+        PUT_PIXEL2(d, 0, palette[v >> 28]);
+        PUT_PIXEL2(d, 1, palette[(v >> 24) & 0xf]);
+        PUT_PIXEL2(d, 2, palette[(v >> 20) & 0xf]);
+        PUT_PIXEL2(d, 3, palette[(v >> 16) & 0xf]);
+        PUT_PIXEL2(d, 4, palette[(v >> 12) & 0xf]);
+        PUT_PIXEL2(d, 5, palette[(v >> 8) & 0xf]);
+        PUT_PIXEL2(d, 6, palette[(v >> 4) & 0xf]);
+        PUT_PIXEL2(d, 7, palette[(v >> 0) & 0xf]);
+        d += 64;
+        s += 4;
+    }
+}
+
+/*
+ * 256 color mode, double pixels
+ *
+ * XXX: add plane_mask support (never used in standard VGA modes)
+ */
+static void vga_draw_line8d2(VGACommonState *s1, uint8_t *d,
+                             const uint8_t *s, int width)
+{
+    uint32_t *palette;
+    int x;
+
+    palette = s1->last_palette;
+    width >>= 3;
+    for(x = 0; x < width; x++) {
+        PUT_PIXEL2(d, 0, palette[s[0]]);
+        PUT_PIXEL2(d, 1, palette[s[1]]);
+        PUT_PIXEL2(d, 2, palette[s[2]]);
+        PUT_PIXEL2(d, 3, palette[s[3]]);
+        d += 32;
+        s += 4;
+    }
+}
+
+/*
+ * standard 256 color mode
+ *
+ * XXX: add plane_mask support (never used in standard VGA modes)
+ */
+static void vga_draw_line8(VGACommonState *s1, uint8_t *d,
+                           const uint8_t *s, int width)
+{
+    uint32_t *palette;
+    int x;
+
+    palette = s1->last_palette;
+    width >>= 3;
+    for(x = 0; x < width; x++) {
+        ((uint32_t *)d)[0] = palette[s[0]];
+        ((uint32_t *)d)[1] = palette[s[1]];
+        ((uint32_t *)d)[2] = palette[s[2]];
+        ((uint32_t *)d)[3] = palette[s[3]];
+        ((uint32_t *)d)[4] = palette[s[4]];
+        ((uint32_t *)d)[5] = palette[s[5]];
+        ((uint32_t *)d)[6] = palette[s[6]];
+        ((uint32_t *)d)[7] = palette[s[7]];
+        d += 32;
+        s += 8;
+    }
+}
+
+/*
+ * 15 bit color
+ */
+static void vga_draw_line15_le(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+    int w;
+    uint32_t v, r, g, b;
+
+    w = width;
+    do {
+        v = lduw_le_p((void *)s);
+        r = (v >> 7) & 0xf8;
+        g = (v >> 2) & 0xf8;
+        b = (v << 3) & 0xf8;
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 2;
+        d += 4;
+    } while (--w != 0);
+}
+
+static void vga_draw_line15_be(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+    int w;
+    uint32_t v, r, g, b;
+
+    w = width;
+    do {
+        v = lduw_be_p((void *)s);
+        r = (v >> 7) & 0xf8;
+        g = (v >> 2) & 0xf8;
+        b = (v << 3) & 0xf8;
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 2;
+        d += 4;
+    } while (--w != 0);
+}
+
+/*
+ * 16 bit color
+ */
+static void vga_draw_line16_le(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+    int w;
+    uint32_t v, r, g, b;
+
+    w = width;
+    do {
+        v = lduw_le_p((void *)s);
+        r = (v >> 8) & 0xf8;
+        g = (v >> 3) & 0xfc;
+        b = (v << 3) & 0xf8;
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 2;
+        d += 4;
+    } while (--w != 0);
+}
+
+static void vga_draw_line16_be(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+    int w;
+    uint32_t v, r, g, b;
+
+    w = width;
+    do {
+        v = lduw_be_p((void *)s);
+        r = (v >> 8) & 0xf8;
+        g = (v >> 3) & 0xfc;
+        b = (v << 3) & 0xf8;
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 2;
+        d += 4;
+    } while (--w != 0);
+}
+
+/*
+ * 24 bit color
+ */
+static void vga_draw_line24_le(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+    int w;
+    uint32_t r, g, b;
+
+    w = width;
+    do {
+        b = s[0];
+        g = s[1];
+        r = s[2];
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 3;
+        d += 4;
+    } while (--w != 0);
+}
+
+static void vga_draw_line24_be(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+    int w;
+    uint32_t r, g, b;
+
+    w = width;
+    do {
+        r = s[0];
+        g = s[1];
+        b = s[2];
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 3;
+        d += 4;
+    } while (--w != 0);
+}
+
+/*
+ * 32 bit color
+ */
+static void vga_draw_line32_le(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+#ifndef HOST_WORDS_BIGENDIAN
+    memcpy(d, s, width * 4);
+#else
+    int w;
+    uint32_t r, g, b;
+
+    w = width;
+    do {
+        b = s[0];
+        g = s[1];
+        r = s[2];
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 4;
+        d += 4;
+    } while (--w != 0);
+#endif
+}
+
+static void vga_draw_line32_be(VGACommonState *s1, uint8_t *d,
+                               const uint8_t *s, int width)
+{
+#ifdef HOST_WORDS_BIGENDIAN
+    memcpy(d, s, width * 4);
+#else
+    int w;
+    uint32_t r, g, b;
+
+    w = width;
+    do {
+        r = s[1];
+        g = s[2];
+        b = s[3];
+        ((uint32_t *)d)[0] = rgb_to_pixel32(r, g, b);
+        s += 4;
+        d += 4;
+    } while (--w != 0);
+#endif
+}
diff --git a/hw/display/vga.c b/hw/display/vga.c
index df0c010..19e7f23 100644
--- a/hw/display/vga.c
+++ b/hw/display/vga.c
@@ -761,14 +761,13 @@
                 s->vbe_regs[VBE_DISPI_INDEX_ENABLE] |= VBE_DISPI_ENABLED;
                 vbe_fixup_regs(s);
 
-                /* clear the screen (should be done in BIOS) */
+                /* clear the screen */
                 if (!(val & VBE_DISPI_NOCLEARMEM)) {
                     memset(s->vram_ptr, 0,
                            s->vbe_regs[VBE_DISPI_INDEX_YRES] * s->vbe_line_offset);
                 }
 
-                /* we initialize the VGA graphic mode (should be done
-                   in BIOS) */
+                /* we initialize the VGA graphic mode */
                 /* graphic mode + memory map 1 */
                 s->gr[VGA_GFX_MISC] = (s->gr[VGA_GFX_MISC] & ~0x0c) | 0x04 |
                     VGA_GR06_GRAPHICS_MODE;
@@ -801,7 +800,6 @@
                     (shift_control << 5);
                 s->cr[VGA_CRTC_MAX_SCAN] &= ~0x9f; /* no double scan */
             } else {
-                /* XXX: the bios should do that */
                 s->bank_offset = 0;
             }
             s->dac_8bit = (val & VBE_DISPI_8BIT_DAC) > 0;
@@ -1006,95 +1004,10 @@
     }
 }
 
-typedef void vga_draw_glyph8_func(uint8_t *d, int linesize,
-                             const uint8_t *font_ptr, int h,
-                             uint32_t fgcol, uint32_t bgcol);
-typedef void vga_draw_glyph9_func(uint8_t *d, int linesize,
-                                  const uint8_t *font_ptr, int h,
-                                  uint32_t fgcol, uint32_t bgcol, int dup9);
 typedef void vga_draw_line_func(VGACommonState *s1, uint8_t *d,
                                 const uint8_t *s, int width);
 
-#define DEPTH 8
-#include "vga_template.h"
-
-#define DEPTH 15
-#include "vga_template.h"
-
-#define BGR_FORMAT
-#define DEPTH 15
-#include "vga_template.h"
-
-#define DEPTH 16
-#include "vga_template.h"
-
-#define BGR_FORMAT
-#define DEPTH 16
-#include "vga_template.h"
-
-#define DEPTH 32
-#include "vga_template.h"
-
-#define BGR_FORMAT
-#define DEPTH 32
-#include "vga_template.h"
-
-static unsigned int rgb_to_pixel8_dup(unsigned int r, unsigned int g, unsigned b)
-{
-    unsigned int col;
-    col = rgb_to_pixel8(r, g, b);
-    col |= col << 8;
-    col |= col << 16;
-    return col;
-}
-
-static unsigned int rgb_to_pixel15_dup(unsigned int r, unsigned int g, unsigned b)
-{
-    unsigned int col;
-    col = rgb_to_pixel15(r, g, b);
-    col |= col << 16;
-    return col;
-}
-
-static unsigned int rgb_to_pixel15bgr_dup(unsigned int r, unsigned int g,
-                                          unsigned int b)
-{
-    unsigned int col;
-    col = rgb_to_pixel15bgr(r, g, b);
-    col |= col << 16;
-    return col;
-}
-
-static unsigned int rgb_to_pixel16_dup(unsigned int r, unsigned int g, unsigned b)
-{
-    unsigned int col;
-    col = rgb_to_pixel16(r, g, b);
-    col |= col << 16;
-    return col;
-}
-
-static unsigned int rgb_to_pixel16bgr_dup(unsigned int r, unsigned int g,
-                                          unsigned int b)
-{
-    unsigned int col;
-    col = rgb_to_pixel16bgr(r, g, b);
-    col |= col << 16;
-    return col;
-}
-
-static unsigned int rgb_to_pixel32_dup(unsigned int r, unsigned int g, unsigned b)
-{
-    unsigned int col;
-    col = rgb_to_pixel32(r, g, b);
-    return col;
-}
-
-static unsigned int rgb_to_pixel32bgr_dup(unsigned int r, unsigned int g, unsigned b)
-{
-    unsigned int col;
-    col = rgb_to_pixel32bgr(r, g, b);
-    return col;
-}
+#include "vga-helpers.h"
 
 /* return true if the palette was modified */
 static int update_palette16(VGACommonState *s)
@@ -1112,9 +1025,9 @@
             v = ((s->ar[VGA_ATC_COLOR_PAGE] & 0xc) << 4) | (v & 0x3f);
         }
         v = v * 3;
-        col = s->rgb_to_pixel(c6_to_8(s->palette[v]),
-                              c6_to_8(s->palette[v + 1]),
-                              c6_to_8(s->palette[v + 2]));
+        col = rgb_to_pixel32(c6_to_8(s->palette[v]),
+                             c6_to_8(s->palette[v + 1]),
+                             c6_to_8(s->palette[v + 2]));
         if (col != palette[i]) {
             full_update = 1;
             palette[i] = col;
@@ -1134,13 +1047,13 @@
     v = 0;
     for(i = 0; i < 256; i++) {
         if (s->dac_8bit) {
-          col = s->rgb_to_pixel(s->palette[v],
-                                s->palette[v + 1],
-                                s->palette[v + 2]);
+            col = rgb_to_pixel32(s->palette[v],
+                                 s->palette[v + 1],
+                                 s->palette[v + 2]);
         } else {
-          col = s->rgb_to_pixel(c6_to_8(s->palette[v]),
-                                c6_to_8(s->palette[v + 1]),
-                                c6_to_8(s->palette[v + 2]));
+            col = rgb_to_pixel32(c6_to_8(s->palette[v]),
+                                 c6_to_8(s->palette[v + 1]),
+                                 c6_to_8(s->palette[v + 2]));
         }
         if (col != palette[i]) {
             full_update = 1;
@@ -1202,56 +1115,6 @@
     return full_update;
 }
 
-#define NB_DEPTHS 7
-
-static inline int get_depth_index(DisplaySurface *s)
-{
-    switch (surface_bits_per_pixel(s)) {
-    default:
-    case 8:
-        return 0;
-    case 15:
-        return 1;
-    case 16:
-        return 2;
-    case 32:
-        if (is_surface_bgr(s)) {
-            return 4;
-        } else {
-            return 3;
-        }
-    }
-}
-
-static vga_draw_glyph8_func * const vga_draw_glyph8_table[NB_DEPTHS] = {
-    vga_draw_glyph8_8,
-    vga_draw_glyph8_16,
-    vga_draw_glyph8_16,
-    vga_draw_glyph8_32,
-    vga_draw_glyph8_32,
-    vga_draw_glyph8_16,
-    vga_draw_glyph8_16,
-};
-
-static vga_draw_glyph8_func * const vga_draw_glyph16_table[NB_DEPTHS] = {
-    vga_draw_glyph16_8,
-    vga_draw_glyph16_16,
-    vga_draw_glyph16_16,
-    vga_draw_glyph16_32,
-    vga_draw_glyph16_32,
-    vga_draw_glyph16_16,
-    vga_draw_glyph16_16,
-};
-
-static vga_draw_glyph9_func * const vga_draw_glyph9_table[NB_DEPTHS] = {
-    vga_draw_glyph9_8,
-    vga_draw_glyph9_16,
-    vga_draw_glyph9_16,
-    vga_draw_glyph9_32,
-    vga_draw_glyph9_32,
-    vga_draw_glyph9_16,
-    vga_draw_glyph9_16,
-};
 
 static const uint8_t cursor_glyph[32 * 4] = {
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -1303,18 +1166,6 @@
     *pcheight = cheight;
 }
 
-typedef unsigned int rgb_to_pixel_dup_func(unsigned int r, unsigned int g, unsigned b);
-
-static rgb_to_pixel_dup_func * const rgb_to_pixel_dup_table[NB_DEPTHS] = {
-    rgb_to_pixel8_dup,
-    rgb_to_pixel15_dup,
-    rgb_to_pixel16_dup,
-    rgb_to_pixel32_dup,
-    rgb_to_pixel32bgr_dup,
-    rgb_to_pixel15bgr_dup,
-    rgb_to_pixel16bgr_dup,
-};
-
 /*
  * Text mode update
  * Missing:
@@ -1331,11 +1182,9 @@
     uint32_t offset, fgcol, bgcol, v, cursor_offset;
     uint8_t *d1, *d, *src, *dest, *cursor_ptr;
     const uint8_t *font_ptr, *font_base[2];
-    int dup9, line_offset, depth_index;
+    int dup9, line_offset;
     uint32_t *palette;
     uint32_t *ch_attr_ptr;
-    vga_draw_glyph8_func *vga_draw_glyph8;
-    vga_draw_glyph9_func *vga_draw_glyph9;
     int64_t now = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
 
     /* compute font data address (in plane 2) */
@@ -1387,8 +1236,6 @@
         s->last_cw = cw;
         full_update = 1;
     }
-    s->rgb_to_pixel =
-        rgb_to_pixel_dup_table[get_depth_index(surface)];
     full_update |= update_palette16(s);
     palette = s->last_palette;
     x_incr = cw * surface_bytes_per_pixel(surface);
@@ -1422,13 +1269,6 @@
         s->cursor_visible_phase = !s->cursor_visible_phase;
     }
 
-    depth_index = get_depth_index(surface);
-    if (cw == 16)
-        vga_draw_glyph8 = vga_draw_glyph16_table[depth_index];
-    else
-        vga_draw_glyph8 = vga_draw_glyph8_table[depth_index];
-    vga_draw_glyph9 = vga_draw_glyph9_table[depth_index];
-
     dest = surface_data(surface);
     linesize = surface_stride(surface);
     ch_attr_ptr = s->last_ch_attr;
@@ -1458,7 +1298,10 @@
                 font_ptr += 32 * 4 * ch;
                 bgcol = palette[cattr >> 4];
                 fgcol = palette[cattr & 0x0f];
-                if (cw != 9) {
+                if (cw == 16) {
+                    vga_draw_glyph16(d1, linesize,
+                                     font_ptr, cheight, fgcol, bgcol);
+                } else if (cw != 9) {
                     vga_draw_glyph8(d1, linesize,
                                     font_ptr, cheight, fgcol, bgcol);
                 } else {
@@ -1483,7 +1326,10 @@
                     if (line_last >= line_start && line_start < cheight) {
                         h = line_last - line_start + 1;
                         d = d1 + linesize * line_start;
-                        if (cw != 9) {
+                        if (cw == 16) {
+                            vga_draw_glyph16(d, linesize,
+                                             cursor_glyph, h, fgcol, bgcol);
+                        } else if (cw != 9) {
                             vga_draw_glyph8(d, linesize,
                                             cursor_glyph, h, fgcol, bgcol);
                         } else {
@@ -1518,93 +1364,32 @@
     VGA_DRAW_LINE4D2,
     VGA_DRAW_LINE8D2,
     VGA_DRAW_LINE8,
-    VGA_DRAW_LINE15,
-    VGA_DRAW_LINE16,
-    VGA_DRAW_LINE24,
-    VGA_DRAW_LINE32,
+    VGA_DRAW_LINE15_LE,
+    VGA_DRAW_LINE16_LE,
+    VGA_DRAW_LINE24_LE,
+    VGA_DRAW_LINE32_LE,
+    VGA_DRAW_LINE15_BE,
+    VGA_DRAW_LINE16_BE,
+    VGA_DRAW_LINE24_BE,
+    VGA_DRAW_LINE32_BE,
     VGA_DRAW_LINE_NB,
 };
 
-static vga_draw_line_func * const vga_draw_line_table[NB_DEPTHS * VGA_DRAW_LINE_NB] = {
-    vga_draw_line2_8,
-    vga_draw_line2_16,
-    vga_draw_line2_16,
-    vga_draw_line2_32,
-    vga_draw_line2_32,
-    vga_draw_line2_16,
-    vga_draw_line2_16,
-
-    vga_draw_line2d2_8,
-    vga_draw_line2d2_16,
-    vga_draw_line2d2_16,
-    vga_draw_line2d2_32,
-    vga_draw_line2d2_32,
-    vga_draw_line2d2_16,
-    vga_draw_line2d2_16,
-
-    vga_draw_line4_8,
-    vga_draw_line4_16,
-    vga_draw_line4_16,
-    vga_draw_line4_32,
-    vga_draw_line4_32,
-    vga_draw_line4_16,
-    vga_draw_line4_16,
-
-    vga_draw_line4d2_8,
-    vga_draw_line4d2_16,
-    vga_draw_line4d2_16,
-    vga_draw_line4d2_32,
-    vga_draw_line4d2_32,
-    vga_draw_line4d2_16,
-    vga_draw_line4d2_16,
-
-    vga_draw_line8d2_8,
-    vga_draw_line8d2_16,
-    vga_draw_line8d2_16,
-    vga_draw_line8d2_32,
-    vga_draw_line8d2_32,
-    vga_draw_line8d2_16,
-    vga_draw_line8d2_16,
-
-    vga_draw_line8_8,
-    vga_draw_line8_16,
-    vga_draw_line8_16,
-    vga_draw_line8_32,
-    vga_draw_line8_32,
-    vga_draw_line8_16,
-    vga_draw_line8_16,
-
-    vga_draw_line15_8,
-    vga_draw_line15_15,
-    vga_draw_line15_16,
-    vga_draw_line15_32,
-    vga_draw_line15_32bgr,
-    vga_draw_line15_15bgr,
-    vga_draw_line15_16bgr,
-
-    vga_draw_line16_8,
-    vga_draw_line16_15,
-    vga_draw_line16_16,
-    vga_draw_line16_32,
-    vga_draw_line16_32bgr,
-    vga_draw_line16_15bgr,
-    vga_draw_line16_16bgr,
-
-    vga_draw_line24_8,
-    vga_draw_line24_15,
-    vga_draw_line24_16,
-    vga_draw_line24_32,
-    vga_draw_line24_32bgr,
-    vga_draw_line24_15bgr,
-    vga_draw_line24_16bgr,
-
-    vga_draw_line32_8,
-    vga_draw_line32_15,
-    vga_draw_line32_16,
-    vga_draw_line32_32,
-    vga_draw_line32_32bgr,
-    vga_draw_line32_15bgr,
-    vga_draw_line32_16bgr,
+static vga_draw_line_func * const vga_draw_line_table[VGA_DRAW_LINE_NB] = {
+    vga_draw_line2,
+    vga_draw_line2d2,
+    vga_draw_line4,
+    vga_draw_line4d2,
+    vga_draw_line8d2,
+    vga_draw_line8,
+    vga_draw_line15_le,
+    vga_draw_line16_le,
+    vga_draw_line24_le,
+    vga_draw_line32_le,
+    vga_draw_line15_be,
+    vga_draw_line16_be,
+    vga_draw_line24_be,
+    vga_draw_line32_be,
 };
 
 static int vga_get_bpp(VGACommonState *s)
@@ -1676,11 +1461,11 @@
     int disp_width, multi_scan, multi_run;
     uint8_t *d;
     uint32_t v, addr1, addr;
-    vga_draw_line_func *vga_draw_line;
-#if defined(HOST_WORDS_BIGENDIAN) == defined(TARGET_WORDS_BIGENDIAN)
-    static const bool byteswap = false;
+    vga_draw_line_func *vga_draw_line = NULL;
+#ifdef HOST_WORDS_BIGENDIAN
+    bool byteswap = !s->big_endian_fb;
 #else
-    static const bool byteswap = true;
+    bool byteswap = s->big_endian_fb;
 #endif
 
     full_update |= update_basic_params(s);
@@ -1723,7 +1508,8 @@
     if (s->line_offset != s->last_line_offset ||
         disp_width != s->last_width ||
         height != s->last_height ||
-        s->last_depth != depth) {
+        s->last_depth != depth ||
+        s->last_byteswap != byteswap) {
         if (depth == 32 || (depth == 16 && !byteswap)) {
             pixman_format_code_t format =
                 qemu_default_pixman_format(depth, !byteswap);
@@ -1741,6 +1527,7 @@
         s->last_height = height;
         s->last_line_offset = s->line_offset;
         s->last_depth = depth;
+        s->last_byteswap = byteswap;
         full_update = 1;
     } else if (is_buffer_shared(surface) &&
                (full_update || surface_data(surface) != s->vram_ptr
@@ -1753,9 +1540,6 @@
         dpy_gfx_replace_surface(s->con, surface);
     }
 
-    s->rgb_to_pixel =
-        rgb_to_pixel_dup_table[get_depth_index(surface)];
-
     if (shift_control == 0) {
         full_update |= update_palette16(s);
         if (s->sr[VGA_SEQ_CLOCK_MODE] & 8) {
@@ -1786,25 +1570,24 @@
             bits = 8;
             break;
         case 15:
-            v = VGA_DRAW_LINE15;
+            v = s->big_endian_fb ? VGA_DRAW_LINE15_BE : VGA_DRAW_LINE15_LE;
             bits = 16;
             break;
         case 16:
-            v = VGA_DRAW_LINE16;
+            v = s->big_endian_fb ? VGA_DRAW_LINE16_BE : VGA_DRAW_LINE16_LE;
             bits = 16;
             break;
         case 24:
-            v = VGA_DRAW_LINE24;
+            v = s->big_endian_fb ? VGA_DRAW_LINE24_BE : VGA_DRAW_LINE24_LE;
             bits = 24;
             break;
         case 32:
-            v = VGA_DRAW_LINE32;
+            v = s->big_endian_fb ? VGA_DRAW_LINE32_BE : VGA_DRAW_LINE32_LE;
             bits = 32;
             break;
         }
     }
-    vga_draw_line = vga_draw_line_table[v * NB_DEPTHS +
-                                        get_depth_index(surface)];
+    vga_draw_line = vga_draw_line_table[v];
 
     if (!is_buffer_shared(surface) && s->cursor_invalidate) {
         s->cursor_invalidate(s);
@@ -1894,7 +1677,7 @@
 static void vga_draw_blank(VGACommonState *s, int full_update)
 {
     DisplaySurface *surface = qemu_console_surface(s->con);
-    int i, w, val;
+    int i, w;
     uint8_t *d;
 
     if (!full_update)
@@ -1902,17 +1685,10 @@
     if (s->last_scr_width <= 0 || s->last_scr_height <= 0)
         return;
 
-    s->rgb_to_pixel =
-        rgb_to_pixel_dup_table[get_depth_index(surface)];
-    if (surface_bits_per_pixel(surface) == 8) {
-        val = s->rgb_to_pixel(0, 0, 0);
-    } else {
-        val = 0;
-    }
     w = s->last_scr_width * surface_bytes_per_pixel(surface);
     d = surface_data(surface);
     for(i = 0; i < s->last_scr_height; i++) {
-        memset(d, val, w);
+        memset(d, 0, w);
         d += surface_stride(surface);
     }
     dpy_gfx_update(s->con, 0, 0,
@@ -2015,6 +1791,7 @@
     s->cursor_start = 0;
     s->cursor_end = 0;
     s->cursor_offset = 0;
+    s->big_endian_fb = s->default_endian_fb;
     memset(s->invalidated_y_table, '\0', sizeof(s->invalidated_y_table));
     memset(s->last_palette, '\0', sizeof(s->last_palette));
     memset(s->last_ch_attr, '\0', sizeof(s->last_ch_attr));
@@ -2246,6 +2023,28 @@
     return 0;
 }
 
+static bool vga_endian_state_needed(void *opaque)
+{
+    VGACommonState *s = opaque;
+
+    /*
+     * Only send the endian state if it's different from the
+     * default one, thus ensuring backward compatibility for
+     * migration of the common case
+     */
+    return s->default_endian_fb != s->big_endian_fb;
+}
+
+const VMStateDescription vmstate_vga_endian = {
+    .name = "vga.endian",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(big_endian_fb, VGACommonState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 const VMStateDescription vmstate_vga_common = {
     .name = "vga",
     .version_id = 2,
@@ -2282,6 +2081,14 @@
         VMSTATE_UINT32(vbe_line_offset, VGACommonState),
         VMSTATE_UINT32(vbe_bank_mask, VGACommonState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (VMStateSubsection []) {
+        {
+            .vmsd = &vmstate_vga_endian,
+            .needed = vga_endian_state_needed,
+        }, {
+            /* empty */
+        }
     }
 };
 
@@ -2350,6 +2157,17 @@
         s->update_retrace_info = vga_precise_update_retrace_info;
         break;
     }
+
+    /*
+     * Set default fb endian based on target, could probably be turned
+     * into a device attribute set by the machine/platform to remove
+     * all target endian dependencies from this file.
+     */
+#ifdef TARGET_WORDS_BIGENDIAN
+    s->default_endian_fb = true;
+#else
+    s->default_endian_fb = false;
+#endif
     vga_dirty_log_start(s);
 }
 
diff --git a/hw/display/vga_int.h b/hw/display/vga_int.h
index bbc0cb2..ed69e06 100644
--- a/hw/display/vga_int.h
+++ b/hw/display/vga_int.h
@@ -150,15 +150,16 @@
     uint32_t last_width, last_height; /* in chars or pixels */
     uint32_t last_scr_width, last_scr_height; /* in pixels */
     uint32_t last_depth; /* in bits */
+    bool last_byteswap;
     uint8_t cursor_start, cursor_end;
     bool cursor_visible_phase;
     int64_t cursor_blink_time;
     uint32_t cursor_offset;
-    unsigned int (*rgb_to_pixel)(unsigned int r,
-                                 unsigned int g, unsigned b);
     const GraphicHwOps *hw_ops;
     bool full_update_text;
     bool full_update_gfx;
+    bool big_endian_fb;
+    bool default_endian_fb;
     /* hardware mouse cursor support */
     uint32_t invalidated_y_table[VGA_MAX_HEIGHT / 32];
     void (*cursor_invalidate)(struct VGACommonState *s);
diff --git a/hw/display/vga_template.h b/hw/display/vga_template.h
deleted file mode 100644
index 90ec9c2..0000000
--- a/hw/display/vga_template.h
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * QEMU VGA Emulator templates
- *
- * Copyright (c) 2003 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#if DEPTH == 8
-#define BPP 1
-#define PIXEL_TYPE uint8_t
-#elif DEPTH == 15 || DEPTH == 16
-#define BPP 2
-#define PIXEL_TYPE uint16_t
-#elif DEPTH == 32
-#define BPP 4
-#define PIXEL_TYPE uint32_t
-#else
-#error unsupport depth
-#endif
-
-#ifdef BGR_FORMAT
-#define PIXEL_NAME glue(DEPTH, bgr)
-#else
-#define PIXEL_NAME DEPTH
-#endif /* BGR_FORMAT */
-
-#if DEPTH != 15 && !defined(BGR_FORMAT)
-
-static inline void glue(vga_draw_glyph_line_, DEPTH)(uint8_t *d,
-                                                     uint32_t font_data,
-                                                     uint32_t xorcol,
-                                                     uint32_t bgcol)
-{
-#if BPP == 1
-        ((uint32_t *)d)[0] = (dmask16[(font_data >> 4)] & xorcol) ^ bgcol;
-        ((uint32_t *)d)[1] = (dmask16[(font_data >> 0) & 0xf] & xorcol) ^ bgcol;
-#elif BPP == 2
-        ((uint32_t *)d)[0] = (dmask4[(font_data >> 6)] & xorcol) ^ bgcol;
-        ((uint32_t *)d)[1] = (dmask4[(font_data >> 4) & 3] & xorcol) ^ bgcol;
-        ((uint32_t *)d)[2] = (dmask4[(font_data >> 2) & 3] & xorcol) ^ bgcol;
-        ((uint32_t *)d)[3] = (dmask4[(font_data >> 0) & 3] & xorcol) ^ bgcol;
-#else
-        ((uint32_t *)d)[0] = (-((font_data >> 7)) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[1] = (-((font_data >> 6) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[2] = (-((font_data >> 5) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[3] = (-((font_data >> 4) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[4] = (-((font_data >> 3) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[5] = (-((font_data >> 2) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[6] = (-((font_data >> 1) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[7] = (-((font_data >> 0) & 1) & xorcol) ^ bgcol;
-#endif
-}
-
-static void glue(vga_draw_glyph8_, DEPTH)(uint8_t *d, int linesize,
-                                          const uint8_t *font_ptr, int h,
-                                          uint32_t fgcol, uint32_t bgcol)
-{
-    uint32_t font_data, xorcol;
-
-    xorcol = bgcol ^ fgcol;
-    do {
-        font_data = font_ptr[0];
-        glue(vga_draw_glyph_line_, DEPTH)(d, font_data, xorcol, bgcol);
-        font_ptr += 4;
-        d += linesize;
-    } while (--h);
-}
-
-static void glue(vga_draw_glyph16_, DEPTH)(uint8_t *d, int linesize,
-                                          const uint8_t *font_ptr, int h,
-                                          uint32_t fgcol, uint32_t bgcol)
-{
-    uint32_t font_data, xorcol;
-
-    xorcol = bgcol ^ fgcol;
-    do {
-        font_data = font_ptr[0];
-        glue(vga_draw_glyph_line_, DEPTH)(d,
-                                          expand4to8[font_data >> 4],
-                                          xorcol, bgcol);
-        glue(vga_draw_glyph_line_, DEPTH)(d + 8 * BPP,
-                                          expand4to8[font_data & 0x0f],
-                                          xorcol, bgcol);
-        font_ptr += 4;
-        d += linesize;
-    } while (--h);
-}
-
-static void glue(vga_draw_glyph9_, DEPTH)(uint8_t *d, int linesize,
-                                          const uint8_t *font_ptr, int h,
-                                          uint32_t fgcol, uint32_t bgcol, int dup9)
-{
-    uint32_t font_data, xorcol, v;
-
-    xorcol = bgcol ^ fgcol;
-    do {
-        font_data = font_ptr[0];
-#if BPP == 1
-        stl_p((uint32_t *)d, (dmask16[(font_data >> 4)] & xorcol) ^ bgcol);
-        v = (dmask16[(font_data >> 0) & 0xf] & xorcol) ^ bgcol;
-        stl_p(((uint32_t *)d)+1, v);
-        if (dup9)
-            ((uint8_t *)d)[8] = v >> (24 * (1 - BIG));
-        else
-            ((uint8_t *)d)[8] = bgcol;
-
-#elif BPP == 2
-        stl_p(((uint32_t *)d)+0, (dmask4[(font_data >> 6)] & xorcol) ^ bgcol);
-        stl_p(((uint32_t *)d)+1,
-              (dmask4[(font_data >> 4) & 3] & xorcol) ^ bgcol);
-        stl_p(((uint32_t *)d)+2,
-              (dmask4[(font_data >> 2) & 3] & xorcol) ^ bgcol);
-        v = (dmask4[(font_data >> 0) & 3] & xorcol) ^ bgcol;
-        stl_p(((uint32_t *)d)+3, v);
-        if (dup9)
-            ((uint16_t *)d)[8] = v >> (16 * (1 - BIG));
-        else
-            ((uint16_t *)d)[8] = bgcol;
-#else
-        ((uint32_t *)d)[0] = (-((font_data >> 7)) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[1] = (-((font_data >> 6) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[2] = (-((font_data >> 5) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[3] = (-((font_data >> 4) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[4] = (-((font_data >> 3) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[5] = (-((font_data >> 2) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[6] = (-((font_data >> 1) & 1) & xorcol) ^ bgcol;
-        v = (-((font_data >> 0) & 1) & xorcol) ^ bgcol;
-        ((uint32_t *)d)[7] = v;
-        if (dup9)
-            ((uint32_t *)d)[8] = v;
-        else
-            ((uint32_t *)d)[8] = bgcol;
-#endif
-        font_ptr += 4;
-        d += linesize;
-    } while (--h);
-}
-
-/*
- * 4 color mode
- */
-static void glue(vga_draw_line2_, DEPTH)(VGACommonState *s1, uint8_t *d,
-                                         const uint8_t *s, int width)
-{
-    uint32_t plane_mask, *palette, data, v;
-    int x;
-
-    palette = s1->last_palette;
-    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
-    width >>= 3;
-    for(x = 0; x < width; x++) {
-        data = ((uint32_t *)s)[0];
-        data &= plane_mask;
-        v = expand2[GET_PLANE(data, 0)];
-        v |= expand2[GET_PLANE(data, 2)] << 2;
-        ((PIXEL_TYPE *)d)[0] = palette[v >> 12];
-        ((PIXEL_TYPE *)d)[1] = palette[(v >> 8) & 0xf];
-        ((PIXEL_TYPE *)d)[2] = palette[(v >> 4) & 0xf];
-        ((PIXEL_TYPE *)d)[3] = palette[(v >> 0) & 0xf];
-
-        v = expand2[GET_PLANE(data, 1)];
-        v |= expand2[GET_PLANE(data, 3)] << 2;
-        ((PIXEL_TYPE *)d)[4] = palette[v >> 12];
-        ((PIXEL_TYPE *)d)[5] = palette[(v >> 8) & 0xf];
-        ((PIXEL_TYPE *)d)[6] = palette[(v >> 4) & 0xf];
-        ((PIXEL_TYPE *)d)[7] = palette[(v >> 0) & 0xf];
-        d += BPP * 8;
-        s += 4;
-    }
-}
-
-#if BPP == 1
-#define PUT_PIXEL2(d, n, v) ((uint16_t *)d)[(n)] = (v)
-#elif BPP == 2
-#define PUT_PIXEL2(d, n, v) ((uint32_t *)d)[(n)] = (v)
-#else
-#define PUT_PIXEL2(d, n, v) \
-((uint32_t *)d)[2*(n)] = ((uint32_t *)d)[2*(n)+1] = (v)
-#endif
-
-/*
- * 4 color mode, dup2 horizontal
- */
-static void glue(vga_draw_line2d2_, DEPTH)(VGACommonState *s1, uint8_t *d,
-                                           const uint8_t *s, int width)
-{
-    uint32_t plane_mask, *palette, data, v;
-    int x;
-
-    palette = s1->last_palette;
-    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
-    width >>= 3;
-    for(x = 0; x < width; x++) {
-        data = ((uint32_t *)s)[0];
-        data &= plane_mask;
-        v = expand2[GET_PLANE(data, 0)];
-        v |= expand2[GET_PLANE(data, 2)] << 2;
-        PUT_PIXEL2(d, 0, palette[v >> 12]);
-        PUT_PIXEL2(d, 1, palette[(v >> 8) & 0xf]);
-        PUT_PIXEL2(d, 2, palette[(v >> 4) & 0xf]);
-        PUT_PIXEL2(d, 3, palette[(v >> 0) & 0xf]);
-
-        v = expand2[GET_PLANE(data, 1)];
-        v |= expand2[GET_PLANE(data, 3)] << 2;
-        PUT_PIXEL2(d, 4, palette[v >> 12]);
-        PUT_PIXEL2(d, 5, palette[(v >> 8) & 0xf]);
-        PUT_PIXEL2(d, 6, palette[(v >> 4) & 0xf]);
-        PUT_PIXEL2(d, 7, palette[(v >> 0) & 0xf]);
-        d += BPP * 16;
-        s += 4;
-    }
-}
-
-/*
- * 16 color mode
- */
-static void glue(vga_draw_line4_, DEPTH)(VGACommonState *s1, uint8_t *d,
-                                         const uint8_t *s, int width)
-{
-    uint32_t plane_mask, data, v, *palette;
-    int x;
-
-    palette = s1->last_palette;
-    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
-    width >>= 3;
-    for(x = 0; x < width; x++) {
-        data = ((uint32_t *)s)[0];
-        data &= plane_mask;
-        v = expand4[GET_PLANE(data, 0)];
-        v |= expand4[GET_PLANE(data, 1)] << 1;
-        v |= expand4[GET_PLANE(data, 2)] << 2;
-        v |= expand4[GET_PLANE(data, 3)] << 3;
-        ((PIXEL_TYPE *)d)[0] = palette[v >> 28];
-        ((PIXEL_TYPE *)d)[1] = palette[(v >> 24) & 0xf];
-        ((PIXEL_TYPE *)d)[2] = palette[(v >> 20) & 0xf];
-        ((PIXEL_TYPE *)d)[3] = palette[(v >> 16) & 0xf];
-        ((PIXEL_TYPE *)d)[4] = palette[(v >> 12) & 0xf];
-        ((PIXEL_TYPE *)d)[5] = palette[(v >> 8) & 0xf];
-        ((PIXEL_TYPE *)d)[6] = palette[(v >> 4) & 0xf];
-        ((PIXEL_TYPE *)d)[7] = palette[(v >> 0) & 0xf];
-        d += BPP * 8;
-        s += 4;
-    }
-}
-
-/*
- * 16 color mode, dup2 horizontal
- */
-static void glue(vga_draw_line4d2_, DEPTH)(VGACommonState *s1, uint8_t *d,
-                                           const uint8_t *s, int width)
-{
-    uint32_t plane_mask, data, v, *palette;
-    int x;
-
-    palette = s1->last_palette;
-    plane_mask = mask16[s1->ar[VGA_ATC_PLANE_ENABLE] & 0xf];
-    width >>= 3;
-    for(x = 0; x < width; x++) {
-        data = ((uint32_t *)s)[0];
-        data &= plane_mask;
-        v = expand4[GET_PLANE(data, 0)];
-        v |= expand4[GET_PLANE(data, 1)] << 1;
-        v |= expand4[GET_PLANE(data, 2)] << 2;
-        v |= expand4[GET_PLANE(data, 3)] << 3;
-        PUT_PIXEL2(d, 0, palette[v >> 28]);
-        PUT_PIXEL2(d, 1, palette[(v >> 24) & 0xf]);
-        PUT_PIXEL2(d, 2, palette[(v >> 20) & 0xf]);
-        PUT_PIXEL2(d, 3, palette[(v >> 16) & 0xf]);
-        PUT_PIXEL2(d, 4, palette[(v >> 12) & 0xf]);
-        PUT_PIXEL2(d, 5, palette[(v >> 8) & 0xf]);
-        PUT_PIXEL2(d, 6, palette[(v >> 4) & 0xf]);
-        PUT_PIXEL2(d, 7, palette[(v >> 0) & 0xf]);
-        d += BPP * 16;
-        s += 4;
-    }
-}
-
-/*
- * 256 color mode, double pixels
- *
- * XXX: add plane_mask support (never used in standard VGA modes)
- */
-static void glue(vga_draw_line8d2_, DEPTH)(VGACommonState *s1, uint8_t *d,
-                                           const uint8_t *s, int width)
-{
-    uint32_t *palette;
-    int x;
-
-    palette = s1->last_palette;
-    width >>= 3;
-    for(x = 0; x < width; x++) {
-        PUT_PIXEL2(d, 0, palette[s[0]]);
-        PUT_PIXEL2(d, 1, palette[s[1]]);
-        PUT_PIXEL2(d, 2, palette[s[2]]);
-        PUT_PIXEL2(d, 3, palette[s[3]]);
-        d += BPP * 8;
-        s += 4;
-    }
-}
-
-/*
- * standard 256 color mode
- *
- * XXX: add plane_mask support (never used in standard VGA modes)
- */
-static void glue(vga_draw_line8_, DEPTH)(VGACommonState *s1, uint8_t *d,
-                                         const uint8_t *s, int width)
-{
-    uint32_t *palette;
-    int x;
-
-    palette = s1->last_palette;
-    width >>= 3;
-    for(x = 0; x < width; x++) {
-        ((PIXEL_TYPE *)d)[0] = palette[s[0]];
-        ((PIXEL_TYPE *)d)[1] = palette[s[1]];
-        ((PIXEL_TYPE *)d)[2] = palette[s[2]];
-        ((PIXEL_TYPE *)d)[3] = palette[s[3]];
-        ((PIXEL_TYPE *)d)[4] = palette[s[4]];
-        ((PIXEL_TYPE *)d)[5] = palette[s[5]];
-        ((PIXEL_TYPE *)d)[6] = palette[s[6]];
-        ((PIXEL_TYPE *)d)[7] = palette[s[7]];
-        d += BPP * 8;
-        s += 8;
-    }
-}
-
-#endif /* DEPTH != 15 */
-
-
-/* XXX: optimize */
-
-/*
- * 15 bit color
- */
-static void glue(vga_draw_line15_, PIXEL_NAME)(VGACommonState *s1, uint8_t *d,
-                                          const uint8_t *s, int width)
-{
-#if DEPTH == 15 && defined(HOST_WORDS_BIGENDIAN) == defined(TARGET_WORDS_BIGENDIAN)
-    memcpy(d, s, width * 2);
-#else
-    int w;
-    uint32_t v, r, g, b;
-
-    w = width;
-    do {
-        v = lduw_p((void *)s);
-        r = (v >> 7) & 0xf8;
-        g = (v >> 2) & 0xf8;
-        b = (v << 3) & 0xf8;
-        ((PIXEL_TYPE *)d)[0] = glue(rgb_to_pixel, PIXEL_NAME)(r, g, b);
-        s += 2;
-        d += BPP;
-    } while (--w != 0);
-#endif
-}
-
-/*
- * 16 bit color
- */
-static void glue(vga_draw_line16_, PIXEL_NAME)(VGACommonState *s1, uint8_t *d,
-                                          const uint8_t *s, int width)
-{
-#if DEPTH == 16 && defined(HOST_WORDS_BIGENDIAN) == defined(TARGET_WORDS_BIGENDIAN)
-    memcpy(d, s, width * 2);
-#else
-    int w;
-    uint32_t v, r, g, b;
-
-    w = width;
-    do {
-        v = lduw_p((void *)s);
-        r = (v >> 8) & 0xf8;
-        g = (v >> 3) & 0xfc;
-        b = (v << 3) & 0xf8;
-        ((PIXEL_TYPE *)d)[0] = glue(rgb_to_pixel, PIXEL_NAME)(r, g, b);
-        s += 2;
-        d += BPP;
-    } while (--w != 0);
-#endif
-}
-
-/*
- * 24 bit color
- */
-static void glue(vga_draw_line24_, PIXEL_NAME)(VGACommonState *s1, uint8_t *d,
-                                          const uint8_t *s, int width)
-{
-    int w;
-    uint32_t r, g, b;
-
-    w = width;
-    do {
-#if defined(TARGET_WORDS_BIGENDIAN)
-        r = s[0];
-        g = s[1];
-        b = s[2];
-#else
-        b = s[0];
-        g = s[1];
-        r = s[2];
-#endif
-        ((PIXEL_TYPE *)d)[0] = glue(rgb_to_pixel, PIXEL_NAME)(r, g, b);
-        s += 3;
-        d += BPP;
-    } while (--w != 0);
-}
-
-/*
- * 32 bit color
- */
-static void glue(vga_draw_line32_, PIXEL_NAME)(VGACommonState *s1, uint8_t *d,
-                                          const uint8_t *s, int width)
-{
-#if DEPTH == 32 && defined(HOST_WORDS_BIGENDIAN) == defined(TARGET_WORDS_BIGENDIAN) && !defined(BGR_FORMAT)
-    memcpy(d, s, width * 4);
-#else
-    int w;
-    uint32_t r, g, b;
-
-    w = width;
-    do {
-#if defined(TARGET_WORDS_BIGENDIAN)
-        r = s[1];
-        g = s[2];
-        b = s[3];
-#else
-        b = s[0];
-        g = s[1];
-        r = s[2];
-#endif
-        ((PIXEL_TYPE *)d)[0] = glue(rgb_to_pixel, PIXEL_NAME)(r, g, b);
-        s += 4;
-        d += BPP;
-    } while (--w != 0);
-#endif
-}
-
-#undef PUT_PIXEL2
-#undef DEPTH
-#undef BPP
-#undef PIXEL_TYPE
-#undef PIXEL_NAME
-#undef BGR_FORMAT
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 5bfc5b7..a800ea7 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -252,7 +252,7 @@
         error_setg(errp, "'" PC_DIMM_MEMDEV_PROP "' property is not set");
         return;
     }
-    if (dimm->node >= nb_numa_nodes) {
+    if ((nb_numa_nodes > 0) && (dimm->node >= nb_numa_nodes)) {
         error_setg(errp, "'DIMM property " PC_DIMM_NODE_PROP " has value %"
                    PRIu32 "' which exceeds the number of numa nodes: %d",
                    dimm->node, nb_numa_nodes);
diff --git a/hw/pci/pci-hotplug-old.c b/hw/pci/pci-hotplug-old.c
index cf2caeb..d87c469 100644
--- a/hw/pci/pci-hotplug-old.c
+++ b/hw/pci/pci-hotplug-old.c
@@ -107,6 +107,7 @@
 {
     SCSIBus *scsibus;
     SCSIDevice *scsidev;
+    Error *local_err = NULL;
 
     scsibus = (SCSIBus *)
         object_dynamic_cast(OBJECT(QLIST_FIRST(&adapter->child_bus)),
@@ -127,8 +128,10 @@
     dinfo->unit = qemu_opt_get_number(dinfo->opts, "unit", -1);
     dinfo->bus = scsibus->busnr;
     scsidev = scsi_bus_legacy_add_drive(scsibus, dinfo->bdrv, dinfo->unit,
-                                        false, -1, NULL, NULL);
+                                        false, -1, NULL, &local_err);
     if (!scsidev) {
+        error_report("%s", error_get_pretty(local_err));
+        error_free(local_err);
         return -1;
     }
     dinfo->unit = scsidev->id;
diff --git a/hw/s390x/s390-virtio-bus.c b/hw/s390x/s390-virtio-bus.c
index 6b6fb61..f451ca1 100644
--- a/hw/s390x/s390-virtio-bus.c
+++ b/hw/s390x/s390-virtio-bus.c
@@ -159,8 +159,9 @@
 static void s390_virtio_net_instance_init(Object *obj)
 {
     VirtIONetS390 *dev = VIRTIO_NET_S390(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_NET);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_NET);
 }
 
 static int s390_virtio_blk_init(VirtIOS390Device *s390_dev)
@@ -177,10 +178,9 @@
 static void s390_virtio_blk_instance_init(Object *obj)
 {
     VirtIOBlkS390 *dev = VIRTIO_BLK_S390(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_BLK);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
-    object_unref(OBJECT(&dev->vdev));
-    qdev_alias_all_properties(DEVICE(&dev->vdev), obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_BLK);
     object_property_add_alias(obj, "iothread", OBJECT(&dev->vdev),"iothread",
                               &error_abort);
 }
@@ -222,8 +222,9 @@
 static void s390_virtio_serial_instance_init(Object *obj)
 {
     VirtIOSerialS390 *dev = VIRTIO_SERIAL_S390(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_SERIAL);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_SERIAL);
 }
 
 static int s390_virtio_scsi_init(VirtIOS390Device *s390_dev)
@@ -254,8 +255,9 @@
 static void s390_virtio_scsi_instance_init(Object *obj)
 {
     VirtIOSCSIS390 *dev = VIRTIO_SCSI_S390(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_SCSI);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_SCSI);
 }
 
 #ifdef CONFIG_VHOST_SCSI
@@ -275,8 +277,9 @@
 static void s390_vhost_scsi_instance_init(Object *obj)
 {
     VHostSCSIS390 *dev = VHOST_SCSI_S390(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VHOST_SCSI);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_SCSI);
 }
 #endif
 
@@ -301,8 +304,9 @@
 static void s390_virtio_rng_instance_init(Object *obj)
 {
     VirtIORNGS390 *dev = VIRTIO_RNG_S390(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_RNG);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_RNG);
     object_property_add_link(obj, "rng", TYPE_RNG_BACKEND,
                              (Object **)&dev->vdev.conf.rng,
                              qdev_prop_allow_set_link_before_realize,
@@ -493,10 +497,8 @@
 /**************** S390 Virtio Bus Device Descriptions *******************/
 
 static Property s390_virtio_net_properties[] = {
-    DEFINE_NIC_PROPERTIES(VirtIONetS390, vdev.nic_conf),
     DEFINE_VIRTIO_COMMON_FEATURES(VirtIOS390Device, host_features),
     DEFINE_VIRTIO_NET_FEATURES(VirtIOS390Device, host_features),
-    DEFINE_VIRTIO_NET_PROPERTIES(VirtIONetS390, vdev.net_conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -533,7 +535,6 @@
 };
 
 static Property s390_virtio_serial_properties[] = {
-    DEFINE_VIRTIO_SERIAL_PROPERTIES(VirtIOSerialS390, vdev.serial),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -556,7 +557,6 @@
 
 static Property s390_virtio_rng_properties[] = {
     DEFINE_VIRTIO_COMMON_FEATURES(VirtIOS390Device, host_features),
-    DEFINE_VIRTIO_RNG_PROPERTIES(VirtIORNGS390, vdev.conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -614,7 +614,6 @@
 };
 
 static Property s390_virtio_scsi_properties[] = {
-    DEFINE_VIRTIO_SCSI_PROPERTIES(VirtIOSCSIS390, vdev.parent_obj.conf),
     DEFINE_VIRTIO_COMMON_FEATURES(VirtIOS390Device, host_features),
     DEFINE_VIRTIO_SCSI_FEATURES(VirtIOS390Device, host_features),
     DEFINE_PROP_END_OF_LIST(),
@@ -640,7 +639,6 @@
 #ifdef CONFIG_VHOST_SCSI
 static Property s390_vhost_scsi_properties[] = {
     DEFINE_VIRTIO_COMMON_FEATURES(VirtIOS390Device, host_features),
-    DEFINE_VHOST_SCSI_PROPERTIES(VHostSCSIS390, vdev.parent_obj.conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 33a1d86..e7d3ea1 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -792,8 +792,9 @@
 static void virtio_ccw_net_instance_init(Object *obj)
 {
     VirtIONetCcw *dev = VIRTIO_NET_CCW(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_NET);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_NET);
 }
 
 static int virtio_ccw_blk_init(VirtioCcwDevice *ccw_dev)
@@ -811,10 +812,9 @@
 static void virtio_ccw_blk_instance_init(Object *obj)
 {
     VirtIOBlkCcw *dev = VIRTIO_BLK_CCW(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_BLK);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
-    object_unref(OBJECT(&dev->vdev));
-    qdev_alias_all_properties(DEVICE(&dev->vdev), obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_BLK);
     object_property_add_alias(obj, "iothread", OBJECT(&dev->vdev),"iothread",
                               &error_abort);
 }
@@ -848,8 +848,9 @@
 static void virtio_ccw_serial_instance_init(Object *obj)
 {
     VirtioSerialCcw *dev = VIRTIO_SERIAL_CCW(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_SERIAL);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_SERIAL);
 }
 
 static int virtio_ccw_balloon_init(VirtioCcwDevice *ccw_dev)
@@ -896,7 +897,7 @@
     VirtIOBalloonCcw *dev = VIRTIO_BALLOON_CCW(obj);
     object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_BALLOON);
     object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
-
+    object_unref(OBJECT(&dev->vdev));
     object_property_add(obj, "guest-stats", "guest statistics",
                         balloon_ccw_stats_get_all, NULL, NULL, dev, NULL);
 
@@ -934,8 +935,11 @@
 static void virtio_ccw_scsi_instance_init(Object *obj)
 {
     VirtIOSCSICcw *dev = VIRTIO_SCSI_CCW(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_SCSI);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_SCSI);
+    object_property_add_alias(obj, "iothread", OBJECT(&dev->vdev), "iothread",
+                              &error_abort);
 }
 
 #ifdef CONFIG_VHOST_SCSI
@@ -955,8 +959,9 @@
 static void vhost_ccw_scsi_instance_init(Object *obj)
 {
     VHostSCSICcw *dev = VHOST_SCSI_CCW(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VHOST_SCSI);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_SCSI);
 }
 #endif
 
@@ -1374,8 +1379,6 @@
 static Property virtio_ccw_net_properties[] = {
     DEFINE_PROP_STRING("devno", VirtioCcwDevice, bus_id),
     DEFINE_VIRTIO_NET_FEATURES(VirtioCcwDevice, host_features[0]),
-    DEFINE_VIRTIO_NET_PROPERTIES(VirtIONetCcw, vdev.net_conf),
-    DEFINE_NIC_PROPERTIES(VirtIONetCcw, vdev.nic_conf),
     DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags,
                     VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true),
     DEFINE_PROP_END_OF_LIST(),
@@ -1428,7 +1431,6 @@
 
 static Property virtio_ccw_serial_properties[] = {
     DEFINE_PROP_STRING("devno", VirtioCcwDevice, bus_id),
-    DEFINE_VIRTIO_SERIAL_PROPERTIES(VirtioSerialCcw, vdev.serial),
     DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags,
                     VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true),
     DEFINE_PROP_END_OF_LIST(),
@@ -1481,7 +1483,6 @@
 
 static Property virtio_ccw_scsi_properties[] = {
     DEFINE_PROP_STRING("devno", VirtioCcwDevice, bus_id),
-    DEFINE_VIRTIO_SCSI_PROPERTIES(VirtIOSCSICcw, vdev.parent_obj.conf),
     DEFINE_VIRTIO_SCSI_FEATURES(VirtioCcwDevice, host_features[0]),
     DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags,
                     VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true),
@@ -1510,7 +1511,6 @@
 #ifdef CONFIG_VHOST_SCSI
 static Property vhost_ccw_scsi_properties[] = {
     DEFINE_PROP_STRING("devno", VirtioCcwDevice, bus_id),
-    DEFINE_VHOST_SCSI_PROPERTIES(VirtIOSCSICcw, vdev.parent_obj.conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1537,8 +1537,9 @@
 static void virtio_ccw_rng_instance_init(Object *obj)
 {
     VirtIORNGCcw *dev = VIRTIO_RNG_CCW(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_RNG);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_RNG);
     object_property_add_link(obj, "rng", TYPE_RNG_BACKEND,
                              (Object **)&dev->vdev.conf.rng,
                              qdev_prop_allow_set_link_before_realize,
@@ -1547,7 +1548,6 @@
 
 static Property virtio_ccw_rng_properties[] = {
     DEFINE_PROP_STRING("devno", VirtioCcwDevice, bus_id),
-    DEFINE_VIRTIO_RNG_PROPERTIES(VirtIORNGCcw, vdev.conf),
     DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags,
                     VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true),
     DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/scsi/Makefile.objs b/hw/scsi/Makefile.objs
index 121ddc5..40c79d3 100644
--- a/hw/scsi/Makefile.objs
+++ b/hw/scsi/Makefile.objs
@@ -8,6 +8,6 @@
 obj-$(CONFIG_PSERIES) += spapr_vscsi.o
 
 ifeq ($(CONFIG_VIRTIO),y)
-obj-y += virtio-scsi.o
+obj-y += virtio-scsi.o virtio-scsi-dataplane.o
 obj-$(CONFIG_VHOST_SCSI) += vhost-scsi.o
 endif
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 954c607..0f3e039 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -551,8 +551,11 @@
     SCSIRequest *req;
     SCSIBus *bus = scsi_bus_from_device(d);
     BusState *qbus = BUS(bus);
+    const int memset_off = offsetof(SCSIRequest, sense)
+                           + sizeof(req->sense);
 
-    req = g_malloc0(reqops->size);
+    req = g_slice_alloc(reqops->size);
+    memset((uint8_t *)req + memset_off, 0, reqops->size - memset_off);
     req->refcount = 1;
     req->bus = bus;
     req->dev = d;
@@ -560,10 +563,10 @@
     req->lun = lun;
     req->hba_private = hba_private;
     req->status = -1;
-    req->sense_len = 0;
     req->ops = reqops;
     object_ref(OBJECT(d));
     object_ref(OBJECT(qbus->parent));
+    notifier_list_init(&req->cancel_notifiers);
     trace_scsi_req_alloc(req->dev->id, req->lun, req->tag);
     return req;
 }
@@ -1603,7 +1606,7 @@
         }
         object_unref(OBJECT(req->dev));
         object_unref(OBJECT(qbus->parent));
-        g_free(req);
+        g_slice_free1(req->ops->size, req);
     }
 }
 
@@ -1713,9 +1716,44 @@
     scsi_req_ref(req);
     scsi_req_dequeue(req);
     req->bus->info->complete(req, req->status, req->resid);
+
+    /* Cancelled requests might end up being completed instead of cancelled */
+    notifier_list_notify(&req->cancel_notifiers, req);
     scsi_req_unref(req);
 }
 
+/* Called by the devices when the request is canceled. */
+void scsi_req_cancel_complete(SCSIRequest *req)
+{
+    assert(req->io_canceled);
+    if (req->bus->info->cancel) {
+        req->bus->info->cancel(req);
+    }
+    notifier_list_notify(&req->cancel_notifiers, req);
+    scsi_req_unref(req);
+}
+
+/* Cancel @req asynchronously. @notifier is added to @req's cancellation
+ * notifier list, the bus will be notified the requests cancellation is
+ * completed.
+ * */
+void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier)
+{
+    trace_scsi_req_cancel(req->dev->id, req->lun, req->tag);
+    if (notifier) {
+        notifier_list_add(&req->cancel_notifiers, notifier);
+    }
+    if (req->io_canceled) {
+        return;
+    }
+    scsi_req_ref(req);
+    scsi_req_dequeue(req);
+    req->io_canceled = true;
+    if (req->aiocb) {
+        bdrv_aio_cancel_async(req->aiocb);
+    }
+}
+
 void scsi_req_cancel(SCSIRequest *req)
 {
     trace_scsi_req_cancel(req->dev->id, req->lun, req->tag);
@@ -1725,28 +1763,9 @@
     scsi_req_ref(req);
     scsi_req_dequeue(req);
     req->io_canceled = true;
-    if (req->ops->cancel_io) {
-        req->ops->cancel_io(req);
+    if (req->aiocb) {
+        bdrv_aio_cancel(req->aiocb);
     }
-    if (req->bus->info->cancel) {
-        req->bus->info->cancel(req);
-    }
-    scsi_req_unref(req);
-}
-
-void scsi_req_abort(SCSIRequest *req, int status)
-{
-    if (!req->enqueued) {
-        return;
-    }
-    scsi_req_ref(req);
-    scsi_req_dequeue(req);
-    req->io_canceled = true;
-    if (req->ops->cancel_io) {
-        req->ops->cancel_io(req);
-    }
-    scsi_req_complete(req, status);
-    scsi_req_unref(req);
 }
 
 static int scsi_ua_precedence(SCSISense sense)
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 9645d01..7a7938a 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -105,23 +105,6 @@
     scsi_req_complete(&r->req, CHECK_CONDITION);
 }
 
-/* Cancel a pending data transfer.  */
-static void scsi_cancel_io(SCSIRequest *req)
-{
-    SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
-
-    DPRINTF("Cancel tag=0x%x\n", req->tag);
-    if (r->req.aiocb) {
-        bdrv_aio_cancel(r->req.aiocb);
-
-        /* This reference was left in by scsi_*_data.  We take ownership of
-         * it the moment scsi_req_cancel is called, independent of whether
-         * bdrv_aio_cancel completes the request or not.  */
-        scsi_req_unref(&r->req);
-    }
-    r->req.aiocb = NULL;
-}
-
 static uint32_t scsi_init_iovec(SCSIDiskReq *r, size_t size)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
@@ -185,6 +168,7 @@
     r->req.aiocb = NULL;
     block_acct_done(bdrv_get_stats(s->qdev.conf.bs), &r->acct);
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -197,9 +181,7 @@
     scsi_req_complete(&r->req, GOOD);
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
 }
 
 static bool scsi_is_cmd_fua(SCSICommand *cmd)
@@ -233,6 +215,7 @@
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -246,9 +229,7 @@
     scsi_req_complete(&r->req, GOOD);
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
 }
 
 static void scsi_dma_complete_noio(void *opaque, int ret)
@@ -261,6 +242,7 @@
         block_acct_done(bdrv_get_stats(s->qdev.conf.bs), &r->acct);
     }
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -280,9 +262,7 @@
     }
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
 }
 
 static void scsi_dma_complete(void *opaque, int ret)
@@ -303,6 +283,7 @@
     r->req.aiocb = NULL;
     block_acct_done(bdrv_get_stats(s->qdev.conf.bs), &r->acct);
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -320,9 +301,7 @@
     scsi_req_data(&r->req, r->qiov.size);
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
 }
 
 /* Actually issue a read to the block device.  */
@@ -337,6 +316,7 @@
         block_acct_done(bdrv_get_stats(s->qdev.conf.bs), &r->acct);
     }
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -363,9 +343,7 @@
     }
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
 }
 
 /* Read more data from scsi device into buffer.  */
@@ -459,6 +437,7 @@
         block_acct_done(bdrv_get_stats(s->qdev.conf.bs), &r->acct);
     }
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -481,9 +460,7 @@
     }
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
 }
 
 static void scsi_write_data(SCSIRequest *req)
@@ -1553,6 +1530,7 @@
 
     r->req.aiocb = NULL;
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -1582,9 +1560,7 @@
     scsi_req_complete(&r->req, GOOD);
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
     g_free(data);
 }
 
@@ -1654,6 +1630,7 @@
     r->req.aiocb = NULL;
     block_acct_done(bdrv_get_stats(s->qdev.conf.bs), &r->acct);
     if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
@@ -1678,9 +1655,7 @@
     scsi_req_complete(&r->req, GOOD);
 
 done:
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
+    scsi_req_unref(&r->req);
     qemu_vfree(data->iov.iov_base);
     g_free(data);
 }
@@ -2346,7 +2321,6 @@
     .send_command = scsi_disk_emulate_command,
     .read_data    = scsi_disk_emulate_read_data,
     .write_data   = scsi_disk_emulate_write_data,
-    .cancel_io    = scsi_cancel_io,
     .get_buf      = scsi_get_buf,
 };
 
@@ -2356,7 +2330,6 @@
     .send_command = scsi_disk_dma_command,
     .read_data    = scsi_read_data,
     .write_data   = scsi_write_data,
-    .cancel_io    = scsi_cancel_io,
     .get_buf      = scsi_get_buf,
     .load_request = scsi_disk_load_request,
     .save_request = scsi_disk_save_request,
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 20587b4..01bca08 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -93,6 +93,10 @@
     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
 
     r->req.aiocb = NULL;
+    if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
+        goto done;
+    }
     if (r->io_header.driver_status & SG_ERR_DRIVER_SENSE) {
         r->req.sense_len = r->io_header.sb_len_wr;
     }
@@ -133,26 +137,8 @@
             r, r->req.tag, status);
 
     scsi_req_complete(&r->req, status);
-    if (!r->req.io_canceled) {
-        scsi_req_unref(&r->req);
-    }
-}
-
-/* Cancel a pending data transfer.  */
-static void scsi_cancel_io(SCSIRequest *req)
-{
-    SCSIGenericReq *r = DO_UPCAST(SCSIGenericReq, req, req);
-
-    DPRINTF("Cancel tag=0x%x\n", req->tag);
-    if (r->req.aiocb) {
-        bdrv_aio_cancel(r->req.aiocb);
-
-        /* This reference was left in by scsi_*_data.  We take ownership of
-         * it independent of whether bdrv_aio_cancel completes the request
-         * or not.  */
-        scsi_req_unref(&r->req);
-    }
-    r->req.aiocb = NULL;
+done:
+    scsi_req_unref(&r->req);
 }
 
 static int execute_command(BlockDriverState *bdrv,
@@ -186,8 +172,7 @@
     int len;
 
     r->req.aiocb = NULL;
-    if (ret) {
-        DPRINTF("IO error ret %d\n", ret);
+    if (ret || r->req.io_canceled) {
         scsi_command_complete(r, ret);
         return;
     }
@@ -211,9 +196,7 @@
         bdrv_set_guest_block_size(s->conf.bs, s->blocksize);
 
         scsi_req_data(&r->req, len);
-        if (!r->req.io_canceled) {
-            scsi_req_unref(&r->req);
-        }
+        scsi_req_unref(&r->req);
     }
 }
 
@@ -246,8 +229,7 @@
 
     DPRINTF("scsi_write_complete() ret = %d\n", ret);
     r->req.aiocb = NULL;
-    if (ret) {
-        DPRINTF("IO error\n");
+    if (ret || r->req.io_canceled) {
         scsi_command_complete(r, ret);
         return;
     }
@@ -465,7 +447,6 @@
     .send_command = scsi_send_command,
     .read_data    = scsi_read_data,
     .write_data   = scsi_write_data,
-    .cancel_io    = scsi_cancel_io,
     .get_buf      = scsi_get_buf,
     .load_request = scsi_generic_load_request,
     .save_request = scsi_generic_save_request,
diff --git a/hw/scsi/spapr_vscsi.c b/hw/scsi/spapr_vscsi.c
index 048cfc7..20b20f0 100644
--- a/hw/scsi/spapr_vscsi.c
+++ b/hw/scsi/spapr_vscsi.c
@@ -77,8 +77,9 @@
     SCSIRequest             *sreq;
     uint32_t                qtag; /* qemu tag != srp tag */
     bool                    active;
-    uint32_t                data_len;
     bool                    writing;
+    bool                    dma_error;
+    uint32_t                data_len;
     uint32_t                senselen;
     uint8_t                 sense[SCSI_SENSE_BUF_SIZE];
 
@@ -536,8 +537,8 @@
     }
     if (rc < 0) {
         fprintf(stderr, "VSCSI: RDMA error rc=%d!\n", rc);
-        vscsi_makeup_sense(s, req, HARDWARE_ERROR, 0, 0);
-        scsi_req_abort(req->sreq, CHECK_CONDITION);
+        req->dma_error = true;
+        scsi_req_cancel(req->sreq);
         return;
     }
 
@@ -591,6 +592,12 @@
 {
     vscsi_req *req = sreq->hba_private;
 
+    if (req->dma_error) {
+        VSCSIState *s = VIO_SPAPR_VSCSI_DEVICE(sreq->bus->qbus.parent);
+
+        vscsi_makeup_sense(s, req, HARDWARE_ERROR, 0, 0);
+        vscsi_send_rsp(s, req, CHECK_CONDITION, 0, 0);
+    }
     vscsi_put_req(req);
 }
 
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 7146e0e..308b393 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -23,6 +23,7 @@
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/virtio-scsi.h"
 #include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
 
 /* Features supported by host kernel. */
 static const int kernel_feature_bits[] = {
@@ -163,8 +164,8 @@
     VirtIOSCSIConfig *scsiconf = (VirtIOSCSIConfig *)config;
     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
 
-    if ((uint32_t) ldl_p(&scsiconf->sense_size) != vs->sense_size ||
-        (uint32_t) ldl_p(&scsiconf->cdb_size) != vs->cdb_size) {
+    if ((uint32_t) virtio_ldl_p(vdev, &scsiconf->sense_size) != vs->sense_size ||
+        (uint32_t) virtio_ldl_p(vdev, &scsiconf->cdb_size) != vs->cdb_size) {
         error_report("vhost-scsi does not support changing the sense data and CDB sizes");
         exit(1);
     }
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
new file mode 100644
index 0000000..b778e05
--- /dev/null
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -0,0 +1,229 @@
+/*
+ * Virtio SCSI dataplane
+ *
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ *   Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "hw/virtio/virtio-scsi.h"
+#include "qemu/error-report.h"
+#include <hw/scsi/scsi.h>
+#include <block/scsi.h>
+#include <hw/virtio/virtio-bus.h>
+#include "hw/virtio/virtio-access.h"
+#include "stdio.h"
+
+/* Context: QEMU global mutex held */
+void virtio_scsi_set_iothread(VirtIOSCSI *s, IOThread *iothread)
+{
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+
+    assert(!s->ctx);
+    s->ctx = iothread_get_aio_context(vs->conf.iothread);
+
+    /* Don't try if transport does not support notifiers. */
+    if (!k->set_guest_notifiers || !k->set_host_notifier) {
+        fprintf(stderr, "virtio-scsi: Failed to set iothread "
+                   "(transport does not support notifiers)");
+        exit(1);
+    }
+}
+
+static VirtIOSCSIVring *virtio_scsi_vring_init(VirtIOSCSI *s,
+                                               VirtQueue *vq,
+                                               EventNotifierHandler *handler,
+                                               int n)
+{
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    VirtIOSCSIVring *r = g_slice_new(VirtIOSCSIVring);
+
+    /* Set up virtqueue notify */
+    if (k->set_host_notifier(qbus->parent, n, true) != 0) {
+        fprintf(stderr, "virtio-scsi: Failed to set host notifier\n");
+        exit(1);
+    }
+    r->host_notifier = *virtio_queue_get_host_notifier(vq);
+    r->guest_notifier = *virtio_queue_get_guest_notifier(vq);
+    aio_set_event_notifier(s->ctx, &r->host_notifier, handler);
+
+    r->parent = s;
+
+    if (!vring_setup(&r->vring, VIRTIO_DEVICE(s), n)) {
+        fprintf(stderr, "virtio-scsi: VRing setup failed\n");
+        exit(1);
+    }
+    return r;
+}
+
+VirtIOSCSIReq *virtio_scsi_pop_req_vring(VirtIOSCSI *s,
+                                         VirtIOSCSIVring *vring)
+{
+    VirtIOSCSIReq *req = virtio_scsi_init_req(s, NULL);
+    int r;
+
+    req->vring = vring;
+    r = vring_pop((VirtIODevice *)s, &vring->vring, &req->elem);
+    if (r < 0) {
+        virtio_scsi_free_req(req);
+        req = NULL;
+    }
+    return req;
+}
+
+void virtio_scsi_vring_push_notify(VirtIOSCSIReq *req)
+{
+    vring_push(&req->vring->vring, &req->elem,
+               req->qsgl.size + req->resp_iov.size);
+    event_notifier_set(&req->vring->guest_notifier);
+}
+
+static void virtio_scsi_iothread_handle_ctrl(EventNotifier *notifier)
+{
+    VirtIOSCSIVring *vring = container_of(notifier,
+                                          VirtIOSCSIVring, host_notifier);
+    VirtIOSCSI *s = VIRTIO_SCSI(vring->parent);
+    VirtIOSCSIReq *req;
+
+    event_notifier_test_and_clear(notifier);
+    while ((req = virtio_scsi_pop_req_vring(s, vring))) {
+        virtio_scsi_handle_ctrl_req(s, req);
+    }
+}
+
+static void virtio_scsi_iothread_handle_event(EventNotifier *notifier)
+{
+    VirtIOSCSIVring *vring = container_of(notifier,
+                                          VirtIOSCSIVring, host_notifier);
+    VirtIOSCSI *s = vring->parent;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    event_notifier_test_and_clear(notifier);
+
+    if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+        return;
+    }
+
+    if (s->events_dropped) {
+        virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+    }
+}
+
+static void virtio_scsi_iothread_handle_cmd(EventNotifier *notifier)
+{
+    VirtIOSCSIVring *vring = container_of(notifier,
+                                          VirtIOSCSIVring, host_notifier);
+    VirtIOSCSI *s = (VirtIOSCSI *)vring->parent;
+    VirtIOSCSIReq *req, *next;
+    QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
+
+    event_notifier_test_and_clear(notifier);
+    while ((req = virtio_scsi_pop_req_vring(s, vring))) {
+        if (virtio_scsi_handle_cmd_req_prepare(s, req)) {
+            QTAILQ_INSERT_TAIL(&reqs, req, next);
+        }
+    }
+
+    QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
+        virtio_scsi_handle_cmd_req_submit(s, req);
+    }
+}
+
+/* Context: QEMU global mutex held */
+void virtio_scsi_dataplane_start(VirtIOSCSI *s)
+{
+    int i;
+    int rc;
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+
+    if (s->dataplane_started ||
+        s->dataplane_starting ||
+        s->ctx != iothread_get_aio_context(vs->conf.iothread)) {
+        return;
+    }
+
+    s->dataplane_starting = true;
+
+    /* Set up guest notifier (irq) */
+    rc = k->set_guest_notifiers(qbus->parent, vs->conf.num_queues + 2, true);
+    if (rc != 0) {
+        fprintf(stderr, "virtio-scsi: Failed to set guest notifiers, "
+                "ensure -enable-kvm is set\n");
+        exit(1);
+    }
+
+    aio_context_acquire(s->ctx);
+    s->ctrl_vring = virtio_scsi_vring_init(s, vs->ctrl_vq,
+                                           virtio_scsi_iothread_handle_ctrl,
+                                           0);
+    s->event_vring = virtio_scsi_vring_init(s, vs->event_vq,
+                                            virtio_scsi_iothread_handle_event,
+                                            1);
+    s->cmd_vrings = g_malloc0(sizeof(VirtIOSCSIVring) * vs->conf.num_queues);
+    for (i = 0; i < vs->conf.num_queues; i++) {
+        s->cmd_vrings[i] =
+            virtio_scsi_vring_init(s, vs->cmd_vqs[i],
+                                   virtio_scsi_iothread_handle_cmd,
+                                   i + 2);
+    }
+
+    aio_context_release(s->ctx);
+    s->dataplane_starting = false;
+    s->dataplane_started = true;
+}
+
+/* Context: QEMU global mutex held */
+void virtio_scsi_dataplane_stop(VirtIOSCSI *s)
+{
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+    int i;
+
+    if (!s->dataplane_started || s->dataplane_stopping) {
+        return;
+    }
+    s->dataplane_stopping = true;
+    assert(s->ctx == iothread_get_aio_context(vs->conf.iothread));
+
+    aio_context_acquire(s->ctx);
+
+    aio_set_event_notifier(s->ctx, &s->ctrl_vring->host_notifier, NULL);
+    aio_set_event_notifier(s->ctx, &s->event_vring->host_notifier, NULL);
+    for (i = 0; i < vs->conf.num_queues; i++) {
+        aio_set_event_notifier(s->ctx, &s->cmd_vrings[i]->host_notifier, NULL);
+    }
+
+    bdrv_drain_all(); /* ensure there are no in-flight requests */
+
+    aio_context_release(s->ctx);
+
+    /* Sync vring state back to virtqueue so that non-dataplane request
+     * processing can continue when we disable the host notifier below.
+     */
+    vring_teardown(&s->ctrl_vring->vring, vdev, 0);
+    vring_teardown(&s->event_vring->vring, vdev, 1);
+    for (i = 0; i < vs->conf.num_queues; i++) {
+        vring_teardown(&s->cmd_vrings[i]->vring, vdev, 2 + i);
+    }
+
+    for (i = 0; i < vs->conf.num_queues + 2; i++) {
+        k->set_host_notifier(qbus->parent, i, false);
+    }
+
+    /* Clean up guest notifier (irq) */
+    k->set_guest_notifiers(qbus->parent, vs->conf.num_queues + 2, false);
+    s->dataplane_stopping = false;
+    s->dataplane_started = false;
+}
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 86aba88..203e624 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -20,34 +20,7 @@
 #include <block/scsi.h>
 #include <hw/virtio/virtio-bus.h>
 #include "hw/virtio/virtio-access.h"
-
-typedef struct VirtIOSCSIReq {
-    VirtIOSCSI *dev;
-    VirtQueue *vq;
-    VirtQueueElement elem;
-    QEMUSGList qsgl;
-    SCSIRequest *sreq;
-    size_t resp_size;
-    enum SCSIXferMode mode;
-    QEMUIOVector resp_iov;
-    union {
-        VirtIOSCSICmdResp     cmd;
-        VirtIOSCSICtrlTMFResp tmf;
-        VirtIOSCSICtrlANResp  an;
-        VirtIOSCSIEvent       event;
-    } resp;
-    union {
-        struct {
-            VirtIOSCSICmdReq  cmd;
-            uint8_t           cdb[];
-        } QEMU_PACKED;
-        VirtIOSCSICtrlTMFReq  tmf;
-        VirtIOSCSICtrlANReq   an;
-    } req;
-} VirtIOSCSIReq;
-
-QEMU_BUILD_BUG_ON(offsetof(VirtIOSCSIReq, req.cdb) !=
-                  offsetof(VirtIOSCSIReq, req.cmd) + sizeof(VirtIOSCSICmdReq));
+#include "migration/migration.h"
 
 static inline int virtio_scsi_get_lun(uint8_t *lun)
 {
@@ -65,26 +38,29 @@
     return scsi_device_find(&s->bus, 0, lun[1], virtio_scsi_get_lun(lun));
 }
 
-static VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
+VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
 {
     VirtIOSCSIReq *req;
-    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+    VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
+    const size_t zero_skip = offsetof(VirtIOSCSIReq, elem)
+                             + sizeof(VirtQueueElement);
 
-    req = g_malloc0(sizeof(*req) + vs->cdb_size);
-
+    req = g_slice_alloc(sizeof(*req) + vs->cdb_size);
     req->vq = vq;
     req->dev = s;
-    req->sreq = NULL;
     qemu_sglist_init(&req->qsgl, DEVICE(s), 8, &address_space_memory);
     qemu_iovec_init(&req->resp_iov, 1);
+    memset((uint8_t *)req + zero_skip, 0, sizeof(*req) - zero_skip);
     return req;
 }
 
-static void virtio_scsi_free_req(VirtIOSCSIReq *req)
+void virtio_scsi_free_req(VirtIOSCSIReq *req)
 {
+    VirtIOSCSICommon *vs = (VirtIOSCSICommon *)req->dev;
+
     qemu_iovec_destroy(&req->resp_iov);
     qemu_sglist_destroy(&req->qsgl);
-    g_free(req);
+    g_slice_free1(sizeof(*req) + vs->cdb_size, req);
 }
 
 static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
@@ -94,13 +70,19 @@
     VirtIODevice *vdev = VIRTIO_DEVICE(s);
 
     qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size);
-    virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size);
+    if (req->vring) {
+        assert(req->vq == NULL);
+        virtio_scsi_vring_push_notify(req);
+    } else {
+        virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size);
+        virtio_notify(vdev, vq);
+    }
+
     if (req->sreq) {
         req->sreq->hba_private = NULL;
         scsi_req_unref(req->sreq);
     }
     virtio_scsi_free_req(req);
-    virtio_notify(vdev, vq);
 }
 
 static void virtio_scsi_bad_req(void)
@@ -226,13 +208,39 @@
     return req;
 }
 
-static void virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
+typedef struct {
+    Notifier        notifier;
+    VirtIOSCSIReq  *tmf_req;
+} VirtIOSCSICancelNotifier;
+
+static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
+{
+    VirtIOSCSICancelNotifier *n = container_of(notifier,
+                                               VirtIOSCSICancelNotifier,
+                                               notifier);
+
+    if (--n->tmf_req->remaining == 0) {
+        virtio_scsi_complete_req(n->tmf_req);
+    }
+    g_slice_free(VirtIOSCSICancelNotifier, n);
+}
+
+/* Return 0 if the request is ready to be completed and return to guest;
+ * -EINPROGRESS if the request is submitted and will be completed later, in the
+ *  case of async cancellation. */
+static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
 {
     SCSIDevice *d = virtio_scsi_device_find(s, req->req.tmf.lun);
     SCSIRequest *r, *next;
     BusChild *kid;
     int target;
+    int ret = 0;
 
+    if (s->dataplane_started && bdrv_get_aio_context(d->conf.bs) != s->ctx) {
+        aio_context_acquire(s->ctx);
+        bdrv_set_aio_context(d->conf.bs, s->ctx);
+        aio_context_release(s->ctx);
+    }
     /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE".  */
     req->resp.tmf.response = VIRTIO_SCSI_S_OK;
 
@@ -264,7 +272,14 @@
                  */
                 req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
             } else {
-                scsi_req_cancel(r);
+                VirtIOSCSICancelNotifier *notifier;
+
+                req->remaining = 1;
+                notifier = g_slice_new(VirtIOSCSICancelNotifier);
+                notifier->tmf_req = req;
+                notifier->notifier.notify = virtio_scsi_cancel_notify;
+                scsi_req_cancel_async(r, &notifier->notifier);
+                ret = -EINPROGRESS;
             }
         }
         break;
@@ -290,6 +305,13 @@
         if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
             goto incorrect_lun;
         }
+
+        /* Add 1 to "remaining" until virtio_scsi_do_tmf returns.
+         * This way, if the bus starts calling back to the notifiers
+         * even before we finish the loop, virtio_scsi_cancel_notify
+         * will not complete the TMF too early.
+         */
+        req->remaining = 1;
         QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
             if (r->hba_private) {
                 if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) {
@@ -299,10 +321,19 @@
                     req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
                     break;
                 } else {
-                    scsi_req_cancel(r);
+                    VirtIOSCSICancelNotifier *notifier;
+
+                    req->remaining++;
+                    notifier = g_slice_new(VirtIOSCSICancelNotifier);
+                    notifier->notifier.notify = virtio_scsi_cancel_notify;
+                    notifier->tmf_req = req;
+                    scsi_req_cancel_async(r, &notifier->notifier);
                 }
             }
         }
+        if (--req->remaining > 0) {
+            ret = -EINPROGRESS;
+        }
         break;
 
     case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
@@ -323,14 +354,53 @@
         break;
     }
 
-    return;
+    return ret;
 
 incorrect_lun:
     req->resp.tmf.response = VIRTIO_SCSI_S_INCORRECT_LUN;
-    return;
+    return ret;
 
 fail:
     req->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
+    return ret;
+}
+
+void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
+{
+    VirtIODevice *vdev = (VirtIODevice *)s;
+    int type;
+    int r = 0;
+
+    if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0,
+                &type, sizeof(type)) < sizeof(type)) {
+        virtio_scsi_bad_req();
+        return;
+    }
+
+    virtio_tswap32s(vdev, &req->req.tmf.type);
+    if (req->req.tmf.type == VIRTIO_SCSI_T_TMF) {
+        if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq),
+                    sizeof(VirtIOSCSICtrlTMFResp)) < 0) {
+            virtio_scsi_bad_req();
+        } else {
+            r = virtio_scsi_do_tmf(s, req);
+        }
+
+    } else if (req->req.tmf.type == VIRTIO_SCSI_T_AN_QUERY ||
+            req->req.tmf.type == VIRTIO_SCSI_T_AN_SUBSCRIBE) {
+        if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq),
+                    sizeof(VirtIOSCSICtrlANResp)) < 0) {
+            virtio_scsi_bad_req();
+        } else {
+            req->resp.an.event_actual = 0;
+            req->resp.an.response = VIRTIO_SCSI_S_OK;
+        }
+    }
+    if (r == 0) {
+        virtio_scsi_complete_req(req);
+    } else {
+        assert(r == -EINPROGRESS);
+    }
 }
 
 static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
@@ -338,35 +408,12 @@
     VirtIOSCSI *s = (VirtIOSCSI *)vdev;
     VirtIOSCSIReq *req;
 
+    if (s->ctx && !s->dataplane_disabled) {
+        virtio_scsi_dataplane_start(s);
+        return;
+    }
     while ((req = virtio_scsi_pop_req(s, vq))) {
-        int type;
-
-        if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0,
-                       &type, sizeof(type)) < sizeof(type)) {
-            virtio_scsi_bad_req();
-            continue;
-        }
-
-        virtio_tswap32s(vdev, &req->req.tmf.type);
-        if (req->req.tmf.type == VIRTIO_SCSI_T_TMF) {
-            if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq),
-                                      sizeof(VirtIOSCSICtrlTMFResp)) < 0) {
-                virtio_scsi_bad_req();
-            } else {
-                virtio_scsi_do_tmf(s, req);
-            }
-
-        } else if (req->req.tmf.type == VIRTIO_SCSI_T_AN_QUERY ||
-                   req->req.tmf.type == VIRTIO_SCSI_T_AN_SUBSCRIBE) {
-            if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq),
-                                      sizeof(VirtIOSCSICtrlANResp)) < 0) {
-                virtio_scsi_bad_req();
-            } else {
-                req->resp.an.event_actual = 0;
-                req->resp.an.response = VIRTIO_SCSI_S_OK;
-            }
-        }
-        virtio_scsi_complete_req(req);
+        virtio_scsi_handle_ctrl_req(s, req);
     }
 }
 
@@ -420,13 +467,7 @@
      * host device passthrough.
      */
     cmd->xfer = req->qsgl.size;
-    if (cmd->xfer == 0) {
-        cmd->mode = SCSI_XFER_NONE;
-    } else if (iov_size(req->elem.in_sg, req->elem.in_num) > req->resp_size) {
-        cmd->mode = SCSI_XFER_FROM_DEV;
-    } else {
-        cmd->mode = SCSI_XFER_TO_DEV;
-    }
+    cmd->mode = req->mode;
     return 0;
 }
 
@@ -458,52 +499,78 @@
     virtio_scsi_complete_cmd_req(req);
 }
 
+bool virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
+{
+    VirtIOSCSICommon *vs = &s->parent_obj;
+    SCSIDevice *d;
+    int rc;
+
+    rc = virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size,
+                               sizeof(VirtIOSCSICmdResp) + vs->sense_size);
+    if (rc < 0) {
+        if (rc == -ENOTSUP) {
+            virtio_scsi_fail_cmd_req(req);
+        } else {
+            virtio_scsi_bad_req();
+        }
+        return false;
+    }
+
+    d = virtio_scsi_device_find(s, req->req.cmd.lun);
+    if (!d) {
+        req->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET;
+        virtio_scsi_complete_cmd_req(req);
+        return false;
+    }
+    if (s->dataplane_started && bdrv_get_aio_context(d->conf.bs) != s->ctx) {
+        aio_context_acquire(s->ctx);
+        bdrv_set_aio_context(d->conf.bs, s->ctx);
+        aio_context_release(s->ctx);
+    }
+    req->sreq = scsi_req_new(d, req->req.cmd.tag,
+                             virtio_scsi_get_lun(req->req.cmd.lun),
+                             req->req.cdb, req);
+
+    if (req->sreq->cmd.mode != SCSI_XFER_NONE
+        && (req->sreq->cmd.mode != req->mode ||
+            req->sreq->cmd.xfer > req->qsgl.size)) {
+        req->resp.cmd.response = VIRTIO_SCSI_S_OVERRUN;
+        virtio_scsi_complete_cmd_req(req);
+        return false;
+    }
+    scsi_req_ref(req->sreq);
+    bdrv_io_plug(d->conf.bs);
+    return true;
+}
+
+void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req)
+{
+    if (scsi_req_enqueue(req->sreq)) {
+        scsi_req_continue(req->sreq);
+    }
+    bdrv_io_unplug(req->sreq->dev->conf.bs);
+    scsi_req_unref(req->sreq);
+}
+
 static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq)
 {
     /* use non-QOM casts in the data path */
     VirtIOSCSI *s = (VirtIOSCSI *)vdev;
-    VirtIOSCSICommon *vs = &s->parent_obj;
+    VirtIOSCSIReq *req, *next;
+    QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 
-    VirtIOSCSIReq *req;
-    int n;
-
+    if (s->ctx && !s->dataplane_disabled) {
+        virtio_scsi_dataplane_start(s);
+        return;
+    }
     while ((req = virtio_scsi_pop_req(s, vq))) {
-        SCSIDevice *d;
-        int rc;
-
-        rc = virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size,
-                                   sizeof(VirtIOSCSICmdResp) + vs->sense_size);
-        if (rc < 0) {
-            if (rc == -ENOTSUP) {
-                virtio_scsi_fail_cmd_req(req);
-            } else {
-                virtio_scsi_bad_req();
-            }
-            continue;
+        if (virtio_scsi_handle_cmd_req_prepare(s, req)) {
+            QTAILQ_INSERT_TAIL(&reqs, req, next);
         }
+    }
 
-        d = virtio_scsi_device_find(s, req->req.cmd.lun);
-        if (!d) {
-            req->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET;
-            virtio_scsi_complete_cmd_req(req);
-            continue;
-        }
-        req->sreq = scsi_req_new(d, req->req.cmd.tag,
-                                 virtio_scsi_get_lun(req->req.cmd.lun),
-                                 req->req.cdb, req);
-
-        if (req->sreq->cmd.mode != SCSI_XFER_NONE
-            && (req->sreq->cmd.mode != req->mode ||
-                req->sreq->cmd.xfer > req->qsgl.size)) {
-            req->resp.cmd.response = VIRTIO_SCSI_S_OVERRUN;
-            virtio_scsi_complete_cmd_req(req);
-            continue;
-        }
-
-        n = scsi_req_enqueue(req->sreq);
-        if (n) {
-            scsi_req_continue(req->sreq);
-        }
+    QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
+        virtio_scsi_handle_cmd_req_submit(s, req);
     }
 }
 
@@ -552,6 +619,9 @@
     VirtIOSCSI *s = VIRTIO_SCSI(vdev);
     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
 
+    if (s->ctx) {
+        virtio_scsi_dataplane_stop(s);
+    }
     s->resetting++;
     qbus_reset_all(&s->bus.qbus);
     s->resetting--;
@@ -582,8 +652,8 @@
     return 0;
 }
 
-static void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,
-                                   uint32_t event, uint32_t reason)
+void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,
+                            uint32_t event, uint32_t reason)
 {
     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
     VirtIOSCSIReq *req;
@@ -594,10 +664,19 @@
         return;
     }
 
-    req = virtio_scsi_pop_req(s, vs->event_vq);
+    if (s->dataplane_started) {
+        assert(s->ctx);
+        aio_context_acquire(s->ctx);
+    }
+
+    if (s->dataplane_started) {
+        req = virtio_scsi_pop_req_vring(s, s->event_vring);
+    } else {
+        req = virtio_scsi_pop_req(s, vs->event_vq);
+    }
     if (!req) {
         s->events_dropped = true;
-        return;
+        goto out;
     }
 
     if (s->events_dropped) {
@@ -626,12 +705,20 @@
         evt->lun[3] = dev->lun & 0xFF;
     }
     virtio_scsi_complete_req(req);
+out:
+    if (s->dataplane_started) {
+        aio_context_release(s->ctx);
+    }
 }
 
 static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIOSCSI *s = VIRTIO_SCSI(vdev);
 
+    if (s->ctx && !s->dataplane_disabled) {
+        virtio_scsi_dataplane_start(s);
+        return;
+    }
     if (s->events_dropped) {
         virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
     }
@@ -717,6 +804,35 @@
         s->cmd_vqs[i] = virtio_add_queue(vdev, VIRTIO_SCSI_VQ_SIZE,
                                          cmd);
     }
+
+    if (s->conf.iothread) {
+        virtio_scsi_set_iothread(VIRTIO_SCSI(s), s->conf.iothread);
+    }
+}
+
+/* Disable dataplane thread during live migration since it does not
+ * update the dirty memory bitmap yet.
+ */
+static void virtio_scsi_migration_state_changed(Notifier *notifier, void *data)
+{
+    VirtIOSCSI *s = container_of(notifier, VirtIOSCSI,
+                                 migration_state_notifier);
+    MigrationState *mig = data;
+
+    if (migration_in_setup(mig)) {
+        if (!s->dataplane_started) {
+            return;
+        }
+        virtio_scsi_dataplane_stop(s);
+        s->dataplane_disabled = true;
+    } else if (migration_has_finished(mig) ||
+               migration_has_failed(mig)) {
+        if (s->dataplane_started) {
+            return;
+        }
+        bdrv_drain_all(); /* complete in-flight non-dataplane requests */
+        s->dataplane_disabled = false;
+    }
 }
 
 static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
@@ -747,6 +863,18 @@
 
     register_savevm(dev, "virtio-scsi", virtio_scsi_id++, 1,
                     virtio_scsi_save, virtio_scsi_load, s);
+    s->migration_state_notifier.notify = virtio_scsi_migration_state_changed;
+    add_migration_state_change_notifier(&s->migration_state_notifier);
+}
+
+static void virtio_scsi_instance_init(Object *obj)
+{
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(obj);
+
+    object_property_add_link(obj, "iothread", TYPE_IOTHREAD,
+                             (Object **)&vs->conf.iothread,
+                             qdev_prop_allow_set_link_before_realize,
+                             OBJ_PROP_LINK_UNREF_ON_RELEASE, &error_abort);
 }
 
 void virtio_scsi_common_unrealize(DeviceState *dev, Error **errp)
@@ -763,6 +891,7 @@
     VirtIOSCSI *s = VIRTIO_SCSI(dev);
 
     unregister_savevm(dev, "virtio-scsi", s);
+    remove_migration_state_change_notifier(&s->migration_state_notifier);
 
     virtio_scsi_common_unrealize(dev, errp);
 }
@@ -807,6 +936,7 @@
     .name = TYPE_VIRTIO_SCSI,
     .parent = TYPE_VIRTIO_SCSI_COMMON,
     .instance_size = sizeof(VirtIOSCSI),
+    .instance_init = virtio_scsi_instance_init,
     .class_init = virtio_scsi_class_init,
 };
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index f560814..390f824 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -86,6 +86,9 @@
  * 12 is historical, and due to x86 page size. */
 #define VIRTIO_PCI_QUEUE_ADDR_SHIFT    12
 
+/* Flags track per-device state like workarounds for quirks in older guests. */
+#define VIRTIO_PCI_FLAG_BUS_MASTER_BUG  (1 << 0)
+
 static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
                                VirtIOPCIProxy *dev);
 
@@ -320,6 +323,14 @@
                                      proxy->pci_dev.config[PCI_COMMAND] |
                                      PCI_COMMAND_MASTER, 1);
         }
+
+        /* Linux before 2.6.34 sets the device as OK without enabling
+           the PCI device bus master bit. In this case we need to disable
+           some safety checks. */
+        if ((val & VIRTIO_CONFIG_S_DRIVER_OK) &&
+            !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+            proxy->flags |= VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
+        }
         break;
     case VIRTIO_MSI_CONFIG_VECTOR:
         msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
@@ -469,18 +480,13 @@
     VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev);
     VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
 
-    uint8_t cmd = proxy->pci_dev.config[PCI_COMMAND];
-
     pci_default_write_config(pci_dev, address, val, len);
 
     if (range_covers_byte(address, len, PCI_COMMAND) &&
         !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER) &&
-        (cmd & PCI_COMMAND_MASTER)) {
-        /* Bus driver disables bus mastering - make it act
-         * as a kind of reset to render the device quiescent. */
+        !(proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG)) {
         virtio_pci_stop_ioeventfd(proxy);
-        virtio_reset(vdev);
-        msix_unuse_all_vectors(&proxy->pci_dev);
+        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
     }
 }
 
@@ -889,19 +895,11 @@
     VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
 
     if (running) {
-        /* Linux before 2.6.34 drives the device without enabling
-           the PCI device bus master bit. Enable it automatically
-           for the guest. This is a PCI spec violation but so is
-           initiating DMA with bus master bit clear.
-           Note: this only makes a difference when migrating
-           across QEMU versions from an old QEMU, as for new QEMU
-           bus master and driver bits are always in sync.
-           TODO: consider enabling conditionally for compat machine types. */
-        if (vdev->status & (VIRTIO_CONFIG_S_ACKNOWLEDGE |
-                            VIRTIO_CONFIG_S_DRIVER)) {
-            pci_default_write_config(&proxy->pci_dev, PCI_COMMAND,
-                                     proxy->pci_dev.config[PCI_COMMAND] |
-                                     PCI_COMMAND_MASTER, 1);
+        /* Try to find out if the guest has bus master disabled, but is
+           in ready state. Then we have a buggy guest OS. */
+        if ((vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+            !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+            proxy->flags |= VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
         }
         virtio_pci_start_ioeventfd(proxy);
     } else {
@@ -926,7 +924,6 @@
     DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
                     VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
     DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
-    DEFINE_VIRTIO_9P_PROPERTIES(V9fsPCIState, vdev.fsconf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -948,8 +945,9 @@
 static void virtio_9p_pci_instance_init(Object *obj)
 {
     V9fsPCIState *dev = VIRTIO_9P_PCI(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_9P);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_9P);
 }
 
 static const TypeInfo virtio_9p_pci_info = {
@@ -1042,6 +1040,7 @@
     virtio_pci_stop_ioeventfd(proxy);
     virtio_bus_reset(bus);
     msix_unuse_all_vectors(&proxy->pci_dev);
+    proxy->flags &= ~VIRTIO_PCI_FLAG_BUS_MASTER_BUG;
 }
 
 static Property virtio_pci_properties[] = {
@@ -1111,10 +1110,9 @@
 static void virtio_blk_pci_instance_init(Object *obj)
 {
     VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_BLK);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
-    object_unref(OBJECT(&dev->vdev));
-    qdev_alias_all_properties(DEVICE(&dev->vdev), obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_BLK);
     object_property_add_alias(obj, "iothread", OBJECT(&dev->vdev),"iothread",
                               &error_abort);
 }
@@ -1135,7 +1133,6 @@
     DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
                        DEV_NVECTORS_UNSPECIFIED),
     DEFINE_VIRTIO_SCSI_FEATURES(VirtIOPCIProxy, host_features),
-    DEFINE_VIRTIO_SCSI_PROPERTIES(VirtIOSCSIPCI, vdev.parent_obj.conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1185,8 +1182,11 @@
 static void virtio_scsi_pci_instance_init(Object *obj)
 {
     VirtIOSCSIPCI *dev = VIRTIO_SCSI_PCI(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_SCSI);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_SCSI);
+    object_property_add_alias(obj, "iothread", OBJECT(&dev->vdev), "iothread",
+                              &error_abort);
 }
 
 static const TypeInfo virtio_scsi_pci_info = {
@@ -1203,7 +1203,6 @@
 static Property vhost_scsi_pci_properties[] = {
     DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
                        DEV_NVECTORS_UNSPECIFIED),
-    DEFINE_VHOST_SCSI_PROPERTIES(VHostSCSIPCI, vdev.parent_obj.conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1241,8 +1240,9 @@
 static void vhost_scsi_pci_instance_init(Object *obj)
 {
     VHostSCSIPCI *dev = VHOST_SCSI_PCI(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VHOST_SCSI);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_SCSI);
 }
 
 static const TypeInfo vhost_scsi_pci_info = {
@@ -1323,7 +1323,7 @@
     VirtIOBalloonPCI *dev = VIRTIO_BALLOON_PCI(obj);
     object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_BALLOON);
     object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
-
+    object_unref(OBJECT(&dev->vdev));
     object_property_add(obj, "guest-stats", "guest statistics",
                         balloon_pci_stats_get_all, NULL, NULL, dev,
                         NULL);
@@ -1385,7 +1385,6 @@
                     VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
     DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
     DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
-    DEFINE_VIRTIO_SERIAL_PROPERTIES(VirtIOSerialPCI, vdev.serial),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1406,8 +1405,9 @@
 static void virtio_serial_pci_instance_init(Object *obj)
 {
     VirtIOSerialPCI *dev = VIRTIO_SERIAL_PCI(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_SERIAL);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_SERIAL);
 }
 
 static const TypeInfo virtio_serial_pci_info = {
@@ -1425,8 +1425,6 @@
                     VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, false),
     DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
     DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features),
-    DEFINE_NIC_PROPERTIES(VirtIONetPCI, vdev.nic_conf),
-    DEFINE_VIRTIO_NET_PROPERTIES(VirtIONetPCI, vdev.net_conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1465,8 +1463,9 @@
 static void virtio_net_pci_instance_init(Object *obj)
 {
     VirtIONetPCI *dev = VIRTIO_NET_PCI(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_NET);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_NET);
 }
 
 static const TypeInfo virtio_net_pci_info = {
@@ -1480,7 +1479,6 @@
 /* virtio-rng-pci */
 
 static Property virtio_rng_pci_properties[] = {
-    DEFINE_VIRTIO_RNG_PROPERTIES(VirtIORngPCI, vdev.conf),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1520,8 +1518,9 @@
 static void virtio_rng_initfn(Object *obj)
 {
     VirtIORngPCI *dev = VIRTIO_RNG_PCI(obj);
-    object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_RNG);
-    object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_RNG);
     object_property_add_link(obj, "rng", TYPE_RNG_BACKEND,
                              (Object **)&dev->vdev.conf.rng,
                              qdev_prop_allow_set_link_before_realize,
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 5c98180..2c236bf 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1123,6 +1123,17 @@
     }
 }
 
+void virtio_instance_init_common(Object *proxy_obj, void *data,
+                                 size_t vdev_size, const char *vdev_name)
+{
+    DeviceState *vdev = data;
+
+    object_initialize(vdev, vdev_size, vdev_name);
+    object_property_add_child(proxy_obj, "virtio-backend", OBJECT(vdev), NULL);
+    object_unref(OBJECT(vdev));
+    qdev_alias_all_properties(vdev, proxy_obj);
+}
+
 void virtio_init(VirtIODevice *vdev, const char *name,
                  uint16_t device_id, size_t config_size)
 {
diff --git a/include/elf.h b/include/elf.h
index 70107f0..a516584 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -473,14 +473,35 @@
 #define PPC_FEATURE_TRUE_LE             0x00000002
 #define PPC_FEATURE_PPC_LE              0x00000001
 
-/* Bits present in AT_HWCAP, primarily for Sparc32.  */
+/* Bits present in AT_HWCAP for Sparc.  */
 
-#define HWCAP_SPARC_FLUSH       1    /* CPU supports flush instruction. */
-#define HWCAP_SPARC_STBAR       2
-#define HWCAP_SPARC_SWAP        4
-#define HWCAP_SPARC_MULDIV      8
-#define HWCAP_SPARC_V9		16
-#define HWCAP_SPARC_ULTRA3	32
+#define HWCAP_SPARC_FLUSH               0x00000001
+#define HWCAP_SPARC_STBAR               0x00000002
+#define HWCAP_SPARC_SWAP                0x00000004
+#define HWCAP_SPARC_MULDIV              0x00000008
+#define HWCAP_SPARC_V9                  0x00000010
+#define HWCAP_SPARC_ULTRA3              0x00000020
+#define HWCAP_SPARC_BLKINIT             0x00000040
+#define HWCAP_SPARC_N2                  0x00000080
+#define HWCAP_SPARC_MUL32               0x00000100
+#define HWCAP_SPARC_DIV32               0x00000200
+#define HWCAP_SPARC_FSMULD              0x00000400
+#define HWCAP_SPARC_V8PLUS              0x00000800
+#define HWCAP_SPARC_POPC                0x00001000
+#define HWCAP_SPARC_VIS                 0x00002000
+#define HWCAP_SPARC_VIS2                0x00004000
+#define HWCAP_SPARC_ASI_BLK_INIT        0x00008000
+#define HWCAP_SPARC_FMAF                0x00010000
+#define HWCAP_SPARC_VIS3                0x00020000
+#define HWCAP_SPARC_HPC                 0x00040000
+#define HWCAP_SPARC_RANDOM              0x00080000
+#define HWCAP_SPARC_TRANS               0x00100000
+#define HWCAP_SPARC_FJFMAU              0x00200000
+#define HWCAP_SPARC_IMA                 0x00400000
+#define HWCAP_SPARC_ASI_CACHE_SPARING   0x00800000
+#define HWCAP_SPARC_PAUSE               0x01000000
+#define HWCAP_SPARC_CBCOND              0x02000000
+#define HWCAP_SPARC_CRYPTO              0x04000000
 
 /* Bits present in AT_HWCAP for s390.  */
 
diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h
index c6b5129..a517753 100644
--- a/include/hw/elf_ops.h
+++ b/include/hw/elf_ops.h
@@ -147,18 +147,13 @@
         }
         i++;
     }
-    if (nsyms) {
-        syms = g_realloc(syms, nsyms * sizeof(*syms));
+    syms = g_realloc(syms, nsyms * sizeof(*syms));
 
-        qsort(syms, nsyms, sizeof(*syms), glue(symcmp, SZ));
-        for (i = 0; i < nsyms - 1; i++) {
-            if (syms[i].st_size == 0) {
-                syms[i].st_size = syms[i + 1].st_value - syms[i].st_value;
-            }
+    qsort(syms, nsyms, sizeof(*syms), glue(symcmp, SZ));
+    for (i = 0; i < nsyms - 1; i++) {
+        if (syms[i].st_size == 0) {
+            syms[i].st_size = syms[i + 1].st_value - syms[i].st_value;
         }
-    } else {
-        g_free(syms);
-        syms = NULL;
     }
 
     /* String table */
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 2e3a8f9..b61bedb 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -5,6 +5,7 @@
 #include "block/block.h"
 #include "hw/block/block.h"
 #include "sysemu/sysemu.h"
+#include "qemu/notify.h"
 
 #define MAX_SCSI_DEVS	255
 
@@ -50,17 +51,25 @@
     uint32_t          tag;
     uint32_t          lun;
     uint32_t          status;
+    void              *hba_private;
     size_t            resid;
     SCSICommand       cmd;
+    NotifierList      cancel_notifiers;
+
+    /* Note:
+     * - fields before sense are initialized by scsi_req_alloc;
+     * - sense[] is uninitialized;
+     * - fields after sense are memset to 0 by scsi_req_alloc.
+     * */
+
+    uint8_t           sense[SCSI_SENSE_BUF_SIZE];
+    uint32_t          sense_len;
+    bool              enqueued;
+    bool              io_canceled;
+    bool              retry;
+    bool              dma_started;
     BlockDriverAIOCB  *aiocb;
     QEMUSGList        *sg;
-    bool              dma_started;
-    uint8_t sense[SCSI_SENSE_BUF_SIZE];
-    uint32_t sense_len;
-    bool enqueued;
-    bool io_canceled;
-    bool retry;
-    void *hba_private;
     QTAILQ_ENTRY(SCSIRequest) next;
 };
 
@@ -123,7 +132,6 @@
     int32_t (*send_command)(SCSIRequest *req, uint8_t *buf);
     void (*read_data)(SCSIRequest *req);
     void (*write_data)(SCSIRequest *req);
-    void (*cancel_io)(SCSIRequest *req);
     uint8_t *(*get_buf)(SCSIRequest *req);
 
     void (*save_request)(QEMUFile *f, SCSIRequest *req);
@@ -258,8 +266,9 @@
 void scsi_req_complete(SCSIRequest *req, int status);
 uint8_t *scsi_req_get_buf(SCSIRequest *req);
 int scsi_req_get_sense(SCSIRequest *req, uint8_t *buf, int len);
-void scsi_req_abort(SCSIRequest *req, int status);
+void scsi_req_cancel_complete(SCSIRequest *req);
 void scsi_req_cancel(SCSIRequest *req);
+void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier);
 void scsi_req_retry(SCSIRequest *req);
 void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense);
 void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense);
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 188a2d9..d6e5e79 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -17,6 +17,8 @@
 #include "hw/virtio/virtio.h"
 #include "hw/pci/pci.h"
 #include "hw/scsi/scsi.h"
+#include "sysemu/iothread.h"
+#include "hw/virtio/dataplane/vring.h"
 
 #define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
 #define VIRTIO_SCSI_COMMON(obj) \
@@ -151,8 +153,18 @@
     uint32_t cmd_per_lun;
     char *vhostfd;
     char *wwpn;
+    IOThread *iothread;
 };
 
+struct VirtIOSCSI;
+
+typedef struct {
+    struct VirtIOSCSI *parent;
+    Vring vring;
+    EventNotifier host_notifier;
+    EventNotifier guest_notifier;
+} VirtIOSCSIVring;
+
 typedef struct VirtIOSCSICommon {
     VirtIODevice parent_obj;
     VirtIOSCSIConf conf;
@@ -164,14 +176,74 @@
     VirtQueue **cmd_vqs;
 } VirtIOSCSICommon;
 
-typedef struct {
+typedef struct VirtIOSCSI {
     VirtIOSCSICommon parent_obj;
 
     SCSIBus bus;
     int resetting;
     bool events_dropped;
+
+    /* Fields for dataplane below */
+    AioContext *ctx; /* one iothread per virtio-scsi-pci for now */
+
+    /* Vring is used instead of vq in dataplane code, because of the underlying
+     * memory layer thread safety */
+    VirtIOSCSIVring *ctrl_vring;
+    VirtIOSCSIVring *event_vring;
+    VirtIOSCSIVring **cmd_vrings;
+    bool dataplane_started;
+    bool dataplane_starting;
+    bool dataplane_stopping;
+    bool dataplane_disabled;
+    Notifier migration_state_notifier;
 } VirtIOSCSI;
 
+typedef struct VirtIOSCSIReq {
+    VirtIOSCSI *dev;
+    VirtQueue *vq;
+    QEMUSGList qsgl;
+    QEMUIOVector resp_iov;
+
+    /* Note:
+     * - fields before elem are initialized by virtio_scsi_init_req;
+     * - elem is uninitialized at the time of allocation.
+     * - fields after elem are zeroed by virtio_scsi_init_req.
+     * */
+
+    VirtQueueElement elem;
+    /* Set by dataplane code. */
+    VirtIOSCSIVring *vring;
+
+    union {
+        /* Used for two-stage request submission */
+        QTAILQ_ENTRY(VirtIOSCSIReq) next;
+
+        /* Used for cancellation of request during TMFs */
+        int remaining;
+    };
+
+    SCSIRequest *sreq;
+    size_t resp_size;
+    enum SCSIXferMode mode;
+    union {
+        VirtIOSCSICmdResp     cmd;
+        VirtIOSCSICtrlTMFResp tmf;
+        VirtIOSCSICtrlANResp  an;
+        VirtIOSCSIEvent       event;
+    } resp;
+    union {
+        struct {
+            VirtIOSCSICmdReq  cmd;
+            uint8_t           cdb[];
+        } QEMU_PACKED;
+        VirtIOSCSICtrlTMFReq  tmf;
+        VirtIOSCSICtrlANReq   an;
+    } req;
+} VirtIOSCSIReq;
+
+QEMU_BUILD_BUG_ON(offsetof(VirtIOSCSIReq, req.cdb) !=
+                  offsetof(VirtIOSCSIReq, req.cmd) + sizeof(VirtIOSCSICmdReq));
+
 #define DEFINE_VIRTIO_SCSI_PROPERTIES(_state, _conf_field)                     \
     DEFINE_PROP_UINT32("num_queues", _state, _conf_field.num_queues, 1),       \
     DEFINE_PROP_UINT32("max_sectors", _state, _conf_field.max_sectors, 0xFFFF),\
@@ -192,5 +264,19 @@
                                 HandleOutput cmd);
 
 void virtio_scsi_common_unrealize(DeviceState *dev, Error **errp);
+void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req);
+bool virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req);
+void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req);
+VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq);
+void virtio_scsi_free_req(VirtIOSCSIReq *req);
+void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,
+                            uint32_t event, uint32_t reason);
+
+void virtio_scsi_set_iothread(VirtIOSCSI *s, IOThread *iothread);
+void virtio_scsi_dataplane_start(VirtIOSCSI *s);
+void virtio_scsi_dataplane_stop(VirtIOSCSI *s);
+void virtio_scsi_vring_push_notify(VirtIOSCSIReq *req);
+VirtIOSCSIReq *virtio_scsi_pop_req_vring(VirtIOSCSI *s,
+                                         VirtIOSCSIVring *vring);
 
 #endif /* _QEMU_VIRTIO_SCSI_H */
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index a60104c..0726d76 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -161,6 +161,9 @@
     int (*load)(VirtIODevice *vdev, QEMUFile *f, int version_id);
 } VirtioDeviceClass;
 
+void virtio_instance_init_common(Object *proxy_obj, void *data,
+                                 size_t vdev_size, const char *vdev_name);
+
 void virtio_init(VirtIODevice *vdev, const char *name,
                          uint16_t device_id, size_t config_size);
 void virtio_cleanup(VirtIODevice *vdev);
diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h
index 1babd5d..edf4f17 100644
--- a/include/qemu/bitmap.h
+++ b/include/qemu/bitmap.h
@@ -88,10 +88,19 @@
 int slow_bitmap_intersects(const unsigned long *bitmap1,
                            const unsigned long *bitmap2, long bits);
 
-static inline unsigned long *bitmap_new(long nbits)
+static inline unsigned long *bitmap_try_new(long nbits)
 {
     long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-    return g_malloc0(len);
+    return g_try_malloc0(len);
+}
+
+static inline unsigned long *bitmap_new(long nbits)
+{
+    unsigned long *ptr = bitmap_try_new(nbits);
+    if (ptr == NULL) {
+        abort();
+    }
+    return ptr;
 }
 
 static inline void bitmap_zero(unsigned long *dst, long nbits)
diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index 155b358..ac7c4c4 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -24,6 +24,12 @@
 #define QEMU_WARN_UNUSED_RESULT
 #endif
 
+#if QEMU_GNUC_PREREQ(4, 3)
+#define QEMU_ARTIFICIAL __attribute__((always_inline, artificial))
+#else
+#define QEMU_ARTIFICIAL
+#endif
+
 #if defined(_WIN32)
 # define QEMU_PACKED __attribute__((gcc_struct, packed))
 #else
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 56dae66..987c0bd 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -1007,7 +1007,7 @@
     tcg_out_adr(s, TCG_REG_X3, lb->raddr);
     tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
     if (opc & MO_SIGN) {
-        tcg_out_sxt(s, TCG_TYPE_I64, size, lb->datalo_reg, TCG_REG_X0);
+        tcg_out_sxt(s, lb->type, size, lb->datalo_reg, TCG_REG_X0);
     } else {
         tcg_out_mov(s, size == MO_64, lb->datalo_reg, TCG_REG_X0);
     }
@@ -1032,7 +1032,7 @@
 }
 
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
-                                TCGReg data_reg, TCGReg addr_reg,
+                                TCGType ext, TCGReg data_reg, TCGReg addr_reg,
                                 int mem_index, tcg_insn_unit *raddr,
                                 tcg_insn_unit *label_ptr)
 {
@@ -1040,6 +1040,7 @@
 
     label->is_ld = is_ld;
     label->opc = opc;
+    label->type = ext;
     label->datalo_reg = data_reg;
     label->addrlo_reg = addr_reg;
     label->mem_index = mem_index;
@@ -1108,7 +1109,7 @@
 
 #endif /* CONFIG_SOFTMMU */
 
-static void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp memop,
+static void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp memop, TCGType ext,
                                    TCGReg data_r, TCGReg addr_r, TCGReg off_r)
 {
     const TCGMemOp bswap = memop & MO_BSWAP;
@@ -1118,7 +1119,8 @@
         tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, off_r);
         break;
     case MO_SB:
-        tcg_out_ldst_r(s, I3312_LDRSBX, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
+                       data_r, addr_r, off_r);
         break;
     case MO_UW:
         tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, off_r);
@@ -1130,9 +1132,10 @@
         if (bswap) {
             tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, off_r);
             tcg_out_rev16(s, data_r, data_r);
-            tcg_out_sxt(s, TCG_TYPE_I64, MO_16, data_r, data_r);
+            tcg_out_sxt(s, ext, MO_16, data_r, data_r);
         } else {
-            tcg_out_ldst_r(s, I3312_LDRSHX, data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, ext ? I3312_LDRSHX : I3312_LDRSHW,
+                           data_r, addr_r, off_r);
         }
         break;
     case MO_UL:
@@ -1197,18 +1200,18 @@
 }
 
 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp memop, int mem_index)
+                            TCGMemOp memop, TCGType ext, int mem_index)
 {
 #ifdef CONFIG_SOFTMMU
     TCGMemOp s_bits = memop & MO_SIZE;
     tcg_insn_unit *label_ptr;
 
     tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 1);
-    tcg_out_qemu_ld_direct(s, memop, data_reg, addr_reg, TCG_REG_X1);
-    add_qemu_ldst_label(s, true, memop, data_reg, addr_reg,
+    tcg_out_qemu_ld_direct(s, memop, ext, data_reg, addr_reg, TCG_REG_X1);
+    add_qemu_ldst_label(s, true, memop, ext, data_reg, addr_reg,
                         mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-    tcg_out_qemu_ld_direct(s, memop, data_reg, addr_reg,
+    tcg_out_qemu_ld_direct(s, memop, ext, data_reg, addr_reg,
                            GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
 #endif /* CONFIG_SOFTMMU */
 }
@@ -1222,7 +1225,7 @@
 
     tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 0);
     tcg_out_qemu_st_direct(s, memop, data_reg, addr_reg, TCG_REG_X1);
-    add_qemu_ldst_label(s, false, memop, data_reg, addr_reg,
+    add_qemu_ldst_label(s, false, memop, s_bits == MO_64, data_reg, addr_reg,
                         mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_out_qemu_st_direct(s, memop, data_reg, addr_reg,
@@ -1515,7 +1518,7 @@
 
     case INDEX_op_qemu_ld_i32:
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, a0, a1, a2, args[3]);
+        tcg_out_qemu_ld(s, a0, a1, a2, ext, args[3]);
         break;
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 40f2ec1..0c4b028 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -197,8 +197,8 @@
 #define ARITH_XOR  (INSN_OP(2) | INSN_OP3(0x03))
 #define ARITH_SUB  (INSN_OP(2) | INSN_OP3(0x04))
 #define ARITH_SUBCC (INSN_OP(2) | INSN_OP3(0x14))
-#define ARITH_ADDX (INSN_OP(2) | INSN_OP3(0x08))
-#define ARITH_SUBX (INSN_OP(2) | INSN_OP3(0x0c))
+#define ARITH_ADDC (INSN_OP(2) | INSN_OP3(0x08))
+#define ARITH_SUBC (INSN_OP(2) | INSN_OP3(0x0c))
 #define ARITH_UMUL (INSN_OP(2) | INSN_OP3(0x0a))
 #define ARITH_SMUL (INSN_OP(2) | INSN_OP3(0x0b))
 #define ARITH_UDIV (INSN_OP(2) | INSN_OP3(0x0e))
@@ -209,6 +209,9 @@
 #define ARITH_MOVCC (INSN_OP(2) | INSN_OP3(0x2c))
 #define ARITH_MOVR (INSN_OP(2) | INSN_OP3(0x2f))
 
+#define ARITH_ADDXC (INSN_OP(2) | INSN_OP3(0x36) | INSN_OPF(0x11))
+#define ARITH_UMULXHI (INSN_OP(2) | INSN_OP3(0x36) | INSN_OPF(0x16))
+
 #define SHIFT_SLL  (INSN_OP(2) | INSN_OP3(0x25))
 #define SHIFT_SRL  (INSN_OP(2) | INSN_OP3(0x26))
 #define SHIFT_SRA  (INSN_OP(2) | INSN_OP3(0x27))
@@ -262,6 +265,10 @@
 #define STW_LE     (STWA  | INSN_ASI(ASI_PRIMARY_LITTLE))
 #define STX_LE     (STXA  | INSN_ASI(ASI_PRIMARY_LITTLE))
 
+#ifndef use_vis3_instructions
+bool use_vis3_instructions;
+#endif
+
 static inline int check_fit_i64(int64_t val, unsigned int bits)
 {
     return val == sextract64(val, 0, bits);
@@ -657,7 +664,7 @@
 static void tcg_out_setcond_i32(TCGContext *s, TCGCond cond, TCGReg ret,
                                 TCGReg c1, int32_t c2, int c2const)
 {
-    /* For 32-bit comparisons, we can play games with ADDX/SUBX.  */
+    /* For 32-bit comparisons, we can play games with ADDC/SUBC.  */
     switch (cond) {
     case TCG_COND_LTU:
     case TCG_COND_GEU:
@@ -668,9 +675,12 @@
     case TCG_COND_NE:
         /* For equality, we can transform to inequality vs zero.  */
         if (c2 != 0) {
-            tcg_out_arithc(s, ret, c1, c2, c2const, ARITH_XOR);
+            tcg_out_arithc(s, TCG_REG_T1, c1, c2, c2const, ARITH_XOR);
+            c2 = TCG_REG_T1;
+        } else {
+            c2 = c1;
         }
-        c1 = TCG_REG_G0, c2 = ret, c2const = 0;
+        c1 = TCG_REG_G0, c2const = 0;
         cond = (cond == TCG_COND_EQ ? TCG_COND_GEU : TCG_COND_LTU);
 	break;
 
@@ -698,15 +708,32 @@
 
     tcg_out_cmp(s, c1, c2, c2const);
     if (cond == TCG_COND_LTU) {
-        tcg_out_arithi(s, ret, TCG_REG_G0, 0, ARITH_ADDX);
+        tcg_out_arithi(s, ret, TCG_REG_G0, 0, ARITH_ADDC);
     } else {
-        tcg_out_arithi(s, ret, TCG_REG_G0, -1, ARITH_SUBX);
+        tcg_out_arithi(s, ret, TCG_REG_G0, -1, ARITH_SUBC);
     }
 }
 
 static void tcg_out_setcond_i64(TCGContext *s, TCGCond cond, TCGReg ret,
                                 TCGReg c1, int32_t c2, int c2const)
 {
+    if (use_vis3_instructions) {
+        switch (cond) {
+        case TCG_COND_NE:
+            if (c2 != 0) {
+                break;
+            }
+            c2 = c1, c2const = 0, c1 = TCG_REG_G0;
+            /* FALLTHRU */
+        case TCG_COND_LTU:
+            tcg_out_cmp(s, c1, c2, c2const);
+            tcg_out_arith(s, ret, TCG_REG_G0, TCG_REG_G0, ARITH_ADDXC);
+            return;
+        default:
+            break;
+        }
+    }
+
     /* For 64-bit signed comparisons vs zero, we can avoid the compare
        if the input does not overlap the output.  */
     if (c2 == 0 && !is_unsigned_cond(cond) && c1 != ret) {
@@ -719,9 +746,9 @@
     }
 }
 
-static void tcg_out_addsub2(TCGContext *s, TCGReg rl, TCGReg rh,
-                            TCGReg al, TCGReg ah, int32_t bl, int blconst,
-                            int32_t bh, int bhconst, int opl, int oph)
+static void tcg_out_addsub2_i32(TCGContext *s, TCGReg rl, TCGReg rh,
+                                TCGReg al, TCGReg ah, int32_t bl, int blconst,
+                                int32_t bh, int bhconst, int opl, int oph)
 {
     TCGReg tmp = TCG_REG_T1;
 
@@ -735,6 +762,54 @@
     tcg_out_mov(s, TCG_TYPE_I32, rl, tmp);
 }
 
+static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, TCGReg rh,
+                                TCGReg al, TCGReg ah, int32_t bl, int blconst,
+                                int32_t bh, int bhconst, bool is_sub)
+{
+    TCGReg tmp = TCG_REG_T1;
+
+    /* Note that the low parts are fully consumed before tmp is set.  */
+    if (rl != ah && (bhconst || rl != bh)) {
+        tmp = rl;
+    }
+
+    tcg_out_arithc(s, tmp, al, bl, blconst, is_sub ? ARITH_SUBCC : ARITH_ADDCC);
+
+    if (use_vis3_instructions && !is_sub) {
+        /* Note that ADDXC doesn't accept immediates.  */
+        if (bhconst && bh != 0) {
+           tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_T2, bh);
+           bh = TCG_REG_T2;
+        }
+        tcg_out_arith(s, rh, ah, bh, ARITH_ADDXC);
+    } else if (bh == TCG_REG_G0) {
+	/* If we have a zero, we can perform the operation in two insns,
+           with the arithmetic first, and a conditional move into place.  */
+	if (rh == ah) {
+            tcg_out_arithi(s, TCG_REG_T2, ah, 1,
+			   is_sub ? ARITH_SUB : ARITH_ADD);
+            tcg_out_movcc(s, TCG_COND_LTU, MOVCC_XCC, rh, TCG_REG_T2, 0);
+	} else {
+            tcg_out_arithi(s, rh, ah, 1, is_sub ? ARITH_SUB : ARITH_ADD);
+	    tcg_out_movcc(s, TCG_COND_GEU, MOVCC_XCC, rh, ah, 0);
+	}
+    } else {
+        /* Otherwise adjust BH as if there is carry into T2 ... */
+        if (bhconst) {
+            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_T2, bh + (is_sub ? -1 : 1));
+        } else {
+            tcg_out_arithi(s, TCG_REG_T2, bh, 1,
+                           is_sub ? ARITH_SUB : ARITH_ADD);
+        }
+        /* ... smoosh T2 back to original BH if carry is clear ... */
+        tcg_out_movcc(s, TCG_COND_GEU, MOVCC_XCC, TCG_REG_T2, bh, bhconst);
+	/* ... and finally perform the arithmetic with the new operand.  */
+        tcg_out_arith(s, rh, ah, TCG_REG_T2, is_sub ? ARITH_SUB : ARITH_ADD);
+    }
+
+    tcg_out_mov(s, TCG_TYPE_I64, rl, tmp);
+}
+
 static void tcg_out_call_nodelay(TCGContext *s, tcg_insn_unit *dest)
 {
     ptrdiff_t disp = tcg_pcrel_diff(s, dest);
@@ -1264,12 +1339,14 @@
         break;
 
     case INDEX_op_add2_i32:
-        tcg_out_addsub2(s, a0, a1, a2, args[3], args[4], const_args[4],
-                        args[5], const_args[5], ARITH_ADDCC, ARITH_ADDX);
+        tcg_out_addsub2_i32(s, args[0], args[1], args[2], args[3],
+                            args[4], const_args[4], args[5], const_args[5],
+                            ARITH_ADDCC, ARITH_ADDC);
         break;
     case INDEX_op_sub2_i32:
-        tcg_out_addsub2(s, a0, a1, a2, args[3], args[4], const_args[4],
-                        args[5], const_args[5], ARITH_SUBCC, ARITH_SUBX);
+        tcg_out_addsub2_i32(s, args[0], args[1], args[2], args[3],
+                            args[4], const_args[4], args[5], const_args[5],
+                            ARITH_SUBCC, ARITH_SUBC);
         break;
     case INDEX_op_mulu2_i32:
         c = ARITH_UMUL;
@@ -1351,6 +1428,17 @@
     case INDEX_op_movcond_i64:
         tcg_out_movcond_i64(s, args[5], a0, a1, a2, c2, args[3], const_args[3]);
         break;
+    case INDEX_op_add2_i64:
+        tcg_out_addsub2_i64(s, args[0], args[1], args[2], args[3], args[4],
+                            const_args[4], args[5], const_args[5], false);
+        break;
+    case INDEX_op_sub2_i64:
+        tcg_out_addsub2_i64(s, args[0], args[1], args[2], args[3], args[4],
+                            const_args[4], args[5], const_args[5], true);
+        break;
+    case INDEX_op_muluh_i64:
+        tcg_out_arith(s, args[0], args[1], args[2], ARITH_UMULXHI);
+        break;
 
     gen_arith:
         tcg_out_arithc(s, a0, a1, a2, c2, c);
@@ -1449,6 +1537,10 @@
     { INDEX_op_setcond_i64, { "R", "RZ", "RJ" } },
     { INDEX_op_movcond_i64, { "R", "RZ", "RJ", "RI", "0" } },
 
+    { INDEX_op_add2_i64, { "R", "R", "RZ", "RZ", "RJ", "RI" } },
+    { INDEX_op_sub2_i64, { "R", "R", "RZ", "RZ", "RJ", "RI" } },
+    { INDEX_op_muluh_i64, { "R", "RZ", "RZ" } },
+
     { INDEX_op_qemu_ld_i32, { "r", "A" } },
     { INDEX_op_qemu_ld_i64, { "R", "A" } },
     { INDEX_op_qemu_st_i32, { "sZ", "A" } },
@@ -1459,6 +1551,15 @@
 
 static void tcg_target_init(TCGContext *s)
 {
+    /* Only probe for the platform and capabilities if we havn't already
+       determined maximum values at compile time.  */
+#ifndef use_vis3_instructions
+    {
+        unsigned long hwcap = qemu_getauxval(AT_HWCAP);
+        use_vis3_instructions = (hwcap & HWCAP_SPARC_VIS3) != 0;
+    }
+#endif
+
     tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffffffff);
     tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, ALL_64);
 
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 089f976..0c4c8af 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -85,6 +85,12 @@
 #define TCG_TARGET_EXTEND_ARGS 1
 #endif
 
+#if defined(__VIS__) && __VIS__ >= 0x300
+#define use_vis3_instructions  1
+#else
+extern bool use_vis3_instructions;
+#endif
+
 /* optional instructions */
 #define TCG_TARGET_HAS_div_i32		1
 #define TCG_TARGET_HAS_rem_i32		0
@@ -133,11 +139,11 @@
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
 #define TCG_TARGET_HAS_movcond_i64      1
-#define TCG_TARGET_HAS_add2_i64         0
-#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_add2_i64         1
+#define TCG_TARGET_HAS_sub2_i64         1
 #define TCG_TARGET_HAS_mulu2_i64        0
 #define TCG_TARGET_HAS_muls2_i64        0
-#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_muluh_i64        use_vis3_instructions
 #define TCG_TARGET_HAS_mulsh_i64        0
 
 #define TCG_AREG0 TCG_REG_I0
diff --git a/tcg/tcg-be-ldst.h b/tcg/tcg-be-ldst.h
index 49b3de6..429cba2 100644
--- a/tcg/tcg-be-ldst.h
+++ b/tcg/tcg-be-ldst.h
@@ -24,8 +24,9 @@
 #define TCG_MAX_QEMU_LDST       640
 
 typedef struct TCGLabelQemuLdst {
-    bool is_ld:1;           /* qemu_ld: true, qemu_st: false */
-    TCGMemOp opc:4;
+    bool is_ld;             /* qemu_ld: true, qemu_st: false */
+    TCGMemOp opc;
+    TCGType type;           /* result type of a load */
     TCGReg addrlo_reg;      /* reg index for low word of guest virtual addr */
     TCGReg addrhi_reg;      /* reg index for high word of guest virtual addr */
     TCGReg datalo_reg;      /* reg index for low word to be loaded or stored */
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 997a704..7285f71 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -274,75 +274,54 @@
 
 typedef tcg_target_ulong TCGArg;
 
-/* Define a type and accessor macros for variables.  Using a struct is
-   nice because it gives some level of type safely.  Ideally the compiler
-   be able to see through all this.  However in practice this is not true,
-   especially on targets with braindamaged ABIs (e.g. i386).
-   We use plain int by default to avoid this runtime overhead.
-   Users of tcg_gen_* don't need to know about any of this, and should
-   treat TCGv as an opaque type.
+/* Define a type and accessor macros for variables.  Using pointer types
+   is nice because it gives some level of type safely.  Converting to and
+   from intptr_t rather than int reduces the number of sign-extension
+   instructions that get implied on 64-bit hosts.  Users of tcg_gen_* don't
+   need to know about any of this, and should treat TCGv as an opaque type.
    In addition we do typechecking for different types of variables.  TCGv_i32
    and TCGv_i64 are 32/64-bit variables respectively.  TCGv and TCGv_ptr
-   are aliases for target_ulong and host pointer sized values respectively.
- */
+   are aliases for target_ulong and host pointer sized values respectively.  */
 
-#ifdef CONFIG_DEBUG_TCG
-#define DEBUG_TCGV 1
-#endif
+typedef struct TCGv_i32_d *TCGv_i32;
+typedef struct TCGv_i64_d *TCGv_i64;
+typedef struct TCGv_ptr_d *TCGv_ptr;
 
-#ifdef DEBUG_TCGV
-
-typedef struct
+static inline TCGv_i32 QEMU_ARTIFICIAL MAKE_TCGV_I32(intptr_t i)
 {
-    int i32;
-} TCGv_i32;
+    return (TCGv_i32)i;
+}
 
-typedef struct
+static inline TCGv_i64 QEMU_ARTIFICIAL MAKE_TCGV_I64(intptr_t i)
 {
-    int i64;
-} TCGv_i64;
+    return (TCGv_i64)i;
+}
 
-typedef struct {
-    int iptr;
-} TCGv_ptr;
+static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i)
+{
+    return (TCGv_ptr)i;
+}
 
-#define MAKE_TCGV_I32(i) __extension__                  \
-    ({ TCGv_i32 make_tcgv_tmp = {i}; make_tcgv_tmp;})
-#define MAKE_TCGV_I64(i) __extension__                  \
-    ({ TCGv_i64 make_tcgv_tmp = {i}; make_tcgv_tmp;})
-#define MAKE_TCGV_PTR(i) __extension__                  \
-    ({ TCGv_ptr make_tcgv_tmp = {i}; make_tcgv_tmp; })
-#define GET_TCGV_I32(t) ((t).i32)
-#define GET_TCGV_I64(t) ((t).i64)
-#define GET_TCGV_PTR(t) ((t).iptr)
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
+{
+    return (intptr_t)t;
+}
+
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I64(TCGv_i64 t)
+{
+    return (intptr_t)t;
+}
+
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
+{
+    return (intptr_t)t;
+}
+
 #if TCG_TARGET_REG_BITS == 32
 #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
 #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
 #endif
 
-#else /* !DEBUG_TCGV */
-
-typedef int TCGv_i32;
-typedef int TCGv_i64;
-#if TCG_TARGET_REG_BITS == 32
-#define TCGv_ptr TCGv_i32
-#else
-#define TCGv_ptr TCGv_i64
-#endif
-#define MAKE_TCGV_I32(x) (x)
-#define MAKE_TCGV_I64(x) (x)
-#define MAKE_TCGV_PTR(x) (x)
-#define GET_TCGV_I32(t) (t)
-#define GET_TCGV_I64(t) (t)
-#define GET_TCGV_PTR(t) (t)
-
-#if TCG_TARGET_REG_BITS == 32
-#define TCGV_LOW(t) (t)
-#define TCGV_HIGH(t) ((t) + 1)
-#endif
-
-#endif /* DEBUG_TCGV */
-
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
diff --git a/ui/qemu-pixman.c b/ui/qemu-pixman.c
index 30c7fdd..1f6fea5 100644
--- a/ui/qemu-pixman.c
+++ b/ui/qemu-pixman.c
@@ -80,7 +80,7 @@
         case 24:
             return PIXMAN_b8g8r8;
         case 32:
-            return PIXMAN_b8g8r8a8;
+            return PIXMAN_b8g8r8x8;
         break;
         }
     }
diff --git a/vl.c b/vl.c
index dbdca59..9d2aaaf 100644
--- a/vl.c
+++ b/vl.c
@@ -134,6 +134,7 @@
 ram_addr_t ram_size;
 const char *mem_path = NULL;
 int mem_prealloc = 0; /* force preallocation of physical target memory */
+bool enable_mlock = false;
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int autostart;
@@ -1421,12 +1422,8 @@
 
 }
 
-static void configure_realtime(QemuOpts *opts)
+static void realtime_init(void)
 {
-    bool enable_mlock;
-
-    enable_mlock = qemu_opt_get_bool(opts, "mlock", true);
-
     if (enable_mlock) {
         if (os_mlock() < 0) {
             fprintf(stderr, "qemu: locking memory failed\n");
@@ -3974,7 +3971,7 @@
                 if (!opts) {
                     exit(1);
                 }
-                configure_realtime(opts);
+                enable_mlock = qemu_opt_get_bool(opts, "mlock", true);
                 break;
             case QEMU_OPTION_msg:
                 opts = qemu_opts_parse(qemu_find_opts("msg"), optarg, 0);
@@ -4442,6 +4439,8 @@
 
     machine_class->init(current_machine);
 
+    realtime_init();
+
     audio_init();
 
     cpu_synchronize_all_post_init();