Merge remote-tracking branch 'remotes/kraxel/tags/pull-vga-20150707-1' into staging

virtio-gpu property fixes, add testcase

# gpg: Signature made Tue Jul  7 10:24:16 2015 BST using RSA key ID D3E87138
# gpg: Good signature from "Gerd Hoffmann (work) <kraxel@redhat.com>"
# gpg:                 aka "Gerd Hoffmann <gerd@kraxel.org>"
# gpg:                 aka "Gerd Hoffmann (private) <kraxel@gmail.com>"

* remotes/kraxel/tags/pull-vga-20150707-1:
  virtio-gpu: add to display-vga test
  virtio-gpu: use virtio_instance_init_common, fixup properties
  virtio-gpu: update console device property.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/cpu-exec.c b/cpu-exec.c
index 2ffeb6e..b2724c1 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -27,6 +27,7 @@
 #include "exec/address-spaces.h"
 #include "exec/memory-internal.h"
 #include "qemu/rcu.h"
+#include "exec/tb-hash.h"
 
 /* -icount align implementation. */
 
diff --git a/cpus.c b/cpus.c
index 4f0e54d..f547aeb 100644
--- a/cpus.c
+++ b/cpus.c
@@ -954,7 +954,7 @@
     CPUState *cpu = arg;
     int r;
 
-    qemu_mutex_lock(&qemu_global_mutex);
+    qemu_mutex_lock_iothread();
     qemu_thread_get_self(cpu->thread);
     cpu->thread_id = qemu_get_thread_id();
     cpu->can_do_io = 1;
@@ -1034,10 +1034,10 @@
 {
     CPUState *cpu = arg;
 
+    qemu_mutex_lock_iothread();
     qemu_tcg_init_cpu_signals();
     qemu_thread_get_self(cpu->thread);
 
-    qemu_mutex_lock(&qemu_global_mutex);
     CPU_FOREACH(cpu) {
         cpu->thread_id = qemu_get_thread_id();
         cpu->created = true;
@@ -1146,10 +1146,21 @@
     return current_cpu && qemu_cpu_is_self(current_cpu);
 }
 
+static __thread bool iothread_locked = false;
+
+bool qemu_mutex_iothread_locked(void)
+{
+    return iothread_locked;
+}
+
 void qemu_mutex_lock_iothread(void)
 {
     atomic_inc(&iothread_requesting_mutex);
-    if (!tcg_enabled() || !first_cpu || !first_cpu->thread) {
+    /* In the simple case there is no need to bump the VCPU thread out of
+     * TCG code execution.
+     */
+    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
+        !first_cpu || !first_cpu->thread) {
         qemu_mutex_lock(&qemu_global_mutex);
         atomic_dec(&iothread_requesting_mutex);
     } else {
@@ -1160,10 +1171,12 @@
         atomic_dec(&iothread_requesting_mutex);
         qemu_cond_broadcast(&qemu_io_proceeded_cond);
     }
+    iothread_locked = true;
 }
 
 void qemu_mutex_unlock_iothread(void)
 {
+    iothread_locked = false;
     qemu_mutex_unlock(&qemu_global_mutex);
 }
 
diff --git a/docs/multiseat.txt b/docs/multiseat.txt
index 814496e..ebf2446 100644
--- a/docs/multiseat.txt
+++ b/docs/multiseat.txt
@@ -2,8 +2,8 @@
 multiseat howto (with some multihead coverage)
 ==============================================
 
-host side
----------
+host devices
+------------
 
 First you must compile qemu with a user interface supporting
 multihead/multiseat and input event routing.  Right now this
@@ -41,6 +41,19 @@
 the window which belongs to the video.2 display adapter will be routed
 to these input devices.
 
+Starting with qemu 2.4 and linux kernel 4.1 you can also use virtio
+for the input devices, using this ...
+
+	-device pci-bridge,addr=12.0,chassis_nr=2,id=head.2 \
+	-device secondary-vga,bus=head.2,addr=02.0,id=video.2 \
+	-device virtio-keyboard-pci,bus=head.2,addr=03.0,display=video.2 \
+	-device virtio-tablet-pci,bus=head.2,addr=03.0,display=video.2
+
+... instead of xhci and usb hid devices.
+
+host ui
+-------
+
 The sdl2 ui will start up with two windows, one for each display
 device.  The gtk ui will start with a single window and each display
 in a separate tab.  You can either simply switch tabs to switch heads,
@@ -110,7 +123,7 @@
 guest side with pci-bridge-seat
 -------------------------------
 
-Qemu version FIXME and newer has a new pci-bridge-seat device which
+Qemu version 2.4 and newer has a new pci-bridge-seat device which
 can be used instead of pci-bridge.  Just swap the device name in the
 qemu command line above.  The only difference between the two devices
 is the pci id.  We can match the pci id instead of the device path
@@ -121,9 +134,10 @@
     SUBSYSTEM=="pci", ATTR{vendor}=="0x1b36", ATTR{device}=="0x000a", \
             TAG+="seat", ENV{ID_AUTOSEAT}="1"
 
-Patch with this rule will be submitted to upstream udev/systemd, so
-long-term, when systemd with this lands in distros, things will work
-just fine without any manual guest configuration.
+Patch with this rule has been submitted to upstream udev/systemd, was
+accepted and and should be included in the next systemd release (222).
+So, if your guest has this or a newer version, multiseat will work just
+fine without any manual guest configuration.
 
 Enjoy!
 
diff --git a/exec.c b/exec.c
index f7883d2..251dc79 100644
--- a/exec.c
+++ b/exec.c
@@ -48,6 +48,7 @@
 #endif
 #include "exec/cpu-all.h"
 #include "qemu/rcu_queue.h"
+#include "qemu/main-loop.h"
 #include "exec/cputlb.h"
 #include "translate-all.h"
 
@@ -352,6 +353,18 @@
     *xlat = addr + section->offset_within_region;
 
     mr = section->mr;
+
+    /* MMIO registers can be expected to perform full-width accesses based only
+     * on their address, without considering adjacent registers that could
+     * decode to completely different MemoryRegions.  When such registers
+     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
+     * regions overlap wildly.  For this reason we cannot clamp the accesses
+     * here.
+     *
+     * If the length is small (as is the case for address_space_ldl/stl),
+     * everything works fine.  If the incoming length is large, however,
+     * the caller really has to do the clamping through memory_access_size.
+     */
     if (memory_region_is_ram(mr)) {
         diff = int128_sub(section->size, int128_make64(addr));
         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
@@ -2316,6 +2329,29 @@
     return l;
 }
 
+static bool prepare_mmio_access(MemoryRegion *mr)
+{
+    bool unlocked = !qemu_mutex_iothread_locked();
+    bool release_lock = false;
+
+    if (unlocked && mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        unlocked = false;
+        release_lock = true;
+    }
+    if (mr->flush_coalesced_mmio) {
+        if (unlocked) {
+            qemu_mutex_lock_iothread();
+        }
+        qemu_flush_coalesced_mmio_buffer();
+        if (unlocked) {
+            qemu_mutex_unlock_iothread();
+        }
+    }
+
+    return release_lock;
+}
+
 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                              uint8_t *buf, int len, bool is_write)
 {
@@ -2325,6 +2361,7 @@
     hwaddr addr1;
     MemoryRegion *mr;
     MemTxResult result = MEMTX_OK;
+    bool release_lock = false;
 
     rcu_read_lock();
     while (len > 0) {
@@ -2333,6 +2370,7 @@
 
         if (is_write) {
             if (!memory_access_is_direct(mr, is_write)) {
+                release_lock |= prepare_mmio_access(mr);
                 l = memory_access_size(mr, l, addr1);
                 /* XXX: could force current_cpu to NULL to avoid
                    potential bugs */
@@ -2374,6 +2412,7 @@
         } else {
             if (!memory_access_is_direct(mr, is_write)) {
                 /* I/O case */
+                release_lock |= prepare_mmio_access(mr);
                 l = memory_access_size(mr, l, addr1);
                 switch (l) {
                 case 8:
@@ -2409,6 +2448,12 @@
                 memcpy(buf, ptr, l);
             }
         }
+
+        if (release_lock) {
+            qemu_mutex_unlock_iothread();
+            release_lock = false;
+        }
+
         len -= l;
         buf += l;
         addr += l;
@@ -2458,7 +2503,7 @@
 
         if (!(memory_region_is_ram(mr) ||
               memory_region_is_romd(mr))) {
-            /* do nothing */
+            l = memory_access_size(mr, l, addr1);
         } else {
             addr1 += memory_region_get_ram_addr(mr);
             /* ROM/RAM case */
@@ -2735,10 +2780,13 @@
     hwaddr l = 4;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l, false);
     if (l < 4 || !memory_access_is_direct(mr, false)) {
+        release_lock |= prepare_mmio_access(mr);
+
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs);
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -2771,6 +2819,9 @@
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -2823,11 +2874,14 @@
     hwaddr l = 8;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  false);
     if (l < 8 || !memory_access_is_direct(mr, false)) {
+        release_lock |= prepare_mmio_access(mr);
+
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs);
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -2860,6 +2914,9 @@
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -2932,11 +2989,14 @@
     hwaddr l = 2;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  false);
     if (l < 2 || !memory_access_is_direct(mr, false)) {
+        release_lock |= prepare_mmio_access(mr);
+
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs);
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -2969,6 +3029,9 @@
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -3021,11 +3084,14 @@
     hwaddr addr1;
     MemTxResult r;
     uint8_t dirty_log_mask;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  true);
     if (l < 4 || !memory_access_is_direct(mr, true)) {
+        release_lock |= prepare_mmio_access(mr);
+
         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
     } else {
         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
@@ -3040,6 +3106,9 @@
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
@@ -3060,11 +3129,14 @@
     hwaddr l = 4;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  true);
     if (l < 4 || !memory_access_is_direct(mr, true)) {
+        release_lock |= prepare_mmio_access(mr);
+
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
             val = bswap32(val);
@@ -3096,6 +3168,9 @@
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
@@ -3165,10 +3240,13 @@
     hwaddr l = 2;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l, true);
     if (l < 2 || !memory_access_is_direct(mr, true)) {
+        release_lock |= prepare_mmio_access(mr);
+
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
             val = bswap16(val);
@@ -3200,6 +3278,9 @@
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
diff --git a/hw/acpi/core.c b/hw/acpi/core.c
index 0f201d8..fe6215a 100644
--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -528,6 +528,7 @@
     ar->tmr.timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, acpi_pm_tmr_timer, ar);
     memory_region_init_io(&ar->tmr.io, memory_region_owner(parent),
                           &acpi_pm_tmr_ops, ar, "acpi-tmr", 4);
+    memory_region_clear_global_locking(&ar->tmr.io);
     memory_region_add_subregion(parent, 8, &ar->tmr.io);
 }
 
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index 8a64ffb..f4dc7a8 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -192,7 +192,7 @@
     acpi_pm_tmr_reset(&pm->acpi_regs);
     acpi_gpe_reset(&pm->acpi_regs);
 
-    if (kvm_enabled()) {
+    if (!pm->smm_enabled) {
         /* Mark SMM as already inited to prevent SMM from running. KVM does not
          * support SMM mode. */
         pm->smi_en |= ICH9_PMIO_SMI_EN_APMC_EN;
@@ -209,7 +209,7 @@
     acpi_pm1_evt_power_down(&pm->acpi_regs);
 }
 
-void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
+void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm, bool smm_enabled,
                   qemu_irq sci_irq)
 {
     memory_region_init(&pm->io, OBJECT(lpc_pci), "ich9-pm", ICH9_PMIO_SIZE);
@@ -231,6 +231,7 @@
                           "acpi-smi", 8);
     memory_region_add_subregion(&pm->io, ICH9_PMIO_SMI_EN, &pm->io_smi);
 
+    pm->smm_enabled = smm_enabled;
     pm->irq = sci_irq;
     qemu_register_reset(pm_reset, pm);
     pm->powerdown_notifier.notify = pm_powerdown_req;
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index 3bd1d5a..2cd2fee 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -72,7 +72,7 @@
 
     qemu_irq irq;
     qemu_irq smi_irq;
-    int kvm_enabled;
+    int smm_enabled;
     Notifier machine_ready;
     Notifier powerdown_notifier;
 
@@ -112,6 +112,9 @@
 
     /* ACPI specs 3.0, 4.7.2.5 */
     acpi_pm1_cnt_update(&s->ar, val == ACPI_ENABLE, val == ACPI_DISABLE);
+    if (val == ACPI_ENABLE || val == ACPI_DISABLE) {
+        return;
+    }
 
     if (d->config[0x5b] & (1 << 1)) {
         if (s->smi_irq) {
@@ -319,7 +322,7 @@
     pci_conf[0x40] = 0x01; /* PM io base read only bit */
     pci_conf[0x80] = 0;
 
-    if (s->kvm_enabled) {
+    if (!s->smm_enabled) {
         /* Mark SMM as already inited (until KVM supports SMM). */
         pci_conf[0x5B] = 0x02;
     }
@@ -450,7 +453,7 @@
     /* APM */
     apm_init(dev, &s->apm, apm_ctrl_changed, s);
 
-    if (s->kvm_enabled) {
+    if (!s->smm_enabled) {
         /* Mark SMM as already inited to prevent SMM from running.  KVM does not
          * support SMM mode. */
         pci_conf[0x5B] = 0x02;
@@ -501,7 +504,7 @@
 
 I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
                       qemu_irq sci_irq, qemu_irq smi_irq,
-                      int kvm_enabled, DeviceState **piix4_pm)
+                      int smm_enabled, DeviceState **piix4_pm)
 {
     DeviceState *dev;
     PIIX4PMState *s;
@@ -515,7 +518,7 @@
     s = PIIX4_PM(dev);
     s->irq = sci_irq;
     s->smi_irq = smi_irq;
-    s->kvm_enabled = kvm_enabled;
+    s->smm_enabled = smm_enabled;
     if (xen_enabled()) {
         s->use_acpi_pci_hotplug = false;
     }
diff --git a/hw/core/sysbus.c b/hw/core/sysbus.c
index 278a2d1..3c58629 100644
--- a/hw/core/sysbus.c
+++ b/hw/core/sysbus.c
@@ -109,7 +109,13 @@
 
 void sysbus_connect_irq(SysBusDevice *dev, int n, qemu_irq irq)
 {
+    SysBusDeviceClass *sbd = SYS_BUS_DEVICE_GET_CLASS(dev);
+
     qdev_connect_gpio_out_named(DEVICE(dev), SYSBUS_DEVICE_GPIO_IRQ, n, irq);
+
+    if (sbd->connect_irq_notifier) {
+        sbd->connect_irq_notifier(dev, irq);
+    }
 }
 
 /* Check whether an MMIO region exists */
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 00818b9..aed811a 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1509,7 +1509,7 @@
      */
     if (hotplugabble_address_space_size) {
         numamem = acpi_data_push(table_data, sizeof *numamem);
-        acpi_build_srat_memory(numamem, pcms->hotplug_memory_base,
+        acpi_build_srat_memory(numamem, pcms->hotplug_memory.base,
                                hotplugabble_address_space_size, 0,
                                MEM_AFFINITY_HOTPLUGGABLE |
                                MEM_AFFINITY_ENABLED);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 7072930..7959b44 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -64,7 +64,6 @@
 #include "hw/pci/pci_host.h"
 #include "acpi-build.h"
 #include "hw/mem/pc-dimm.h"
-#include "trace.h"
 #include "qapi/visitor.h"
 #include "qapi-visit.h"
 
@@ -1297,7 +1296,7 @@
             exit(EXIT_FAILURE);
         }
 
-        pcms->hotplug_memory_base =
+        pcms->hotplug_memory.base =
             ROUND_UP(0x100000000ULL + above_4g_mem_size, 1ULL << 30);
 
         if (pcms->enforce_aligned_dimm) {
@@ -1305,17 +1304,17 @@
             hotplug_mem_size += (1ULL << 30) * machine->ram_slots;
         }
 
-        if ((pcms->hotplug_memory_base + hotplug_mem_size) <
+        if ((pcms->hotplug_memory.base + hotplug_mem_size) <
             hotplug_mem_size) {
             error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
                          machine->maxram_size);
             exit(EXIT_FAILURE);
         }
 
-        memory_region_init(&pcms->hotplug_memory, OBJECT(pcms),
+        memory_region_init(&pcms->hotplug_memory.mr, OBJECT(pcms),
                            "hotplug-memory", hotplug_mem_size);
-        memory_region_add_subregion(system_memory, pcms->hotplug_memory_base,
-                                    &pcms->hotplug_memory);
+        memory_region_add_subregion(system_memory, pcms->hotplug_memory.base,
+                                    &pcms->hotplug_memory.mr);
     }
 
     /* Initialize PC system firmware */
@@ -1333,9 +1332,9 @@
     fw_cfg = bochs_bios_init();
     rom_set_fw(fw_cfg);
 
-    if (guest_info->has_reserved_memory && pcms->hotplug_memory_base) {
+    if (guest_info->has_reserved_memory && pcms->hotplug_memory.base) {
         uint64_t *val = g_malloc(sizeof(*val));
-        *val = cpu_to_le64(ROUND_UP(pcms->hotplug_memory_base, 0x1ULL << 30));
+        *val = cpu_to_le64(ROUND_UP(pcms->hotplug_memory.base, 0x1ULL << 30));
         fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));
     }
 
@@ -1554,88 +1553,31 @@
 static void pc_dimm_plug(HotplugHandler *hotplug_dev,
                          DeviceState *dev, Error **errp)
 {
-    int slot;
     HotplugHandlerClass *hhc;
     Error *local_err = NULL;
     PCMachineState *pcms = PC_MACHINE(hotplug_dev);
-    MachineState *machine = MACHINE(hotplug_dev);
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
     MemoryRegion *mr = ddc->get_memory_region(dimm);
-    uint64_t existing_dimms_capacity = 0;
     uint64_t align = TARGET_PAGE_SIZE;
-    uint64_t addr;
-
-    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
 
     if (memory_region_get_alignment(mr) && pcms->enforce_aligned_dimm) {
         align = memory_region_get_alignment(mr);
     }
 
-    addr = pc_dimm_get_free_addr(pcms->hotplug_memory_base,
-                                 memory_region_size(&pcms->hotplug_memory),
-                                 !addr ? NULL : &addr, align,
-                                 memory_region_size(mr), &local_err);
-    if (local_err) {
-        goto out;
-    }
-
-    existing_dimms_capacity = pc_existing_dimms_capacity(&local_err);
-    if (local_err) {
-        goto out;
-    }
-
-    if (existing_dimms_capacity + memory_region_size(mr) >
-        machine->maxram_size - machine->ram_size) {
-        error_setg(&local_err, "not enough space, currently 0x%" PRIx64
-                   " in use of total hot pluggable 0x" RAM_ADDR_FMT,
-                   existing_dimms_capacity,
-                   machine->maxram_size - machine->ram_size);
-        goto out;
-    }
-
-    object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
-    trace_mhp_pc_dimm_assigned_address(addr);
-
-    slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
-
-    slot = pc_dimm_get_free_slot(slot == PC_DIMM_UNASSIGNED_SLOT ? NULL : &slot,
-                                 machine->ram_slots, &local_err);
-    if (local_err) {
-        goto out;
-    }
-    object_property_set_int(OBJECT(dev), slot, PC_DIMM_SLOT_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
-    trace_mhp_pc_dimm_assigned_slot(slot);
-
     if (!pcms->acpi_dev) {
         error_setg(&local_err,
                    "memory hotplug is not enabled: missing acpi device");
         goto out;
     }
 
-    if (kvm_enabled() && !kvm_has_free_slot(machine)) {
-        error_setg(&local_err, "hypervisor has no free memory slots left");
+    pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, &local_err);
+    if (local_err) {
         goto out;
     }
 
-    memory_region_add_subregion(&pcms->hotplug_memory,
-                                addr - pcms->hotplug_memory_base, mr);
-    vmstate_register_ram(mr, dev);
-
     hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev);
-    hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &local_err);
+    hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &error_abort);
 out:
     error_propagate(errp, local_err);
 }
@@ -1677,9 +1619,7 @@
         goto out;
     }
 
-    memory_region_del_subregion(&pcms->hotplug_memory, mr);
-    vmstate_unregister_ram(mr, dev);
-
+    pc_dimm_memory_unplug(dev, &pcms->hotplug_memory, mr);
     object_unparent(OBJECT(dev));
 
  out:
@@ -1766,7 +1706,7 @@
                                           const char *name, Error **errp)
 {
     PCMachineState *pcms = PC_MACHINE(obj);
-    int64_t value = memory_region_size(&pcms->hotplug_memory);
+    int64_t value = memory_region_size(&pcms->hotplug_memory.mr);
 
     visit_type_int(v, &value, name, errp);
 }
@@ -1828,6 +1768,48 @@
     visit_type_OnOffAuto(v, &pcms->vmport, name, errp);
 }
 
+bool pc_machine_is_smm_enabled(PCMachineState *pcms)
+{
+    bool smm_available = false;
+
+    if (pcms->smm == ON_OFF_AUTO_OFF) {
+        return false;
+    }
+
+    if (tcg_enabled() || qtest_enabled()) {
+        smm_available = true;
+    } else if (kvm_enabled()) {
+        smm_available = kvm_has_smm();
+    }
+
+    if (smm_available) {
+        return true;
+    }
+
+    if (pcms->smm == ON_OFF_AUTO_ON) {
+        error_report("System Management Mode not supported by this hypervisor.");
+        exit(1);
+    }
+    return false;
+}
+
+static void pc_machine_get_smm(Object *obj, Visitor *v, void *opaque,
+                              const char *name, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+    OnOffAuto smm = pcms->smm;
+
+    visit_type_OnOffAuto(v, &smm, name, errp);
+}
+
+static void pc_machine_set_smm(Object *obj, Visitor *v, void *opaque,
+                                  const char *name, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, &pcms->smm, name, errp);
+}
+
 static bool pc_machine_get_aligned_dimm(Object *obj, Error **errp)
 {
     PCMachineState *pcms = PC_MACHINE(obj);
@@ -1852,6 +1834,15 @@
                                     "Maximum ram below the 4G boundary (32bit boundary)",
                                     NULL);
 
+    pcms->smm = ON_OFF_AUTO_AUTO;
+    object_property_add(obj, PC_MACHINE_SMM, "OnOffAuto",
+                        pc_machine_get_smm,
+                        pc_machine_set_smm,
+                        NULL, NULL, NULL);
+    object_property_set_description(obj, PC_MACHINE_SMM,
+                                    "Enable SMM (pc & q35)",
+                                    NULL);
+
     pcms->vmport = ON_OFF_AUTO_AUTO;
     object_property_add(obj, PC_MACHINE_VMPORT, "OnOffAuto",
                         pc_machine_get_vmport,
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index e142f75..56cdcb9 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -287,7 +287,8 @@
         /* TODO: Populate SPD eeprom data.  */
         smbus = piix4_pm_init(pci_bus, piix3_devfn + 3, 0xb100,
                               gsi[9], smi_irq,
-                              kvm_enabled(), &piix4_pm);
+                              pc_machine_is_smm_enabled(pc_machine),
+                              &piix4_pm);
         smbus_eeprom_init(smbus, 8, NULL, 0);
 
         object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
@@ -306,7 +307,11 @@
 
 static void pc_compat_2_3(MachineState *machine)
 {
+    PCMachineState *pcms = PC_MACHINE(machine);
     savevm_skip_section_footers();
+    if (kvm_enabled()) {
+        pcms->smm = ON_OFF_AUTO_OFF;
+    }
 }
 
 static void pc_compat_2_2(MachineState *machine)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 082cd93..8aa3a67 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -253,7 +253,7 @@
                          (pc_machine->vmport != ON_OFF_AUTO_ON), 0xff0104);
 
     /* connect pm stuff to lpc */
-    ich9_lpc_pm_init(lpc);
+    ich9_lpc_pm_init(lpc, pc_machine_is_smm_enabled(pc_machine));
 
     /* ahci and SATA device, for q35 1 ahci controller is built-in */
     ahci = pci_create_simple_multifunction(host_bus,
@@ -290,7 +290,11 @@
 
 static void pc_compat_2_3(MachineState *machine)
 {
+    PCMachineState *pcms = PC_MACHINE(machine);
     savevm_skip_section_footers();
+    if (kvm_enabled()) {
+        pcms->smm = ON_OFF_AUTO_OFF;
+    }
 }
 
 static void pc_compat_2_2(MachineState *machine)
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index b4b65c1..bb6a92f 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -45,11 +45,11 @@
 } while (0)
 
 static void check_cmd(AHCIState *s, int port);
-static int handle_cmd(AHCIState *s,int port,int slot);
+static int handle_cmd(AHCIState *s, int port, uint8_t slot);
 static void ahci_reset_port(AHCIState *s, int port);
 static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis);
 static void ahci_init_d2h(AHCIDevice *ad);
-static int ahci_dma_prepare_buf(IDEDMA *dma, int is_write);
+static int ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit);
 static void ahci_commit_buf(IDEDMA *dma, uint32_t tx_bytes);
 static bool ahci_map_clb_address(AHCIDevice *ad);
 static bool ahci_map_fis_address(AHCIDevice *ad);
@@ -106,8 +106,6 @@
         val = pr->scr_err;
         break;
     case PORT_SCR_ACT:
-        pr->scr_act &= ~s->dev[port].finished;
-        s->dev[port].finished = 0;
         val = pr->scr_act;
         break;
     case PORT_CMD_ISSUE:
@@ -331,8 +329,7 @@
     }
 }
 
-static uint64_t ahci_mem_read(void *opaque, hwaddr addr,
-                              unsigned size)
+static uint64_t ahci_mem_read_32(void *opaque, hwaddr addr)
 {
     AHCIState *s = opaque;
     uint32_t val = 0;
@@ -368,6 +365,30 @@
 }
 
 
+/**
+ * AHCI 1.3 section 3 ("HBA Memory Registers")
+ * Support unaligned 8/16/32 bit reads, and 64 bit aligned reads.
+ * Caller is responsible for masking unwanted higher order bytes.
+ */
+static uint64_t ahci_mem_read(void *opaque, hwaddr addr, unsigned size)
+{
+    hwaddr aligned = addr & ~0x3;
+    int ofst = addr - aligned;
+    uint64_t lo = ahci_mem_read_32(opaque, aligned);
+    uint64_t hi;
+
+    /* if < 8 byte read does not cross 4 byte boundary */
+    if (ofst + size <= 4) {
+        return lo >> (ofst * 8);
+    }
+    g_assert_cmpint(size, >, 1);
+
+    /* If the 64bit read is unaligned, we will produce undefined
+     * results. AHCI does not support unaligned 64bit reads. */
+    hi = ahci_mem_read_32(opaque, aligned + 4);
+    return (hi << 32 | lo) >> (ofst * 8);
+}
+
 
 static void ahci_mem_write(void *opaque, hwaddr addr,
                            uint64_t val, unsigned size)
@@ -483,7 +504,7 @@
 static void check_cmd(AHCIState *s, int port)
 {
     AHCIPortRegs *pr = &s->dev[port].port_regs;
-    int slot;
+    uint8_t slot;
 
     if ((pr->cmd & PORT_CMD_START) && pr->cmd_issue) {
         for (slot = 0; (slot < 32) && pr->cmd_issue; slot++) {
@@ -558,6 +579,7 @@
     /* reset ncq queue */
     for (i = 0; i < AHCI_MAX_CMDS; i++) {
         NCQTransferState *ncq_tfs = &s->dev[port].ncq_tfs[i];
+        ncq_tfs->halt = false;
         if (!ncq_tfs->used) {
             continue;
         }
@@ -642,14 +664,14 @@
     ad->lst = NULL;
 }
 
-static void ahci_write_fis_sdb(AHCIState *s, int port, uint32_t finished)
+static void ahci_write_fis_sdb(AHCIState *s, NCQTransferState *ncq_tfs)
 {
-    AHCIDevice *ad = &s->dev[port];
+    AHCIDevice *ad = ncq_tfs->drive;
     AHCIPortRegs *pr = &ad->port_regs;
     IDEState *ide_state;
     SDBFIS *sdb_fis;
 
-    if (!s->dev[port].res_fis ||
+    if (!ad->res_fis ||
         !(pr->cmd & PORT_CMD_FIS_RX)) {
         return;
     }
@@ -659,53 +681,35 @@
 
     sdb_fis->type = SATA_FIS_TYPE_SDB;
     /* Interrupt pending & Notification bit */
-    sdb_fis->flags = (ad->hba->control_regs.irqstatus ? (1 << 6) : 0);
+    sdb_fis->flags = 0x40; /* Interrupt bit, always 1 for NCQ */
     sdb_fis->status = ide_state->status & 0x77;
     sdb_fis->error = ide_state->error;
     /* update SAct field in SDB_FIS */
-    s->dev[port].finished |= finished;
     sdb_fis->payload = cpu_to_le32(ad->finished);
 
     /* Update shadow registers (except BSY 0x80 and DRQ 0x08) */
     pr->tfdata = (ad->port.ifs[0].error << 8) |
         (ad->port.ifs[0].status & 0x77) |
         (pr->tfdata & 0x88);
+    pr->scr_act &= ~ad->finished;
+    ad->finished = 0;
 
-    ahci_trigger_irq(s, ad, PORT_IRQ_SDB_FIS);
+    /* Trigger IRQ if interrupt bit is set (which currently, it always is) */
+    if (sdb_fis->flags & 0x40) {
+        ahci_trigger_irq(s, ad, PORT_IRQ_SDB_FIS);
+    }
 }
 
 static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len)
 {
     AHCIPortRegs *pr = &ad->port_regs;
-    uint8_t *pio_fis, *cmd_fis;
-    uint64_t tbl_addr;
-    dma_addr_t cmd_len = 0x80;
+    uint8_t *pio_fis;
     IDEState *s = &ad->port.ifs[0];
 
     if (!ad->res_fis || !(pr->cmd & PORT_CMD_FIS_RX)) {
         return;
     }
 
-    /* map cmd_fis */
-    tbl_addr = le64_to_cpu(ad->cur_cmd->tbl_addr);
-    cmd_fis = dma_memory_map(ad->hba->as, tbl_addr, &cmd_len,
-                             DMA_DIRECTION_TO_DEVICE);
-
-    if (cmd_fis == NULL) {
-        DPRINTF(ad->port_no, "dma_memory_map failed in ahci_write_fis_pio");
-        ahci_trigger_irq(ad->hba, ad, PORT_IRQ_HBUS_ERR);
-        return;
-    }
-
-    if (cmd_len != 0x80) {
-        DPRINTF(ad->port_no,
-                "dma_memory_map mapped too few bytes in ahci_write_fis_pio");
-        dma_memory_unmap(ad->hba->as, cmd_fis, cmd_len,
-                         DMA_DIRECTION_TO_DEVICE, cmd_len);
-        ahci_trigger_irq(ad->hba, ad, PORT_IRQ_HBUS_ERR);
-        return;
-    }
-
     pio_fis = &ad->res_fis[RES_FIS_PSFIS];
 
     pio_fis[0] = SATA_FIS_TYPE_PIO_SETUP;
@@ -721,8 +725,8 @@
     pio_fis[9] = s->hob_lcyl;
     pio_fis[10] = s->hob_hcyl;
     pio_fis[11] = 0;
-    pio_fis[12] = cmd_fis[12];
-    pio_fis[13] = cmd_fis[13];
+    pio_fis[12] = s->nsector & 0xFF;
+    pio_fis[13] = (s->nsector >> 8) & 0xFF;
     pio_fis[14] = 0;
     pio_fis[15] = s->status;
     pio_fis[16] = len & 255;
@@ -739,9 +743,6 @@
     }
 
     ahci_trigger_irq(ad->hba, ad, PORT_IRQ_PIOS_FIS);
-
-    dma_memory_unmap(ad->hba->as, cmd_fis, cmd_len,
-                     DMA_DIRECTION_TO_DEVICE, cmd_len);
 }
 
 static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis)
@@ -749,22 +750,12 @@
     AHCIPortRegs *pr = &ad->port_regs;
     uint8_t *d2h_fis;
     int i;
-    dma_addr_t cmd_len = 0x80;
-    int cmd_mapped = 0;
     IDEState *s = &ad->port.ifs[0];
 
     if (!ad->res_fis || !(pr->cmd & PORT_CMD_FIS_RX)) {
         return;
     }
 
-    if (!cmd_fis) {
-        /* map cmd_fis */
-        uint64_t tbl_addr = le64_to_cpu(ad->cur_cmd->tbl_addr);
-        cmd_fis = dma_memory_map(ad->hba->as, tbl_addr, &cmd_len,
-                                 DMA_DIRECTION_TO_DEVICE);
-        cmd_mapped = 1;
-    }
-
     d2h_fis = &ad->res_fis[RES_FIS_RFIS];
 
     d2h_fis[0] = SATA_FIS_TYPE_REGISTER_D2H;
@@ -780,8 +771,8 @@
     d2h_fis[9] = s->hob_lcyl;
     d2h_fis[10] = s->hob_hcyl;
     d2h_fis[11] = 0;
-    d2h_fis[12] = cmd_fis[12];
-    d2h_fis[13] = cmd_fis[13];
+    d2h_fis[12] = s->nsector & 0xFF;
+    d2h_fis[13] = (s->nsector >> 8) & 0xFF;
     for (i = 14; i < 20; i++) {
         d2h_fis[i] = 0;
     }
@@ -795,26 +786,22 @@
     }
 
     ahci_trigger_irq(ad->hba, ad, PORT_IRQ_D2H_REG_FIS);
-
-    if (cmd_mapped) {
-        dma_memory_unmap(ad->hba->as, cmd_fis, cmd_len,
-                         DMA_DIRECTION_TO_DEVICE, cmd_len);
-    }
 }
 
 static int prdt_tbl_entry_size(const AHCI_SG *tbl)
 {
+    /* flags_size is zero-based */
     return (le32_to_cpu(tbl->flags_size) & AHCI_PRDT_SIZE_MASK) + 1;
 }
 
 static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
-                                int32_t offset)
+                                AHCICmdHdr *cmd, int64_t limit, int32_t offset)
 {
-    AHCICmdHdr *cmd = ad->cur_cmd;
-    uint32_t opts = le32_to_cpu(cmd->opts);
-    uint64_t prdt_addr = le64_to_cpu(cmd->tbl_addr) + 0x80;
-    int sglist_alloc_hint = opts >> AHCI_CMD_HDR_PRDT_LEN;
-    dma_addr_t prdt_len = (sglist_alloc_hint * sizeof(AHCI_SG));
+    uint16_t opts = le16_to_cpu(cmd->opts);
+    uint16_t prdtl = le16_to_cpu(cmd->prdtl);
+    uint64_t cfis_addr = le64_to_cpu(cmd->tbl_addr);
+    uint64_t prdt_addr = cfis_addr + 0x80;
+    dma_addr_t prdt_len = (prdtl * sizeof(AHCI_SG));
     dma_addr_t real_prdt_len = prdt_len;
     uint8_t *prdt;
     int i;
@@ -834,7 +821,7 @@
      * request for sector sizes up to 32K.
      */
 
-    if (!sglist_alloc_hint) {
+    if (!prdtl) {
         DPRINTF(ad->port_no, "no sg list given by guest: 0x%08x\n", opts);
         return -1;
     }
@@ -853,13 +840,12 @@
     }
 
     /* Get entries in the PRDT, init a qemu sglist accordingly */
-    if (sglist_alloc_hint > 0) {
+    if (prdtl > 0) {
         AHCI_SG *tbl = (AHCI_SG *)prdt;
         sum = 0;
-        for (i = 0; i < sglist_alloc_hint; i++) {
-            /* flags_size is zero-based */
+        for (i = 0; i < prdtl; i++) {
             tbl_entry_size = prdt_tbl_entry_size(&tbl[i]);
-            if (offset <= (sum + tbl_entry_size)) {
+            if (offset < (sum + tbl_entry_size)) {
                 off_idx = i;
                 off_pos = offset - sum;
                 break;
@@ -874,15 +860,16 @@
             goto out;
         }
 
-        qemu_sglist_init(sglist, qbus->parent, (sglist_alloc_hint - off_idx),
+        qemu_sglist_init(sglist, qbus->parent, (prdtl - off_idx),
                          ad->hba->as);
         qemu_sglist_add(sglist, le64_to_cpu(tbl[off_idx].addr) + off_pos,
-                        prdt_tbl_entry_size(&tbl[off_idx]) - off_pos);
+                        MIN(prdt_tbl_entry_size(&tbl[off_idx]) - off_pos,
+                            limit));
 
-        for (i = off_idx + 1; i < sglist_alloc_hint; i++) {
-            /* flags_size is zero-based */
+        for (i = off_idx + 1; i < prdtl && sglist->size < limit; i++) {
             qemu_sglist_add(sglist, le64_to_cpu(tbl[i].addr),
-                            prdt_tbl_entry_size(&tbl[i]));
+                            MIN(prdt_tbl_entry_size(&tbl[i]),
+                                limit - sglist->size));
             if (sglist->size > INT32_MAX) {
                 error_report("AHCI Physical Region Descriptor Table describes "
                              "more than 2 GiB.\n");
@@ -899,28 +886,25 @@
     return r;
 }
 
-static void ncq_cb(void *opaque, int ret)
+static void ncq_err(NCQTransferState *ncq_tfs)
 {
-    NCQTransferState *ncq_tfs = (NCQTransferState *)opaque;
     IDEState *ide_state = &ncq_tfs->drive->port.ifs[0];
 
-    if (ret == -ECANCELED) {
-        return;
-    }
-    /* Clear bit for this tag in SActive */
-    ncq_tfs->drive->port_regs.scr_act &= ~(1 << ncq_tfs->tag);
+    ide_state->error = ABRT_ERR;
+    ide_state->status = READY_STAT | ERR_STAT;
+    ncq_tfs->drive->port_regs.scr_err |= (1 << ncq_tfs->tag);
+}
 
-    if (ret < 0) {
-        /* error */
-        ide_state->error = ABRT_ERR;
-        ide_state->status = READY_STAT | ERR_STAT;
-        ncq_tfs->drive->port_regs.scr_err |= (1 << ncq_tfs->tag);
-    } else {
-        ide_state->status = READY_STAT | SEEK_STAT;
+static void ncq_finish(NCQTransferState *ncq_tfs)
+{
+    /* If we didn't error out, set our finished bit. Errored commands
+     * do not get a bit set for the SDB FIS ACT register, nor do they
+     * clear the outstanding bit in scr_act (PxSACT). */
+    if (!(ncq_tfs->drive->port_regs.scr_err & (1 << ncq_tfs->tag))) {
+        ncq_tfs->drive->finished |= (1 << ncq_tfs->tag);
     }
 
-    ahci_write_fis_sdb(ncq_tfs->drive->hba, ncq_tfs->drive->port_no,
-                       (1 << ncq_tfs->tag));
+    ahci_write_fis_sdb(ncq_tfs->drive->hba, ncq_tfs);
 
     DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
             ncq_tfs->tag);
@@ -931,6 +915,35 @@
     ncq_tfs->used = 0;
 }
 
+static void ncq_cb(void *opaque, int ret)
+{
+    NCQTransferState *ncq_tfs = (NCQTransferState *)opaque;
+    IDEState *ide_state = &ncq_tfs->drive->port.ifs[0];
+
+    if (ret == -ECANCELED) {
+        return;
+    }
+
+    if (ret < 0) {
+        bool is_read = ncq_tfs->cmd == READ_FPDMA_QUEUED;
+        BlockErrorAction action = blk_get_error_action(ide_state->blk,
+                                                       is_read, -ret);
+        if (action == BLOCK_ERROR_ACTION_STOP) {
+            ncq_tfs->halt = true;
+            ide_state->bus->error_status = IDE_RETRY_HBA;
+        } else if (action == BLOCK_ERROR_ACTION_REPORT) {
+            ncq_err(ncq_tfs);
+        }
+        blk_error_action(ide_state->blk, action, is_read, -ret);
+    } else {
+        ide_state->status = READY_STAT | SEEK_STAT;
+    }
+
+    if (!ncq_tfs->halt) {
+        ncq_finish(ncq_tfs);
+    }
+}
+
 static int is_ncq(uint8_t ata_cmd)
 {
     /* Based on SATA 3.2 section 13.6.3.2 */
@@ -946,13 +959,60 @@
     }
 }
 
-static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis,
-                                int slot)
+static void execute_ncq_command(NCQTransferState *ncq_tfs)
 {
+    AHCIDevice *ad = ncq_tfs->drive;
+    IDEState *ide_state = &ad->port.ifs[0];
+    int port = ad->port_no;
+
+    g_assert(is_ncq(ncq_tfs->cmd));
+    ncq_tfs->halt = false;
+
+    switch (ncq_tfs->cmd) {
+    case READ_FPDMA_QUEUED:
+        DPRINTF(port, "NCQ reading %d sectors from LBA %"PRId64", tag %d\n",
+                ncq_tfs->sector_count, ncq_tfs->lba, ncq_tfs->tag);
+
+        DPRINTF(port, "tag %d aio read %"PRId64"\n",
+                ncq_tfs->tag, ncq_tfs->lba);
+
+        dma_acct_start(ide_state->blk, &ncq_tfs->acct,
+                       &ncq_tfs->sglist, BLOCK_ACCT_READ);
+        ncq_tfs->aiocb = dma_blk_read(ide_state->blk, &ncq_tfs->sglist,
+                                      ncq_tfs->lba, ncq_cb, ncq_tfs);
+        break;
+    case WRITE_FPDMA_QUEUED:
+        DPRINTF(port, "NCQ writing %d sectors to LBA %"PRId64", tag %d\n",
+                ncq_tfs->sector_count, ncq_tfs->lba, ncq_tfs->tag);
+
+        DPRINTF(port, "tag %d aio write %"PRId64"\n",
+                ncq_tfs->tag, ncq_tfs->lba);
+
+        dma_acct_start(ide_state->blk, &ncq_tfs->acct,
+                       &ncq_tfs->sglist, BLOCK_ACCT_WRITE);
+        ncq_tfs->aiocb = dma_blk_write(ide_state->blk, &ncq_tfs->sglist,
+                                       ncq_tfs->lba, ncq_cb, ncq_tfs);
+        break;
+    default:
+        DPRINTF(port, "error: unsupported NCQ command (0x%02x) received\n",
+                ncq_tfs->cmd);
+        qemu_sglist_destroy(&ncq_tfs->sglist);
+        ncq_err(ncq_tfs);
+    }
+}
+
+
+static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis,
+                                uint8_t slot)
+{
+    AHCIDevice *ad = &s->dev[port];
+    IDEState *ide_state = &ad->port.ifs[0];
     NCQFrame *ncq_fis = (NCQFrame*)cmd_fis;
     uint8_t tag = ncq_fis->tag >> 3;
-    NCQTransferState *ncq_tfs = &s->dev[port].ncq_tfs[tag];
+    NCQTransferState *ncq_tfs = &ad->ncq_tfs[tag];
+    size_t size;
 
+    g_assert(is_ncq(ncq_fis->command));
     if (ncq_tfs->used) {
         /* error - already in use */
         fprintf(stderr, "%s: tag %d already used\n", __FUNCTION__, tag);
@@ -960,75 +1020,82 @@
     }
 
     ncq_tfs->used = 1;
-    ncq_tfs->drive = &s->dev[port];
+    ncq_tfs->drive = ad;
     ncq_tfs->slot = slot;
+    ncq_tfs->cmdh = &((AHCICmdHdr *)ad->lst)[slot];
+    ncq_tfs->cmd = ncq_fis->command;
     ncq_tfs->lba = ((uint64_t)ncq_fis->lba5 << 40) |
                    ((uint64_t)ncq_fis->lba4 << 32) |
                    ((uint64_t)ncq_fis->lba3 << 24) |
                    ((uint64_t)ncq_fis->lba2 << 16) |
                    ((uint64_t)ncq_fis->lba1 << 8) |
                    (uint64_t)ncq_fis->lba0;
+    ncq_tfs->tag = tag;
 
-    /* Note: We calculate the sector count, but don't currently rely on it.
-     * The total size of the DMA buffer tells us the transfer size instead. */
-    ncq_tfs->sector_count = ((uint16_t)ncq_fis->sector_count_high << 8) |
-                                ncq_fis->sector_count_low;
+    /* Sanity-check the NCQ packet */
+    if (tag != slot) {
+        DPRINTF(port, "Warn: NCQ slot (%d) did not match the given tag (%d)\n",
+                slot, tag);
+    }
+
+    if (ncq_fis->aux0 || ncq_fis->aux1 || ncq_fis->aux2 || ncq_fis->aux3) {
+        DPRINTF(port, "Warn: Attempt to use NCQ auxiliary fields.\n");
+    }
+    if (ncq_fis->prio || ncq_fis->icc) {
+        DPRINTF(port, "Warn: Unsupported attempt to use PRIO/ICC fields\n");
+    }
+    if (ncq_fis->fua & NCQ_FIS_FUA_MASK) {
+        DPRINTF(port, "Warn: Unsupported attempt to use Force Unit Access\n");
+    }
+    if (ncq_fis->tag & NCQ_FIS_RARC_MASK) {
+        DPRINTF(port, "Warn: Unsupported attempt to use Rebuild Assist\n");
+    }
+
+    ncq_tfs->sector_count = ((ncq_fis->sector_count_high << 8) |
+                             ncq_fis->sector_count_low);
+    if (!ncq_tfs->sector_count) {
+        ncq_tfs->sector_count = 0x10000;
+    }
+    size = ncq_tfs->sector_count * 512;
+    ahci_populate_sglist(ad, &ncq_tfs->sglist, ncq_tfs->cmdh, size, 0);
+
+    if (ncq_tfs->sglist.size < size) {
+        error_report("ahci: PRDT length for NCQ command (0x%zx) "
+                     "is smaller than the requested size (0x%zx)",
+                     ncq_tfs->sglist.size, size);
+        qemu_sglist_destroy(&ncq_tfs->sglist);
+        ncq_err(ncq_tfs);
+        ahci_trigger_irq(ad->hba, ad, PORT_IRQ_OVERFLOW);
+        return;
+    } else if (ncq_tfs->sglist.size != size) {
+        DPRINTF(port, "Warn: PRDTL (0x%zx)"
+                " does not match requested size (0x%zx)",
+                ncq_tfs->sglist.size, size);
+    }
 
     DPRINTF(port, "NCQ transfer LBA from %"PRId64" to %"PRId64", "
             "drive max %"PRId64"\n",
-            ncq_tfs->lba, ncq_tfs->lba + ncq_tfs->sector_count - 2,
-            s->dev[port].port.ifs[0].nb_sectors - 1);
+            ncq_tfs->lba, ncq_tfs->lba + ncq_tfs->sector_count - 1,
+            ide_state->nb_sectors - 1);
 
-    ahci_populate_sglist(&s->dev[port], &ncq_tfs->sglist, 0);
-    ncq_tfs->tag = tag;
+    execute_ncq_command(ncq_tfs);
+}
 
-    switch(ncq_fis->command) {
-        case READ_FPDMA_QUEUED:
-            DPRINTF(port, "NCQ reading %d sectors from LBA %"PRId64", "
-                    "tag %d\n",
-                    ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
-
-            DPRINTF(port, "tag %d aio read %"PRId64"\n",
-                    ncq_tfs->tag, ncq_tfs->lba);
-
-            dma_acct_start(ncq_tfs->drive->port.ifs[0].blk, &ncq_tfs->acct,
-                           &ncq_tfs->sglist, BLOCK_ACCT_READ);
-            ncq_tfs->aiocb = dma_blk_read(ncq_tfs->drive->port.ifs[0].blk,
-                                          &ncq_tfs->sglist, ncq_tfs->lba,
-                                          ncq_cb, ncq_tfs);
-            break;
-        case WRITE_FPDMA_QUEUED:
-            DPRINTF(port, "NCQ writing %d sectors to LBA %"PRId64", tag %d\n",
-                    ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
-
-            DPRINTF(port, "tag %d aio write %"PRId64"\n",
-                    ncq_tfs->tag, ncq_tfs->lba);
-
-            dma_acct_start(ncq_tfs->drive->port.ifs[0].blk, &ncq_tfs->acct,
-                           &ncq_tfs->sglist, BLOCK_ACCT_WRITE);
-            ncq_tfs->aiocb = dma_blk_write(ncq_tfs->drive->port.ifs[0].blk,
-                                           &ncq_tfs->sglist, ncq_tfs->lba,
-                                           ncq_cb, ncq_tfs);
-            break;
-        default:
-            if (is_ncq(cmd_fis[2])) {
-                DPRINTF(port,
-                        "error: unsupported NCQ command (0x%02x) received\n",
-                        cmd_fis[2]);
-            } else {
-                DPRINTF(port,
-                        "error: tried to process non-NCQ command as NCQ\n");
-            }
-            qemu_sglist_destroy(&ncq_tfs->sglist);
+static AHCICmdHdr *get_cmd_header(AHCIState *s, uint8_t port, uint8_t slot)
+{
+    if (port >= s->ports || slot >= AHCI_MAX_CMDS) {
+        return NULL;
     }
+
+    return s->dev[port].lst ? &((AHCICmdHdr *)s->dev[port].lst)[slot] : NULL;
 }
 
 static void handle_reg_h2d_fis(AHCIState *s, int port,
-                               int slot, uint8_t *cmd_fis)
+                               uint8_t slot, uint8_t *cmd_fis)
 {
     IDEState *ide_state = &s->dev[port].port.ifs[0];
-    AHCICmdHdr *cmd = s->dev[port].cur_cmd;
-    uint32_t opts = le32_to_cpu(cmd->opts);
+    AHCICmdHdr *cmd = get_cmd_header(s, port, slot);
+    uint16_t opts = le16_to_cpu(cmd->opts);
 
     if (cmd_fis[1] & 0x0F) {
         DPRINTF(port, "Port Multiplier not supported."
@@ -1108,7 +1175,7 @@
     ide_exec_cmd(&s->dev[port].port, cmd_fis[2]);
 }
 
-static int handle_cmd(AHCIState *s, int port, int slot)
+static int handle_cmd(AHCIState *s, int port, uint8_t slot)
 {
     IDEState *ide_state;
     uint64_t tbl_addr;
@@ -1126,7 +1193,7 @@
         DPRINTF(port, "error: lst not given but cmd handled");
         return -1;
     }
-    cmd = &((AHCICmdHdr *)s->dev[port].lst)[slot];
+    cmd = get_cmd_header(s, port, slot);
     /* remember current slot handle for later */
     s->dev[port].cur_cmd = cmd;
 
@@ -1185,7 +1252,7 @@
     IDEState *s = &ad->port.ifs[0];
     uint32_t size = (uint32_t)(s->data_end - s->data_ptr);
     /* write == ram -> device */
-    uint32_t opts = le32_to_cpu(ad->cur_cmd->opts);
+    uint16_t opts = le16_to_cpu(ad->cur_cmd->opts);
     int is_write = opts & AHCI_CMD_WRITE;
     int is_atapi = opts & AHCI_CMD_ATAPI;
     int has_sglist = 0;
@@ -1197,7 +1264,7 @@
         goto out;
     }
 
-    if (ahci_dma_prepare_buf(dma, is_write)) {
+    if (ahci_dma_prepare_buf(dma, size)) {
         has_sglist = 1;
     }
 
@@ -1243,16 +1310,34 @@
 }
 
 /**
+ * IDE/PIO restarts are handled by the core layer, but NCQ commands
+ * need an extra kick from the AHCI HBA.
+ */
+static void ahci_restart(IDEDMA *dma)
+{
+    AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
+    int i;
+
+    for (i = 0; i < AHCI_MAX_CMDS; i++) {
+        NCQTransferState *ncq_tfs = &ad->ncq_tfs[i];
+        if (ncq_tfs->halt) {
+            execute_ncq_command(ncq_tfs);
+        }
+    }
+}
+
+/**
  * Called in DMA R/W chains to read the PRDT, utilizing ahci_populate_sglist.
  * Not currently invoked by PIO R/W chains,
  * which invoke ahci_populate_sglist via ahci_start_transfer.
  */
-static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int is_write)
+static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit)
 {
     AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
     IDEState *s = &ad->port.ifs[0];
 
-    if (ahci_populate_sglist(ad, &s->sg, s->io_buffer_offset) == -1) {
+    if (ahci_populate_sglist(ad, &s->sg, ad->cur_cmd,
+                             limit, s->io_buffer_offset) == -1) {
         DPRINTF(ad->port_no, "ahci_dma_prepare_buf failed.\n");
         return -1;
     }
@@ -1287,7 +1372,7 @@
     uint8_t *p = s->io_buffer + s->io_buffer_index;
     int l = s->io_buffer_size - s->io_buffer_index;
 
-    if (ahci_populate_sglist(ad, &s->sg, s->io_buffer_offset)) {
+    if (ahci_populate_sglist(ad, &s->sg, ad->cur_cmd, l, s->io_buffer_offset)) {
         return 0;
     }
 
@@ -1330,6 +1415,7 @@
 
 static const IDEDMAOps ahci_dma_ops = {
     .start_dma = ahci_start_dma,
+    .restart = ahci_restart,
     .restart_dma = ahci_restart_dma,
     .start_transfer = ahci_start_transfer,
     .prepare_buf = ahci_dma_prepare_buf,
@@ -1400,6 +1486,21 @@
     }
 }
 
+static const VMStateDescription vmstate_ncq_tfs = {
+    .name = "ncq state",
+    .version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(sector_count, NCQTransferState),
+        VMSTATE_UINT64(lba, NCQTransferState),
+        VMSTATE_UINT8(tag, NCQTransferState),
+        VMSTATE_UINT8(cmd, NCQTransferState),
+        VMSTATE_UINT8(slot, NCQTransferState),
+        VMSTATE_BOOL(used, NCQTransferState),
+        VMSTATE_BOOL(halt, NCQTransferState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_ahci_device = {
     .name = "ahci port",
     .version_id = 1,
@@ -1425,14 +1526,17 @@
         VMSTATE_BOOL(done_atapi_packet, AHCIDevice),
         VMSTATE_INT32(busy_slot, AHCIDevice),
         VMSTATE_BOOL(init_d2h_sent, AHCIDevice),
+        VMSTATE_STRUCT_ARRAY(ncq_tfs, AHCIDevice, AHCI_MAX_CMDS,
+                             1, vmstate_ncq_tfs, NCQTransferState),
         VMSTATE_END_OF_LIST()
     },
 };
 
 static int ahci_state_post_load(void *opaque, int version_id)
 {
-    int i;
+    int i, j;
     struct AHCIDevice *ad;
+    NCQTransferState *ncq_tfs;
     AHCIState *s = opaque;
 
     for (i = 0; i < s->ports; i++) {
@@ -1444,6 +1548,37 @@
             return -1;
         }
 
+        for (j = 0; j < AHCI_MAX_CMDS; j++) {
+            ncq_tfs = &ad->ncq_tfs[j];
+            ncq_tfs->drive = ad;
+
+            if (ncq_tfs->used != ncq_tfs->halt) {
+                return -1;
+            }
+            if (!ncq_tfs->halt) {
+                continue;
+            }
+            if (!is_ncq(ncq_tfs->cmd)) {
+                return -1;
+            }
+            if (ncq_tfs->slot != ncq_tfs->tag) {
+                return -1;
+            }
+            /* If ncq_tfs->halt is justly set, the engine should be engaged,
+             * and the command list buffer should be mapped. */
+            ncq_tfs->cmdh = get_cmd_header(s, i, ncq_tfs->slot);
+            if (!ncq_tfs->cmdh) {
+                return -1;
+            }
+            ahci_populate_sglist(ncq_tfs->drive, &ncq_tfs->sglist,
+                                 ncq_tfs->cmdh, ncq_tfs->sector_count * 512,
+                                 0);
+            if (ncq_tfs->sector_count != ncq_tfs->sglist.size >> 9) {
+                return -1;
+            }
+        }
+
+
         /*
          * If an error is present, ad->busy_slot will be valid and not -1.
          * In this case, an operation is waiting to resume and will re-check
@@ -1460,7 +1595,7 @@
             if (ad->busy_slot < 0 || ad->busy_slot >= AHCI_MAX_CMDS) {
                 return -1;
             }
-            ad->cur_cmd = &((AHCICmdHdr *)ad->lst)[ad->busy_slot];
+            ad->cur_cmd = get_cmd_header(s, i, ad->busy_slot);
         }
     }
 
diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h
index 501c002..9f5b4d2 100644
--- a/hw/ide/ahci.h
+++ b/hw/ide/ahci.h
@@ -195,6 +195,9 @@
 #define RECEIVE_FPDMA_QUEUED               0x65
 #define SEND_FPDMA_QUEUED                  0x64
 
+#define NCQ_FIS_FUA_MASK                   0x80
+#define NCQ_FIS_RARC_MASK                  0x01
+
 #define RES_FIS_DSFIS                      0x00
 #define RES_FIS_PSFIS                      0x20
 #define RES_FIS_RFIS                       0x40
@@ -233,7 +236,8 @@
 } AHCIPortRegs;
 
 typedef struct AHCICmdHdr {
-    uint32_t    opts;
+    uint16_t    opts;
+    uint16_t    prdtl;
     uint32_t    status;
     uint64_t    tbl_addr;
     uint32_t    reserved[4];
@@ -250,13 +254,16 @@
 typedef struct NCQTransferState {
     AHCIDevice *drive;
     BlockAIOCB *aiocb;
+    AHCICmdHdr *cmdh;
     QEMUSGList sglist;
     BlockAcctCookie acct;
-    uint16_t sector_count;
+    uint32_t sector_count;
     uint64_t lba;
     uint8_t tag;
-    int slot;
-    int used;
+    uint8_t cmd;
+    uint8_t slot;
+    bool used;
+    bool halt;
 } NCQTransferState;
 
 struct AHCIDevice {
@@ -312,27 +319,39 @@
     .offset     = vmstate_offset_value(_state, _field, AHCIState),   \
 }
 
+/**
+ * NCQFrame is the same as a Register H2D FIS (described in SATA 3.2),
+ * but some fields have been re-mapped and re-purposed, as seen in
+ * SATA 3.2 section 13.6.4.1 ("READ FPDMA QUEUED")
+ *
+ * cmd_fis[3], feature 7:0, becomes sector count 7:0.
+ * cmd_fis[7], device 7:0, uses bit 7 as the Force Unit Access bit.
+ * cmd_fis[11], feature 15:8, becomes sector count 15:8.
+ * cmd_fis[12], count 7:0, becomes the NCQ TAG (7:3) and RARC bit (0)
+ * cmd_fis[13], count 15:8, becomes the priority value (7:6)
+ * bytes 16-19 become an le32 "auxiliary" field.
+ */
 typedef struct NCQFrame {
     uint8_t fis_type;
     uint8_t c;
     uint8_t command;
-    uint8_t sector_count_low;
+    uint8_t sector_count_low;  /* (feature 7:0) */
     uint8_t lba0;
     uint8_t lba1;
     uint8_t lba2;
-    uint8_t fua;
+    uint8_t fua;               /* (device 7:0) */
     uint8_t lba3;
     uint8_t lba4;
     uint8_t lba5;
-    uint8_t sector_count_high;
-    uint8_t tag;
-    uint8_t reserved5;
-    uint8_t reserved6;
+    uint8_t sector_count_high; /* (feature 15:8) */
+    uint8_t tag;               /* (count 0:7) */
+    uint8_t prio;              /* (count 15:8) */
+    uint8_t icc;
     uint8_t control;
-    uint8_t reserved7;
-    uint8_t reserved8;
-    uint8_t reserved9;
-    uint8_t reserved10;
+    uint8_t aux0;
+    uint8_t aux1;
+    uint8_t aux2;
+    uint8_t aux3;
 } QEMU_PACKED NCQFrame;
 
 typedef struct SDBFIS {
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 1efd98a..122e955 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -716,8 +716,8 @@
 
     sector_num = ide_get_sector(s);
     if (n > 0) {
-        assert(s->io_buffer_size == s->sg.size);
-        dma_buf_commit(s, s->io_buffer_size);
+        assert(n * 512 == s->sg.size);
+        dma_buf_commit(s, s->sg.size);
         sector_num += n;
         ide_set_sector(s, sector_num);
         s->nsector -= n;
@@ -734,7 +734,7 @@
     n = s->nsector;
     s->io_buffer_index = 0;
     s->io_buffer_size = n * 512;
-    if (s->bus->dma->ops->prepare_buf(s->bus->dma, ide_cmd_is_read(s)) < 512) {
+    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
         /* The PRDs were too short. Reset the Active bit, but don't raise an
          * interrupt. */
         s->status = READY_STAT | SEEK_STAT;
@@ -2326,7 +2326,7 @@
 {
 }
 
-static int32_t ide_nop_int32(IDEDMA *dma, int x)
+static int32_t ide_nop_int32(IDEDMA *dma, int32_t l)
 {
     return 0;
 }
@@ -2371,6 +2371,13 @@
      * called function can set a new error status. */
     bus->error_status = 0;
 
+    /* The HBA has generically asked to be kicked on retry */
+    if (error_status & IDE_RETRY_HBA) {
+        if (s->bus->dma->ops->restart) {
+            s->bus->dma->ops->restart(s->bus->dma);
+        }
+    }
+
     if (error_status & IDE_RETRY_DMA) {
         if (error_status & IDE_RETRY_TRIM) {
             ide_restart_dma(s, IDE_DMA_TRIM);
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 965cc55..30fdcbc 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -324,7 +324,7 @@
 typedef void DMAStartFunc(IDEDMA *, IDEState *, BlockCompletionFunc *);
 typedef void DMAVoidFunc(IDEDMA *);
 typedef int DMAIntFunc(IDEDMA *, int);
-typedef int32_t DMAInt32Func(IDEDMA *, int);
+typedef int32_t DMAInt32Func(IDEDMA *, int32_t len);
 typedef void DMAu32Func(IDEDMA *, uint32_t);
 typedef void DMAStopFunc(IDEDMA *, bool);
 typedef void DMARestartFunc(void *, int, RunState);
@@ -436,6 +436,7 @@
     DMAInt32Func *prepare_buf;
     DMAu32Func *commit_buf;
     DMAIntFunc *rw_buf;
+    DMAVoidFunc *restart;
     DMAVoidFunc *restart_dma;
     DMAStopFunc *set_inactive;
     DMAVoidFunc *cmd_done;
@@ -499,6 +500,7 @@
 #define IDE_RETRY_READ  0x20
 #define IDE_RETRY_FLUSH 0x40
 #define IDE_RETRY_TRIM 0x80
+#define IDE_RETRY_HBA  0x100
 
 static inline IDEState *idebus_active_if(IDEBus *bus)
 {
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index dd52d50..a55a479 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -499,7 +499,7 @@
     return 0;
 }
 
-static int32_t ide_nop_int32(IDEDMA *dma, int x)
+static int32_t ide_nop_int32(IDEDMA *dma, int32_t l)
 {
     return 0;
 }
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 4afd0cf..d31ff88 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -53,10 +53,14 @@
 }
 
 /**
- * Return the number of bytes successfully prepared.
- * -1 on error.
+ * Prepare an sglist based on available PRDs.
+ * @limit: How many bytes to prepare total.
+ *
+ * Returns the number of bytes prepared, -1 on error.
+ * IDEState.io_buffer_size will contain the number of bytes described
+ * by the PRDs, whether or not we added them to the sglist.
  */
-static int32_t bmdma_prepare_buf(IDEDMA *dma, int is_write)
+static int32_t bmdma_prepare_buf(IDEDMA *dma, int32_t limit)
 {
     BMDMAState *bm = DO_UPCAST(BMDMAState, dma, dma);
     IDEState *s = bmdma_active_if(bm);
@@ -75,7 +79,7 @@
             /* end of table (with a fail safe of one page) */
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= BMDMA_PAGE_SIZE) {
-                return s->io_buffer_size;
+                return s->sg.size;
             }
             pci_dma_read(pci_dev, bm->cur_addr, &prd, 8);
             bm->cur_addr += 8;
@@ -90,7 +94,14 @@
         }
         l = bm->cur_prd_len;
         if (l > 0) {
-            qemu_sglist_add(&s->sg, bm->cur_prd_addr, l);
+            uint64_t sg_len;
+
+            /* Don't add extra bytes to the SGList; consume any remaining
+             * PRDs from the guest, but ignore them. */
+            sg_len = MIN(limit - s->sg.size, bm->cur_prd_len);
+            if (sg_len) {
+                qemu_sglist_add(&s->sg, bm->cur_prd_addr, sg_len);
+            }
 
             /* Note: We limit the max transfer to be 2GiB.
              * This should accommodate the largest ATA transaction
diff --git a/hw/input/virtio-input-hid.c b/hw/input/virtio-input-hid.c
index 32cc94a..616a815 100644
--- a/hw/input/virtio-input-hid.c
+++ b/hw/input/virtio-input-hid.c
@@ -252,7 +252,11 @@
 static void virtio_input_hid_realize(DeviceState *dev, Error **errp)
 {
     VirtIOInputHID *vhid = VIRTIO_INPUT_HID(dev);
+
     vhid->hs = qemu_input_handler_register(dev, vhid->handler);
+    if (vhid->display && vhid->hs) {
+        qemu_input_handler_bind(vhid->hs, vhid->display, vhid->head, NULL);
+    }
 }
 
 static void virtio_input_hid_unrealize(DeviceState *dev, Error **errp)
@@ -301,10 +305,17 @@
     }
 }
 
+static Property virtio_input_hid_properties[] = {
+    DEFINE_PROP_STRING("display", VirtIOInputHID, display),
+    DEFINE_PROP_UINT32("head", VirtIOInputHID, head, 0),
+};
+
 static void virtio_input_hid_class_init(ObjectClass *klass, void *data)
 {
+    DeviceClass *dc = DEVICE_CLASS(klass);
     VirtIOInputClass *vic = VIRTIO_INPUT_CLASS(klass);
 
+    dc->props          = virtio_input_hid_properties;
     vic->realize       = virtio_input_hid_realize;
     vic->unrealize     = virtio_input_hid_unrealize;
     vic->change_active = virtio_input_hid_change_active;
diff --git a/hw/intc/arm_gic_common.c b/hw/intc/arm_gic_common.c
index 044ad66..a64d071 100644
--- a/hw/intc/arm_gic_common.c
+++ b/hw/intc/arm_gic_common.c
@@ -123,7 +123,7 @@
 static void arm_gic_common_reset(DeviceState *dev)
 {
     GICState *s = ARM_GIC_COMMON(dev);
-    int i;
+    int i, j;
     memset(s->irq_state, 0, GIC_MAXIRQ * sizeof(gic_irq_state));
     for (i = 0 ; i < s->num_cpu; i++) {
         if (s->revision == REV_11MPCORE) {
@@ -135,15 +135,30 @@
         s->running_irq[i] = 1023;
         s->running_priority[i] = 0x100;
         s->cpu_ctlr[i] = 0;
+        s->bpr[i] = GIC_MIN_BPR;
+        s->abpr[i] = GIC_MIN_ABPR;
+        for (j = 0; j < GIC_INTERNAL; j++) {
+            s->priority1[j][i] = 0;
+        }
+        for (j = 0; j < GIC_NR_SGIS; j++) {
+            s->sgi_pending[j][i] = 0;
+        }
     }
     for (i = 0; i < GIC_NR_SGIS; i++) {
         GIC_SET_ENABLED(i, ALL_CPU_MASK);
         GIC_SET_EDGE_TRIGGER(i);
     }
-    if (s->num_cpu == 1) {
+
+    for (i = 0; i < ARRAY_SIZE(s->priority2); i++) {
+        s->priority2[i] = 0;
+    }
+
+    for (i = 0; i < GIC_MAXIRQ; i++) {
         /* For uniprocessor GICs all interrupts always target the sole CPU */
-        for (i = 0; i < GIC_MAXIRQ; i++) {
+        if (s->num_cpu == 1) {
             s->irq_target[i] = 1;
+        } else {
+            s->irq_target[i] = 0;
         }
     }
     s->ctlr = 0;
diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c
index 2cb7d25..f56bff1 100644
--- a/hw/intc/arm_gic_kvm.c
+++ b/hw/intc/arm_gic_kvm.c
@@ -570,6 +570,12 @@
      */
     i += (GIC_INTERNAL * s->num_cpu);
     qdev_init_gpio_in(dev, kvm_arm_gic_set_irq, i);
+
+    for (i = 0; i < s->num_irq - GIC_INTERNAL; i++) {
+        qemu_irq irq = qdev_get_gpio_in(dev, i);
+        kvm_irqchip_set_qemuirq_gsi(kvm_state, irq, i);
+    }
+
     /* We never use our outbound IRQ/FIQ lines but provide them so that
      * we maintain the same interface as the non-KVM GIC.
      */
diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c
index b3e0b1f..bd655b8 100644
--- a/hw/isa/lpc_ich9.c
+++ b/hw/isa/lpc_ich9.c
@@ -357,11 +357,13 @@
     }
 }
 
-void ich9_lpc_pm_init(PCIDevice *lpc_pci)
+void ich9_lpc_pm_init(PCIDevice *lpc_pci, bool smm_enabled)
 {
     ICH9LPCState *lpc = ICH9_LPC_DEVICE(lpc_pci);
+    qemu_irq sci_irq;
 
-    ich9_pm_init(lpc_pci, &lpc->pm, qemu_allocate_irq(ich9_set_sci, lpc, 0));
+    sci_irq = qemu_allocate_irq(ich9_set_sci, lpc, 0);
+    ich9_pm_init(lpc_pci, &lpc->pm, smm_enabled, sci_irq);
     ich9_lpc_reset(&lpc->d.qdev);
 }
 
@@ -375,6 +377,9 @@
     acpi_pm1_cnt_update(&lpc->pm.acpi_regs,
                         val == ICH9_APM_ACPI_ENABLE,
                         val == ICH9_APM_ACPI_DISABLE);
+    if (val == ICH9_APM_ACPI_ENABLE || val == ICH9_APM_ACPI_DISABLE) {
+        return;
+    }
 
     /* SMI_EN = PMBASE + 30. SMI control and enable register */
     if (lpc->pm.smi_en & ICH9_PMIO_SMI_EN_APMC_EN) {
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index e70633d..bb04862 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -23,12 +23,96 @@
 #include "qapi/visitor.h"
 #include "qemu/range.h"
 #include "sysemu/numa.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
 
 typedef struct pc_dimms_capacity {
      uint64_t size;
      Error    **errp;
 } pc_dimms_capacity;
 
+void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
+                         MemoryRegion *mr, uint64_t align, Error **errp)
+{
+    int slot;
+    MachineState *machine = MACHINE(qdev_get_machine());
+    PCDIMMDevice *dimm = PC_DIMM(dev);
+    Error *local_err = NULL;
+    uint64_t existing_dimms_capacity = 0;
+    uint64_t addr;
+
+    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    addr = pc_dimm_get_free_addr(hpms->base,
+                                 memory_region_size(&hpms->mr),
+                                 !addr ? NULL : &addr, align,
+                                 memory_region_size(mr), &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    existing_dimms_capacity = pc_existing_dimms_capacity(&local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    if (existing_dimms_capacity + memory_region_size(mr) >
+        machine->maxram_size - machine->ram_size) {
+        error_setg(&local_err, "not enough space, currently 0x%" PRIx64
+                   " in use of total hot pluggable 0x" RAM_ADDR_FMT,
+                   existing_dimms_capacity,
+                   machine->maxram_size - machine->ram_size);
+        goto out;
+    }
+
+    object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    trace_mhp_pc_dimm_assigned_address(addr);
+
+    slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    slot = pc_dimm_get_free_slot(slot == PC_DIMM_UNASSIGNED_SLOT ? NULL : &slot,
+                                 machine->ram_slots, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    object_property_set_int(OBJECT(dev), slot, PC_DIMM_SLOT_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    trace_mhp_pc_dimm_assigned_slot(slot);
+
+    if (kvm_enabled() && !kvm_has_free_slot(machine)) {
+        error_setg(&local_err, "hypervisor has no free memory slots left");
+        goto out;
+    }
+
+    memory_region_add_subregion(&hpms->mr, addr - hpms->base, mr);
+    vmstate_register_ram(mr, dev);
+    numa_set_mem_node_id(addr, memory_region_size(mr), dimm->node);
+
+out:
+    error_propagate(errp, local_err);
+}
+
+void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
+                           MemoryRegion *mr)
+{
+    PCDIMMDevice *dimm = PC_DIMM(dev);
+
+    numa_unset_mem_node_id(dimm->addr, memory_region_size(mr), dimm->node);
+    memory_region_del_subregion(&hpms->mr, mr);
+    vmstate_unregister_ram(mr, dev);
+}
+
 static int pc_existing_dimms_capacity_internal(Object *obj, void *opaque)
 {
     pc_dimms_capacity *cap = opaque;
diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index d631337..e345a6e 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -1316,8 +1316,8 @@
     VirtQueue *vq = virtio_get_queue(vdev, n);
     EventNotifier *notifier = virtio_queue_get_guest_notifier(vq);
 
-    return kvm_irqchip_add_irqfd_notifier(kvm_state, notifier, NULL,
-                                          dev->routes.gsi[n]);
+    return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, notifier, NULL,
+                                              dev->routes.gsi[n]);
 }
 
 static void virtio_ccw_remove_irqfd(VirtioCcwDevice *dev, int n)
@@ -1327,8 +1327,8 @@
     EventNotifier *notifier = virtio_queue_get_guest_notifier(vq);
     int ret;
 
-    ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, notifier,
-                                            dev->routes.gsi[n]);
+    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, notifier,
+                                                dev->routes.gsi[n]);
     assert(ret == 0);
 }
 
diff --git a/hw/timer/arm_mptimer.c b/hw/timer/arm_mptimer.c
index 8b93b3c..3e59c2a 100644
--- a/hw/timer/arm_mptimer.c
+++ b/hw/timer/arm_mptimer.c
@@ -38,7 +38,7 @@
 
 static inline void timerblock_update_irq(TimerBlock *tb)
 {
-    qemu_set_irq(tb->irq, tb->status);
+    qemu_set_irq(tb->irq, tb->status && (tb->control & 4));
 }
 
 /* Return conversion factor from mpcore timer ticks to qemu timer ticks.  */
@@ -122,11 +122,18 @@
     case 8: /* Control.  */
         old = tb->control;
         tb->control = value;
-        if (((old & 1) == 0) && (value & 1)) {
-            if (tb->count == 0 && (tb->control & 2)) {
+        if (value & 1) {
+            if ((old & 1) && (tb->count != 0)) {
+                /* Do nothing if timer is ticking right now.  */
+                break;
+            }
+            if (tb->control & 2) {
                 tb->count = tb->load;
             }
             timerblock_reload(tb, 1);
+        } else if (old & 1) {
+            /* Shutdown the timer.  */
+            timer_del(tb->timer);
         }
         break;
     case 12: /* Interrupt status.  */
diff --git a/hw/timer/cadence_ttc.c b/hw/timer/cadence_ttc.c
index d46db3c..35bc880 100644
--- a/hw/timer/cadence_ttc.c
+++ b/hw/timer/cadence_ttc.c
@@ -208,15 +208,14 @@
             s->reg_intr |= (2 << i);
         }
     }
+    if ((x < 0) || (x >= interval)) {
+        s->reg_intr |= (s->reg_count & COUNTER_CTRL_INT) ?
+            COUNTER_INTR_IV : COUNTER_INTR_OV;
+    }
     while (x < 0) {
         x += interval;
     }
     s->reg_value = (uint32_t)(x % interval);
-
-    if (s->reg_value != x) {
-        s->reg_intr |= (s->reg_count & COUNTER_CTRL_INT) ?
-            COUNTER_INTR_IV : COUNTER_INTR_OV;
-    }
     cadence_timer_update(s);
 }
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index b1045da..85ee9b0 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -772,11 +772,19 @@
 
     if (QLIST_EMPTY(&container->group_list)) {
         VFIOAddressSpace *space = container->space;
+        VFIOGuestIOMMU *giommu, *tmp;
 
         if (container->iommu_data.release) {
             container->iommu_data.release(container);
         }
         QLIST_REMOVE(container, next);
+
+        QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
+            memory_region_unregister_iommu_notifier(&giommu->n);
+            QLIST_REMOVE(giommu, giommu_next);
+            g_free(giommu);
+        }
+
         trace_vfio_disconnect_container(container->fd);
         close(container->fd);
         g_free(container);
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e0e339a..2ed877f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -597,7 +597,7 @@
         return;
     }
 
-    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
+    if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
                                        NULL, virq) < 0) {
         kvm_irqchip_release_virq(kvm_state, virq);
         event_notifier_cleanup(&vector->kvm_interrupt);
@@ -609,8 +609,8 @@
 
 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 {
-    kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
-                                      vector->virq);
+    kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
+                                          vector->virq);
     kvm_irqchip_release_virq(kvm_state, vector->virq);
     vector->virq = -1;
     event_notifier_cleanup(&vector->kvm_interrupt);
@@ -939,7 +939,7 @@
     };
     uint64_t size;
     off_t off = 0;
-    size_t bytes;
+    ssize_t bytes;
 
     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
         error_report("vfio: Error getting ROM info: %m");
@@ -2252,6 +2252,33 @@
     vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
     vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
 
+    /*
+     * Test the size of the pba_offset variable and catch if it extends outside
+     * of the specified BAR. If it is the case, we need to apply a hardware
+     * specific quirk if the device is known or we have a broken configuration.
+     */
+    if (vdev->msix->pba_offset >=
+        vdev->bars[vdev->msix->pba_bar].region.size) {
+
+        PCIDevice *pdev = &vdev->pdev;
+        uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
+        uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID);
+
+        /*
+         * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
+         * adapters. The T5 hardware returns an incorrect value of 0x8000 for
+         * the VF PBA offset while the BAR itself is only 8k. The correct value
+         * is 0x1000, so we hard code that here.
+         */
+        if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) {
+            vdev->msix->pba_offset = 0x1000;
+        } else {
+            error_report("vfio: Hardware reports invalid configuration, "
+                         "MSIX PBA outside of specified BAR");
+            return -EINVAL;
+        }
+    }
+
     trace_vfio_early_setup_msix(vdev->vbasedev.name, pos,
                                 vdev->msix->table_bar,
                                 vdev->msix->table_offset,
@@ -2388,7 +2415,7 @@
      * potentially insert a direct-mapped subregion before and after it.
      */
     if (vdev->msix && vdev->msix->table_bar == nr) {
-        size = vdev->msix->table_offset & qemu_host_page_mask;
+        size = vdev->msix->table_offset & qemu_real_host_page_mask;
     }
 
     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
@@ -2401,8 +2428,9 @@
     if (vdev->msix && vdev->msix->table_bar == nr) {
         uint64_t start;
 
-        start = HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
-                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
+        start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
+                                     (vdev->msix->entries *
+                                      PCI_MSIX_ENTRY_SIZE));
 
         size = start < bar->region.size ? bar->region.size - start : 0;
         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 5c678b9..60365d1 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -26,6 +26,7 @@
 #include "hw/sysbus.h"
 #include "trace.h"
 #include "hw/platform-bus.h"
+#include "sysemu/kvm.h"
 
 /*
  * Functions used whatever the injection method
@@ -51,6 +52,7 @@
     intp->pin = info.index;
     intp->flags = info.flags;
     intp->state = VFIO_IRQ_INACTIVE;
+    intp->kvm_accel = false;
 
     sysbus_init_irq(sbdev, &intp->qemuirq);
 
@@ -61,6 +63,13 @@
         error_report("vfio: Error: trigger event_notifier_init failed ");
         return NULL;
     }
+    /* Get an eventfd for resample/unmask */
+    ret = event_notifier_init(&intp->unmask, 0);
+    if (ret) {
+        g_free(intp);
+        error_report("vfio: Error: resamplefd event_notifier_init failed");
+        return NULL;
+    }
 
     QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
     return intp;
@@ -315,6 +324,94 @@
     return ret;
 }
 
+/*
+ * Functions used for irqfd
+ */
+
+/**
+ * vfio_set_resample_eventfd - sets the resamplefd for an IRQ
+ * @intp: the IRQ struct handle
+ * programs the VFIO driver to unmask this IRQ when the
+ * intp->unmask eventfd is triggered
+ */
+static int vfio_set_resample_eventfd(VFIOINTp *intp)
+{
+    VFIODevice *vbasedev = &intp->vdev->vbasedev;
+    struct vfio_irq_set *irq_set;
+    int argsz, ret;
+    int32_t *pfd;
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
+    irq_set->index = intp->pin;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = event_notifier_get_fd(&intp->unmask);
+    qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (ret < 0) {
+        error_report("vfio: Failed to set resample eventfd: %m");
+    }
+    return ret;
+}
+
+static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq)
+{
+    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev);
+    VFIOINTp *intp;
+
+    if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() ||
+        !vdev->irqfd_allowed) {
+        return;
+    }
+
+    QLIST_FOREACH(intp, &vdev->intp_list, next) {
+        if (intp->qemuirq == irq) {
+            break;
+        }
+    }
+    assert(intp);
+
+    /* Get to a known interrupt state */
+    qemu_set_fd_handler(event_notifier_get_fd(&intp->interrupt),
+                        NULL, NULL, vdev);
+
+    vfio_mask_single_irqindex(&vdev->vbasedev, intp->pin);
+    qemu_set_irq(intp->qemuirq, 0);
+
+    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &intp->interrupt,
+                                   &intp->unmask, irq) < 0) {
+        goto fail_irqfd;
+    }
+
+    if (vfio_set_trigger_eventfd(intp, NULL) < 0) {
+        goto fail_vfio;
+    }
+    if (vfio_set_resample_eventfd(intp) < 0) {
+        goto fail_vfio;
+    }
+
+    /* Let's resume injection with irqfd setup */
+    vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin);
+
+    intp->kvm_accel = true;
+
+    trace_vfio_platform_start_irqfd_injection(intp->pin,
+                                     event_notifier_get_fd(&intp->interrupt),
+                                     event_notifier_get_fd(&intp->unmask));
+    return;
+fail_vfio:
+    kvm_irqchip_remove_irqfd_notifier(kvm_state, &intp->interrupt, irq);
+fail_irqfd:
+    vfio_start_eventfd_injection(intp);
+    vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin);
+    return;
+}
+
 /* VFIO skeleton */
 
 static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev)
@@ -584,17 +681,20 @@
     DEFINE_PROP_BOOL("x-mmap", VFIOPlatformDevice, vbasedev.allow_mmap, true),
     DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
                        mmap_timeout, 1100),
+    DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
 static void vfio_platform_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass);
 
     dc->realize = vfio_platform_realize;
     dc->props = vfio_platform_dev_properties;
     dc->vmsd = &vfio_platform_vmstate;
     dc->desc = "VFIO-based platform device assignment";
+    sbc->connect_irq_notifier = vfio_start_irqfd_injection;
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
 }
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 6a0174e..7a89081 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -495,7 +495,7 @@
     VirtQueue *vq = virtio_get_queue(vdev, queue_no);
     EventNotifier *n = virtio_queue_get_guest_notifier(vq);
     int ret;
-    ret = kvm_irqchip_add_irqfd_notifier(kvm_state, n, NULL, irqfd->virq);
+    ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq);
     return ret;
 }
 
@@ -509,7 +509,7 @@
     VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
     int ret;
 
-    ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, n, irqfd->virq);
+    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq);
     assert(ret == 0);
 }
 
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index ac06c67..ea6a9a6 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -26,6 +26,12 @@
 #include "qom/cpu.h"
 #include "qemu/rcu.h"
 
+#define EXCP_INTERRUPT 	0x10000 /* async interruption */
+#define EXCP_HLT        0x10001 /* hlt instruction reached */
+#define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
+#define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
+#define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
+
 /* some important defines:
  *
  * WORDS_ALIGNED : if defined, the host cpu can only make word aligned
@@ -177,10 +183,13 @@
 
 /* ??? These should be the larger of uintptr_t and target_ulong.  */
 extern uintptr_t qemu_real_host_page_size;
+extern uintptr_t qemu_real_host_page_mask;
 extern uintptr_t qemu_host_page_size;
 extern uintptr_t qemu_host_page_mask;
 
 #define HOST_PAGE_ALIGN(addr) (((addr) + qemu_host_page_size - 1) & qemu_host_page_mask)
+#define REAL_HOST_PAGE_ALIGN(addr) (((addr) + qemu_real_host_page_size - 1) & \
+                                    qemu_real_host_page_mask)
 
 /* same as PROT_xxx */
 #define PAGE_READ      0x0001
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index de8a720..9fb1d54 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -13,6 +13,8 @@
 
 #include "qemu/bswap.h"
 #include "qemu/queue.h"
+#include "qemu/fprintf-fn.h"
+#include "qemu/typedefs.h"
 
 /**
  * CPUListState:
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index d5aecaf..98b9cff 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -56,20 +56,6 @@
 #error TARGET_LONG_SIZE undefined
 #endif
 
-#define EXCP_INTERRUPT 	0x10000 /* async interruption */
-#define EXCP_HLT        0x10001 /* hlt instruction reached */
-#define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
-#define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
-#define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
-
-/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
-   addresses on the same page.  The top bits are the same.  This allows
-   TLB invalidation to quickly clear a subset of the hash table.  */
-#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2)
-#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS)
-#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1)
-#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE)
-
 #if !defined(CONFIG_USER_ONLY)
 /* use a fully associative victim tlb of 8 entries */
 #define CPU_VTLB_SIZE 8
@@ -161,7 +147,6 @@
 #endif
 
 
-#define CPU_TEMP_BUF_NLONGS 128
 #define CPU_COMMON                                                      \
     /* soft mmu support */                                              \
     CPU_COMMON_TLB                                                      \
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 2573e8c..d678114 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -195,26 +195,6 @@
     int tb_invalidated_flag;
 };
 
-static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc)
-{
-    target_ulong tmp;
-    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
-    return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK;
-}
-
-static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
-{
-    target_ulong tmp;
-    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
-    return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK)
-	    | (tmp & TB_JMP_ADDR_MASK));
-}
-
-static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc)
-{
-    return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1);
-}
-
 void tb_free(TranslationBlock *tb);
 void tb_flush(CPUArchState *env);
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 8ae004e..1394715 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -23,7 +23,6 @@
 
 #include <stdint.h>
 #include <stdbool.h>
-#include "qemu-common.h"
 #include "exec/cpu-common.h"
 #ifndef CONFIG_USER_ONLY
 #include "exec/hwaddr.h"
@@ -180,6 +179,7 @@
     bool rom_device;
     bool warning_printed; /* For reservations */
     bool flush_coalesced_mmio;
+    bool global_locking;
     MemoryRegion *alias;
     hwaddr alias_offset;
     int32_t priority;
@@ -825,6 +825,31 @@
 void memory_region_clear_flush_coalesced(MemoryRegion *mr);
 
 /**
+ * memory_region_set_global_locking: Declares the access processing requires
+ *                                   QEMU's global lock.
+ *
+ * When this is invoked, accesses to the memory region will be processed while
+ * holding the global lock of QEMU. This is the default behavior of memory
+ * regions.
+ *
+ * @mr: the memory region to be updated.
+ */
+void memory_region_set_global_locking(MemoryRegion *mr);
+
+/**
+ * memory_region_clear_global_locking: Declares that access processing does
+ *                                     not depend on the QEMU global lock.
+ *
+ * By clearing this property, accesses to the memory region will be processed
+ * outside of QEMU's global lock (unless the lock is held on when issuing the
+ * access request). In this case, the device model implementing the access
+ * handlers is responsible for synchronization of concurrency.
+ *
+ * @mr: the memory region to be updated.
+ */
+void memory_region_clear_global_locking(MemoryRegion *mr);
+
+/**
  * memory_region_add_eventfd: Request an eventfd to be triggered when a word
  *                            is written to a location.
  *
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
new file mode 100644
index 0000000..0f4e8a0
--- /dev/null
+++ b/include/exec/tb-hash.h
@@ -0,0 +1,51 @@
+/*
+ * internal execution defines for qemu
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EXEC_TB_HASH
+#define EXEC_TB_HASH
+
+/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
+   addresses on the same page.  The top bits are the same.  This allows
+   TLB invalidation to quickly clear a subset of the hash table.  */
+#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2)
+#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS)
+#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1)
+#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE)
+
+static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc)
+{
+    target_ulong tmp;
+    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+    return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK;
+}
+
+static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
+{
+    target_ulong tmp;
+    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+    return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK)
+           | (tmp & TB_JMP_ADDR_MASK));
+}
+
+static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc)
+{
+    return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1);
+}
+
+#endif
diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h
index 77cc65c..ac24bbe 100644
--- a/include/hw/acpi/ich9.h
+++ b/include/hw/acpi/ich9.h
@@ -54,10 +54,11 @@
     uint8_t disable_s3;
     uint8_t disable_s4;
     uint8_t s4_val;
+    uint8_t smm_enabled;
 } ICH9LPCPMRegs;
 
 void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
-                  qemu_irq sci_irq);
+                  bool smm_enabled, qemu_irq sci_irq);
 void ich9_pm_iospace_update(ICH9LPCPMRegs *pm, uint32_t pm_io_base);
 extern const VMStateDescription vmstate_ich9_pm;
 
diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
index 760804c..4dcd4f9 100644
--- a/include/hw/arm/arm.h
+++ b/include/hw/arm/arm.h
@@ -14,6 +14,7 @@
 #include "exec/memory.h"
 #include "hw/irq.h"
 #include "qemu/notify.h"
+#include "cpu.h"
 
 /* armv7m.c */
 qemu_irq *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq,
diff --git a/include/hw/i386/ich9.h b/include/hw/i386/ich9.h
index a2cc15c..b317a48 100644
--- a/include/hw/i386/ich9.h
+++ b/include/hw/i386/ich9.h
@@ -17,7 +17,7 @@
 void ich9_lpc_set_irq(void *opaque, int irq_num, int level);
 int ich9_lpc_map_irq(PCIDevice *pci_dev, int intx);
 PCIINTxRoute ich9_route_intx_pin_to_irq(void *opaque, int pirq_pin);
-void ich9_lpc_pm_init(PCIDevice *pci_lpc);
+void ich9_lpc_pm_init(PCIDevice *pci_lpc, bool smm_enabled);
 I2CBus *ich9_smb_init(PCIBus *bus, int devfn, uint32_t smb_io_base);
 
 #define ICH9_CC_SIZE                            (16 * 1024)     /* 16KB */
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 86c5651..786a1d5 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -15,14 +15,12 @@
 #include "hw/pci/pci.h"
 #include "hw/boards.h"
 #include "hw/compat.h"
+#include "hw/mem/pc-dimm.h"
 
 #define HPET_INTCAP "hpet-intcap"
 
 /**
  * PCMachineState:
- * @hotplug_memory_base: address in guest RAM address space where hotplug memory
- * address space begins.
- * @hotplug_memory: hotplug memory addess space container
  * @acpi_dev: link to ACPI PM device that performs ACPI hotplug handling
  * @enforce_aligned_dimm: check that DIMM's address/size is aligned by
  *                        backend's alignment value if provided
@@ -32,14 +30,14 @@
     MachineState parent_obj;
 
     /* <public> */
-    ram_addr_t hotplug_memory_base;
-    MemoryRegion hotplug_memory;
+    MemoryHotplugState hotplug_memory;
 
     HotplugHandler *acpi_dev;
     ISADevice *rtc;
 
     uint64_t max_ram_below_4g;
     OnOffAuto vmport;
+    OnOffAuto smm;
     bool enforce_aligned_dimm;
 };
 
@@ -47,6 +45,7 @@
 #define PC_MACHINE_MEMHP_REGION_SIZE "hotplug-memory-region-size"
 #define PC_MACHINE_MAX_RAM_BELOW_4G "max-ram-below-4g"
 #define PC_MACHINE_VMPORT           "vmport"
+#define PC_MACHINE_SMM              "smm"
 #define PC_MACHINE_ENFORCE_ALIGNED_DIMM "enforce-aligned-dimm"
 
 /**
@@ -158,6 +157,7 @@
 /* pc.c */
 extern int fd_bootchk;
 
+bool pc_machine_is_smm_enabled(PCMachineState *pcms);
 void pc_register_ferr_irq(qemu_irq irq);
 void pc_acpi_smi_interrupt(void *opaque, int irq, int level);
 
@@ -217,7 +217,7 @@
 
 I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
                       qemu_irq sci_irq, qemu_irq smi_irq,
-                      int kvm_enabled, DeviceState **piix4_pm);
+                      int smm_enabled, DeviceState **piix4_pm);
 void piix4_smbus_register_device(SMBusDevice *dev, uint8_t addr);
 
 /* hpet.c */
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index f7b80b4..d83bf30 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -70,6 +70,17 @@
     MemoryRegion *(*get_memory_region)(PCDIMMDevice *dimm);
 } PCDIMMDeviceClass;
 
+/**
+ * MemoryHotplugState:
+ * @base: address in guest RAM address space where hotplug memory
+ * address space begins.
+ * @mr: hotplug memory address space container
+ */
+typedef struct MemoryHotplugState {
+    ram_addr_t base;
+    MemoryRegion mr;
+} MemoryHotplugState;
+
 uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
                                uint64_t address_space_size,
                                uint64_t *hint, uint64_t align, uint64_t size,
@@ -79,4 +90,8 @@
 
 int qmp_pc_dimm_device_list(Object *obj, void *opaque);
 uint64_t pc_existing_dimms_capacity(Error **errp);
+void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
+                         MemoryRegion *mr, uint64_t align, Error **errp);
+void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
+                           MemoryRegion *mr);
 #endif
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 49c062b..d98e6c9 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -114,6 +114,8 @@
 #define PCI_VENDOR_ID_ENSONIQ            0x1274
 #define PCI_DEVICE_ID_ENSONIQ_ES1370     0x5000
 
+#define PCI_VENDOR_ID_CHELSIO            0x1425
+
 #define PCI_VENDOR_ID_FREESCALE          0x1957
 #define PCI_DEVICE_ID_MPC8533E           0x0030
 
diff --git a/include/hw/sysbus.h b/include/hw/sysbus.h
index 34f93c3..cc1dba4 100644
--- a/include/hw/sysbus.h
+++ b/include/hw/sysbus.h
@@ -58,6 +58,7 @@
      * omitted then. (This is not considered a fatal error.)
      */
     char *(*explicit_ofw_unit_address)(const SysBusDevice *dev);
+    void (*connect_irq_notifier)(SysBusDevice *dev, qemu_irq irq);
 } SysBusDeviceClass;
 
 struct SysBusDevice {
diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h
index 26b2ad6..c5cf1d7 100644
--- a/include/hw/vfio/vfio-platform.h
+++ b/include/hw/vfio/vfio-platform.h
@@ -41,6 +41,7 @@
     int state; /* inactive, pending, active */
     uint8_t pin; /* index */
     uint32_t flags; /* IRQ info flags */
+    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
 } VFIOINTp;
 
 /* function type for user side eventfd handler */
@@ -57,6 +58,7 @@
     uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
     QEMUTimer *mmap_timer; /* allows fast-path resume after IRQ hit */
     QemuMutex intp_mutex; /* protect the intp_list IRQ state */
+    bool irqfd_allowed; /* debug option to force irqfd on/off */
 } VFIOPlatformDevice;
 
 typedef struct VFIOPlatformDeviceClass {
diff --git a/include/hw/virtio/virtio-input.h b/include/hw/virtio/virtio-input.h
index fd5417d..af1c207 100644
--- a/include/hw/virtio/virtio-input.h
+++ b/include/hw/virtio/virtio-input.h
@@ -95,6 +95,8 @@
 
 struct VirtIOInputHID {
     VirtIOInput                       parent_obj;
+    char                              *display;
+    uint32_t                          head;
     QemuInputHandler                  *handler;
     QemuInputHandlerState             *hs;
     int                               ledstate;
diff --git a/include/qemu-common.h b/include/qemu-common.h
index d52d09c..237d654 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -15,6 +15,7 @@
 #include "qemu/compiler.h"
 #include "config-host.h"
 #include "qemu/typedefs.h"
+#include "qemu/fprintf-fn.h"
 
 #if defined(__arm__) || defined(__sparc__) || defined(__mips__) || defined(__hppa__) || defined(__ia64__)
 #define WORDS_ALIGNED
@@ -85,9 +86,6 @@
 # error Unknown pointer size
 #endif
 
-typedef int (*fprintf_function)(FILE *f, const char *fmt, ...)
-    GCC_FMT_ATTR(2, 3);
-
 #ifdef _WIN32
 #define fsync _commit
 #if !defined(lseek)
@@ -455,6 +453,7 @@
 #define VECTYPE        __vector unsigned char
 #define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
 #define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
+#define VEC_OR(v1, v2) ((v1) | (v2))
 /* altivec.h may redefine the bool macro as vector type.
  * Reset it to POSIX semantics. */
 #define bool _Bool
@@ -463,10 +462,12 @@
 #define VECTYPE        __m128i
 #define SPLAT(p)       _mm_set1_epi8(*(p))
 #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
+#define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
 #else
 #define VECTYPE        unsigned long
 #define SPLAT(p)       (*(p) * (~0UL / 255))
 #define ALL_EQ(v1, v2) ((v1) == (v2))
+#define VEC_OR(v1, v2) ((v1) | (v2))
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
diff --git a/include/qemu/fprintf-fn.h b/include/qemu/fprintf-fn.h
new file mode 100644
index 0000000..9ddc90f
--- /dev/null
+++ b/include/qemu/fprintf-fn.h
@@ -0,0 +1,17 @@
+/*
+ * Typedef for fprintf-alike function pointers.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_FPRINTF_FN_H
+#define QEMU_FPRINTF_FN_H 1
+
+#include "qemu/compiler.h"
+#include <stdio.h>
+
+typedef int (*fprintf_function)(FILE *f, const char *fmt, ...)
+    GCC_FMT_ATTR(2, 3);
+
+#endif
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 0f4a0fd..bc18ca3 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -223,6 +223,16 @@
 #endif
 
 /**
+ * qemu_mutex_iothread_locked: Return lock status of the main loop mutex.
+ *
+ * The main loop mutex is the coarsest lock in QEMU, and as such it
+ * must always be taken outside other locks.  This function helps
+ * functions take different paths depending on whether the current
+ * thread is running within the main loop mutex.
+ */
+bool qemu_mutex_iothread_locked(void);
+
+/**
  * qemu_mutex_lock_iothread: Lock the main loop mutex.
  *
  * This function locks the main loop mutex.  The mutex is taken by
diff --git a/include/standard-headers/linux/input.h b/include/standard-headers/linux/input.h
index b94d365..a459dd2 100644
--- a/include/standard-headers/linux/input.h
+++ b/include/standard-headers/linux/input.h
@@ -367,7 +367,8 @@
 #define KEY_MSDOS		151
 #define KEY_COFFEE		152	/* AL Terminal Lock/Screensaver */
 #define KEY_SCREENLOCK		KEY_COFFEE
-#define KEY_DIRECTION		153
+#define KEY_ROTATE_DISPLAY	153	/* Display orientation for e.g. tablets */
+#define KEY_DIRECTION		KEY_ROTATE_DISPLAY
 #define KEY_CYCLEWINDOWS	154
 #define KEY_MAIL		155
 #define KEY_BOOKMARKS		156	/* AC Bookmarks */
@@ -700,6 +701,10 @@
 #define KEY_NUMERIC_9		0x209
 #define KEY_NUMERIC_STAR	0x20a
 #define KEY_NUMERIC_POUND	0x20b
+#define KEY_NUMERIC_A		0x20c	/* Phone key A - HUT Telephony 0xb9 */
+#define KEY_NUMERIC_B		0x20d
+#define KEY_NUMERIC_C		0x20e
+#define KEY_NUMERIC_D		0x20f
 
 #define KEY_CAMERA_FOCUS	0x210
 #define KEY_WPS_BUTTON		0x211	/* WiFi Protected Setup key */
@@ -971,7 +976,8 @@
  */
 #define MT_TOOL_FINGER		0
 #define MT_TOOL_PEN		1
-#define MT_TOOL_MAX		1
+#define MT_TOOL_PALM		2
+#define MT_TOOL_MAX		2
 
 /*
  * Values describing the status of a force-feedback effect
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index 88ada1d..2e2a6dc 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -26,6 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE. */
 #include "standard-headers/linux/types.h"
+#include "standard-headers/linux/virtio_types.h"
 #include "standard-headers/linux/virtio_ids.h"
 #include "standard-headers/linux/virtio_config.h"
 
diff --git a/include/standard-headers/linux/virtio_gpu.h b/include/standard-headers/linux/virtio_gpu.h
index cfcfb46..72ef815 100644
--- a/include/standard-headers/linux/virtio_gpu.h
+++ b/include/standard-headers/linux/virtio_gpu.h
@@ -38,6 +38,8 @@
 #ifndef VIRTIO_GPU_HW_H
 #define VIRTIO_GPU_HW_H
 
+#include "standard-headers/linux/types.h"
+
 enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_UNDEFINED = 0,
 
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index f459fbd..983e99e 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -19,6 +19,7 @@
 #include "qemu/queue.h"
 #include "qom/cpu.h"
 #include "exec/memattrs.h"
+#include "hw/irq.h"
 
 #ifdef CONFIG_KVM
 #include <linux/kvm.h>
@@ -151,6 +152,7 @@
 #define kvm_halt_in_kernel() (false)
 #define kvm_eventfds_enabled() (false)
 #define kvm_irqfds_enabled() (false)
+#define kvm_resamplefds_enabled() (false)
 #define kvm_msi_via_irqfd_enabled() (false)
 #define kvm_gsi_routing_allowed() (false)
 #define kvm_gsi_direct_mapping() (false)
@@ -416,9 +418,15 @@
 
 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter);
 
+int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                       EventNotifier *rn, int virq);
+int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                          int virq);
 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
-                                   EventNotifier *rn, int virq);
-int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq);
+                                   EventNotifier *rn, qemu_irq irq);
+int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
+                                      qemu_irq irq);
+void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi);
 void kvm_pc_gsi_handler(void *opaque, int n, int level);
 void kvm_pc_setup_irq_routing(bool pci_enabled);
 void kvm_init_irq_routing(KVMState *s);
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
new file mode 100644
index 0000000..888557a
--- /dev/null
+++ b/include/sysemu/kvm_int.h
@@ -0,0 +1,39 @@
+/*
+ * Internal definitions for a target's KVM support
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_KVM_INT_H
+#define QEMU_KVM_INT_H
+
+#include "sysemu/sysemu.h"
+#include "sysemu/accel.h"
+#include "sysemu/kvm.h"
+
+typedef struct KVMSlot
+{
+    hwaddr start_addr;
+    ram_addr_t memory_size;
+    void *ram;
+    int slot;
+    int flags;
+} KVMSlot;
+
+typedef struct KVMMemoryListener {
+    MemoryListener listener;
+    KVMSlot *slots;
+    int as_id;
+} KVMMemoryListener;
+
+#define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm")
+
+#define KVM_STATE(obj) \
+    OBJECT_CHECK(KVMState, (obj), TYPE_KVM_ACCEL)
+
+void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
+                                  AddressSpace *as, int as_id);
+
+#endif
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 6523b4d..a6392bc 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -10,16 +10,27 @@
 
 extern int nb_numa_nodes;   /* Number of NUMA nodes */
 
+struct numa_addr_range {
+    ram_addr_t mem_start;
+    ram_addr_t mem_end;
+    QLIST_ENTRY(numa_addr_range) entry;
+};
+
 typedef struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
     bool present;
+    QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
 } NodeInfo;
+
 extern NodeInfo numa_info[MAX_NODES];
 void parse_numa_opts(MachineClass *mc);
 void numa_post_machine_init(void);
 void query_numa_node_mem(uint64_t node_mem[]);
 extern QemuOptsList qemu_numa_opts;
+void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
+void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
+uint32_t numa_get_node(ram_addr_t addr, Error **errp);
 
 #endif
diff --git a/kvm-all.c b/kvm-all.c
index 53e01d4..06e06f2 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -24,19 +24,18 @@
 #include "qemu/atomic.h"
 #include "qemu/option.h"
 #include "qemu/config-file.h"
-#include "sysemu/sysemu.h"
-#include "sysemu/accel.h"
 #include "hw/hw.h"
 #include "hw/pci/msi.h"
 #include "hw/s390x/adapter.h"
 #include "exec/gdbstub.h"
-#include "sysemu/kvm.h"
+#include "sysemu/kvm_int.h"
 #include "qemu/bswap.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
 #include "exec/address-spaces.h"
 #include "qemu/event_notifier.h"
 #include "trace.h"
+#include "hw/irq.h"
 
 #include "hw/boards.h"
 
@@ -60,22 +59,10 @@
 
 #define KVM_MSI_HASHTAB_SIZE    256
 
-typedef struct KVMSlot
-{
-    hwaddr start_addr;
-    ram_addr_t memory_size;
-    void *ram;
-    int slot;
-    int flags;
-} KVMSlot;
-
-typedef struct kvm_dirty_log KVMDirtyLog;
-
 struct KVMState
 {
     AccelState parent_obj;
 
-    KVMSlot *slots;
     int nr_slots;
     int fd;
     int vmfd;
@@ -98,6 +85,7 @@
      * unsigned, and treating them as signed here can break things */
     unsigned irq_set_ioctl;
     unsigned int sigmask_len;
+    GHashTable *gsimap;
 #ifdef KVM_CAP_IRQ_ROUTING
     struct kvm_irq_routing *irq_routes;
     int nr_allocated_irq_routes;
@@ -106,13 +94,9 @@
     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
     bool direct_msi;
 #endif
+    KVMMemoryListener memory_listener;
 };
 
-#define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm")
-
-#define KVM_STATE(obj) \
-    OBJECT_CHECK(KVMState, (obj), TYPE_KVM_ACCEL)
-
 KVMState *kvm_state;
 bool kvm_kernel_irqchip;
 bool kvm_async_interrupts_allowed;
@@ -133,13 +117,14 @@
     KVM_CAP_LAST_INFO
 };
 
-static KVMSlot *kvm_get_free_slot(KVMState *s)
+static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 {
+    KVMState *s = kvm_state;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        if (s->slots[i].memory_size == 0) {
-            return &s->slots[i];
+        if (kml->slots[i].memory_size == 0) {
+            return &kml->slots[i];
         }
     }
 
@@ -148,12 +133,14 @@
 
 bool kvm_has_free_slot(MachineState *ms)
 {
-    return kvm_get_free_slot(KVM_STATE(ms->accelerator));
+    KVMState *s = KVM_STATE(ms->accelerator);
+
+    return kvm_get_free_slot(&s->memory_listener);
 }
 
-static KVMSlot *kvm_alloc_slot(KVMState *s)
+static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
 {
-    KVMSlot *slot = kvm_get_free_slot(s);
+    KVMSlot *slot = kvm_get_free_slot(kml);
 
     if (slot) {
         return slot;
@@ -163,14 +150,15 @@
     abort();
 }
 
-static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
+static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
                                          hwaddr start_addr,
                                          hwaddr end_addr)
 {
+    KVMState *s = kvm_state;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        KVMSlot *mem = &s->slots[i];
+        KVMSlot *mem = &kml->slots[i];
 
         if (start_addr == mem->start_addr &&
             end_addr == mem->start_addr + mem->memory_size) {
@@ -184,15 +172,16 @@
 /*
  * Find overlapping slot with lowest start address
  */
-static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
+static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
                                             hwaddr start_addr,
                                             hwaddr end_addr)
 {
+    KVMState *s = kvm_state;
     KVMSlot *found = NULL;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        KVMSlot *mem = &s->slots[i];
+        KVMSlot *mem = &kml->slots[i];
 
         if (mem->memory_size == 0 ||
             (found && found->start_addr < mem->start_addr)) {
@@ -211,10 +200,11 @@
 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
                                        hwaddr *phys_addr)
 {
+    KVMMemoryListener *kml = &s->memory_listener;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        KVMSlot *mem = &s->slots[i];
+        KVMSlot *mem = &kml->slots[i];
 
         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
             *phys_addr = mem->start_addr + (ram - mem->ram);
@@ -225,11 +215,12 @@
     return 0;
 }
 
-static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
+static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
 {
+    KVMState *s = kvm_state;
     struct kvm_userspace_memory_region mem;
 
-    mem.slot = slot->slot;
+    mem.slot = slot->slot | (kml->as_id << 16);
     mem.guest_phys_addr = slot->start_addr;
     mem.userspace_addr = (unsigned long)slot->ram;
     mem.flags = slot->flags;
@@ -291,45 +282,47 @@
  * dirty pages logging control
  */
 
-static int kvm_mem_flags(KVMState *s, bool log_dirty, bool readonly)
+static int kvm_mem_flags(MemoryRegion *mr)
 {
+    bool readonly = mr->readonly || memory_region_is_romd(mr);
     int flags = 0;
-    flags = log_dirty ? KVM_MEM_LOG_DIRTY_PAGES : 0;
+
+    if (memory_region_get_dirty_log_mask(mr) != 0) {
+        flags |= KVM_MEM_LOG_DIRTY_PAGES;
+    }
     if (readonly && kvm_readonly_mem_allowed) {
         flags |= KVM_MEM_READONLY;
     }
     return flags;
 }
 
-static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
+static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
+                                 MemoryRegion *mr)
 {
-    KVMState *s = kvm_state;
-    int flags, mask = KVM_MEM_LOG_DIRTY_PAGES;
     int old_flags;
 
     old_flags = mem->flags;
-
-    flags = (mem->flags & ~mask) | kvm_mem_flags(s, log_dirty, false);
-    mem->flags = flags;
+    mem->flags = kvm_mem_flags(mr);
 
     /* If nothing changed effectively, no need to issue ioctl */
-    if (flags == old_flags) {
+    if (mem->flags == old_flags) {
         return 0;
     }
 
-    return kvm_set_user_memory_region(s, mem);
+    return kvm_set_user_memory_region(kml, mem);
 }
 
-static int kvm_dirty_pages_log_change(hwaddr phys_addr,
-                                      ram_addr_t size, bool log_dirty)
+static int kvm_section_update_flags(KVMMemoryListener *kml,
+                                    MemoryRegionSection *section)
 {
-    KVMState *s = kvm_state;
-    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
+    hwaddr phys_addr = section->offset_within_address_space;
+    ram_addr_t size = int128_get64(section->size);
+    KVMSlot *mem = kvm_lookup_matching_slot(kml, phys_addr, phys_addr + size);
 
     if (mem == NULL)  {
         return 0;
     } else {
-        return kvm_slot_dirty_pages_log_change(mem, log_dirty);
+        return kvm_slot_update_flags(kml, mem, section->mr);
     }
 }
 
@@ -337,14 +330,14 @@
                           MemoryRegionSection *section,
                           int old, int new)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
     int r;
 
     if (old != 0) {
         return;
     }
 
-    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
-                                   int128_get64(section->size), true);
+    r = kvm_section_update_flags(kml, section);
     if (r < 0) {
         abort();
     }
@@ -354,14 +347,14 @@
                           MemoryRegionSection *section,
                           int old, int new)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
     int r;
 
     if (new != 0) {
         return;
     }
 
-    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
-                                   int128_get64(section->size), false);
+    r = kvm_section_update_flags(kml, section);
     if (r < 0) {
         abort();
     }
@@ -389,11 +382,12 @@
  * @start_add: start of logged region.
  * @end_addr: end of logged region.
  */
-static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
+static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
+                                          MemoryRegionSection *section)
 {
     KVMState *s = kvm_state;
     unsigned long size, allocated_size = 0;
-    KVMDirtyLog d = {};
+    struct kvm_dirty_log d = {};
     KVMSlot *mem;
     int ret = 0;
     hwaddr start_addr = section->offset_within_address_space;
@@ -401,7 +395,7 @@
 
     d.dirty_bitmap = NULL;
     while (start_addr < end_addr) {
-        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
+        mem = kvm_lookup_overlapping_slot(kml, start_addr, end_addr);
         if (mem == NULL) {
             break;
         }
@@ -428,8 +422,7 @@
         allocated_size = size;
         memset(d.dirty_bitmap, 0, allocated_size);
 
-        d.slot = mem->slot;
-
+        d.slot = mem->slot | (kml->as_id << 16);
         if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
             DPRINTF("ioctl failed %d\n", errno);
             ret = -1;
@@ -632,15 +625,14 @@
     return NULL;
 }
 
-static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
+static void kvm_set_phys_mem(KVMMemoryListener *kml,
+                             MemoryRegionSection *section, bool add)
 {
     KVMState *s = kvm_state;
     KVMSlot *mem, old;
     int err;
     MemoryRegion *mr = section->mr;
-    bool log_dirty = memory_region_get_dirty_log_mask(mr) != 0;
     bool writeable = !mr->readonly && !mr->rom_device;
-    bool readonly_flag = mr->readonly || memory_region_is_romd(mr);
     hwaddr start_addr = section->offset_within_address_space;
     ram_addr_t size = int128_get64(section->size);
     void *ram = NULL;
@@ -674,7 +666,7 @@
     ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
 
     while (1) {
-        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
+        mem = kvm_lookup_overlapping_slot(kml, start_addr, start_addr + size);
         if (!mem) {
             break;
         }
@@ -684,19 +676,19 @@
             (ram - start_addr == mem->ram - mem->start_addr)) {
             /* The new slot fits into the existing one and comes with
              * identical parameters - update flags and done. */
-            kvm_slot_dirty_pages_log_change(mem, log_dirty);
+            kvm_slot_update_flags(kml, mem, mr);
             return;
         }
 
         old = *mem;
 
         if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-            kvm_physical_sync_dirty_bitmap(section);
+            kvm_physical_sync_dirty_bitmap(kml, section);
         }
 
         /* unregister the overlapping slot */
         mem->memory_size = 0;
-        err = kvm_set_user_memory_region(s, mem);
+        err = kvm_set_user_memory_region(kml, mem);
         if (err) {
             fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
                     __func__, strerror(-err));
@@ -713,13 +705,13 @@
          * - and actually require a recent KVM version. */
         if (s->broken_set_mem_region &&
             old.start_addr == start_addr && old.memory_size < size && add) {
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot(kml);
             mem->memory_size = old.memory_size;
             mem->start_addr = old.start_addr;
             mem->ram = old.ram;
-            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
+            mem->flags = kvm_mem_flags(mr);
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(kml, mem);
             if (err) {
                 fprintf(stderr, "%s: error updating slot: %s\n", __func__,
                         strerror(-err));
@@ -734,13 +726,13 @@
 
         /* register prefix slot */
         if (old.start_addr < start_addr) {
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot(kml);
             mem->memory_size = start_addr - old.start_addr;
             mem->start_addr = old.start_addr;
             mem->ram = old.ram;
-            mem->flags =  kvm_mem_flags(s, log_dirty, readonly_flag);
+            mem->flags =  kvm_mem_flags(mr);
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(kml, mem);
             if (err) {
                 fprintf(stderr, "%s: error registering prefix slot: %s\n",
                         __func__, strerror(-err));
@@ -757,14 +749,14 @@
         if (old.start_addr + old.memory_size > start_addr + size) {
             ram_addr_t size_delta;
 
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot(kml);
             mem->start_addr = start_addr + size;
             size_delta = mem->start_addr - old.start_addr;
             mem->memory_size = old.memory_size - size_delta;
             mem->ram = old.ram + size_delta;
-            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
+            mem->flags = kvm_mem_flags(mr);
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(kml, mem);
             if (err) {
                 fprintf(stderr, "%s: error registering suffix slot: %s\n",
                         __func__, strerror(-err));
@@ -780,13 +772,13 @@
     if (!add) {
         return;
     }
-    mem = kvm_alloc_slot(s);
+    mem = kvm_alloc_slot(kml);
     mem->memory_size = size;
     mem->start_addr = start_addr;
     mem->ram = ram;
-    mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
+    mem->flags = kvm_mem_flags(mr);
 
-    err = kvm_set_user_memory_region(s, mem);
+    err = kvm_set_user_memory_region(kml, mem);
     if (err) {
         fprintf(stderr, "%s: error registering slot: %s\n", __func__,
                 strerror(-err));
@@ -797,23 +789,28 @@
 static void kvm_region_add(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+
     memory_region_ref(section->mr);
-    kvm_set_phys_mem(section, true);
+    kvm_set_phys_mem(kml, section, true);
 }
 
 static void kvm_region_del(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
-    kvm_set_phys_mem(section, false);
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+
+    kvm_set_phys_mem(kml, section, false);
     memory_region_unref(section->mr);
 }
 
 static void kvm_log_sync(MemoryListener *listener,
                          MemoryRegionSection *section)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
     int r;
 
-    r = kvm_physical_sync_dirty_bitmap(section);
+    r = kvm_physical_sync_dirty_bitmap(kml, section);
     if (r < 0) {
         abort();
     }
@@ -888,18 +885,27 @@
     }
 }
 
-static MemoryListener kvm_memory_listener = {
-    .region_add = kvm_region_add,
-    .region_del = kvm_region_del,
-    .log_start = kvm_log_start,
-    .log_stop = kvm_log_stop,
-    .log_sync = kvm_log_sync,
-    .eventfd_add = kvm_mem_ioeventfd_add,
-    .eventfd_del = kvm_mem_ioeventfd_del,
-    .coalesced_mmio_add = kvm_coalesce_mmio_region,
-    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
-    .priority = 10,
-};
+void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
+                                  AddressSpace *as, int as_id)
+{
+    int i;
+
+    kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
+    kml->as_id = as_id;
+
+    for (i = 0; i < s->nr_slots; i++) {
+        kml->slots[i].slot = i;
+    }
+
+    kml->listener.region_add = kvm_region_add;
+    kml->listener.region_del = kvm_region_del;
+    kml->listener.log_start = kvm_log_start;
+    kml->listener.log_stop = kvm_log_stop;
+    kml->listener.log_sync = kvm_log_sync;
+    kml->listener.priority = 10;
+
+    memory_listener_register(&kml->listener, as);
+}
 
 static MemoryListener kvm_io_listener = {
     .eventfd_add = kvm_io_ioeventfd_add,
@@ -1099,9 +1105,17 @@
     uint32_t *word = s->used_gsi_bitmap;
     int max_words = ALIGN(s->gsi_count, 32) / 32;
     int i, zeroes;
-    bool retry = true;
 
-again:
+    /*
+     * PIC and IOAPIC share the first 16 GSI numbers, thus the available
+     * GSI numbers are more than the number of IRQ route. Allocating a GSI
+     * number can succeed even though a new route entry cannot be added.
+     * When this happens, flush dynamic MSI entries to free IRQ route entries.
+     */
+    if (!s->direct_msi && s->irq_routes->nr == s->gsi_count) {
+        kvm_flush_dynamic_msi_routes(s);
+    }
+
     /* Return the lowest unused GSI in the bitmap */
     for (i = 0; i < max_words; i++) {
         zeroes = ctz32(~word[i]);
@@ -1111,11 +1125,6 @@
 
         return zeroes + i * 32;
     }
-    if (!s->direct_msi && retry) {
-        retry = false;
-        kvm_flush_dynamic_msi_routes(s);
-        goto again;
-    }
     return -ENOSPC;
 
 }
@@ -1325,40 +1334,74 @@
 }
 #endif /* !KVM_CAP_IRQ_ROUTING */
 
-int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
-                                   EventNotifier *rn, int virq)
+int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                       EventNotifier *rn, int virq)
 {
     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
            rn ? event_notifier_get_fd(rn) : -1, virq, true);
 }
 
-int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
+int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                          int virq)
 {
     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
            false);
 }
 
-static int kvm_irqchip_create(MachineState *machine, KVMState *s)
+int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
+                                   EventNotifier *rn, qemu_irq irq)
+{
+    gpointer key, gsi;
+    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
+
+    if (!found) {
+        return -ENXIO;
+    }
+    return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
+}
+
+int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
+                                      qemu_irq irq)
+{
+    gpointer key, gsi;
+    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
+
+    if (!found) {
+        return -ENXIO;
+    }
+    return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
+}
+
+void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
+{
+    g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
+}
+
+static void kvm_irqchip_create(MachineState *machine, KVMState *s)
 {
     int ret;
 
-    if (!machine_kernel_irqchip_allowed(machine) ||
-        (!kvm_check_extension(s, KVM_CAP_IRQCHIP) &&
-         (kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0) < 0))) {
-        return 0;
+    if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
+        ;
+    } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
+        ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
+        if (ret < 0) {
+            fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
+            exit(1);
+        }
+    } else {
+        return;
     }
 
     /* First probe and see if there's a arch-specific hook to create the
      * in-kernel irqchip for us */
     ret = kvm_arch_irqchip_create(s);
-    if (ret < 0) {
-        return ret;
-    } else if (ret == 0) {
+    if (ret == 0) {
         ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
-        if (ret < 0) {
-            fprintf(stderr, "Create kernel irqchip failed\n");
-            return ret;
-        }
+    }
+    if (ret < 0) {
+        fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
+        exit(1);
     }
 
     kvm_kernel_irqchip = true;
@@ -1370,7 +1413,7 @@
 
     kvm_init_irq_routing(s);
 
-    return 0;
+    s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
 }
 
 /* Find number of supported CPUs using the recommended
@@ -1407,7 +1450,7 @@
     KVMState *s;
     const KVMCapabilityInfo *missing_cap;
     int ret;
-    int i, type = 0;
+    int type = 0;
     const char *kvm_type;
 
     s = KVM_STATE(ms->accelerator);
@@ -1456,12 +1499,6 @@
         s->nr_slots = 32;
     }
 
-    s->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
-
-    for (i = 0; i < s->nr_slots; i++) {
-        s->slots[i].slot = i;
-    }
-
     /* check the vcpu limits */
     soft_vcpus_limit = kvm_recommended_vcpus(s);
     hard_vcpus_limit = kvm_max_vcpus(s);
@@ -1593,14 +1630,21 @@
         goto err;
     }
 
-    ret = kvm_irqchip_create(ms, s);
-    if (ret < 0) {
-        goto err;
+    if (machine_kernel_irqchip_allowed(ms)) {
+        kvm_irqchip_create(ms, s);
     }
 
     kvm_state = s;
-    memory_listener_register(&kvm_memory_listener, &address_space_memory);
-    memory_listener_register(&kvm_io_listener, &address_space_io);
+
+    s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
+    s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
+    s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
+    s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;
+
+    kvm_memory_listener_register(s, &s->memory_listener,
+                                 &address_space_memory, 0);
+    memory_listener_register(&kvm_io_listener,
+                             &address_space_io);
 
     s->many_ioeventfds = kvm_check_many_ioeventfds();
 
@@ -1616,7 +1660,7 @@
     if (s->fd != -1) {
         close(s->fd);
     }
-    g_free(s->slots);
+    g_free(s->memory_listener.slots);
 
     return ret;
 }
@@ -1752,6 +1796,8 @@
         return EXCP_HLT;
     }
 
+    qemu_mutex_unlock_iothread();
+
     do {
         MemTxAttrs attrs;
 
@@ -1770,11 +1816,9 @@
              */
             qemu_cpu_kick_self();
         }
-        qemu_mutex_unlock_iothread();
 
         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
 
-        qemu_mutex_lock_iothread();
         attrs = kvm_arch_post_run(cpu, run);
 
         if (run_ret < 0) {
@@ -1801,6 +1845,7 @@
         switch (run->exit_reason) {
         case KVM_EXIT_IO:
             DPRINTF("handle_io\n");
+            /* Called outside BQL */
             kvm_handle_io(run->io.port, attrs,
                           (uint8_t *)run + run->io.data_offset,
                           run->io.direction,
@@ -1810,6 +1855,7 @@
             break;
         case KVM_EXIT_MMIO:
             DPRINTF("handle_mmio\n");
+            /* Called outside BQL */
             address_space_rw(&address_space_memory,
                              run->mmio.phys_addr, attrs,
                              run->mmio.data,
@@ -1857,6 +1903,8 @@
         }
     } while (ret == 0);
 
+    qemu_mutex_lock_iothread();
+
     if (ret < 0) {
         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
         vm_stop(RUN_STATE_INTERNAL_ERROR);
diff --git a/kvm-stub.c b/kvm-stub.c
index 7ba90c5..d9ad624 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -24,6 +24,7 @@
 bool kvm_async_interrupts_allowed;
 bool kvm_eventfds_allowed;
 bool kvm_irqfds_allowed;
+bool kvm_resamplefds_allowed;
 bool kvm_msi_via_irqfd_allowed;
 bool kvm_gsi_routing_allowed;
 bool kvm_gsi_direct_mapping;
@@ -137,13 +138,14 @@
     return -ENOSYS;
 }
 
-int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
-                                   EventNotifier *rn, int virq)
+int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                       EventNotifier *rn, int virq)
 {
     return -ENOSYS;
 }
 
-int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
+int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                          int virq)
 {
     return -ENOSYS;
 }
diff --git a/linux-headers/asm-x86/hyperv.h b/linux-headers/asm-x86/hyperv.h
index ce6068d..8fba544 100644
--- a/linux-headers/asm-x86/hyperv.h
+++ b/linux-headers/asm-x86/hyperv.h
@@ -199,6 +199,17 @@
 #define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
 #define HV_X64_MSR_STIMER3_COUNT		0x400000B7
 
+/* Hyper-V guest crash notification MSR's */
+#define HV_X64_MSR_CRASH_P0			0x40000100
+#define HV_X64_MSR_CRASH_P1			0x40000101
+#define HV_X64_MSR_CRASH_P2			0x40000102
+#define HV_X64_MSR_CRASH_P3			0x40000103
+#define HV_X64_MSR_CRASH_P4			0x40000104
+#define HV_X64_MSR_CRASH_CTL			0x40000105
+#define HV_X64_MSR_CRASH_CTL_NOTIFY		(1ULL << 63)
+#define HV_X64_MSR_CRASH_PARAMS		\
+		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index fad9e5c..3bac873 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -897,7 +897,7 @@
  *
  * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
  * the irqfd to operate in resampling mode for level triggered interrupt
- * emlation.  See Documentation/virtual/kvm/api.txt.
+ * emulation.  See Documentation/virtual/kvm/api.txt.
  */
 #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
 
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 0508d0b..aa276bc 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -36,6 +36,8 @@
 /* Two-stage IOMMU */
 #define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
+#define VFIO_SPAPR_TCE_v2_IOMMU		7
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -443,6 +445,23 @@
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*
+ * The SPAPR TCE DDW info struct provides the information about
+ * the details of Dynamic DMA window capability.
+ *
+ * @pgsizes contains a page size bitmask, 4K/64K/16M are supported.
+ * @max_dynamic_windows_supported tells the maximum number of windows
+ * which the platform can create.
+ * @levels tells the maximum number of levels in multi-level IOMMU tables;
+ * this allows splitting a table into smaller chunks which reduces
+ * the amount of physically contiguous memory required for the table.
+ */
+struct vfio_iommu_spapr_tce_ddw_info {
+	__u64 pgsizes;			/* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 levels;
+};
+
+/*
  * The SPAPR TCE info struct provides the information about the PCI bus
  * address ranges available for DMA, these values are programmed into
  * the hardware so the guest has to know that information.
@@ -452,14 +471,17 @@
  * addresses too so the window works as a filter rather than an offset
  * for IOVA addresses.
  *
- * A flag will need to be added if other page sizes are supported,
- * so as defined here, it is always 4k.
+ * Flags supported:
+ * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows
+ *   (DDW) support is present. @ddw is only supported when DDW is present.
  */
 struct vfio_iommu_spapr_tce_info {
 	__u32 argsz;
-	__u32 flags;			/* reserved for future use */
+	__u32 flags;
+#define VFIO_IOMMU_SPAPR_INFO_DDW	(1 << 0)	/* DDW supported */
 	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
 	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+	struct vfio_iommu_spapr_tce_ddw_info ddw;
 };
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
@@ -470,12 +492,23 @@
  * - unfreeze IO/DMA for frozen PE;
  * - read PE state;
  * - reset PE;
- * - configure PE.
+ * - configure PE;
+ * - inject EEH error.
  */
+struct vfio_eeh_pe_err {
+	__u32 type;
+	__u32 func;
+	__u64 addr;
+	__u64 mask;
+};
+
 struct vfio_eeh_pe_op {
 	__u32 argsz;
 	__u32 flags;
 	__u32 op;
+	union {
+		struct vfio_eeh_pe_err err;
+	};
 };
 
 #define VFIO_EEH_PE_DISABLE		0	/* Disable EEH functionality */
@@ -492,9 +525,70 @@
 #define VFIO_EEH_PE_RESET_HOT		6	/* Assert hot reset          */
 #define VFIO_EEH_PE_RESET_FUNDAMENTAL	7	/* Assert fundamental reset  */
 #define VFIO_EEH_PE_CONFIGURE		8	/* PE configuration          */
+#define VFIO_EEH_PE_INJECT_ERR		9	/* Inject EEH error          */
 
 #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
 
+/**
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
+ *
+ * Registers user space memory where DMA is allowed. It pins
+ * user pages and does the locked memory accounting so
+ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
+ * get faster.
+ */
+struct vfio_iommu_spapr_register_memory {
+	__u32	argsz;
+	__u32	flags;
+	__u64	vaddr;				/* Process virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/**
+ * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
+ *
+ * Unregisters user space memory registered with
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
+ * Uses vfio_iommu_spapr_register_memory for parameters.
+ */
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create)
+ *
+ * Creates an additional TCE table and programs it (sets a new DMA window)
+ * to every IOMMU group in the container. It receives page shift, window
+ * size and number of levels in the TCE table being created.
+ *
+ * It allocates and returns an offset on a PCI bus of the new DMA window.
+ */
+struct vfio_iommu_spapr_tce_create {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u32 page_shift;
+	__u64 window_size;
+	__u32 levels;
+	/* out */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_CREATE	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove)
+ *
+ * Unprograms a TCE table from all groups in the container and destroys it.
+ * It receives a PCI bus offset as a window id.
+ */
+struct vfio_iommu_spapr_tce_remove {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE	_IO(VFIO_TYPE, VFIO_BASE + 20)
+
 /* ***************************************************************** */
 
 #endif /* VFIO_H */
diff --git a/linux-headers/linux/virtio_pci.h b/linux-headers/linux/virtio_pci.h
deleted file mode 100644
index 92624e5..0000000
--- a/linux-headers/linux/virtio_pci.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Virtio PCI driver
- *
- * This module allows virtio devices to be used over a virtual PCI device.
- * This can be used with QEMU based VMMs like KVM or Xen.
- *
- * Copyright IBM Corp. 2007
- *
- * Authors:
- *  Anthony Liguori  <aliguori@us.ibm.com>
- *
- * This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _LINUX_VIRTIO_PCI_H
-#define _LINUX_VIRTIO_PCI_H
-
-#include <linux/types.h>
-
-#ifndef VIRTIO_PCI_NO_LEGACY
-
-/* A 32-bit r/o bitmask of the features supported by the host */
-#define VIRTIO_PCI_HOST_FEATURES	0
-
-/* A 32-bit r/w bitmask of features activated by the guest */
-#define VIRTIO_PCI_GUEST_FEATURES	4
-
-/* A 32-bit r/w PFN for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_PFN		8
-
-/* A 16-bit r/o queue size for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_NUM		12
-
-/* A 16-bit r/w queue selector */
-#define VIRTIO_PCI_QUEUE_SEL		14
-
-/* A 16-bit r/w queue notifier */
-#define VIRTIO_PCI_QUEUE_NOTIFY		16
-
-/* An 8-bit device status register.  */
-#define VIRTIO_PCI_STATUS		18
-
-/* An 8-bit r/o interrupt status register.  Reading the value will return the
- * current contents of the ISR and will also clear it.  This is effectively
- * a read-and-acknowledge. */
-#define VIRTIO_PCI_ISR			19
-
-/* MSI-X registers: only enabled if MSI-X is enabled. */
-/* A 16-bit vector for configuration changes. */
-#define VIRTIO_MSI_CONFIG_VECTOR        20
-/* A 16-bit vector for selected queue notifications. */
-#define VIRTIO_MSI_QUEUE_VECTOR         22
-
-/* The remaining space is defined by each driver as the per-driver
- * configuration space */
-#define VIRTIO_PCI_CONFIG_OFF(msix_enabled)	((msix_enabled) ? 24 : 20)
-/* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */
-#define VIRTIO_PCI_CONFIG(dev)	VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled)
-
-/* Virtio ABI version, this must match exactly */
-#define VIRTIO_PCI_ABI_VERSION		0
-
-/* How many bits to shift physical queue address written to QUEUE_PFN.
- * 12 is historical, and due to x86 page size. */
-#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
-
-/* The alignment to use between consumer and producer parts of vring.
- * x86 pagesize again. */
-#define VIRTIO_PCI_VRING_ALIGN		4096
-
-#endif /* VIRTIO_PCI_NO_LEGACY */
-
-/* The bit of the ISR which indicates a device configuration change. */
-#define VIRTIO_PCI_ISR_CONFIG		0x2
-/* Vector value used to disable MSI for queue */
-#define VIRTIO_MSI_NO_VECTOR            0xffff
-
-#ifndef VIRTIO_PCI_NO_MODERN
-
-/* IDs for different capabilities.  Must all exist. */
-
-/* Common configuration */
-#define VIRTIO_PCI_CAP_COMMON_CFG	1
-/* Notifications */
-#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
-/* ISR access */
-#define VIRTIO_PCI_CAP_ISR_CFG		3
-/* Device specific confiuration */
-#define VIRTIO_PCI_CAP_DEVICE_CFG	4
-
-/* This is the PCI capability header: */
-struct virtio_pci_cap {
-	__u8 cap_vndr;		/* Generic PCI field: PCI_CAP_ID_VNDR */
-	__u8 cap_next;		/* Generic PCI field: next ptr. */
-	__u8 cap_len;		/* Generic PCI field: capability length */
-	__u8 cfg_type;		/* Identifies the structure. */
-	__u8 bar;		/* Where to find it. */
-	__u8 padding[3];	/* Pad to full dword. */
-	__le32 offset;		/* Offset within bar. */
-	__le32 length;		/* Length of the structure, in bytes. */
-};
-
-struct virtio_pci_notify_cap {
-	struct virtio_pci_cap cap;
-	__le32 notify_off_multiplier;	/* Multiplier for queue_notify_off. */
-};
-
-/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
-struct virtio_pci_common_cfg {
-	/* About the whole device. */
-	__le32 device_feature_select;	/* read-write */
-	__le32 device_feature;		/* read-only */
-	__le32 guest_feature_select;	/* read-write */
-	__le32 guest_feature;		/* read-write */
-	__le16 msix_config;		/* read-write */
-	__le16 num_queues;		/* read-only */
-	__u8 device_status;		/* read-write */
-	__u8 config_generation;		/* read-only */
-
-	/* About a specific virtqueue. */
-	__le16 queue_select;		/* read-write */
-	__le16 queue_size;		/* read-write, power of 2. */
-	__le16 queue_msix_vector;	/* read-write */
-	__le16 queue_enable;		/* read-write */
-	__le16 queue_notify_off;	/* read-only */
-	__le32 queue_desc_lo;		/* read-write */
-	__le32 queue_desc_hi;		/* read-write */
-	__le32 queue_avail_lo;		/* read-write */
-	__le32 queue_avail_hi;		/* read-write */
-	__le32 queue_used_lo;		/* read-write */
-	__le32 queue_used_hi;		/* read-write */
-};
-
-/* Macro versions of offsets for the Old Timers! */
-#define VIRTIO_PCI_CAP_VNDR		0
-#define VIRTIO_PCI_CAP_NEXT		1
-#define VIRTIO_PCI_CAP_LEN		2
-#define VIRTIO_PCI_CAP_CFG_TYPE		3
-#define VIRTIO_PCI_CAP_BAR		4
-#define VIRTIO_PCI_CAP_OFFSET		8
-#define VIRTIO_PCI_CAP_LENGTH		12
-
-#define VIRTIO_PCI_NOTIFY_CAP_MULT	16
-
-
-#define VIRTIO_PCI_COMMON_DFSELECT	0
-#define VIRTIO_PCI_COMMON_DF		4
-#define VIRTIO_PCI_COMMON_GFSELECT	8
-#define VIRTIO_PCI_COMMON_GF		12
-#define VIRTIO_PCI_COMMON_MSIX		16
-#define VIRTIO_PCI_COMMON_NUMQ		18
-#define VIRTIO_PCI_COMMON_STATUS	20
-#define VIRTIO_PCI_COMMON_CFGGENERATION	21
-#define VIRTIO_PCI_COMMON_Q_SELECT	22
-#define VIRTIO_PCI_COMMON_Q_SIZE	24
-#define VIRTIO_PCI_COMMON_Q_MSIX	26
-#define VIRTIO_PCI_COMMON_Q_ENABLE	28
-#define VIRTIO_PCI_COMMON_Q_NOFF	30
-#define VIRTIO_PCI_COMMON_Q_DESCLO	32
-#define VIRTIO_PCI_COMMON_Q_DESCHI	36
-#define VIRTIO_PCI_COMMON_Q_AVAILLO	40
-#define VIRTIO_PCI_COMMON_Q_AVAILHI	44
-#define VIRTIO_PCI_COMMON_Q_USEDLO	48
-#define VIRTIO_PCI_COMMON_Q_USEDHI	52
-
-#endif /* VIRTIO_PCI_NO_MODERN */
-
-#endif
diff --git a/memory.c b/memory.c
index 3ac0bd2..5a0cc66 100644
--- a/memory.c
+++ b/memory.c
@@ -396,9 +396,6 @@
 {
     uint64_t tmp;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     tmp = mr->ops->read(mr->opaque, addr, size);
     trace_memory_region_ops_read(mr, addr, tmp, size);
     *value |= (tmp & mask) << shift;
@@ -416,9 +413,6 @@
     uint64_t tmp = 0;
     MemTxResult r;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs);
     trace_memory_region_ops_read(mr, addr, tmp, size);
     *value |= (tmp & mask) << shift;
@@ -451,9 +445,6 @@
 {
     uint64_t tmp;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     tmp = (*value >> shift) & mask;
     trace_memory_region_ops_write(mr, addr, tmp, size);
     mr->ops->write(mr->opaque, addr, tmp, size);
@@ -470,9 +461,6 @@
 {
     uint64_t tmp;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     tmp = (*value >> shift) & mask;
     trace_memory_region_ops_write(mr, addr, tmp, size);
     return mr->ops->write_with_attrs(mr->opaque, addr, tmp, size, attrs);
@@ -1012,6 +1000,7 @@
     mr->ram_addr = RAM_ADDR_INVALID;
     mr->enabled = true;
     mr->romd_mode = true;
+    mr->global_locking = true;
     mr->destructor = memory_region_destructor_none;
     QTAILQ_INIT(&mr->subregions);
     QTAILQ_INIT(&mr->coalesced);
@@ -1646,6 +1635,16 @@
     }
 }
 
+void memory_region_set_global_locking(MemoryRegion *mr)
+{
+    mr->global_locking = true;
+}
+
+void memory_region_clear_global_locking(MemoryRegion *mr)
+{
+    mr->global_locking = false;
+}
+
 void memory_region_add_eventfd(MemoryRegion *mr,
                                hwaddr addr,
                                unsigned size,
diff --git a/memory_mapping.c b/memory_mapping.c
index 7b69801..36d6b26 100644
--- a/memory_mapping.c
+++ b/memory_mapping.c
@@ -13,8 +13,8 @@
 
 #include <glib.h>
 
+#include "qemu-common.h"
 #include "cpu.h"
-#include "exec/cpu-all.h"
 #include "sysemu/memory_mapping.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
diff --git a/numa.c b/numa.c
index 91fc6c1..3c80059 100644
--- a/numa.c
+++ b/numa.c
@@ -52,6 +52,92 @@
 int nb_numa_nodes;
 NodeInfo numa_info[MAX_NODES];
 
+void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node)
+{
+    struct numa_addr_range *range = g_malloc0(sizeof(*range));
+
+    /*
+     * Memory-less nodes can come here with 0 size in which case,
+     * there is nothing to do.
+     */
+    if (!size) {
+        return;
+    }
+
+    range->mem_start = addr;
+    range->mem_end = addr + size - 1;
+    QLIST_INSERT_HEAD(&numa_info[node].addr, range, entry);
+}
+
+void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node)
+{
+    struct numa_addr_range *range, *next;
+
+    QLIST_FOREACH_SAFE(range, &numa_info[node].addr, entry, next) {
+        if (addr == range->mem_start && (addr + size - 1) == range->mem_end) {
+            QLIST_REMOVE(range, entry);
+            g_free(range);
+            return;
+        }
+    }
+}
+
+static void numa_set_mem_ranges(void)
+{
+    int i;
+    ram_addr_t mem_start = 0;
+
+    /*
+     * Deduce start address of each node and use it to store
+     * the address range info in numa_info address range list
+     */
+    for (i = 0; i < nb_numa_nodes; i++) {
+        numa_set_mem_node_id(mem_start, numa_info[i].node_mem, i);
+        mem_start += numa_info[i].node_mem;
+    }
+}
+
+/*
+ * Check if @addr falls under NUMA @node.
+ */
+static bool numa_addr_belongs_to_node(ram_addr_t addr, uint32_t node)
+{
+    struct numa_addr_range *range;
+
+    QLIST_FOREACH(range, &numa_info[node].addr, entry) {
+        if (addr >= range->mem_start && addr <= range->mem_end) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/*
+ * Given an address, return the index of the NUMA node to which the
+ * address belongs to.
+ */
+uint32_t numa_get_node(ram_addr_t addr, Error **errp)
+{
+    uint32_t i;
+
+    /* For non NUMA configurations, check if the addr falls under node 0 */
+    if (!nb_numa_nodes) {
+        if (numa_addr_belongs_to_node(addr, 0)) {
+            return 0;
+        }
+    }
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        if (numa_addr_belongs_to_node(addr, i)) {
+            return i;
+        }
+    }
+
+    error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any "
+                "NUMA node", addr);
+    return -1;
+}
+
 static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
 {
     uint16_t nodenr;
@@ -274,6 +360,12 @@
         }
 
         for (i = 0; i < nb_numa_nodes; i++) {
+            QLIST_INIT(&numa_info[i].addr);
+        }
+
+        numa_set_mem_ranges();
+
+        for (i = 0; i < nb_numa_nodes; i++) {
             if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
                 break;
             }
@@ -297,6 +389,8 @@
         }
 
         validate_numa_cpus();
+    } else {
+        numa_set_mem_node_id(0, ram_size, 0);
     }
 }
 
diff --git a/stubs/iothread-lock.c b/stubs/iothread-lock.c
index 5d8aca1..dda6f6b 100644
--- a/stubs/iothread-lock.c
+++ b/stubs/iothread-lock.c
@@ -1,6 +1,11 @@
 #include "qemu-common.h"
 #include "qemu/main-loop.h"
 
+bool qemu_mutex_iothread_locked(void)
+{
+    return true;
+}
+
 void qemu_mutex_lock_iothread(void)
 {
 }
diff --git a/target-arm/helper.c b/target-arm/helper.c
index aa34159..b87afe7 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -2441,7 +2441,7 @@
     { .name = "TLBI_ALLE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 4,
       .access = PL2_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbiall_write },
+      .writefn = tlbiall_is_write },
     { .name = "TLBI_VMALLE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 0,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
diff --git a/target-arm/helper.h b/target-arm/helper.h
index fc885de..827b33d 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -50,6 +50,7 @@
 DEF_HELPER_4(exception_with_syndrome, void, env, i32, i32, i32)
 DEF_HELPER_1(wfi, void, env)
 DEF_HELPER_1(wfe, void, env)
+DEF_HELPER_1(yield, void, env)
 DEF_HELPER_1(pre_hvc, void, env)
 DEF_HELPER_2(pre_smc, void, env, i32)
 
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 7fa32c4..663c05d 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -323,13 +323,25 @@
 
 void HELPER(wfe)(CPUARMState *env)
 {
-    CPUState *cs = CPU(arm_env_get_cpu(env));
-
-    /* Don't actually halt the CPU, just yield back to top
+    /* This is a hint instruction that is semantically different
+     * from YIELD even though we currently implement it identically.
+     * Don't actually halt the CPU, just yield back to top
      * level loop. This is not going into a "low power state"
      * (ie halting until some event occurs), so we never take
      * a configurable trap to a different exception level.
      */
+    HELPER(yield)(env);
+}
+
+void HELPER(yield)(CPUARMState *env)
+{
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
+
+    /* This is a non-trappable hint instruction that generally indicates
+     * that the guest is currently busy-looping. Yield control back to the
+     * top level loop so that a more deserving VCPU has a chance to run.
+     */
     cs->exception_index = EXCP_YIELD;
     cpu_loop_exit(cs);
 }
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index e077f2d..689f2be 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1199,6 +1199,8 @@
         s->is_jmp = DISAS_WFI;
         return;
     case 1: /* YIELD */
+        s->is_jmp = DISAS_YIELD;
+        return;
     case 2: /* WFE */
         s->is_jmp = DISAS_WFE;
         return;
@@ -11107,6 +11109,10 @@
             gen_a64_set_pc_im(dc->pc);
             gen_helper_wfe(cpu_env);
             break;
+        case DISAS_YIELD:
+            gen_a64_set_pc_im(dc->pc);
+            gen_helper_yield(cpu_env);
+            break;
         case DISAS_WFI:
             /* This is a special case because we don't want to just halt the CPU
              * if trying to debug across a WFI.
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 971b6db..69ac18c 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4080,6 +4080,10 @@
 static void gen_nop_hint(DisasContext *s, int val)
 {
     switch (val) {
+    case 1: /* yield */
+        gen_set_pc_im(s, s->pc);
+        s->is_jmp = DISAS_YIELD;
+        break;
     case 3: /* wfi */
         gen_set_pc_im(s, s->pc);
         s->is_jmp = DISAS_WFI;
@@ -11459,6 +11463,9 @@
         case DISAS_WFE:
             gen_helper_wfe(cpu_env);
             break;
+        case DISAS_YIELD:
+            gen_helper_yield(cpu_env);
+            break;
         case DISAS_SWI:
             gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb),
                           default_exception_el(dc));
diff --git a/target-arm/translate.h b/target-arm/translate.h
index bcdcf11..9ab978f 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -103,6 +103,7 @@
 #define DISAS_WFE 7
 #define DISAS_HVC 8
 #define DISAS_SMC 9
+#define DISAS_YIELD 10
 
 #ifdef TARGET_AARCH64
 void a64_translate_init(void);
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 603aaf0..ac39291 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -314,6 +314,7 @@
 
 #define MSR_P6_PERFCTR0                 0xc1
 
+#define MSR_IA32_SMBASE                 0x9e
 #define MSR_MTRRcap                     0xfe
 #define MSR_MTRRcap_VCNT                8
 #define MSR_MTRRcap_FIXRANGE_SUPPORT    (1 << 8)
diff --git a/target-i386/kvm-stub.c b/target-i386/kvm-stub.c
index 2b9e801..6fefd65 100644
--- a/target-i386/kvm-stub.c
+++ b/target-i386/kvm-stub.c
@@ -18,6 +18,11 @@
 }
 
 #ifndef __OPTIMIZE__
+bool kvm_has_smm(void)
+{
+    return 1;
+}
+
 /* This function is only called inside conditionals which we
  * rely on the compiler to optimize out when CONFIG_KVM is not
  * defined.
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index daced5c..9038bf7 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -22,7 +22,7 @@
 
 #include "qemu-common.h"
 #include "sysemu/sysemu.h"
-#include "sysemu/kvm.h"
+#include "sysemu/kvm_int.h"
 #include "kvm_i386.h"
 #include "cpu.h"
 #include "exec/gdbstub.h"
@@ -73,6 +73,7 @@
 static bool has_msr_async_pf_en;
 static bool has_msr_pv_eoi_en;
 static bool has_msr_misc_enable;
+static bool has_msr_smbase;
 static bool has_msr_bndcfgs;
 static bool has_msr_kvm_steal_time;
 static int lm_capable_kernel;
@@ -85,6 +86,11 @@
 static bool has_msr_architectural_pmu;
 static uint32_t num_architectural_pmu_counters;
 
+bool kvm_has_smm(void)
+{
+    return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
@@ -819,6 +825,10 @@
                     has_msr_tsc_deadline = true;
                     continue;
                 }
+                if (kvm_msr_list->indices[i] == MSR_IA32_SMBASE) {
+                    has_msr_smbase = true;
+                    continue;
+                }
                 if (kvm_msr_list->indices[i] == MSR_IA32_MISC_ENABLE) {
                     has_msr_misc_enable = true;
                     continue;
@@ -840,6 +850,40 @@
     return ret;
 }
 
+static Notifier smram_machine_done;
+static KVMMemoryListener smram_listener;
+static AddressSpace smram_address_space;
+static MemoryRegion smram_as_root;
+static MemoryRegion smram_as_mem;
+
+static void register_smram_listener(Notifier *n, void *unused)
+{
+    MemoryRegion *smram =
+        (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
+
+    /* Outer container... */
+    memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
+    memory_region_set_enabled(&smram_as_root, true);
+
+    /* ... with two regions inside: normal system memory with low
+     * priority, and...
+     */
+    memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
+                             get_system_memory(), 0, ~0ull);
+    memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
+    memory_region_set_enabled(&smram_as_mem, true);
+
+    if (smram) {
+        /* ... SMRAM with higher priority */
+        memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
+        memory_region_set_enabled(smram, true);
+    }
+
+    address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
+    kvm_memory_listener_register(kvm_state, &smram_listener,
+                                 &smram_address_space, 1);
+}
+
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
     uint64_t identity_base = 0xfffbc000;
@@ -898,6 +942,11 @@
             return ret;
         }
     }
+
+    if (kvm_check_extension(s, KVM_CAP_X86_SMM)) {
+        smram_machine_done.notify = register_smram_listener;
+        qemu_add_machine_init_done_notifier(&smram_machine_done);
+    }
     return 0;
 }
 
@@ -1245,6 +1294,9 @@
         kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                           env->msr_ia32_misc_enable);
     }
+    if (has_msr_smbase) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_SMBASE, env->smbase);
+    }
     if (has_msr_bndcfgs) {
         kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
     }
@@ -1606,6 +1658,9 @@
     if (has_msr_misc_enable) {
         msrs[n++].index = MSR_IA32_MISC_ENABLE;
     }
+    if (has_msr_smbase) {
+        msrs[n++].index = MSR_IA32_SMBASE;
+    }
     if (has_msr_feature_control) {
         msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
     }
@@ -1760,6 +1815,9 @@
         case MSR_IA32_MISC_ENABLE:
             env->msr_ia32_misc_enable = msrs[i].data;
             break;
+        case MSR_IA32_SMBASE:
+            env->smbase = msrs[i].data;
+            break;
         case MSR_IA32_FEATURE_CONTROL:
             env->msr_ia32_feature_control = msrs[i].data;
             break;
@@ -1923,6 +1981,7 @@
 
 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
 {
+    CPUState *cs = CPU(cpu);
     CPUX86State *env = &cpu->env;
     struct kvm_vcpu_events events = {};
 
@@ -1947,6 +2006,24 @@
 
     events.sipi_vector = env->sipi_vector;
 
+    if (has_msr_smbase) {
+        events.smi.smm = !!(env->hflags & HF_SMM_MASK);
+        events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
+        if (kvm_irqchip_in_kernel()) {
+            /* As soon as these are moved to the kernel, remove them
+             * from cs->interrupt_request.
+             */
+            events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
+            events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
+            cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
+        } else {
+            /* Keep these in cs->interrupt_request.  */
+            events.smi.pending = 0;
+            events.smi.latched_init = 0;
+        }
+        events.flags |= KVM_VCPUEVENT_VALID_SMM;
+    }
+
     events.flags = 0;
     if (level >= KVM_PUT_RESET_STATE) {
         events.flags |=
@@ -1966,6 +2043,7 @@
         return 0;
     }
 
+    memset(&events, 0, sizeof(events));
     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
     if (ret < 0) {
        return ret;
@@ -1987,6 +2065,29 @@
         env->hflags2 &= ~HF2_NMI_MASK;
     }
 
+    if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
+        if (events.smi.smm) {
+            env->hflags |= HF_SMM_MASK;
+        } else {
+            env->hflags &= ~HF_SMM_MASK;
+        }
+        if (events.smi.pending) {
+            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
+        } else {
+            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
+        }
+        if (events.smi.smm_inside_nmi) {
+            env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
+        } else {
+            env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
+        }
+        if (events.smi.latched_init) {
+            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
+        } else {
+            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
+        }
+    }
+
     env->sipi_vector = events.sipi_vector;
 
     return 0;
@@ -2190,14 +2291,33 @@
     int ret;
 
     /* Inject NMI */
-    if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
-        cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
-        DPRINTF("injected NMI\n");
-        ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
-        if (ret < 0) {
-            fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
-                    strerror(-ret));
+    if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
+        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
+            qemu_mutex_lock_iothread();
+            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
+            qemu_mutex_unlock_iothread();
+            DPRINTF("injected NMI\n");
+            ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
+            if (ret < 0) {
+                fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
+                        strerror(-ret));
+            }
         }
+        if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
+            qemu_mutex_lock_iothread();
+            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
+            qemu_mutex_unlock_iothread();
+            DPRINTF("injected SMI\n");
+            ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
+            if (ret < 0) {
+                fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
+                        strerror(-ret));
+            }
+        }
+    }
+
+    if (!kvm_irqchip_in_kernel()) {
+        qemu_mutex_lock_iothread();
     }
 
     /* Force the VCPU out of its inner loop to process any INIT requests
@@ -2205,7 +2325,13 @@
      * pending TPR access reports.
      */
     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
-        cpu->exit_request = 1;
+        if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
+            !(env->hflags & HF_SMM_MASK)) {
+            cpu->exit_request = 1;
+        }
+        if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
+            cpu->exit_request = 1;
+        }
     }
 
     if (!kvm_irqchip_in_kernel()) {
@@ -2243,6 +2369,8 @@
 
         DPRINTF("setting tpr\n");
         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
+
+        qemu_mutex_unlock_iothread();
     }
 }
 
@@ -2251,13 +2379,27 @@
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
 
+    if (run->flags & KVM_RUN_X86_SMM) {
+        env->hflags |= HF_SMM_MASK;
+    } else {
+        env->hflags &= HF_SMM_MASK;
+    }
     if (run->if_flag) {
         env->eflags |= IF_MASK;
     } else {
         env->eflags &= ~IF_MASK;
     }
+
+    /* We need to protect the apic state against concurrent accesses from
+     * different threads in case the userspace irqchip is used. */
+    if (!kvm_irqchip_in_kernel()) {
+        qemu_mutex_lock_iothread();
+    }
     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
+    if (!kvm_irqchip_in_kernel()) {
+        qemu_mutex_unlock_iothread();
+    }
     return cpu_get_mem_attrs(env);
 }
 
@@ -2289,7 +2431,8 @@
         }
     }
 
-    if (cs->interrupt_request & CPU_INTERRUPT_INIT) {
+    if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
+        !(env->hflags & HF_SMM_MASK)) {
         kvm_cpu_synchronize_state(cs);
         do_cpu_init(cpu);
     }
@@ -2550,13 +2693,17 @@
     switch (run->exit_reason) {
     case KVM_EXIT_HLT:
         DPRINTF("handle_hlt\n");
+        qemu_mutex_lock_iothread();
         ret = kvm_handle_halt(cpu);
+        qemu_mutex_unlock_iothread();
         break;
     case KVM_EXIT_SET_TPR:
         ret = 0;
         break;
     case KVM_EXIT_TPR_ACCESS:
+        qemu_mutex_lock_iothread();
         ret = kvm_handle_tpr_access(cpu);
+        qemu_mutex_unlock_iothread();
         break;
     case KVM_EXIT_FAIL_ENTRY:
         code = run->fail_entry.hardware_entry_failure_reason;
@@ -2582,7 +2729,9 @@
         break;
     case KVM_EXIT_DEBUG:
         DPRINTF("kvm_exit_debug\n");
+        qemu_mutex_lock_iothread();
         ret = kvm_handle_debug(cpu, &run->debug.arch);
+        qemu_mutex_unlock_iothread();
         break;
     default:
         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h
index cac30fd..e557e94 100644
--- a/target-i386/kvm_i386.h
+++ b/target-i386/kvm_i386.h
@@ -14,6 +14,7 @@
 #include "sysemu/kvm.h"
 
 bool kvm_allows_irq0_override(void);
+bool kvm_has_smm(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);
 
diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index 948619f..7d2293d 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -99,6 +99,8 @@
     int r;
     struct kvm_mips_interrupt intr;
 
+    qemu_mutex_lock_iothread();
+
     if ((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
             cpu_mips_io_interrupts_pending(cpu)) {
         intr.cpu = -1;
@@ -109,6 +111,8 @@
                          __func__, cs->cpu_index, intr.irq);
         }
     }
+
+    qemu_mutex_unlock_iothread();
 }
 
 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index afb4696..ddf469f 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -1242,6 +1242,8 @@
     int r;
     unsigned irq;
 
+    qemu_mutex_lock_iothread();
+
     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
     if (!cap_interrupt_level &&
@@ -1269,6 +1271,8 @@
     /* We don't know if there are more interrupts pending after this. However,
      * the guest will return to userspace in the course of handling this one
      * anyways, so we will get a chance to deliver the rest. */
+
+    qemu_mutex_unlock_iothread();
 }
 
 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
@@ -1570,6 +1574,8 @@
     CPUPPCState *env = &cpu->env;
     int ret;
 
+    qemu_mutex_lock_iothread();
+
     switch (run->exit_reason) {
     case KVM_EXIT_DCR:
         if (run->dcr.is_write) {
@@ -1620,6 +1626,7 @@
         break;
     }
 
+    qemu_mutex_unlock_iothread();
     return ret;
 }
 
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 135111a..ae3a0af 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -2007,6 +2007,8 @@
     S390CPU *cpu = S390_CPU(cs);
     int ret = 0;
 
+    qemu_mutex_lock_iothread();
+
     switch (run->exit_reason) {
         case KVM_EXIT_S390_SIEIC:
             ret = handle_intercept(cpu);
@@ -2027,6 +2029,7 @@
             fprintf(stderr, "Unknown KVM exit: %d\n", run->exit_reason);
             break;
     }
+    qemu_mutex_unlock_iothread();
 
     if (ret == 0) {
         ret = EXCP_INTERRUPT;
diff --git a/target-s390x/mmu_helper.c b/target-s390x/mmu_helper.c
index 815ff42..1ea6d81 100644
--- a/target-s390x/mmu_helper.c
+++ b/target-s390x/mmu_helper.c
@@ -17,8 +17,8 @@
 
 #include "qemu/error-report.h"
 #include "exec/address-spaces.h"
-#include "sysemu/kvm.h"
 #include "cpu.h"
+#include "sysemu/kvm.h"
 
 /* #define DEBUG_S390 */
 /* #define DEBUG_S390_PTE */
diff --git a/target-xtensa/core-dc232b.c b/target-xtensa/core-dc232b.c
index a3b914b..06826c0 100644
--- a/target-xtensa/core-dc232b.c
+++ b/target-xtensa/core-dc232b.c
@@ -33,7 +33,7 @@
 #include "core-dc232b/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig dc232b __attribute__((unused)) = {
+static XtensaConfig dc232b __attribute__((unused)) = {
     .name = "dc232b",
     .gdb_regmap = {
         .num_regs = 120,
diff --git a/target-xtensa/core-dc233c.c b/target-xtensa/core-dc233c.c
index ac745d1..8daf7d9 100644
--- a/target-xtensa/core-dc233c.c
+++ b/target-xtensa/core-dc233c.c
@@ -34,7 +34,7 @@
 #include "core-dc233c/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig dc233c __attribute__((unused)) = {
+static XtensaConfig dc233c __attribute__((unused)) = {
     .name = "dc233c",
     .gdb_regmap = {
         .num_regs = 121,
diff --git a/target-xtensa/core-fsf.c b/target-xtensa/core-fsf.c
index cfcc840..f6ea6b9 100644
--- a/target-xtensa/core-fsf.c
+++ b/target-xtensa/core-fsf.c
@@ -33,9 +33,14 @@
 #include "core-fsf/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig fsf __attribute__((unused)) = {
+static XtensaConfig fsf __attribute__((unused)) = {
     .name = "fsf",
+    .gdb_regmap = {
     /* GDB for this core is not supported currently */
+        .reg = {
+            XTREG_END
+        },
+    },
     .clock_freq_khz = 10000,
     DEFAULT_SECTIONS
 };
diff --git a/target-xtensa/cpu.h b/target-xtensa/cpu.h
index dfd0d1c..b89c602 100644
--- a/target-xtensa/cpu.h
+++ b/target-xtensa/cpu.h
@@ -287,6 +287,7 @@
     int targno;
     int type;
     int group;
+    unsigned size;
 } XtensaGdbReg;
 
 typedef struct XtensaGdbRegmap {
@@ -336,6 +337,18 @@
     struct XtensaConfigList *next;
 } XtensaConfigList;
 
+#ifdef HOST_WORDS_BIGENDIAN
+enum {
+    FP_F32_HIGH,
+    FP_F32_LOW,
+};
+#else
+enum {
+    FP_F32_LOW,
+    FP_F32_HIGH,
+};
+#endif
+
 typedef struct CPUXtensaState {
     const XtensaConfig *config;
     uint32_t regs[16];
@@ -343,7 +356,10 @@
     uint32_t sregs[256];
     uint32_t uregs[256];
     uint32_t phys_regs[MAX_NAREG];
-    float32 fregs[16];
+    union {
+        float32 f32[2];
+        float64 f64;
+    } fregs[16];
     float_status fp_status;
 
     xtensa_tlb_entry itlb[7][MAX_TLB_WAY_SIZE];
@@ -384,6 +400,7 @@
 void xtensa_translate_init(void);
 void xtensa_breakpoint_handler(CPUState *cs);
 int cpu_xtensa_exec(CPUXtensaState *s);
+void xtensa_finalize_config(XtensaConfig *config);
 void xtensa_register_core(XtensaConfigList *node);
 void check_interrupts(CPUXtensaState *s);
 void xtensa_irq_init(CPUXtensaState *env);
diff --git a/target-xtensa/gdbstub.c b/target-xtensa/gdbstub.c
index 9e13b20..bc2e1b5 100644
--- a/target-xtensa/gdbstub.c
+++ b/target-xtensa/gdbstub.c
@@ -26,6 +26,7 @@
     XtensaCPU *cpu = XTENSA_CPU(cs);
     CPUXtensaState *env = &cpu->env;
     const XtensaGdbReg *reg = env->config->gdb_regmap.reg + n;
+    unsigned i;
 
     if (n < 0 || n >= env->config->gdb_regmap.num_regs) {
         return 0;
@@ -47,8 +48,16 @@
         return gdb_get_reg32(mem_buf, env->uregs[reg->targno & 0xff]);
 
     case 4: /*f*/
-        return gdb_get_reg32(mem_buf, float32_val(env->fregs[reg->targno
-                                                             & 0x0f]));
+        i = reg->targno & 0x0f;
+        switch (reg->size) {
+        case 4:
+            return gdb_get_reg32(mem_buf,
+                                 float32_val(env->fregs[i].f32[FP_F32_LOW]));
+        case 8:
+            return gdb_get_reg64(mem_buf, float64_val(env->fregs[i].f64));
+        default:
+            return 0;
+        }
 
     case 8: /*a*/
         return gdb_get_reg32(mem_buf, env->regs[reg->targno & 0x0f]);
@@ -92,8 +101,16 @@
         break;
 
     case 4: /*f*/
-        env->fregs[reg->targno & 0x0f] = make_float32(tmp);
-        break;
+        switch (reg->size) {
+        case 4:
+            env->fregs[reg->targno & 0x0f].f32[FP_F32_LOW] = make_float32(tmp);
+            return 4;
+        case 8:
+            env->fregs[reg->targno & 0x0f].f64 = make_float64(tmp);
+            return 8;
+        default:
+            return 0;
+        }
 
     case 8: /*a*/
         env->regs[reg->targno & 0x0f] = tmp;
diff --git a/target-xtensa/helper.c b/target-xtensa/helper.c
index d84d259..76be50d 100644
--- a/target-xtensa/helper.c
+++ b/target-xtensa/helper.c
@@ -51,6 +51,20 @@
     cc->gdb_num_core_regs = config->gdb_regmap.num_regs;
 }
 
+void xtensa_finalize_config(XtensaConfig *config)
+{
+    unsigned i, n = 0;
+
+    if (config->gdb_regmap.num_regs) {
+        return;
+    }
+
+    for (i = 0; config->gdb_regmap.reg[i].targno >= 0; ++i) {
+        n += (config->gdb_regmap.reg[i].type != 6);
+    }
+    config->gdb_regmap.num_regs = n;
+}
+
 void xtensa_register_core(XtensaConfigList *node)
 {
     TypeInfo type = {
diff --git a/target-xtensa/import_core.sh b/target-xtensa/import_core.sh
index 73791ec..351bee4 100755
--- a/target-xtensa/import_core.sh
+++ b/target-xtensa/import_core.sh
@@ -22,8 +22,7 @@
 tar -xf "$OVERLAY" -C "$TARGET" --strip-components=1 \
     --xform='s/core/core-isa/' config/core.h
 tar -xf "$OVERLAY" -O gdb/xtensa-config.c | \
-    sed -n '1,/*\//p;/pc/,/a15/p' > "$TARGET"/gdb-config.c
-NUM_REGS=$(grep XTREG "$TARGET"/gdb-config.c | wc -l)
+    sed -n '1,/*\//p;/XTREG/,/XTREG_END/p' > "$TARGET"/gdb-config.c
 
 cat <<EOF > "${TARGET}.c"
 #include "cpu.h"
@@ -34,10 +33,9 @@
 #include "core-$NAME/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig $NAME __attribute__((unused)) = {
+static XtensaConfig $NAME __attribute__((unused)) = {
     .name = "$NAME",
     .gdb_regmap = {
-        .num_regs = $NUM_REGS,
         .reg = {
 #include "core-$NAME/gdb-config.c"
         }
diff --git a/target-xtensa/overlay_tool.h b/target-xtensa/overlay_tool.h
index 6105d4c..eda03aa 100644
--- a/target-xtensa/overlay_tool.h
+++ b/target-xtensa/overlay_tool.h
@@ -27,7 +27,8 @@
 
 #define XTREG(idx, ofs, bi, sz, al, no, flags, cp, typ, grp, name, \
         a1, a2, a3, a4, a5, a6) \
-    { .targno = (no), .type = (typ), .group = (grp) },
+    { .targno = (no), .type = (typ), .group = (grp), .size = (sz) },
+#define XTREG_END { .targno = -1 },
 
 #ifndef XCHAL_HAVE_DIV32
 #define XCHAL_HAVE_DIV32 0
@@ -316,6 +317,7 @@
         static XtensaConfigList node = { \
             .config = &core, \
         }; \
+        xtensa_finalize_config(&core); \
         xtensa_register_core(&node); \
     }
 #else
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index 86e4849..f2118c2 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -228,7 +228,7 @@
 
     for (i = 0; i < 16; i++) {
         cpu_FR[i] = tcg_global_mem_new_i32(TCG_AREG0,
-                offsetof(CPUXtensaState, fregs[i]),
+                offsetof(CPUXtensaState, fregs[i].f32[FP_F32_LOW]),
                 fregnames[i]);
     }
 
@@ -3206,8 +3206,9 @@
 
         for (i = 0; i < 16; ++i) {
             cpu_fprintf(f, "F%02d=%08x (%+10.8e)%c", i,
-                    float32_val(env->fregs[i]),
-                    *(float *)&env->fregs[i], (i % 2) == 1 ? '\n' : ' ');
+                    float32_val(env->fregs[i].f32[FP_F32_LOW]),
+                    *(float *)(env->fregs[i].f32 + FP_F32_LOW),
+                    (i % 2) == 1 ? '\n' : ' ');
         }
     }
 }
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 41e4869..231a781 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -29,6 +29,8 @@
 #include "qemu/bitops.h"
 #include "tcg-target.h"
 
+#define CPU_TEMP_BUF_NLONGS 128
+
 /* Default target word size to pointer size.  */
 #ifndef TCG_TARGET_REG_BITS
 # if UINTPTR_MAX == UINT32_MAX
diff --git a/tests/ahci-test.c b/tests/ahci-test.c
index ae9415d..87d7691 100644
--- a/tests/ahci-test.c
+++ b/tests/ahci-test.c
@@ -228,6 +228,8 @@
 {
     AHCIQState *ahci;
     va_list ap;
+    uint16_t buff[256];
+    uint8_t port;
 
     if (cli) {
         va_start(ap, cli);
@@ -239,6 +241,10 @@
 
     ahci_pci_enable(ahci);
     ahci_hba_enable(ahci);
+    /* Initialize test device */
+    port = ahci_port_select(ahci);
+    ahci_port_clear(ahci, port);
+    ahci_io(ahci, port, CMD_IDENTIFY, &buff, sizeof(buff), 0);
 
     return ahci;
 }
@@ -890,21 +896,23 @@
     g_free(rx);
 }
 
-static void ahci_test_nondata(AHCIQState *ahci, uint8_t ide_cmd)
+static uint8_t ahci_test_nondata(AHCIQState *ahci, uint8_t ide_cmd)
 {
-    uint8_t px;
+    uint8_t port;
     AHCICommand *cmd;
 
     /* Sanitize */
-    px = ahci_port_select(ahci);
-    ahci_port_clear(ahci, px);
+    port = ahci_port_select(ahci);
+    ahci_port_clear(ahci, port);
 
     /* Issue Command */
     cmd = ahci_command_create(ide_cmd);
-    ahci_command_commit(ahci, cmd, px);
+    ahci_command_commit(ahci, cmd, port);
     ahci_command_issue(ahci, cmd);
     ahci_command_verify(ahci, cmd);
     ahci_command_free(cmd);
+
+    return port;
 }
 
 static void ahci_test_flush(AHCIQState *ahci)
@@ -912,6 +920,33 @@
     ahci_test_nondata(ahci, CMD_FLUSH_CACHE);
 }
 
+static void ahci_test_max(AHCIQState *ahci)
+{
+    RegD2HFIS *d2h = g_malloc0(0x20);
+    uint64_t nsect;
+    uint8_t port;
+    uint8_t cmd;
+    uint64_t config_sect = TEST_IMAGE_SECTORS - 1;
+
+    if (config_sect > 0xFFFFFF) {
+        cmd = CMD_READ_MAX_EXT;
+    } else {
+        cmd = CMD_READ_MAX;
+    }
+
+    port = ahci_test_nondata(ahci, cmd);
+    memread(ahci->port[port].fb + 0x40, d2h, 0x20);
+    nsect = (uint64_t)d2h->lba_hi[2] << 40 |
+        (uint64_t)d2h->lba_hi[1] << 32 |
+        (uint64_t)d2h->lba_hi[0] << 24 |
+        (uint64_t)d2h->lba_lo[2] << 16 |
+        (uint64_t)d2h->lba_lo[1] << 8 |
+        (uint64_t)d2h->lba_lo[0];
+
+    g_assert_cmphex(nsect, ==, config_sect);
+    g_free(d2h);
+}
+
 
 /******************************************************************************/
 /* Test Interfaces                                                            */
@@ -1111,9 +1146,9 @@
 }
 
 /**
- * DMA Migration test: Write a pattern, migrate, then read.
+ * Simple migration test: Write a pattern, migrate, then read.
  */
-static void test_migrate_dma(void)
+static void ahci_migrate_simple(uint8_t cmd_read, uint8_t cmd_write)
 {
     AHCIQState *src, *dst;
     uint8_t px;
@@ -1141,9 +1176,9 @@
     }
 
     /* Write, migrate, then read. */
-    ahci_io(src, px, CMD_WRITE_DMA, tx, bufsize, 0);
+    ahci_io(src, px, cmd_write, tx, bufsize, 0);
     ahci_migrate(src, dst, uri);
-    ahci_io(dst, px, CMD_READ_DMA, rx, bufsize, 0);
+    ahci_io(dst, px, cmd_read, rx, bufsize, 0);
 
     /* Verify pattern */
     g_assert_cmphex(memcmp(tx, rx, bufsize), ==, 0);
@@ -1154,14 +1189,24 @@
     g_free(tx);
 }
 
+static void test_migrate_dma(void)
+{
+    ahci_migrate_simple(CMD_READ_DMA, CMD_WRITE_DMA);
+}
+
+static void test_migrate_ncq(void)
+{
+    ahci_migrate_simple(READ_FPDMA_QUEUED, WRITE_FPDMA_QUEUED);
+}
+
 /**
- * DMA Error Test
+ * Halted IO Error Test
  *
  * Simulate an error on first write, Try to write a pattern,
  * Confirm the VM has stopped, resume the VM, verify command
  * has completed, then read back the data and verify.
  */
-static void test_halted_dma(void)
+static void ahci_halted_io_test(uint8_t cmd_read, uint8_t cmd_write)
 {
     AHCIQState *ahci;
     uint8_t port;
@@ -1196,7 +1241,7 @@
     memwrite(ptr, tx, bufsize);
 
     /* Attempt to write (and fail) */
-    cmd = ahci_guest_io_halt(ahci, port, CMD_WRITE_DMA,
+    cmd = ahci_guest_io_halt(ahci, port, cmd_write,
                              ptr, bufsize, 0);
 
     /* Attempt to resume the command */
@@ -1204,7 +1249,7 @@
     ahci_free(ahci, ptr);
 
     /* Read back and verify */
-    ahci_io(ahci, port, CMD_READ_DMA, rx, bufsize, 0);
+    ahci_io(ahci, port, cmd_read, rx, bufsize, 0);
     g_assert_cmphex(memcmp(tx, rx, bufsize), ==, 0);
 
     /* Cleanup and go home */
@@ -1213,14 +1258,24 @@
     g_free(tx);
 }
 
+static void test_halted_dma(void)
+{
+    ahci_halted_io_test(CMD_READ_DMA, CMD_WRITE_DMA);
+}
+
+static void test_halted_ncq(void)
+{
+    ahci_halted_io_test(READ_FPDMA_QUEUED, WRITE_FPDMA_QUEUED);
+}
+
 /**
- * DMA Error Migration Test
+ * IO Error Migration Test
  *
  * Simulate an error on first write, Try to write a pattern,
  * Confirm the VM has stopped, migrate, resume the VM,
  * verify command has completed, then read back the data and verify.
  */
-static void test_migrate_halted_dma(void)
+static void ahci_migrate_halted_io(uint8_t cmd_read, uint8_t cmd_write)
 {
     AHCIQState *src, *dst;
     uint8_t port;
@@ -1266,14 +1321,14 @@
     memwrite(ptr, tx, bufsize);
 
     /* Write, trigger the VM to stop, migrate, then resume. */
-    cmd = ahci_guest_io_halt(src, port, CMD_WRITE_DMA,
+    cmd = ahci_guest_io_halt(src, port, cmd_write,
                              ptr, bufsize, 0);
     ahci_migrate(src, dst, uri);
     ahci_guest_io_resume(dst, cmd);
     ahci_free(dst, ptr);
 
     /* Read back */
-    ahci_io(dst, port, CMD_READ_DMA, rx, bufsize, 0);
+    ahci_io(dst, port, cmd_read, rx, bufsize, 0);
 
     /* Verify TX and RX are identical */
     g_assert_cmphex(memcmp(tx, rx, bufsize), ==, 0);
@@ -1285,6 +1340,16 @@
     g_free(tx);
 }
 
+static void test_migrate_halted_dma(void)
+{
+    ahci_migrate_halted_io(CMD_READ_DMA, CMD_WRITE_DMA);
+}
+
+static void test_migrate_halted_ncq(void)
+{
+    ahci_migrate_halted_io(READ_FPDMA_QUEUED, WRITE_FPDMA_QUEUED);
+}
+
 /**
  * Migration test: Try to flush, migrate, then resume.
  */
@@ -1334,6 +1399,49 @@
     ahci_shutdown(dst);
 }
 
+static void test_max(void)
+{
+    AHCIQState *ahci;
+
+    ahci = ahci_boot_and_enable(NULL);
+    ahci_test_max(ahci);
+    ahci_shutdown(ahci);
+}
+
+static void test_reset(void)
+{
+    AHCIQState *ahci;
+    int i;
+
+    ahci = ahci_boot(NULL);
+    ahci_test_pci_spec(ahci);
+    ahci_pci_enable(ahci);
+
+    for (i = 0; i < 2; i++) {
+        ahci_test_hba_spec(ahci);
+        ahci_hba_enable(ahci);
+        ahci_test_identify(ahci);
+        ahci_test_io_rw_simple(ahci, 4096, 0,
+                               CMD_READ_DMA_EXT,
+                               CMD_WRITE_DMA_EXT);
+        ahci_set(ahci, AHCI_GHC, AHCI_GHC_HR);
+        ahci_clean_mem(ahci);
+    }
+
+    ahci_shutdown(ahci);
+}
+
+static void test_ncq_simple(void)
+{
+    AHCIQState *ahci;
+
+    ahci = ahci_boot_and_enable(NULL);
+    ahci_test_io_rw_simple(ahci, 4096, 0,
+                           READ_FPDMA_QUEUED,
+                           WRITE_FPDMA_QUEUED);
+    ahci_shutdown(ahci);
+}
+
 /******************************************************************************/
 /* AHCI I/O Test Matrix Definitions                                           */
 
@@ -1584,6 +1692,14 @@
     qtest_add_func("/ahci/io/dma/lba28/retry", test_halted_dma);
     qtest_add_func("/ahci/migrate/dma/halted", test_migrate_halted_dma);
 
+    qtest_add_func("/ahci/max", test_max);
+    qtest_add_func("/ahci/reset", test_reset);
+
+    qtest_add_func("/ahci/io/ncq/simple", test_ncq_simple);
+    qtest_add_func("/ahci/migrate/ncq/simple", test_migrate_ncq);
+    qtest_add_func("/ahci/io/ncq/retry", test_halted_ncq);
+    qtest_add_func("/ahci/migrate/ncq/halted", test_migrate_halted_ncq);
+
     ret = g_test_run();
 
     /* Cleanup */
diff --git a/tests/libqos/ahci.c b/tests/libqos/ahci.c
index 7e17bb6..33ecd2a 100644
--- a/tests/libqos/ahci.c
+++ b/tests/libqos/ahci.c
@@ -50,27 +50,47 @@
 } AHCICommandProp;
 
 AHCICommandProp ahci_command_properties[] = {
-    { .cmd = CMD_READ_PIO,      .data = true,  .pio = true,
-                                .lba28 = true, .read = true },
-    { .cmd = CMD_WRITE_PIO,     .data = true,  .pio = true,
-                                .lba28 = true, .write = true },
-    { .cmd = CMD_READ_PIO_EXT,  .data = true,  .pio = true,
-                                .lba48 = true, .read = true },
-    { .cmd = CMD_WRITE_PIO_EXT, .data = true,  .pio = true,
-                                .lba48 = true, .write = true },
-    { .cmd = CMD_READ_DMA,      .data = true,  .dma = true,
-                                .lba28 = true, .read = true },
-    { .cmd = CMD_WRITE_DMA,     .data = true,  .dma = true,
-                                .lba28 = true, .write = true },
-    { .cmd = CMD_READ_DMA_EXT,  .data = true,  .dma = true,
-                                .lba48 = true, .read = true },
-    { .cmd = CMD_WRITE_DMA_EXT, .data = true,  .dma = true,
-                                .lba48 = true, .write = true },
-    { .cmd = CMD_IDENTIFY,      .data = true,  .pio = true,
-                                .size = 512,   .read = true },
-    { .cmd = CMD_READ_MAX,      .lba28 = true },
-    { .cmd = CMD_READ_MAX_EXT,  .lba48 = true },
-    { .cmd = CMD_FLUSH_CACHE,   .data = false }
+    { .cmd = CMD_READ_PIO,       .data = true,  .pio = true,
+                                 .lba28 = true, .read = true },
+    { .cmd = CMD_WRITE_PIO,      .data = true,  .pio = true,
+                                 .lba28 = true, .write = true },
+    { .cmd = CMD_READ_PIO_EXT,   .data = true,  .pio = true,
+                                 .lba48 = true, .read = true },
+    { .cmd = CMD_WRITE_PIO_EXT,  .data = true,  .pio = true,
+                                 .lba48 = true, .write = true },
+    { .cmd = CMD_READ_DMA,       .data = true,  .dma = true,
+                                 .lba28 = true, .read = true },
+    { .cmd = CMD_WRITE_DMA,      .data = true,  .dma = true,
+                                 .lba28 = true, .write = true },
+    { .cmd = CMD_READ_DMA_EXT,   .data = true,  .dma = true,
+                                 .lba48 = true, .read = true },
+    { .cmd = CMD_WRITE_DMA_EXT,  .data = true,  .dma = true,
+                                 .lba48 = true, .write = true },
+    { .cmd = CMD_IDENTIFY,       .data = true,  .pio = true,
+                                 .size = 512,   .read = true },
+    { .cmd = READ_FPDMA_QUEUED,  .data = true,  .dma = true,
+                                 .lba48 = true, .read = true, .ncq = true },
+    { .cmd = WRITE_FPDMA_QUEUED, .data = true,  .dma = true,
+                                 .lba48 = true, .write = true, .ncq = true },
+    { .cmd = CMD_READ_MAX,       .lba28 = true },
+    { .cmd = CMD_READ_MAX_EXT,   .lba48 = true },
+    { .cmd = CMD_FLUSH_CACHE,    .data = false }
+};
+
+struct AHCICommand {
+    /* Test Management Data */
+    uint8_t name;
+    uint8_t port;
+    uint8_t slot;
+    uint32_t interrupts;
+    uint64_t xbytes;
+    uint32_t prd_size;
+    uint64_t buffer;
+    AHCICommandProp *props;
+    /* Data to be transferred to the guest */
+    AHCICommandHeader header;
+    RegH2DFIS fis;
+    void *atapi_cmd;
 };
 
 /**
@@ -138,12 +158,14 @@
     for (port = 0; port < 32; ++port) {
         if (ahci->port[port].fb) {
             ahci_free(ahci, ahci->port[port].fb);
+            ahci->port[port].fb = 0;
         }
         if (ahci->port[port].clb) {
             for (slot = 0; slot < 32; slot++) {
                 ahci_destroy_command(ahci, port, slot);
             }
             ahci_free(ahci, ahci->port[port].clb);
+            ahci->port[port].clb = 0;
         }
     }
 }
@@ -252,7 +274,7 @@
         /* Allocate Memory for the Command List Buffer & FIS Buffer */
         /* PxCLB space ... 0x20 per command, as in 4.2.2 p 36 */
         ahci->port[i].clb = ahci_alloc(ahci, num_cmd_slots * 0x20);
-        qmemset(ahci->port[i].clb, 0x00, 0x100);
+        qmemset(ahci->port[i].clb, 0x00, num_cmd_slots * 0x20);
         g_test_message("CLB: 0x%08" PRIx64, ahci->port[i].clb);
         ahci_px_wreg(ahci, i, AHCI_PX_CLB, ahci->port[i].clb);
         g_assert_cmphex(ahci->port[i].clb, ==,
@@ -460,13 +482,15 @@
     g_free(pio);
 }
 
-void ahci_port_check_cmd_sanity(AHCIQState *ahci, uint8_t port,
-                                uint8_t slot, size_t buffsize)
+void ahci_port_check_cmd_sanity(AHCIQState *ahci, AHCICommand *cmd)
 {
-    AHCICommandHeader cmd;
+    AHCICommandHeader cmdh;
 
-    ahci_get_command_header(ahci, port, slot, &cmd);
-    g_assert_cmphex(buffsize, ==, cmd.prdbc);
+    ahci_get_command_header(ahci, cmd->port, cmd->slot, &cmdh);
+    /* Physical Region Descriptor Byte Count is not required to work for NCQ. */
+    if (!cmd->props->ncq) {
+        g_assert_cmphex(cmd->xbytes, ==, cmdh.prdbc);
+    }
 }
 
 /* Get the command in #slot of port #port. */
@@ -549,7 +573,7 @@
         if (reg & (1 << j)) {
             continue;
         }
-        ahci_destroy_command(ahci, port, i);
+        ahci_destroy_command(ahci, port, j);
         ahci->port[port].next = (j + 1) % 32;
         return j;
     }
@@ -610,22 +634,6 @@
     ahci_command_free(cmd);
 }
 
-struct AHCICommand {
-    /* Test Management Data */
-    uint8_t name;
-    uint8_t port;
-    uint8_t slot;
-    uint32_t interrupts;
-    uint64_t xbytes;
-    uint32_t prd_size;
-    uint64_t buffer;
-    AHCICommandProp *props;
-    /* Data to be transferred to the guest */
-    AHCICommandHeader header;
-    RegH2DFIS fis;
-    void *atapi_cmd;
-};
-
 static AHCICommandProp *ahci_command_find(uint8_t command_name)
 {
     int i;
@@ -691,19 +699,34 @@
 static void command_table_init(AHCICommand *cmd)
 {
     RegH2DFIS *fis = &(cmd->fis);
+    uint16_t sect_count = (cmd->xbytes / AHCI_SECTOR_SIZE);
 
     fis->fis_type = REG_H2D_FIS;
     fis->flags = REG_H2D_FIS_CMD; /* "Command" bit */
     fis->command = cmd->name;
-    cmd->fis.feature_low = 0x00;
-    cmd->fis.feature_high = 0x00;
-    if (cmd->props->lba28 || cmd->props->lba48) {
-        cmd->fis.device = ATA_DEVICE_LBA;
+
+    if (cmd->props->ncq) {
+        NCQFIS *ncqfis = (NCQFIS *)fis;
+        /* NCQ is weird and re-uses FIS frames for unrelated data.
+         * See SATA 3.2, 13.6.4.1 READ FPDMA QUEUED for an example. */
+        ncqfis->sector_low = sect_count & 0xFF;
+        ncqfis->sector_hi = (sect_count >> 8) & 0xFF;
+        ncqfis->device = NCQ_DEVICE_MAGIC;
+        /* Force Unit Access is bit 7 in the device register */
+        ncqfis->tag = 0;  /* bits 3-7 are the NCQ tag */
+        ncqfis->prio = 0; /* bits 6,7 are a prio tag */
+        /* RARC bit is bit 0 of TAG field */
+    } else {
+        fis->feature_low = 0x00;
+        fis->feature_high = 0x00;
+        if (cmd->props->lba28 || cmd->props->lba48) {
+            fis->device = ATA_DEVICE_LBA;
+        }
+        fis->count = (cmd->xbytes / AHCI_SECTOR_SIZE);
     }
-    cmd->fis.count = (cmd->xbytes / AHCI_SECTOR_SIZE);
-    cmd->fis.icc = 0x00;
-    cmd->fis.control = 0x00;
-    memset(cmd->fis.aux, 0x00, ARRAY_SIZE(cmd->fis.aux));
+    fis->icc = 0x00;
+    fis->control = 0x00;
+    memset(fis->aux, 0x00, ARRAY_SIZE(fis->aux));
 }
 
 AHCICommand *ahci_command_create(uint8_t command_name)
@@ -717,6 +740,7 @@
     g_assert(!(props->lba28 && props->lba48));
     g_assert(!(props->read && props->write));
     g_assert(!props->size || props->data);
+    g_assert(!props->ncq || (props->ncq && props->lba48));
 
     /* Defaults and book-keeping */
     cmd->props = props;
@@ -725,12 +749,15 @@
     cmd->prd_size = 4096;
     cmd->buffer = 0xabad1dea;
 
-    cmd->interrupts = AHCI_PX_IS_DHRS;
+    if (!cmd->props->ncq) {
+        cmd->interrupts = AHCI_PX_IS_DHRS;
+    }
     /* BUG: We expect the DPS interrupt for data commands */
     /* cmd->interrupts |= props->data ? AHCI_PX_IS_DPS : 0; */
     /* BUG: We expect the DMA Setup interrupt for DMA commands */
     /* cmd->interrupts |= props->dma ? AHCI_PX_IS_DSS : 0; */
     cmd->interrupts |= props->pio ? AHCI_PX_IS_PSS : 0;
+    cmd->interrupts |= props->ncq ? AHCI_PX_IS_SDBS : 0;
 
     command_header_init(cmd);
     command_table_init(cmd);
@@ -758,7 +785,7 @@
     RegH2DFIS *fis = &(cmd->fis);
     if (cmd->props->lba28) {
         g_assert_cmphex(lba_sect, <=, 0xFFFFFFF);
-    } else if (cmd->props->lba48) {
+    } else if (cmd->props->lba48 || cmd->props->ncq) {
         g_assert_cmphex(lba_sect, <=, 0xFFFFFFFFFFFF);
     } else {
         /* Can't set offset if we don't know the format. */
@@ -785,6 +812,8 @@
 void ahci_command_set_sizes(AHCICommand *cmd, uint64_t xbytes,
                             unsigned prd_size)
 {
+    uint16_t sect_count;
+
     /* Each PRD can describe up to 4MiB, and must not be odd. */
     g_assert_cmphex(prd_size, <=, 4096 * 1024);
     g_assert_cmphex(prd_size & 0x01, ==, 0x00);
@@ -792,7 +821,15 @@
         cmd->prd_size = prd_size;
     }
     cmd->xbytes = xbytes;
-    cmd->fis.count = (cmd->xbytes / AHCI_SECTOR_SIZE);
+    sect_count = (cmd->xbytes / AHCI_SECTOR_SIZE);
+
+    if (cmd->props->ncq) {
+        NCQFIS *nfis = (NCQFIS *)&(cmd->fis);
+        nfis->sector_low = sect_count & 0xFF;
+        nfis->sector_hi = (sect_count >> 8) & 0xFF;
+    } else {
+        cmd->fis.count = sect_count;
+    }
     cmd->header.prdtl = size_to_prdtl(cmd->xbytes, cmd->prd_size);
 }
 
@@ -824,6 +861,11 @@
     cmd->port = port;
     cmd->slot = ahci_pick_cmd(ahci, port);
 
+    if (cmd->props->ncq) {
+        NCQFIS *nfis = (NCQFIS *)&cmd->fis;
+        nfis->tag = (cmd->slot << 3) & 0xFC;
+    }
+
     /* Create a buffer for the command table */
     prdtl = size_to_prdtl(cmd->xbytes, cmd->prd_size);
     table_size = CMD_TBL_SIZ(prdtl);
@@ -878,11 +920,15 @@
     /* We can't rely on STS_BSY until the command has started processing.
      * Therefore, we also use the Command Issue bit as indication of
      * a command in-flight. */
-    while (BITSET(ahci_px_rreg(ahci, cmd->port, AHCI_PX_TFD),
-                  AHCI_PX_TFD_STS_BSY) ||
-           BITSET(ahci_px_rreg(ahci, cmd->port, AHCI_PX_CI), (1 << cmd->slot))) {
+
+#define RSET(REG, MASK) (BITSET(ahci_px_rreg(ahci, cmd->port, (REG)), (MASK)))
+
+    while (RSET(AHCI_PX_TFD, AHCI_PX_TFD_STS_BSY) ||
+           RSET(AHCI_PX_CI, 1 << cmd->slot) ||
+           (cmd->props->ncq && RSET(AHCI_PX_SACT, 1 << cmd->slot))) {
         usleep(50);
     }
+
 }
 
 void ahci_command_issue(AHCIQState *ahci, AHCICommand *cmd)
@@ -899,8 +945,10 @@
     ahci_port_check_error(ahci, port);
     ahci_port_check_interrupts(ahci, port, cmd->interrupts);
     ahci_port_check_nonbusy(ahci, port, slot);
-    ahci_port_check_cmd_sanity(ahci, port, slot, cmd->xbytes);
-    ahci_port_check_d2h_sanity(ahci, port, slot);
+    ahci_port_check_cmd_sanity(ahci, cmd);
+    if (cmd->interrupts & AHCI_PX_IS_DHRS) {
+        ahci_port_check_d2h_sanity(ahci, port, slot);
+    }
     if (cmd->props->pio) {
         ahci_port_check_pio_sanity(ahci, port, slot, cmd->xbytes);
     }
diff --git a/tests/libqos/ahci.h b/tests/libqos/ahci.h
index 779e812..a08a9dd 100644
--- a/tests/libqos/ahci.h
+++ b/tests/libqos/ahci.h
@@ -263,20 +263,23 @@
 /* ATA Commands */
 enum {
     /* DMA */
-    CMD_READ_DMA      = 0xC8,
-    CMD_READ_DMA_EXT  = 0x25,
-    CMD_WRITE_DMA     = 0xCA,
-    CMD_WRITE_DMA_EXT = 0x35,
+    CMD_READ_DMA       = 0xC8,
+    CMD_READ_DMA_EXT   = 0x25,
+    CMD_WRITE_DMA      = 0xCA,
+    CMD_WRITE_DMA_EXT  = 0x35,
     /* PIO */
-    CMD_READ_PIO      = 0x20,
-    CMD_READ_PIO_EXT  = 0x24,
-    CMD_WRITE_PIO     = 0x30,
-    CMD_WRITE_PIO_EXT = 0x34,
+    CMD_READ_PIO       = 0x20,
+    CMD_READ_PIO_EXT   = 0x24,
+    CMD_WRITE_PIO      = 0x30,
+    CMD_WRITE_PIO_EXT  = 0x34,
     /* Misc */
-    CMD_READ_MAX      = 0xF8,
-    CMD_READ_MAX_EXT  = 0x27,
-    CMD_FLUSH_CACHE   = 0xE7,
-    CMD_IDENTIFY      = 0xEC
+    CMD_READ_MAX       = 0xF8,
+    CMD_READ_MAX_EXT   = 0x27,
+    CMD_FLUSH_CACHE    = 0xE7,
+    CMD_IDENTIFY       = 0xEC,
+    /* NCQ */
+    READ_FPDMA_QUEUED  = 0x60,
+    WRITE_FPDMA_QUEUED = 0x61,
 };
 
 /* AHCI Command Header Flags & Masks*/
@@ -291,8 +294,9 @@
 #define CMDH_PMP      (0xF000)
 
 /* ATA device register masks */
-#define ATA_DEVICE_MAGIC 0xA0
+#define ATA_DEVICE_MAGIC 0xA0 /* used in ata1-3 */
 #define ATA_DEVICE_LBA   0x40
+#define NCQ_DEVICE_MAGIC 0x40 /* for ncq device registers */
 #define ATA_DEVICE_DRIVE 0x10
 #define ATA_DEVICE_HEAD  0x0F
 
@@ -397,6 +401,32 @@
 } __attribute__((__packed__)) RegH2DFIS;
 
 /**
+ * Register host-to-device FIS structure, for NCQ commands.
+ * Actually just a RegH2DFIS, but with fields repurposed.
+ * Repurposed fields are annotated below.
+ */
+typedef struct NCQFIS {
+    /* DW0 */
+    uint8_t fis_type;
+    uint8_t flags;
+    uint8_t command;
+    uint8_t sector_low; /* H2D: Feature 7:0 */
+    /* DW1 */
+    uint8_t lba_lo[3];
+    uint8_t device;
+    /* DW2 */
+    uint8_t lba_hi[3];
+    uint8_t sector_hi; /* H2D: Feature 15:8 */
+    /* DW3 */
+    uint8_t tag;       /* H2D: Count 0:7 */
+    uint8_t prio;      /* H2D: Count 15:8 */
+    uint8_t icc;
+    uint8_t control;
+    /* DW4 */
+    uint8_t aux[4];
+} __attribute__((__packed__)) NCQFIS;
+
+/**
  * Command List entry structure.
  * The command list contains between 1-32 of these structures.
  */
@@ -512,8 +542,7 @@
 void ahci_port_check_d2h_sanity(AHCIQState *ahci, uint8_t port, uint8_t slot);
 void ahci_port_check_pio_sanity(AHCIQState *ahci, uint8_t port,
                                 uint8_t slot, size_t buffsize);
-void ahci_port_check_cmd_sanity(AHCIQState *ahci, uint8_t port,
-                                uint8_t slot, size_t buffsize);
+void ahci_port_check_cmd_sanity(AHCIQState *ahci, AHCICommand *cmd);
 void ahci_get_command_header(AHCIQState *ahci, uint8_t port,
                              uint8_t slot, AHCICommandHeader *cmd);
 void ahci_set_command_header(AHCIQState *ahci, uint8_t port,
diff --git a/trace-events b/trace-events
index 52b7efa..d24d80a 100644
--- a/trace-events
+++ b/trace-events
@@ -1594,6 +1594,7 @@
 vfio_platform_intp_inject_pending_lockheld(int pin, int fd) "Inject pending IRQ #%d (fd = %d)"
 vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d: count %d, flags=0x%x"
 vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING"
+vfio_platform_start_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d"
 
 #hw/acpi/memory_hotplug.c
 mhp_acpi_invalid_slot_selected(uint32_t slot) "0x%"PRIx32
diff --git a/translate-all.c b/translate-all.c
index b6b0e1c..50d53fd 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -58,6 +58,7 @@
 #endif
 
 #include "exec/cputlb.h"
+#include "exec/tb-hash.h"
 #include "translate-all.h"
 #include "qemu/bitmap.h"
 #include "qemu/timer.h"
@@ -117,6 +118,7 @@
 #define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS)
 
 uintptr_t qemu_real_host_page_size;
+uintptr_t qemu_real_host_page_mask;
 uintptr_t qemu_host_page_size;
 uintptr_t qemu_host_page_mask;
 
@@ -306,6 +308,7 @@
     /* NOTE: we can always suppose that qemu_host_page_size >=
        TARGET_PAGE_SIZE */
     qemu_real_host_page_size = getpagesize();
+    qemu_real_host_page_mask = ~(qemu_real_host_page_size - 1);
     if (qemu_host_page_size == 0) {
         qemu_host_page_size = qemu_real_host_page_size;
     }
diff --git a/util/cutils.c b/util/cutils.c
index 144b25c..5d1c9eb 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -207,13 +207,13 @@
     for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
          i < len / sizeof(VECTYPE);
          i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
-        VECTYPE tmp0 = p[i + 0] | p[i + 1];
-        VECTYPE tmp1 = p[i + 2] | p[i + 3];
-        VECTYPE tmp2 = p[i + 4] | p[i + 5];
-        VECTYPE tmp3 = p[i + 6] | p[i + 7];
-        VECTYPE tmp01 = tmp0 | tmp1;
-        VECTYPE tmp23 = tmp2 | tmp3;
-        if (!ALL_EQ(tmp01 | tmp23, zero)) {
+        VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
+        VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
+        VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
+        VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
+        VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
+        VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
+        if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
             break;
         }
     }