Merge remote-tracking branch 'remotes/stefanha/tags/net-pull-request' into staging

# gpg: Signature made Fri 06 Feb 2015 14:10:40 GMT using RSA key ID 81AB73C8
# gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>"
# gpg:                 aka "Stefan Hajnoczi <stefanha@gmail.com>"

* remotes/stefanha/tags/net-pull-request:
  monitor: more accurate completion for host_net_remove()
  net: del hub port when peer is deleted
  net: remove the wrong comment in net_init_hubport()
  monitor: print hub port name during info network
  rtl8139: simplify timer logic
  MAINTAINERS: add Jason Wang as net subsystem maintainer

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index cd1052f..8c06739 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -783,6 +783,11 @@
 S: Maintained
 F: backends/baum.c
 
+Coverity model
+M: Markus Armbruster <armbru@redhat.com>
+S: Supported
+F: scripts/coverity-model.c
+
 CPU
 M: Andreas Färber <afaerber@suse.de>
 S: Supported
diff --git a/Makefile.objs b/Makefile.objs
index abeb902..28999d3 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -51,6 +51,7 @@
 common-obj-y += migration/
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += page_cache.o
+common-obj-y += qjson.o
 
 common-obj-$(CONFIG_SPICE) += spice-qemu-char.o
 
diff --git a/balloon.c b/balloon.c
index b70da4f..dea19a4 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,21 @@
 static QEMUBalloonStatus *balloon_stat_fn;
 static void *balloon_opaque;
 
+static bool have_ballon(Error **errp)
+{
+    if (kvm_enabled() && !kvm_has_sync_mmu()) {
+        error_set(errp, ERROR_CLASS_KVM_MISSING_CAP,
+                  "Using KVM without synchronous MMU, balloon unavailable");
+        return false;
+    }
+    if (!balloon_event_fn) {
+        error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE,
+                  "No balloon device has been activated");
+        return false;
+    }
+    return true;
+}
+
 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
                              QEMUBalloonStatus *stat_func, void *opaque)
 {
@@ -62,58 +77,30 @@
     balloon_opaque = NULL;
 }
 
-static int qemu_balloon(ram_addr_t target)
-{
-    if (!balloon_event_fn) {
-        return 0;
-    }
-    trace_balloon_event(balloon_opaque, target);
-    balloon_event_fn(balloon_opaque, target);
-    return 1;
-}
-
-static int qemu_balloon_status(BalloonInfo *info)
-{
-    if (!balloon_stat_fn) {
-        return 0;
-    }
-    balloon_stat_fn(balloon_opaque, info);
-    return 1;
-}
-
 BalloonInfo *qmp_query_balloon(Error **errp)
 {
     BalloonInfo *info;
 
-    if (kvm_enabled() && !kvm_has_sync_mmu()) {
-        error_set(errp, QERR_KVM_MISSING_CAP, "synchronous MMU", "balloon");
+    if (!have_ballon(errp)) {
         return NULL;
     }
 
     info = g_malloc0(sizeof(*info));
-
-    if (qemu_balloon_status(info) == 0) {
-        error_set(errp, QERR_DEVICE_NOT_ACTIVE, "balloon");
-        qapi_free_BalloonInfo(info);
-        return NULL;
-    }
-
+    balloon_stat_fn(balloon_opaque, info);
     return info;
 }
 
-void qmp_balloon(int64_t value, Error **errp)
+void qmp_balloon(int64_t target, Error **errp)
 {
-    if (kvm_enabled() && !kvm_has_sync_mmu()) {
-        error_set(errp, QERR_KVM_MISSING_CAP, "synchronous MMU", "balloon");
+    if (!have_ballon(errp)) {
         return;
     }
 
-    if (value <= 0) {
+    if (target <= 0) {
         error_set(errp, QERR_INVALID_PARAMETER_VALUE, "target", "a size");
         return;
     }
-    
-    if (qemu_balloon(value) == 0) {
-        error_set(errp, QERR_DEVICE_NOT_ACTIVE, "balloon");
-    }
+
+    trace_balloon_event(balloon_opaque, target);
+    balloon_event_fn(balloon_opaque, target);
 }
diff --git a/cpu-exec.c b/cpu-exec.c
index a4f0eff..fa506e6 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -61,8 +61,7 @@
         sleep_delay.tv_sec = sc->diff_clk / 1000000000LL;
         sleep_delay.tv_nsec = sc->diff_clk % 1000000000LL;
         if (nanosleep(&sleep_delay, &rem_delay) < 0) {
-            sc->diff_clk -= (sleep_delay.tv_sec - rem_delay.tv_sec) * 1000000000LL;
-            sc->diff_clk -= sleep_delay.tv_nsec - rem_delay.tv_nsec;
+            sc->diff_clk = rem_delay.tv_sec * 1000000000LL + rem_delay.tv_nsec;
         } else {
             sc->diff_clk = 0;
         }
@@ -101,10 +100,8 @@
     if (!icount_align_option) {
         return;
     }
-    sc->realtime_clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-    sc->diff_clk = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) -
-                   sc->realtime_clock +
-                   cpu_get_clock_offset();
+    sc->realtime_clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
+    sc->diff_clk = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) - sc->realtime_clock;
     sc->last_cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
     if (sc->diff_clk < max_delay) {
         max_delay = sc->diff_clk;
diff --git a/cpus.c b/cpus.c
index 3a5323b..0cdd1d7 100644
--- a/cpus.c
+++ b/cpus.c
@@ -229,23 +229,6 @@
     return ti;
 }
 
-/* return the offset between the host clock and virtual CPU clock */
-int64_t cpu_get_clock_offset(void)
-{
-    int64_t ti;
-    unsigned start;
-
-    do {
-        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
-        ti = timers_state.cpu_clock_offset;
-        if (!timers_state.cpu_ticks_enabled) {
-            ti -= get_clock();
-        }
-    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
-
-    return -ti;
-}
-
 /* enable cpu_get_ticks()
  * Caller must hold BQL which server as mutex for vm_clock_seqlock.
  */
diff --git a/disas/arm-a64.cc b/disas/arm-a64.cc
index ca29f6f..e04f946 100644
--- a/disas/arm-a64.cc
+++ b/disas/arm-a64.cc
@@ -67,7 +67,8 @@
 int print_insn_arm_a64(uint64_t addr, disassemble_info *info)
 {
     uint8_t bytes[INSN_SIZE];
-    uint32_t instr;
+    uint32_t instrval;
+    const Instruction *instr;
     int status;
 
     status = info->read_memory_func(addr, bytes, INSN_SIZE, info);
@@ -80,8 +81,10 @@
         vixl_init(info->stream);
     }
 
-    instr = bytes[0] | bytes[1] << 8 | bytes[2] << 16 | bytes[3] << 24;
-    vixl_decoder->Decode(reinterpret_cast<Instruction*>(&instr));
+    instrval = bytes[0] | bytes[1] << 8 | bytes[2] << 16 | bytes[3] << 24;
+    instr = reinterpret_cast<const Instruction *>(&instrval);
+    vixl_disasm->MapCodeAddress(addr, instr);
+    vixl_decoder->Decode(instr);
 
     return INSN_SIZE;
 }
diff --git a/disas/libvixl/README b/disas/libvixl/README
index cba31b4..58db41c 100644
--- a/disas/libvixl/README
+++ b/disas/libvixl/README
@@ -2,7 +2,7 @@
 The code in this directory is a subset of libvixl:
  https://github.com/armvixl/vixl
 (specifically, it is the set of files needed for disassembly only,
-taken from libvixl 1.6).
+taken from libvixl 1.7).
 Bugfixes should preferably be sent upstream initially.
 
 The disassembler does not currently support the entire A64 instruction
diff --git a/disas/libvixl/a64/assembler-a64.h b/disas/libvixl/a64/assembler-a64.h
index 16a704b..35aaf20 100644
--- a/disas/libvixl/a64/assembler-a64.h
+++ b/disas/libvixl/a64/assembler-a64.h
@@ -151,21 +151,21 @@
     return Aliases(other) && (size_ == other.size_);
   }
 
-  inline bool IsZero() const {
+  bool IsZero() const {
     VIXL_ASSERT(IsValid());
     return IsRegister() && (code_ == kZeroRegCode);
   }
 
-  inline bool IsSP() const {
+  bool IsSP() const {
     VIXL_ASSERT(IsValid());
     return IsRegister() && (code_ == kSPRegInternalCode);
   }
 
-  inline bool IsRegister() const {
+  bool IsRegister() const {
     return type_ == kRegister;
   }
 
-  inline bool IsFPRegister() const {
+  bool IsFPRegister() const {
     return type_ == kFPRegister;
   }
 
@@ -179,7 +179,7 @@
   const FPRegister& S() const;
   const FPRegister& D() const;
 
-  inline bool IsSameSizeAndType(const CPURegister& other) const {
+  bool IsSameSizeAndType(const CPURegister& other) const {
     return (size_ == other.size_) && (type_ == other.type_);
   }
 
@@ -198,7 +198,7 @@
 class Register : public CPURegister {
  public:
   Register() : CPURegister() {}
-  inline explicit Register(const CPURegister& other)
+  explicit Register(const CPURegister& other)
       : CPURegister(other.code(), other.size(), other.type()) {
     VIXL_ASSERT(IsValidRegister());
   }
@@ -213,10 +213,6 @@
   static const Register& WRegFromCode(unsigned code);
   static const Register& XRegFromCode(unsigned code);
 
-  // V8 compatibility.
-  static const int kNumRegisters = kNumberOfRegisters;
-  static const int kNumAllocatableRegisters = kNumberOfRegisters - 1;
-
  private:
   static const Register wregisters[];
   static const Register xregisters[];
@@ -225,12 +221,12 @@
 
 class FPRegister : public CPURegister {
  public:
-  inline FPRegister() : CPURegister() {}
-  inline explicit FPRegister(const CPURegister& other)
+  FPRegister() : CPURegister() {}
+  explicit FPRegister(const CPURegister& other)
       : CPURegister(other.code(), other.size(), other.type()) {
     VIXL_ASSERT(IsValidFPRegister());
   }
-  inline FPRegister(unsigned code, unsigned size)
+  FPRegister(unsigned code, unsigned size)
       : CPURegister(code, size, kFPRegister) {}
 
   bool IsValid() const {
@@ -241,10 +237,6 @@
   static const FPRegister& SRegFromCode(unsigned code);
   static const FPRegister& DRegFromCode(unsigned code);
 
-  // V8 compatibility.
-  static const int kNumRegisters = kNumberOfFPRegisters;
-  static const int kNumAllocatableRegisters = kNumberOfFPRegisters - 1;
-
  private:
   static const FPRegister sregisters[];
   static const FPRegister dregisters[];
@@ -312,23 +304,23 @@
 // Lists of registers.
 class CPURegList {
  public:
-  inline explicit CPURegList(CPURegister reg1,
-                             CPURegister reg2 = NoCPUReg,
-                             CPURegister reg3 = NoCPUReg,
-                             CPURegister reg4 = NoCPUReg)
+  explicit CPURegList(CPURegister reg1,
+                      CPURegister reg2 = NoCPUReg,
+                      CPURegister reg3 = NoCPUReg,
+                      CPURegister reg4 = NoCPUReg)
       : list_(reg1.Bit() | reg2.Bit() | reg3.Bit() | reg4.Bit()),
         size_(reg1.size()), type_(reg1.type()) {
     VIXL_ASSERT(AreSameSizeAndType(reg1, reg2, reg3, reg4));
     VIXL_ASSERT(IsValid());
   }
 
-  inline CPURegList(CPURegister::RegisterType type, unsigned size, RegList list)
+  CPURegList(CPURegister::RegisterType type, unsigned size, RegList list)
       : list_(list), size_(size), type_(type) {
     VIXL_ASSERT(IsValid());
   }
 
-  inline CPURegList(CPURegister::RegisterType type, unsigned size,
-                    unsigned first_reg, unsigned last_reg)
+  CPURegList(CPURegister::RegisterType type, unsigned size,
+             unsigned first_reg, unsigned last_reg)
       : size_(size), type_(type) {
     VIXL_ASSERT(((type == CPURegister::kRegister) &&
                  (last_reg < kNumberOfRegisters)) ||
@@ -340,7 +332,7 @@
     VIXL_ASSERT(IsValid());
   }
 
-  inline CPURegister::RegisterType type() const {
+  CPURegister::RegisterType type() const {
     VIXL_ASSERT(IsValid());
     return type_;
   }
@@ -366,13 +358,13 @@
   }
 
   // Variants of Combine and Remove which take a single register.
-  inline void Combine(const CPURegister& other) {
+  void Combine(const CPURegister& other) {
     VIXL_ASSERT(other.type() == type_);
     VIXL_ASSERT(other.size() == size_);
     Combine(other.code());
   }
 
-  inline void Remove(const CPURegister& other) {
+  void Remove(const CPURegister& other) {
     VIXL_ASSERT(other.type() == type_);
     VIXL_ASSERT(other.size() == size_);
     Remove(other.code());
@@ -380,24 +372,51 @@
 
   // Variants of Combine and Remove which take a single register by its code;
   // the type and size of the register is inferred from this list.
-  inline void Combine(int code) {
+  void Combine(int code) {
     VIXL_ASSERT(IsValid());
     VIXL_ASSERT(CPURegister(code, size_, type_).IsValid());
     list_ |= (UINT64_C(1) << code);
   }
 
-  inline void Remove(int code) {
+  void Remove(int code) {
     VIXL_ASSERT(IsValid());
     VIXL_ASSERT(CPURegister(code, size_, type_).IsValid());
     list_ &= ~(UINT64_C(1) << code);
   }
 
-  inline RegList list() const {
+  static CPURegList Union(const CPURegList& list_1, const CPURegList& list_2) {
+    VIXL_ASSERT(list_1.type_ == list_2.type_);
+    VIXL_ASSERT(list_1.size_ == list_2.size_);
+    return CPURegList(list_1.type_, list_1.size_, list_1.list_ | list_2.list_);
+  }
+  static CPURegList Union(const CPURegList& list_1,
+                          const CPURegList& list_2,
+                          const CPURegList& list_3);
+  static CPURegList Union(const CPURegList& list_1,
+                          const CPURegList& list_2,
+                          const CPURegList& list_3,
+                          const CPURegList& list_4);
+
+  static CPURegList Intersection(const CPURegList& list_1,
+                                 const CPURegList& list_2) {
+    VIXL_ASSERT(list_1.type_ == list_2.type_);
+    VIXL_ASSERT(list_1.size_ == list_2.size_);
+    return CPURegList(list_1.type_, list_1.size_, list_1.list_ & list_2.list_);
+  }
+  static CPURegList Intersection(const CPURegList& list_1,
+                                 const CPURegList& list_2,
+                                 const CPURegList& list_3);
+  static CPURegList Intersection(const CPURegList& list_1,
+                                 const CPURegList& list_2,
+                                 const CPURegList& list_3,
+                                 const CPURegList& list_4);
+
+  RegList list() const {
     VIXL_ASSERT(IsValid());
     return list_;
   }
 
-  inline void set_list(RegList new_list) {
+  void set_list(RegList new_list) {
     VIXL_ASSERT(IsValid());
     list_ = new_list;
   }
@@ -417,38 +436,38 @@
   static CPURegList GetCallerSaved(unsigned size = kXRegSize);
   static CPURegList GetCallerSavedFP(unsigned size = kDRegSize);
 
-  inline bool IsEmpty() const {
+  bool IsEmpty() const {
     VIXL_ASSERT(IsValid());
     return list_ == 0;
   }
 
-  inline bool IncludesAliasOf(const CPURegister& other) const {
+  bool IncludesAliasOf(const CPURegister& other) const {
     VIXL_ASSERT(IsValid());
     return (type_ == other.type()) && ((other.Bit() & list_) != 0);
   }
 
-  inline bool IncludesAliasOf(int code) const {
+  bool IncludesAliasOf(int code) const {
     VIXL_ASSERT(IsValid());
     return ((code & list_) != 0);
   }
 
-  inline int Count() const {
+  int Count() const {
     VIXL_ASSERT(IsValid());
     return CountSetBits(list_, kRegListSizeInBits);
   }
 
-  inline unsigned RegisterSizeInBits() const {
+  unsigned RegisterSizeInBits() const {
     VIXL_ASSERT(IsValid());
     return size_;
   }
 
-  inline unsigned RegisterSizeInBytes() const {
+  unsigned RegisterSizeInBytes() const {
     int size_in_bits = RegisterSizeInBits();
     VIXL_ASSERT((size_in_bits % 8) == 0);
     return size_in_bits / 8;
   }
 
-  inline unsigned TotalSizeInBytes() const {
+  unsigned TotalSizeInBytes() const {
     VIXL_ASSERT(IsValid());
     return RegisterSizeInBytes() * Count();
   }
@@ -587,8 +606,10 @@
     VIXL_ASSERT(!IsLinked() || IsBound());
   }
 
-  inline bool IsBound() const { return location_ >= 0; }
-  inline bool IsLinked() const { return !links_.empty(); }
+  bool IsBound() const { return location_ >= 0; }
+  bool IsLinked() const { return !links_.empty(); }
+
+  ptrdiff_t location() const { return location_; }
 
  private:
   // The list of linked instructions is stored in a stack-like structure. We
@@ -647,22 +668,20 @@
     std::stack<ptrdiff_t> * links_extended_;
   };
 
-  inline ptrdiff_t location() const { return location_; }
-
-  inline void Bind(ptrdiff_t location) {
+  void Bind(ptrdiff_t location) {
     // Labels can only be bound once.
     VIXL_ASSERT(!IsBound());
     location_ = location;
   }
 
-  inline void AddLink(ptrdiff_t instruction) {
+  void AddLink(ptrdiff_t instruction) {
     // If a label is bound, the assembler already has the information it needs
     // to write the instruction, so there is no need to add it to links_.
     VIXL_ASSERT(!IsBound());
     links_.push(instruction);
   }
 
-  inline ptrdiff_t GetAndRemoveNextLink() {
+  ptrdiff_t GetAndRemoveNextLink() {
     VIXL_ASSERT(IsLinked());
     ptrdiff_t link = links_.top();
     links_.pop();
@@ -845,14 +864,14 @@
 
   // Return the address of an offset in the buffer.
   template <typename T>
-  inline T GetOffsetAddress(ptrdiff_t offset) {
+  T GetOffsetAddress(ptrdiff_t offset) {
     VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
     return buffer_->GetOffsetAddress<T>(offset);
   }
 
   // Return the address of a bound label.
   template <typename T>
-  inline T GetLabelAddress(const Label * label) {
+  T GetLabelAddress(const Label * label) {
     VIXL_ASSERT(label->IsBound());
     VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
     return GetOffsetAddress<T>(label->location());
@@ -860,14 +879,14 @@
 
   // Return the address of the cursor.
   template <typename T>
-  inline T GetCursorAddress() {
+  T GetCursorAddress() {
     VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
     return GetOffsetAddress<T>(CursorOffset());
   }
 
   // Return the address of the start of the buffer.
   template <typename T>
-  inline T GetStartAddress() {
+  T GetStartAddress() {
     VIXL_STATIC_ASSERT(sizeof(T) >= sizeof(uintptr_t));
     return GetOffsetAddress<T>(0);
   }
@@ -1074,20 +1093,20 @@
 
   // Bfm aliases.
   // Bitfield insert.
-  inline void bfi(const Register& rd,
-                  const Register& rn,
-                  unsigned lsb,
-                  unsigned width) {
+  void bfi(const Register& rd,
+           const Register& rn,
+           unsigned lsb,
+           unsigned width) {
     VIXL_ASSERT(width >= 1);
     VIXL_ASSERT(lsb + width <= rn.size());
     bfm(rd, rn, (rd.size() - lsb) & (rd.size() - 1), width - 1);
   }
 
   // Bitfield extract and insert low.
-  inline void bfxil(const Register& rd,
-                    const Register& rn,
-                    unsigned lsb,
-                    unsigned width) {
+  void bfxil(const Register& rd,
+             const Register& rn,
+             unsigned lsb,
+             unsigned width) {
     VIXL_ASSERT(width >= 1);
     VIXL_ASSERT(lsb + width <= rn.size());
     bfm(rd, rn, lsb, lsb + width - 1);
@@ -1095,92 +1114,92 @@
 
   // Sbfm aliases.
   // Arithmetic shift right.
-  inline void asr(const Register& rd, const Register& rn, unsigned shift) {
+  void asr(const Register& rd, const Register& rn, unsigned shift) {
     VIXL_ASSERT(shift < rd.size());
     sbfm(rd, rn, shift, rd.size() - 1);
   }
 
   // Signed bitfield insert with zero at right.
-  inline void sbfiz(const Register& rd,
-                    const Register& rn,
-                    unsigned lsb,
-                    unsigned width) {
+  void sbfiz(const Register& rd,
+             const Register& rn,
+             unsigned lsb,
+             unsigned width) {
     VIXL_ASSERT(width >= 1);
     VIXL_ASSERT(lsb + width <= rn.size());
     sbfm(rd, rn, (rd.size() - lsb) & (rd.size() - 1), width - 1);
   }
 
   // Signed bitfield extract.
-  inline void sbfx(const Register& rd,
-                   const Register& rn,
-                   unsigned lsb,
-                   unsigned width) {
+  void sbfx(const Register& rd,
+            const Register& rn,
+            unsigned lsb,
+            unsigned width) {
     VIXL_ASSERT(width >= 1);
     VIXL_ASSERT(lsb + width <= rn.size());
     sbfm(rd, rn, lsb, lsb + width - 1);
   }
 
   // Signed extend byte.
-  inline void sxtb(const Register& rd, const Register& rn) {
+  void sxtb(const Register& rd, const Register& rn) {
     sbfm(rd, rn, 0, 7);
   }
 
   // Signed extend halfword.
-  inline void sxth(const Register& rd, const Register& rn) {
+  void sxth(const Register& rd, const Register& rn) {
     sbfm(rd, rn, 0, 15);
   }
 
   // Signed extend word.
-  inline void sxtw(const Register& rd, const Register& rn) {
+  void sxtw(const Register& rd, const Register& rn) {
     sbfm(rd, rn, 0, 31);
   }
 
   // Ubfm aliases.
   // Logical shift left.
-  inline void lsl(const Register& rd, const Register& rn, unsigned shift) {
+  void lsl(const Register& rd, const Register& rn, unsigned shift) {
     unsigned reg_size = rd.size();
     VIXL_ASSERT(shift < reg_size);
     ubfm(rd, rn, (reg_size - shift) % reg_size, reg_size - shift - 1);
   }
 
   // Logical shift right.
-  inline void lsr(const Register& rd, const Register& rn, unsigned shift) {
+  void lsr(const Register& rd, const Register& rn, unsigned shift) {
     VIXL_ASSERT(shift < rd.size());
     ubfm(rd, rn, shift, rd.size() - 1);
   }
 
   // Unsigned bitfield insert with zero at right.
-  inline void ubfiz(const Register& rd,
-                    const Register& rn,
-                    unsigned lsb,
-                    unsigned width) {
+  void ubfiz(const Register& rd,
+             const Register& rn,
+             unsigned lsb,
+             unsigned width) {
     VIXL_ASSERT(width >= 1);
     VIXL_ASSERT(lsb + width <= rn.size());
     ubfm(rd, rn, (rd.size() - lsb) & (rd.size() - 1), width - 1);
   }
 
   // Unsigned bitfield extract.
-  inline void ubfx(const Register& rd,
-                   const Register& rn,
-                   unsigned lsb,
-                   unsigned width) {
+  void ubfx(const Register& rd,
+            const Register& rn,
+            unsigned lsb,
+            unsigned width) {
     VIXL_ASSERT(width >= 1);
     VIXL_ASSERT(lsb + width <= rn.size());
     ubfm(rd, rn, lsb, lsb + width - 1);
   }
 
   // Unsigned extend byte.
-  inline void uxtb(const Register& rd, const Register& rn) {
+  void uxtb(const Register& rd, const Register& rn) {
     ubfm(rd, rn, 0, 7);
   }
 
   // Unsigned extend halfword.
-  inline void uxth(const Register& rd, const Register& rn) {
+  void uxth(const Register& rd, const Register& rn) {
     ubfm(rd, rn, 0, 15);
   }
 
   // Unsigned extend word.
-  inline void uxtw(const Register& rd, const Register& rn) {
+  void uxtw(const Register& rd, const Register& rn) {
     ubfm(rd, rn, 0, 31);
   }
 
@@ -1230,7 +1249,7 @@
   void cneg(const Register& rd, const Register& rn, Condition cond);
 
   // Rotate right.
-  inline void ror(const Register& rd, const Register& rs, unsigned shift) {
+  void ror(const Register& rd, const Register& rs, unsigned shift) {
     extr(rd, rs, rs, shift);
   }
 
@@ -1495,6 +1514,19 @@
   // Load-acquire register.
   void ldar(const Register& rt, const MemOperand& src);
 
+  // Prefetch memory.
+  void prfm(PrefetchOperation op, const MemOperand& addr,
+            LoadStoreScalingOption option = PreferScaledOffset);
+
+  // Prefetch memory (with unscaled offset).
+  void prfum(PrefetchOperation op, const MemOperand& addr,
+             LoadStoreScalingOption option = PreferUnscaledOffset);
+
+  // Prefetch memory in the literal pool.
+  void prfm(PrefetchOperation op, RawLiteral* literal);
+
+  // Prefetch from pc + imm19 << 2.
+  void prfm(PrefetchOperation op, int imm19);
 
   // Move instructions. The default shift of -1 indicates that the move
   // instruction will calculate an appropriate 16-bit immediate and left shift
@@ -1638,12 +1670,21 @@
   // FP round to integer (nearest with ties to away).
   void frinta(const FPRegister& fd, const FPRegister& fn);
 
+  // FP round to integer (implicit rounding).
+  void frinti(const FPRegister& fd, const FPRegister& fn);
+
   // FP round to integer (toward minus infinity).
   void frintm(const FPRegister& fd, const FPRegister& fn);
 
   // FP round to integer (nearest with ties to even).
   void frintn(const FPRegister& fd, const FPRegister& fn);
 
+  // FP round to integer (toward plus infinity).
+  void frintp(const FPRegister& fd, const FPRegister& fn);
+
+  // FP round to integer (exact, implicit rounding).
+  void frintx(const FPRegister& fd, const FPRegister& fn);
+
   // FP round to integer (towards zero).
   void frintz(const FPRegister& fd, const FPRegister& fn);
 
@@ -1705,16 +1746,16 @@
 
   // Emit generic instructions.
   // Emit raw instructions into the instruction stream.
-  inline void dci(Instr raw_inst) { Emit(raw_inst); }
+  void dci(Instr raw_inst) { Emit(raw_inst); }
 
   // Emit 32 bits of data into the instruction stream.
-  inline void dc32(uint32_t data) {
+  void dc32(uint32_t data) {
     VIXL_ASSERT(buffer_monitor_ > 0);
     buffer_->Emit32(data);
   }
 
   // Emit 64 bits of data into the instruction stream.
-  inline void dc64(uint64_t data) {
+  void dc64(uint64_t data) {
     VIXL_ASSERT(buffer_monitor_ > 0);
     buffer_->Emit64(data);
   }
@@ -1849,14 +1890,14 @@
     }
   }
 
-  static inline Instr ImmS(unsigned imms, unsigned reg_size) {
+  static Instr ImmS(unsigned imms, unsigned reg_size) {
     VIXL_ASSERT(((reg_size == kXRegSize) && is_uint6(imms)) ||
            ((reg_size == kWRegSize) && is_uint5(imms)));
     USE(reg_size);
     return imms << ImmS_offset;
   }
 
-  static inline Instr ImmR(unsigned immr, unsigned reg_size) {
+  static Instr ImmR(unsigned immr, unsigned reg_size) {
     VIXL_ASSERT(((reg_size == kXRegSize) && is_uint6(immr)) ||
            ((reg_size == kWRegSize) && is_uint5(immr)));
     USE(reg_size);
@@ -1864,7 +1905,7 @@
     return immr << ImmR_offset;
   }
 
-  static inline Instr ImmSetBits(unsigned imms, unsigned reg_size) {
+  static Instr ImmSetBits(unsigned imms, unsigned reg_size) {
     VIXL_ASSERT((reg_size == kWRegSize) || (reg_size == kXRegSize));
     VIXL_ASSERT(is_uint6(imms));
     VIXL_ASSERT((reg_size == kXRegSize) || is_uint6(imms + 3));
@@ -1872,7 +1913,7 @@
     return imms << ImmSetBits_offset;
   }
 
-  static inline Instr ImmRotate(unsigned immr, unsigned reg_size) {
+  static Instr ImmRotate(unsigned immr, unsigned reg_size) {
     VIXL_ASSERT((reg_size == kWRegSize) || (reg_size == kXRegSize));
     VIXL_ASSERT(((reg_size == kXRegSize) && is_uint6(immr)) ||
            ((reg_size == kWRegSize) && is_uint5(immr)));
@@ -1880,12 +1921,12 @@
     return immr << ImmRotate_offset;
   }
 
-  static inline Instr ImmLLiteral(int imm19) {
+  static Instr ImmLLiteral(int imm19) {
     VIXL_ASSERT(is_int19(imm19));
     return truncate_to_int19(imm19) << ImmLLiteral_offset;
   }
 
-  static inline Instr BitN(unsigned bitn, unsigned reg_size) {
+  static Instr BitN(unsigned bitn, unsigned reg_size) {
     VIXL_ASSERT((reg_size == kWRegSize) || (reg_size == kXRegSize));
     VIXL_ASSERT((reg_size == kXRegSize) || (bitn == 0));
     USE(reg_size);
@@ -1943,6 +1984,11 @@
     return shift_amount << ImmShiftLS_offset;
   }
 
+  static Instr ImmPrefetchOperation(int imm5) {
+    VIXL_ASSERT(is_uint5(imm5));
+    return imm5 << ImmPrefetchOperation_offset;
+  }
+
   static Instr ImmException(int imm16) {
     VIXL_ASSERT(is_uint16(imm16));
     return imm16 << ImmException_offset;
@@ -2003,12 +2049,32 @@
     return scale << FPScale_offset;
   }
 
+  // Immediate field checking helpers.
+  static bool IsImmAddSub(int64_t immediate);
+  static bool IsImmConditionalCompare(int64_t immediate);
+  static bool IsImmFP32(float imm);
+  static bool IsImmFP64(double imm);
+  static bool IsImmLogical(uint64_t value,
+                           unsigned width,
+                           unsigned* n = NULL,
+                           unsigned* imm_s = NULL,
+                           unsigned* imm_r = NULL);
+  static bool IsImmLSPair(int64_t offset, LSDataSize size);
+  static bool IsImmLSScaled(int64_t offset, LSDataSize size);
+  static bool IsImmLSUnscaled(int64_t offset);
+  static bool IsImmMovn(uint64_t imm, unsigned reg_size);
+  static bool IsImmMovz(uint64_t imm, unsigned reg_size);
+
   // Size of the code generated since label to the current position.
   size_t SizeOfCodeGeneratedSince(Label* label) const {
     VIXL_ASSERT(label->IsBound());
     return buffer_->OffsetFrom(label->location());
   }
 
+  size_t SizeOfCodeGenerated() const {
+    return buffer_->CursorOffset();
+  }
+
   size_t BufferCapacity() const { return buffer_->capacity(); }
 
   size_t RemainingBufferSpace() const { return buffer_->RemainingBytes(); }
@@ -2025,7 +2091,7 @@
     }
   }
 
-#ifdef DEBUG
+#ifdef VIXL_DEBUG
   void AcquireBuffer() {
     VIXL_ASSERT(buffer_monitor_ >= 0);
     buffer_monitor_++;
@@ -2037,16 +2103,16 @@
   }
 #endif
 
-  inline PositionIndependentCodeOption pic() {
+  PositionIndependentCodeOption pic() const {
     return pic_;
   }
 
-  inline bool AllowPageOffsetDependentCode() {
+  bool AllowPageOffsetDependentCode() const {
     return (pic() == PageOffsetDependentCode) ||
            (pic() == PositionDependentCode);
   }
 
-  static inline const Register& AppropriateZeroRegFor(const CPURegister& reg) {
+  static const Register& AppropriateZeroRegFor(const CPURegister& reg) {
     return reg.Is64Bits() ? xzr : wzr;
   }
 
@@ -2056,14 +2122,15 @@
                  const MemOperand& addr,
                  LoadStoreOp op,
                  LoadStoreScalingOption option = PreferScaledOffset);
-  static bool IsImmLSUnscaled(int64_t offset);
-  static bool IsImmLSScaled(int64_t offset, LSDataSize size);
 
   void LoadStorePair(const CPURegister& rt,
                      const CPURegister& rt2,
                      const MemOperand& addr,
                      LoadStorePairOp op);
-  static bool IsImmLSPair(int64_t offset, LSDataSize size);
+
+  void Prefetch(PrefetchOperation op,
+                const MemOperand& addr,
+                LoadStoreScalingOption option = PreferScaledOffset);
 
   // TODO(all): The third parameter should be passed by reference but gcc 4.8.2
   // reports a bogus uninitialised warning then.
@@ -2077,18 +2144,12 @@
                         unsigned imm_s,
                         unsigned imm_r,
                         LogicalOp op);
-  static bool IsImmLogical(uint64_t value,
-                           unsigned width,
-                           unsigned* n = NULL,
-                           unsigned* imm_s = NULL,
-                           unsigned* imm_r = NULL);
 
   void ConditionalCompare(const Register& rn,
                           const Operand& operand,
                           StatusFlags nzcv,
                           Condition cond,
                           ConditionalCompareOp op);
-  static bool IsImmConditionalCompare(int64_t immediate);
 
   void AddSubWithCarry(const Register& rd,
                        const Register& rn,
@@ -2096,8 +2157,6 @@
                        FlagsUpdate S,
                        AddSubWithCarryOp op);
 
-  static bool IsImmFP32(float imm);
-  static bool IsImmFP64(double imm);
 
   // Functions for emulating operands not directly supported by the instruction
   // set.
@@ -2115,7 +2174,6 @@
               const Operand& operand,
               FlagsUpdate S,
               AddSubOp op);
-  static bool IsImmAddSub(int64_t immediate);
 
   // Find an appropriate LoadStoreOp or LoadStorePairOp for the specified
   // registers. Only simple loads are supported; sign- and zero-extension (such
@@ -2180,6 +2238,12 @@
                                const FPRegister& fa,
                                FPDataProcessing3SourceOp op);
 
+  // Encode the specified MemOperand for the specified access size and scaling
+  // preference.
+  Instr LoadStoreMemOperand(const MemOperand& addr,
+                            LSDataSize size,
+                            LoadStoreScalingOption option);
+
   // Link the current (not-yet-emitted) instruction to the specified label, then
   // return an offset to be encoded in the instruction. If the label is not yet
   // bound, an offset of 0 is returned.
@@ -2205,7 +2269,7 @@
   CodeBuffer* buffer_;
   PositionIndependentCodeOption pic_;
 
-#ifdef DEBUG
+#ifdef VIXL_DEBUG
   int64_t buffer_monitor_;
 #endif
 };
@@ -2239,7 +2303,7 @@
                        AssertPolicy assert_policy = kMaximumSize)
       : assm_(assm) {
     if (check_policy == kCheck) assm->EnsureSpaceFor(size);
-#ifdef DEBUG
+#ifdef VIXL_DEBUG
     assm->bind(&start_);
     size_ = size;
     assert_policy_ = assert_policy;
@@ -2251,7 +2315,7 @@
 
   // This is a shortcut for CodeBufferCheckScope(assm, 0, kNoCheck, kNoAssert).
   explicit CodeBufferCheckScope(Assembler* assm) : assm_(assm) {
-#ifdef DEBUG
+#ifdef VIXL_DEBUG
     size_ = 0;
     assert_policy_ = kNoAssert;
     assm->AcquireBuffer();
@@ -2259,7 +2323,7 @@
   }
 
   ~CodeBufferCheckScope() {
-#ifdef DEBUG
+#ifdef VIXL_DEBUG
     assm_->ReleaseBuffer();
     switch (assert_policy_) {
       case kNoAssert: break;
@@ -2277,7 +2341,7 @@
 
  protected:
   Assembler* assm_;
-#ifdef DEBUG
+#ifdef VIXL_DEBUG
   Label start_;
   size_t size_;
   AssertPolicy assert_policy_;
diff --git a/disas/libvixl/a64/constants-a64.h b/disas/libvixl/a64/constants-a64.h
index 7a14f85..bc1a2c4 100644
--- a/disas/libvixl/a64/constants-a64.h
+++ b/disas/libvixl/a64/constants-a64.h
@@ -31,12 +31,6 @@
 
 const unsigned kNumberOfRegisters = 32;
 const unsigned kNumberOfFPRegisters = 32;
-// Callee saved registers are x21-x30(lr).
-const int kNumberOfCalleeSavedRegisters = 10;
-const int kFirstCalleeSavedRegisterIndex = 21;
-// Callee saved FP registers are d8-d15.
-const int kNumberOfCalleeSavedFPRegisters = 8;
-const int kFirstCalleeSavedFPRegisterIndex = 8;
 
 #define REGISTER_CODE_LIST(R)                                                  \
 R(0)  R(1)  R(2)  R(3)  R(4)  R(5)  R(6)  R(7)                                 \
@@ -53,7 +47,6 @@
 V_(Rt, 4, 0, Bits)                        /* Load/store register.         */   \
 V_(Rt2, 14, 10, Bits)                     /* Load/store second register.  */   \
 V_(Rs, 20, 16, Bits)                      /* Exclusive access status.     */   \
-V_(PrefetchMode, 4, 0, Bits)                                                   \
                                                                                \
 /* Common bits */                                                              \
 V_(SixtyFourBits, 31, 31, Bits)                                                \
@@ -109,6 +102,10 @@
 V_(ImmLSPair, 21, 15, SignedBits)                                              \
 V_(SizeLS, 31, 30, Bits)                                                       \
 V_(ImmShiftLS, 12, 12, Bits)                                                   \
+V_(ImmPrefetchOperation, 4, 0, Bits)                                           \
+V_(PrefetchHint, 4, 3, Bits)                                                   \
+V_(PrefetchTarget, 2, 1, Bits)                                                 \
+V_(PrefetchStream, 0, 0, Bits)                                                 \
                                                                                \
 /* Other immediates */                                                         \
 V_(ImmUncondBranch, 25, 0, SignedBits)                                         \
@@ -269,6 +266,29 @@
   BarrierAll    = 3
 };
 
+enum PrefetchOperation {
+  PLDL1KEEP = 0x00,
+  PLDL1STRM = 0x01,
+  PLDL2KEEP = 0x02,
+  PLDL2STRM = 0x03,
+  PLDL3KEEP = 0x04,
+  PLDL3STRM = 0x05,
+
+  PLIL1KEEP = 0x08,
+  PLIL1STRM = 0x09,
+  PLIL2KEEP = 0x0a,
+  PLIL2STRM = 0x0b,
+  PLIL3KEEP = 0x0c,
+  PLIL3STRM = 0x0d,
+
+  PSTL1KEEP = 0x10,
+  PSTL1STRM = 0x11,
+  PSTL2KEEP = 0x12,
+  PSTL2STRM = 0x13,
+  PSTL3KEEP = 0x14,
+  PSTL3STRM = 0x15
+};
+
 // System/special register names.
 // This information is not encoded as one field but as the concatenation of
 // multiple fields (Op0<0>, Op1, Crn, Crm, Op2).
@@ -605,6 +625,12 @@
   LoadStoreAnyFixed = 0x08000000
 };
 
+// Any load pair or store pair.
+enum LoadStorePairAnyOp {
+  LoadStorePairAnyFMask = 0x3a000000,
+  LoadStorePairAnyFixed = 0x28000000
+};
+
 #define LOAD_STORE_PAIR_OP_LIST(V)  \
   V(STP, w,   0x00000000),          \
   V(LDP, w,   0x00400000),          \
@@ -703,17 +729,6 @@
   V(LD, R, d,   0xC4400000)
 
 
-// Load/store unscaled offset.
-enum LoadStoreUnscaledOffsetOp {
-  LoadStoreUnscaledOffsetFixed = 0x38000000,
-  LoadStoreUnscaledOffsetFMask = 0x3B200C00,
-  LoadStoreUnscaledOffsetMask  = 0xFFE00C00,
-  #define LOAD_STORE_UNSCALED(A, B, C, D)  \
-  A##U##B##_##C = LoadStoreUnscaledOffsetFixed | D
-  LOAD_STORE_OP_LIST(LOAD_STORE_UNSCALED)
-  #undef LOAD_STORE_UNSCALED
-};
-
 // Load/store (post, pre, offset and unsigned.)
 enum LoadStoreOp {
   LoadStoreOpMask = 0xC4C00000,
@@ -724,6 +739,18 @@
   PRFM = 0xC0800000
 };
 
+// Load/store unscaled offset.
+enum LoadStoreUnscaledOffsetOp {
+  LoadStoreUnscaledOffsetFixed = 0x38000000,
+  LoadStoreUnscaledOffsetFMask = 0x3B200C00,
+  LoadStoreUnscaledOffsetMask  = 0xFFE00C00,
+  PRFUM                        = LoadStoreUnscaledOffsetFixed | PRFM,
+  #define LOAD_STORE_UNSCALED(A, B, C, D)  \
+  A##U##B##_##C = LoadStoreUnscaledOffsetFixed | D
+  LOAD_STORE_OP_LIST(LOAD_STORE_UNSCALED)
+  #undef LOAD_STORE_UNSCALED
+};
+
 // Load/store post index.
 enum LoadStorePostIndex {
   LoadStorePostIndexFixed = 0x38000400,
diff --git a/disas/libvixl/a64/decoder-a64.h b/disas/libvixl/a64/decoder-a64.h
index 172594c..fd08d6c 100644
--- a/disas/libvixl/a64/decoder-a64.h
+++ b/disas/libvixl/a64/decoder-a64.h
@@ -108,7 +108,7 @@
   }
 
  private:
-  VisitorConstness constness_;
+  const VisitorConstness constness_;
 };
 
 
diff --git a/disas/libvixl/a64/disasm-a64.cc b/disas/libvixl/a64/disasm-a64.cc
index e4a74aa..f7bc246 100644
--- a/disas/libvixl/a64/disasm-a64.cc
+++ b/disas/libvixl/a64/disasm-a64.cc
@@ -34,6 +34,7 @@
   buffer_ = reinterpret_cast<char*>(malloc(buffer_size_));
   buffer_pos_ = 0;
   own_buffer_ = true;
+  code_address_offset_ = 0;
 }
 
 
@@ -42,6 +43,7 @@
   buffer_ = text_buffer;
   buffer_pos_ = 0;
   own_buffer_ = false;
+  code_address_offset_ = 0;
 }
 
 
@@ -739,9 +741,25 @@
   // shift calculation.
   switch (instr->Mask(MoveWideImmediateMask)) {
     case MOVN_w:
-    case MOVN_x: mnemonic = "movn"; break;
+    case MOVN_x:
+      if ((instr->ImmMoveWide()) || (instr->ShiftMoveWide() == 0)) {
+        if ((instr->SixtyFourBits() == 0) && (instr->ImmMoveWide() == 0xffff)) {
+          mnemonic = "movn";
+        } else {
+          mnemonic = "mov";
+          form = "'Rd, 'IMoveNeg";
+        }
+      } else {
+        mnemonic = "movn";
+      }
+      break;
     case MOVZ_w:
-    case MOVZ_x: mnemonic = "movz"; break;
+    case MOVZ_x:
+      if ((instr->ImmMoveWide()) || (instr->ShiftMoveWide() == 0))
+        mnemonic = "mov";
+      else
+        mnemonic = "movz";
+      break;
     case MOVK_w:
     case MOVK_x: mnemonic = "movk"; form = "'Rd, 'IMoveLSL"; break;
     default: VIXL_UNREACHABLE();
@@ -806,7 +824,7 @@
     case A##_unsigned: mnemonic = B; form = C ", ['Xns'ILU]"; break;
     LOAD_STORE_LIST(LS_UNSIGNEDOFFSET)
     #undef LS_UNSIGNEDOFFSET
-    case PRFM_unsigned: mnemonic = "prfm"; form = "'PrefOp, ['Xn'ILU]";
+    case PRFM_unsigned: mnemonic = "prfm"; form = "'PrefOp, ['Xns'ILU]";
   }
   Format(instr, mnemonic, form);
 }
@@ -833,6 +851,7 @@
   const char *form_x = "'Xt, ['Xns'ILS]";
   const char *form_s = "'St, ['Xns'ILS]";
   const char *form_d = "'Dt, ['Xns'ILS]";
+  const char *form_prefetch = "'PrefOp, ['Xns'ILS]";
 
   switch (instr->Mask(LoadStoreUnscaledOffsetMask)) {
     case STURB_w:  mnemonic = "sturb"; break;
@@ -852,6 +871,7 @@
     case LDURSH_x: form = form_x;  // Fall through.
     case LDURSH_w: mnemonic = "ldursh"; break;
     case LDURSW_x: mnemonic = "ldursw"; form = form_x; break;
+    case PRFUM:    mnemonic = "prfum"; form = form_prefetch; break;
     default: form = "(LoadStoreUnscaledOffset)";
   }
   Format(instr, mnemonic, form);
@@ -872,6 +892,11 @@
       form = "'Xt, 'ILLiteral 'LValue";
       break;
     }
+    case PRFM_lit: {
+      mnemonic = "prfm";
+      form = "'PrefOp, 'ILLiteral 'LValue";
+      break;
+    }
     default: mnemonic = "unimplemented";
   }
   Format(instr, mnemonic, form);
@@ -1344,7 +1369,7 @@
 void Disassembler::AppendAddressToOutput(const Instruction* instr,
                                          const void* addr) {
   USE(instr);
-  AppendToOutput("(addr %p)", addr);
+  AppendToOutput("(addr 0x%" PRIxPTR ")", reinterpret_cast<uintptr_t>(addr));
 }
 
 
@@ -1360,6 +1385,40 @@
 }
 
 
+void Disassembler::AppendCodeRelativeAddressToOutput(const Instruction* instr,
+                                                     const void* addr) {
+  USE(instr);
+  int64_t rel_addr = CodeRelativeAddress(addr);
+  if (rel_addr >= 0) {
+    AppendToOutput("(addr 0x%" PRIx64 ")", rel_addr);
+  } else {
+    AppendToOutput("(addr -0x%" PRIx64 ")", -rel_addr);
+  }
+}
+
+
+void Disassembler::AppendCodeRelativeCodeAddressToOutput(
+    const Instruction* instr, const void* addr) {
+  AppendCodeRelativeAddressToOutput(instr, addr);
+}
+
+
+void Disassembler::AppendCodeRelativeDataAddressToOutput(
+    const Instruction* instr, const void* addr) {
+  AppendCodeRelativeAddressToOutput(instr, addr);
+}
+
+
+void Disassembler::MapCodeAddress(int64_t base_address,
+                                  const Instruction* instr_address) {
+  set_code_address_offset(
+      base_address - reinterpret_cast<intptr_t>(instr_address));
+}
+int64_t Disassembler::CodeRelativeAddress(const void* addr) {
+  return reinterpret_cast<intptr_t>(addr) + code_address_offset();
+}
+
+
 void Disassembler::Format(const Instruction* instr, const char* mnemonic,
                           const char* format) {
   VIXL_ASSERT(mnemonic != NULL);
@@ -1486,16 +1545,20 @@
   VIXL_ASSERT(format[0] == 'I');
 
   switch (format[1]) {
-    case 'M': {  // IMoveImm or IMoveLSL.
-      if (format[5] == 'I') {
-        uint64_t imm = instr->ImmMoveWide() << (16 * instr->ShiftMoveWide());
-        AppendToOutput("#0x%" PRIx64, imm);
-      } else {
-        VIXL_ASSERT(format[5] == 'L');
+    case 'M': {  // IMoveImm, IMoveNeg or IMoveLSL.
+      if (format[5] == 'L') {
         AppendToOutput("#0x%" PRIx64, instr->ImmMoveWide());
         if (instr->ShiftMoveWide() > 0) {
           AppendToOutput(", lsl #%" PRId64, 16 * instr->ShiftMoveWide());
         }
+      } else {
+        VIXL_ASSERT((format[5] == 'I') || (format[5] == 'N'));
+        uint64_t imm = instr->ImmMoveWide() << (16 * instr->ShiftMoveWide());
+        if (format[5] == 'N')
+          imm = ~imm;
+        if (!instr->SixtyFourBits())
+          imm &= UINT64_C(0xffffffff);
+        AppendToOutput("#0x%" PRIx64, imm);
       }
       return 8;
     }
@@ -1634,14 +1697,31 @@
   VIXL_ASSERT(strncmp(format, "LValue", 6) == 0);
   USE(format);
 
+  const void * address = instr->LiteralAddress<const void *>();
   switch (instr->Mask(LoadLiteralMask)) {
     case LDR_w_lit:
     case LDR_x_lit:
     case LDRSW_x_lit:
     case LDR_s_lit:
     case LDR_d_lit:
-      AppendDataAddressToOutput(instr, instr->LiteralAddress());
+      AppendCodeRelativeDataAddressToOutput(instr, address);
       break;
+    case PRFM_lit: {
+      // Use the prefetch hint to decide how to print the address.
+      switch (instr->PrefetchHint()) {
+        case 0x0:     // PLD: prefetch for load.
+        case 0x2:     // PST: prepare for store.
+          AppendCodeRelativeDataAddressToOutput(instr, address);
+          break;
+        case 0x1:     // PLI: preload instructions.
+          AppendCodeRelativeCodeAddressToOutput(instr, address);
+          break;
+        case 0x3:     // Unallocated hint.
+          AppendCodeRelativeAddressToOutput(instr, address);
+          break;
+      }
+      break;
+    }
     default:
       VIXL_UNREACHABLE();
   }
@@ -1701,17 +1781,22 @@
               (strcmp(format, "AddrPCRelPage") == 0));    // Used by `adrp`.
 
   int64_t offset = instr->ImmPCRel();
-  const Instruction * base = instr;
 
+  // Compute the target address based on the effective address (after applying
+  // code_address_offset). This is required for correct behaviour of adrp.
+  const Instruction* base = instr + code_address_offset();
   if (format[9] == 'P') {
     offset *= kPageSize;
     base = AlignDown(base, kPageSize);
   }
+  // Strip code_address_offset before printing, so we can use the
+  // semantically-correct AppendCodeRelativeAddressToOutput.
+  const void* target =
+      reinterpret_cast<const void*>(base + offset - code_address_offset());
 
-  const void* target = reinterpret_cast<const void*>(base + offset);
   AppendPCRelativeOffsetToOutput(instr, offset);
   AppendToOutput(" ");
-  AppendAddressToOutput(instr, target);
+  AppendCodeRelativeAddressToOutput(instr, target);
   return 13;
 }
 
@@ -1738,7 +1823,7 @@
 
   AppendPCRelativeOffsetToOutput(instr, offset);
   AppendToOutput(" ");
-  AppendCodeAddressToOutput(instr, target_address);
+  AppendCodeRelativeCodeAddressToOutput(instr, target_address);
 
   return 8;
 }
@@ -1805,13 +1890,26 @@
   VIXL_ASSERT(format[0] == 'P');
   USE(format);
 
-  int prefetch_mode = instr->PrefetchMode();
+  static const char* hints[] = {"ld", "li", "st"};
+  static const char* stream_options[] = {"keep", "strm"};
 
-  const char* ls = (prefetch_mode & 0x10) ? "st" : "ld";
-  int level = (prefetch_mode >> 1) + 1;
-  const char* ks = (prefetch_mode & 1) ? "strm" : "keep";
+  unsigned hint = instr->PrefetchHint();
+  unsigned target = instr->PrefetchTarget() + 1;
+  unsigned stream = instr->PrefetchStream();
 
-  AppendToOutput("p%sl%d%s", ls, level, ks);
+  if ((hint >= (sizeof(hints) / sizeof(hints[0]))) || (target > 3)) {
+    // Unallocated prefetch operations.
+    int prefetch_mode = instr->ImmPrefetchOperation();
+    AppendToOutput("#0b%c%c%c%c%c",
+                   (prefetch_mode & (1 << 4)) ? '1' : '0',
+                   (prefetch_mode & (1 << 3)) ? '1' : '0',
+                   (prefetch_mode & (1 << 2)) ? '1' : '0',
+                   (prefetch_mode & (1 << 1)) ? '1' : '0',
+                   (prefetch_mode & (1 << 0)) ? '1' : '0');
+  } else {
+    VIXL_ASSERT(stream < (sizeof(stream_options) / sizeof(stream_options[0])));
+    AppendToOutput("p%sl%d%s", hints[hint], target, stream_options[stream]);
+  }
   return 6;
 }
 
diff --git a/disas/libvixl/a64/disasm-a64.h b/disas/libvixl/a64/disasm-a64.h
index db04337..ddfe98b 100644
--- a/disas/libvixl/a64/disasm-a64.h
+++ b/disas/libvixl/a64/disasm-a64.h
@@ -43,7 +43,7 @@
   char* GetOutput();
 
   // Declare all Visitor functions.
-  #define DECLARE(A)  void Visit##A(const Instruction* instr);
+  #define DECLARE(A) virtual void Visit##A(const Instruction* instr);
   VISITOR_LIST(DECLARE)
   #undef DECLARE
 
@@ -65,23 +65,45 @@
 
   // Prints an address, in the general case. It can be code or data. This is
   // used for example to print the target address of an ADR instruction.
-  virtual void AppendAddressToOutput(const Instruction* instr,
-                                     const void* addr);
+  virtual void AppendCodeRelativeAddressToOutput(const Instruction* instr,
+                                                 const void* addr);
 
   // Prints the address of some code.
   // This is used for example to print the target address of a branch to an
   // immediate offset.
   // A sub-class can for example override this method to lookup the address and
   // print an appropriate name.
-  virtual void AppendCodeAddressToOutput(const Instruction* instr,
-                                         const void* addr);
+  virtual void AppendCodeRelativeCodeAddressToOutput(const Instruction* instr,
+                                                     const void* addr);
 
   // Prints the address of some data.
   // This is used for example to print the source address of a load literal
   // instruction.
+  virtual void AppendCodeRelativeDataAddressToOutput(const Instruction* instr,
+                                                     const void* addr);
+
+  // Same as the above, but for addresses that are not relative to the code
+  // buffer. They are currently not used by VIXL.
+  virtual void AppendAddressToOutput(const Instruction* instr,
+                                     const void* addr);
+  virtual void AppendCodeAddressToOutput(const Instruction* instr,
+                                         const void* addr);
   virtual void AppendDataAddressToOutput(const Instruction* instr,
                                          const void* addr);
 
+ public:
+  // Get/Set the offset that should be added to code addresses when printing
+  // code-relative addresses in the AppendCodeRelative<Type>AddressToOutput()
+  // helpers.
+  // Below is an example of how a branch immediate instruction in memory at
+  // address 0xb010200 would disassemble with different offsets.
+  // Base address | Disassembly
+  //          0x0 | 0xb010200:  b #+0xcc  (addr 0xb0102cc)
+  //      0x10000 | 0xb000200:  b #+0xcc  (addr 0xb0002cc)
+  //    0xb010200 |       0x0:  b #+0xcc  (addr 0xcc)
+  void MapCodeAddress(int64_t base_address, const Instruction* instr_address);
+  int64_t CodeRelativeAddress(const void* instr);
+
  private:
   void Format(
       const Instruction* instr, const char* mnemonic, const char* format);
@@ -101,32 +123,40 @@
   int SubstitutePrefetchField(const Instruction* instr, const char* format);
   int SubstituteBarrierField(const Instruction* instr, const char* format);
 
-  inline bool RdIsZROrSP(const Instruction* instr) const {
+  bool RdIsZROrSP(const Instruction* instr) const {
     return (instr->Rd() == kZeroRegCode);
   }
 
-  inline bool RnIsZROrSP(const Instruction* instr) const {
+  bool RnIsZROrSP(const Instruction* instr) const {
     return (instr->Rn() == kZeroRegCode);
   }
 
-  inline bool RmIsZROrSP(const Instruction* instr) const {
+  bool RmIsZROrSP(const Instruction* instr) const {
     return (instr->Rm() == kZeroRegCode);
   }
 
-  inline bool RaIsZROrSP(const Instruction* instr) const {
+  bool RaIsZROrSP(const Instruction* instr) const {
     return (instr->Ra() == kZeroRegCode);
   }
 
   bool IsMovzMovnImm(unsigned reg_size, uint64_t value);
 
+  int64_t code_address_offset() const { return code_address_offset_; }
+
  protected:
   void ResetOutput();
   void AppendToOutput(const char* string, ...) PRINTF_CHECK(2, 3);
 
+  void set_code_address_offset(int64_t code_address_offset) {
+    code_address_offset_ = code_address_offset;
+  }
+
   char* buffer_;
   uint32_t buffer_pos_;
   uint32_t buffer_size_;
   bool own_buffer_;
+
+  int64_t code_address_offset_;
 };
 
 
diff --git a/disas/libvixl/a64/instructions-a64.cc b/disas/libvixl/a64/instructions-a64.cc
index 1f08c78..b091886 100644
--- a/disas/libvixl/a64/instructions-a64.cc
+++ b/disas/libvixl/a64/instructions-a64.cc
@@ -30,6 +30,20 @@
 namespace vixl {
 
 
+// Floating-point infinity values.
+const float kFP32PositiveInfinity = rawbits_to_float(0x7f800000);
+const float kFP32NegativeInfinity = rawbits_to_float(0xff800000);
+const double kFP64PositiveInfinity =
+    rawbits_to_double(UINT64_C(0x7ff0000000000000));
+const double kFP64NegativeInfinity =
+    rawbits_to_double(UINT64_C(0xfff0000000000000));
+
+
+// The default NaN values (for FPCR.DN=1).
+const double kFP64DefaultNaN = rawbits_to_double(UINT64_C(0x7ff8000000000000));
+const float kFP32DefaultNaN = rawbits_to_float(0x7fc00000);
+
+
 static uint64_t RotateRight(uint64_t value,
                             unsigned int rotate,
                             unsigned int width) {
@@ -54,6 +68,55 @@
 }
 
 
+bool Instruction::IsLoad() const {
+  if (Mask(LoadStoreAnyFMask) != LoadStoreAnyFixed) {
+    return false;
+  }
+
+  if (Mask(LoadStorePairAnyFMask) == LoadStorePairAnyFixed) {
+    return Mask(LoadStorePairLBit) != 0;
+  } else {
+    LoadStoreOp op = static_cast<LoadStoreOp>(Mask(LoadStoreOpMask));
+    switch (op) {
+      case LDRB_w:
+      case LDRH_w:
+      case LDR_w:
+      case LDR_x:
+      case LDRSB_w:
+      case LDRSB_x:
+      case LDRSH_w:
+      case LDRSH_x:
+      case LDRSW_x:
+      case LDR_s:
+      case LDR_d: return true;
+      default: return false;
+    }
+  }
+}
+
+
+bool Instruction::IsStore() const {
+  if (Mask(LoadStoreAnyFMask) != LoadStoreAnyFixed) {
+    return false;
+  }
+
+  if (Mask(LoadStorePairAnyFMask) == LoadStorePairAnyFixed) {
+    return Mask(LoadStorePairLBit) == 0;
+  } else {
+    LoadStoreOp op = static_cast<LoadStoreOp>(Mask(LoadStoreOpMask));
+    switch (op) {
+      case STRB_w:
+      case STRH_w:
+      case STR_w:
+      case STR_x:
+      case STR_s:
+      case STR_d: return true;
+      default: return false;
+    }
+  }
+}
+
+
 // Logical immediates can't encode zero, so a return value of zero is used to
 // indicate a failure case. Specifically, where the constraints on imm_s are
 // not met.
diff --git a/disas/libvixl/a64/instructions-a64.h b/disas/libvixl/a64/instructions-a64.h
index 29f9722..f1d883c 100644
--- a/disas/libvixl/a64/instructions-a64.h
+++ b/disas/libvixl/a64/instructions-a64.h
@@ -96,6 +96,17 @@
 const unsigned kFloatMantissaBits = 23;
 const unsigned kFloatExponentBits = 8;
 
+// Floating-point infinity values.
+extern const float kFP32PositiveInfinity;
+extern const float kFP32NegativeInfinity;
+extern const double kFP64PositiveInfinity;
+extern const double kFP64NegativeInfinity;
+
+// The default NaN values (for FPCR.DN=1).
+extern const double kFP64DefaultNaN;
+extern const float kFP32DefaultNaN;
+
+
 enum LSDataSize {
   LSByte        = 0,
   LSHalfword    = 1,
@@ -140,33 +151,33 @@
 
 class Instruction {
  public:
-  inline Instr InstructionBits() const {
+  Instr InstructionBits() const {
     return *(reinterpret_cast<const Instr*>(this));
   }
 
-  inline void SetInstructionBits(Instr new_instr) {
+  void SetInstructionBits(Instr new_instr) {
     *(reinterpret_cast<Instr*>(this)) = new_instr;
   }
 
-  inline int Bit(int pos) const {
+  int Bit(int pos) const {
     return (InstructionBits() >> pos) & 1;
   }
 
-  inline uint32_t Bits(int msb, int lsb) const {
+  uint32_t Bits(int msb, int lsb) const {
     return unsigned_bitextract_32(msb, lsb, InstructionBits());
   }
 
-  inline int32_t SignedBits(int msb, int lsb) const {
+  int32_t SignedBits(int msb, int lsb) const {
     int32_t bits = *(reinterpret_cast<const int32_t*>(this));
     return signed_bitextract_32(msb, lsb, bits);
   }
 
-  inline Instr Mask(uint32_t mask) const {
+  Instr Mask(uint32_t mask) const {
     return InstructionBits() & mask;
   }
 
   #define DEFINE_GETTER(Name, HighBit, LowBit, Func)             \
-  inline int64_t Name() const { return Func(HighBit, LowBit); }
+  int64_t Name() const { return Func(HighBit, LowBit); }
   INSTRUCTION_FIELDS_LIST(DEFINE_GETTER)
   #undef DEFINE_GETTER
 
@@ -182,56 +193,64 @@
   float ImmFP32() const;
   double ImmFP64() const;
 
-  inline LSDataSize SizeLSPair() const {
+  LSDataSize SizeLSPair() const {
     return CalcLSPairDataSize(
              static_cast<LoadStorePairOp>(Mask(LoadStorePairMask)));
   }
 
   // Helpers.
-  inline bool IsCondBranchImm() const {
+  bool IsCondBranchImm() const {
     return Mask(ConditionalBranchFMask) == ConditionalBranchFixed;
   }
 
-  inline bool IsUncondBranchImm() const {
+  bool IsUncondBranchImm() const {
     return Mask(UnconditionalBranchFMask) == UnconditionalBranchFixed;
   }
 
-  inline bool IsCompareBranch() const {
+  bool IsCompareBranch() const {
     return Mask(CompareBranchFMask) == CompareBranchFixed;
   }
 
-  inline bool IsTestBranch() const {
+  bool IsTestBranch() const {
     return Mask(TestBranchFMask) == TestBranchFixed;
   }
 
-  inline bool IsPCRelAddressing() const {
+  bool IsPCRelAddressing() const {
     return Mask(PCRelAddressingFMask) == PCRelAddressingFixed;
   }
 
-  inline bool IsLogicalImmediate() const {
+  bool IsLogicalImmediate() const {
     return Mask(LogicalImmediateFMask) == LogicalImmediateFixed;
   }
 
-  inline bool IsAddSubImmediate() const {
+  bool IsAddSubImmediate() const {
     return Mask(AddSubImmediateFMask) == AddSubImmediateFixed;
   }
 
-  inline bool IsAddSubExtended() const {
+  bool IsAddSubExtended() const {
     return Mask(AddSubExtendedFMask) == AddSubExtendedFixed;
   }
 
-  inline bool IsLoadOrStore() const {
+  bool IsLoadOrStore() const {
     return Mask(LoadStoreAnyFMask) == LoadStoreAnyFixed;
   }
 
-  inline bool IsMovn() const {
+  bool IsLoad() const;
+  bool IsStore() const;
+
+  bool IsLoadLiteral() const {
+    // This includes PRFM_lit.
+    return Mask(LoadLiteralFMask) == LoadLiteralFixed;
+  }
+
+  bool IsMovn() const {
     return (Mask(MoveWideImmediateMask) == MOVN_x) ||
            (Mask(MoveWideImmediateMask) == MOVN_w);
   }
 
   // Indicate whether Rd can be the stack pointer or the zero register. This
   // does not check that the instruction actually has an Rd field.
-  inline Reg31Mode RdMode() const {
+  Reg31Mode RdMode() const {
     // The following instructions use sp or wsp as Rd:
     //  Add/sub (immediate) when not setting the flags.
     //  Add/sub (extended) when not setting the flags.
@@ -260,7 +279,7 @@
 
   // Indicate whether Rn can be the stack pointer or the zero register. This
   // does not check that the instruction actually has an Rn field.
-  inline Reg31Mode RnMode() const {
+  Reg31Mode RnMode() const {
     // The following instructions use sp or wsp as Rn:
     //  All loads and stores.
     //  Add/sub (immediate).
@@ -272,7 +291,7 @@
     return Reg31IsZeroRegister;
   }
 
-  inline ImmBranchType BranchType() const {
+  ImmBranchType BranchType() const {
     if (IsCondBranchImm()) {
       return CondBranchType;
     } else if (IsUncondBranchImm()) {
@@ -296,55 +315,66 @@
   // Patch a literal load instruction to load from 'source'.
   void SetImmLLiteral(const Instruction* source);
 
-  inline uint8_t* LiteralAddress() const {
-    int offset = ImmLLiteral() << kLiteralEntrySizeLog2;
-    const uint8_t* address = reinterpret_cast<const uint8_t*>(this) + offset;
-    // Note that the result is safely mutable only if the backing buffer is
-    // safely mutable.
-    return const_cast<uint8_t*>(address);
+  // Calculate the address of a literal referred to by a load-literal
+  // instruction, and return it as the specified type.
+  //
+  // The literal itself is safely mutable only if the backing buffer is safely
+  // mutable.
+  template <typename T>
+  T LiteralAddress() const {
+    uint64_t base_raw = reinterpret_cast<uintptr_t>(this);
+    ptrdiff_t offset = ImmLLiteral() << kLiteralEntrySizeLog2;
+    uint64_t address_raw = base_raw + offset;
+
+    // Cast the address using a C-style cast. A reinterpret_cast would be
+    // appropriate, but it can't cast one integral type to another.
+    T address = (T)(address_raw);
+
+    // Assert that the address can be represented by the specified type.
+    VIXL_ASSERT((uint64_t)(address) == address_raw);
+
+    return address;
   }
 
-  inline uint32_t Literal32() const {
+  uint32_t Literal32() const {
     uint32_t literal;
-    memcpy(&literal, LiteralAddress(), sizeof(literal));
-
+    memcpy(&literal, LiteralAddress<const void*>(), sizeof(literal));
     return literal;
   }
 
-  inline uint64_t Literal64() const {
+  uint64_t Literal64() const {
     uint64_t literal;
-    memcpy(&literal, LiteralAddress(), sizeof(literal));
-
+    memcpy(&literal, LiteralAddress<const void*>(), sizeof(literal));
     return literal;
   }
 
-  inline float LiteralFP32() const {
+  float LiteralFP32() const {
     return rawbits_to_float(Literal32());
   }
 
-  inline double LiteralFP64() const {
+  double LiteralFP64() const {
     return rawbits_to_double(Literal64());
   }
 
-  inline const Instruction* NextInstruction() const {
+  const Instruction* NextInstruction() const {
     return this + kInstructionSize;
   }
 
-  inline const Instruction* InstructionAtOffset(int64_t offset) const {
+  const Instruction* InstructionAtOffset(int64_t offset) const {
     VIXL_ASSERT(IsWordAligned(this + offset));
     return this + offset;
   }
 
-  template<typename T> static inline Instruction* Cast(T src) {
+  template<typename T> static Instruction* Cast(T src) {
     return reinterpret_cast<Instruction*>(src);
   }
 
-  template<typename T> static inline const Instruction* CastConst(T src) {
+  template<typename T> static const Instruction* CastConst(T src) {
     return reinterpret_cast<const Instruction*>(src);
   }
 
  private:
-  inline int ImmBranch() const;
+  int ImmBranch() const;
 
   void SetPCRelImmTarget(const Instruction* target);
   void SetBranchImmTarget(const Instruction* target);
diff --git a/disas/libvixl/globals.h b/disas/libvixl/globals.h
index e28dc66..0c24931 100644
--- a/disas/libvixl/globals.h
+++ b/disas/libvixl/globals.h
@@ -58,7 +58,7 @@
 const int MBytes = 1024 * KBytes;
 
 #define VIXL_ABORT() printf("in %s, line %i", __FILE__, __LINE__); abort()
-#ifdef DEBUG
+#ifdef VIXL_DEBUG
   #define VIXL_ASSERT(condition) assert(condition)
   #define VIXL_CHECK(condition) VIXL_ASSERT(condition)
   #define VIXL_UNIMPLEMENTED() printf("UNIMPLEMENTED\t"); VIXL_ABORT()
diff --git a/disas/libvixl/utils.cc b/disas/libvixl/utils.cc
index 21965d7..80b132a 100644
--- a/disas/libvixl/utils.cc
+++ b/disas/libvixl/utils.cc
@@ -135,4 +135,17 @@
   return (value != 0) && ((value & (value - 1)) == 0);
 }
 
+
+unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size) {
+  VIXL_ASSERT((reg_size % 8) == 0);
+  int count = 0;
+  for (unsigned i = 0; i < (reg_size / 16); i++) {
+    if ((imm & 0xffff) == 0) {
+      count++;
+    }
+    imm >>= 16;
+  }
+  return count;
+}
+
 }  // namespace vixl
diff --git a/disas/libvixl/utils.h b/disas/libvixl/utils.h
index 1540c30..b440626 100644
--- a/disas/libvixl/utils.h
+++ b/disas/libvixl/utils.h
@@ -166,6 +166,8 @@
 uint64_t LowestSetBit(uint64_t value);
 bool IsPowerOf2(int64_t value);
 
+unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size);
+
 // Pointer alignment
 // TODO: rename/refactor to make it specific to instructions.
 template<typename T>
@@ -174,14 +176,14 @@
   return ((intptr_t)(pointer) & 3) == 0;
 }
 
-// Increment a pointer until it has the specified alignment.
+// Increment a pointer (up to 64 bits) until it has the specified alignment.
 template<class T>
 T AlignUp(T pointer, size_t alignment) {
   // Use C-style casts to get static_cast behaviour for integral types (T), and
   // reinterpret_cast behaviour for other types.
 
-  uintptr_t pointer_raw = (uintptr_t)pointer;
-  VIXL_STATIC_ASSERT(sizeof(pointer) == sizeof(pointer_raw));
+  uint64_t pointer_raw = (uint64_t)pointer;
+  VIXL_STATIC_ASSERT(sizeof(pointer) <= sizeof(pointer_raw));
 
   size_t align_step = (alignment - pointer_raw) % alignment;
   VIXL_ASSERT((pointer_raw + align_step) % alignment == 0);
@@ -189,14 +191,14 @@
   return (T)(pointer_raw + align_step);
 }
 
-// Decrement a pointer until it has the specified alignment.
+// Decrement a pointer (up to 64 bits) until it has the specified alignment.
 template<class T>
 T AlignDown(T pointer, size_t alignment) {
   // Use C-style casts to get static_cast behaviour for integral types (T), and
   // reinterpret_cast behaviour for other types.
 
-  uintptr_t pointer_raw = (uintptr_t)pointer;
-  VIXL_STATIC_ASSERT(sizeof(pointer) == sizeof(pointer_raw));
+  uint64_t pointer_raw = (uint64_t)pointer;
+  VIXL_STATIC_ASSERT(sizeof(pointer) <= sizeof(pointer_raw));
 
   size_t align_step = pointer_raw % alignment;
   VIXL_ASSERT((pointer_raw - align_step) % alignment == 0);
diff --git a/disas/s390.c b/disas/s390.c
index 25499ba..974460c 100644
--- a/disas/s390.c
+++ b/disas/s390.c
@@ -106,10 +106,6 @@
 static const struct s390_opcode s390_opcodes[];
 static const int                s390_num_opcodes;
 
-/* A opcode format table for the .insn pseudo mnemonic.  */
-static const struct s390_opcode s390_opformats[];
-static const int                s390_num_opformats;
-
 /* Values defined for the flags field of a struct powerpc_opcode.  */
 
 /* The operands table is an array of struct s390_operand.  */
@@ -844,37 +840,6 @@
 #define MASK_SIY_DRI     { 0xff, 0x00, 0x00, 0x00, 0x00, 0xff }
 /* QEMU-END */
 
-/* The opcode formats table (blueprints for .insn pseudo mnemonic).  */
-
-static const struct s390_opcode s390_opformats[] =
-  {
-  { "e",	OP8(0x00LL),	MASK_E,		INSTR_E,	3, 0 },
-  { "ri",	OP8(0x00LL),	MASK_RI_RI,	INSTR_RI_RI,	3, 0 },
-  { "rie",	OP8(0x00LL),	MASK_RIE_RRP,	INSTR_RIE_RRP,	3, 0 },
-  { "ril",	OP8(0x00LL),	MASK_RIL_RP,	INSTR_RIL_RP,	3, 0 },
-  { "rilu",	OP8(0x00LL),	MASK_RIL_RU,	INSTR_RIL_RU,	3, 0 },
-  { "rr",	OP8(0x00LL),	MASK_RR_RR,	INSTR_RR_RR,	3, 0 },
-  { "rre",	OP8(0x00LL),	MASK_RRE_RR,	INSTR_RRE_RR,	3, 0 },
-  { "rrf",	OP8(0x00LL),	MASK_RRF_RURR,	INSTR_RRF_RURR,	3, 0 },
-  { "rs",	OP8(0x00LL),	MASK_RS_RRRD,	INSTR_RS_RRRD,	3, 0 },
-  { "rse",	OP8(0x00LL),	MASK_RSE_RRRD,	INSTR_RSE_RRRD,	3, 0 },
-  { "rsi",	OP8(0x00LL),	MASK_RSI_RRP,	INSTR_RSI_RRP,	3, 0 },
-  { "rsy",	OP8(0x00LL),	MASK_RSY_RRRD,	INSTR_RSY_RRRD,	3, 3 },
-  { "rx",	OP8(0x00LL),	MASK_RX_RRRD,	INSTR_RX_RRRD,	3, 0 },
-  { "rxe",	OP8(0x00LL),	MASK_RXE_RRRD,	INSTR_RXE_RRRD,	3, 0 },
-  { "rxf",	OP8(0x00LL),	MASK_RXF_RRRDR,	INSTR_RXF_RRRDR,3, 0 },
-  { "rxy",	OP8(0x00LL),	MASK_RXY_RRRD,	INSTR_RXY_RRRD,	3, 3 },
-  { "s",	OP8(0x00LL),	MASK_S_RD,	INSTR_S_RD,	3, 0 },
-  { "si",	OP8(0x00LL),	MASK_SI_URD,	INSTR_SI_URD,	3, 0 },
-  { "siy",	OP8(0x00LL),	MASK_SIY_URD,	INSTR_SIY_URD,	3, 3 },
-  { "ss",	OP8(0x00LL),	MASK_SS_RRRDRD,	INSTR_SS_RRRDRD,3, 0 },
-  { "sse",	OP8(0x00LL),	MASK_SSE_RDRD,	INSTR_SSE_RDRD,	3, 0 },
-  { "ssf",	OP8(0x00LL),	MASK_SSF_RRDRD,	INSTR_SSF_RRDRD,3, 0 },
-};
-
-static const int s390_num_opformats =
-  sizeof (s390_opformats) / sizeof (s390_opformats[0]);
-
 /* include "s390-opc.tab" generated from opcodes/s390-opc.txt rev 1.17 */
 /* The opcode table. This file was generated by s390-mkopc.
 
diff --git a/docs/rcu.txt b/docs/rcu.txt
new file mode 100644
index 0000000..61752b9
--- /dev/null
+++ b/docs/rcu.txt
@@ -0,0 +1,387 @@
+Using RCU (Read-Copy-Update) for synchronization
+================================================
+
+Read-copy update (RCU) is a synchronization mechanism that is used to
+protect read-mostly data structures.  RCU is very efficient and scalable
+on the read side (it is wait-free), and thus can make the read paths
+extremely fast.
+
+RCU supports concurrency between a single writer and multiple readers,
+thus it is not used alone.  Typically, the write-side will use a lock to
+serialize multiple updates, but other approaches are possible (e.g.,
+restricting updates to a single task).  In QEMU, when a lock is used,
+this will often be the "iothread mutex", also known as the "big QEMU
+lock" (BQL).  Also, restricting updates to a single task is done in
+QEMU using the "bottom half" API.
+
+RCU is fundamentally a "wait-to-finish" mechanism.  The read side marks
+sections of code with "critical sections", and the update side will wait
+for the execution of all *currently running* critical sections before
+proceeding, or before asynchronously executing a callback.
+
+The key point here is that only the currently running critical sections
+are waited for; critical sections that are started _after_ the beginning
+of the wait do not extend the wait, despite running concurrently with
+the updater.  This is the reason why RCU is more scalable than,
+for example, reader-writer locks.  It is so much more scalable that
+the system will have a single instance of the RCU mechanism; a single
+mechanism can be used for an arbitrary number of "things", without
+having to worry about things such as contention or deadlocks.
+
+How is this possible?  The basic idea is to split updates in two phases,
+"removal" and "reclamation".  During removal, we ensure that subsequent
+readers will not be able to get a reference to the old data.  After
+removal has completed, a critical section will not be able to access
+the old data.  Therefore, critical sections that begin after removal
+do not matter; as soon as all previous critical sections have finished,
+there cannot be any readers who hold references to the data structure,
+and these can now be safely reclaimed (e.g., freed or unref'ed).
+
+Here is a picutre:
+
+        thread 1                  thread 2                  thread 3
+    -------------------    ------------------------    -------------------
+    enter RCU crit.sec.
+           |                finish removal phase
+           |                begin wait
+           |                      |                    enter RCU crit.sec.
+    exit RCU crit.sec             |                           |
+                            complete wait                     |
+                            begin reclamation phase           |
+                                                       exit RCU crit.sec.
+
+
+Note how thread 3 is still executing its critical section when thread 2
+starts reclaiming data.  This is possible, because the old version of the
+data structure was not accessible at the time thread 3 began executing
+that critical section.
+
+
+RCU API
+=======
+
+The core RCU API is small:
+
+     void rcu_read_lock(void);
+
+        Used by a reader to inform the reclaimer that the reader is
+        entering an RCU read-side critical section.
+
+     void rcu_read_unlock(void);
+
+        Used by a reader to inform the reclaimer that the reader is
+        exiting an RCU read-side critical section.  Note that RCU
+        read-side critical sections may be nested and/or overlapping.
+
+     void synchronize_rcu(void);
+
+        Blocks until all pre-existing RCU read-side critical sections
+        on all threads have completed.  This marks the end of the removal
+        phase and the beginning of reclamation phase.
+
+        Note that it would be valid for another update to come while
+        synchronize_rcu is running.  Because of this, it is better that
+        the updater releases any locks it may hold before calling
+        synchronize_rcu.  If this is not possible (for example, because
+        the updater is protected by the BQL), you can use call_rcu.
+
+     void call_rcu1(struct rcu_head * head,
+                    void (*func)(struct rcu_head *head));
+
+        This function invokes func(head) after all pre-existing RCU
+        read-side critical sections on all threads have completed.  This
+        marks the end of the removal phase, with func taking care
+        asynchronously of the reclamation phase.
+
+        The foo struct needs to have an rcu_head structure added,
+        perhaps as follows:
+
+            struct foo {
+                struct rcu_head rcu;
+                int a;
+                char b;
+                long c;
+            };
+
+        so that the reclaimer function can fetch the struct foo address
+        and free it:
+
+            call_rcu1(&foo.rcu, foo_reclaim);
+
+            void foo_reclaim(struct rcu_head *rp)
+            {
+                struct foo *fp = container_of(rp, struct foo, rcu);
+                g_free(fp);
+            }
+
+        For the common case where the rcu_head member is the first of the
+        struct, you can use the following macro.
+
+     void call_rcu(T *p,
+                   void (*func)(T *p),
+                   field-name);
+
+        call_rcu1 is typically used through this macro, in the common case
+        where the "struct rcu_head" is the first field in the struct.  In
+        the above case, one could have written simply:
+
+            call_rcu(foo_reclaim, g_free, rcu);
+
+     typeof(*p) atomic_rcu_read(p);
+
+        atomic_rcu_read() is similar to atomic_mb_read(), but it makes
+        some assumptions on the code that calls it.  This allows a more
+        optimized implementation.
+
+        atomic_rcu_read assumes that whenever a single RCU critical
+        section reads multiple shared data, these reads are either
+        data-dependent or need no ordering.  This is almost always the
+        case when using RCU, because read-side critical sections typically
+        navigate one or more pointers (the pointers that are changed on
+        every update) until reaching a data structure of interest,
+        and then read from there.
+
+        RCU read-side critical sections must use atomic_rcu_read() to
+        read data, unless concurrent writes are presented by another
+        synchronization mechanism.
+
+        Furthermore, RCU read-side critical sections should traverse the
+        data structure in a single direction, opposite to the direction
+        in which the updater initializes it.
+
+     void atomic_rcu_set(p, typeof(*p) v);
+
+        atomic_rcu_set() is also similar to atomic_mb_set(), and it also
+        makes assumptions on the code that calls it in order to allow a more
+        optimized implementation.
+
+        In particular, atomic_rcu_set() suffices for synchronization
+        with readers, if the updater never mutates a field within a
+        data item that is already accessible to readers.  This is the
+        case when initializing a new copy of the RCU-protected data
+        structure; just ensure that initialization of *p is carried out
+        before atomic_rcu_set() makes the data item visible to readers.
+        If this rule is observed, writes will happen in the opposite
+        order as reads in the RCU read-side critical sections (or if
+        there is just one update), and there will be no need for other
+        synchronization mechanism to coordinate the accesses.
+
+The following APIs must be used before RCU is used in a thread:
+
+     void rcu_register_thread(void);
+
+        Mark a thread as taking part in the RCU mechanism.  Such a thread
+        will have to report quiescent points regularly, either manually
+        or through the QemuCond/QemuSemaphore/QemuEvent APIs.
+
+     void rcu_unregister_thread(void);
+
+        Mark a thread as not taking part anymore in the RCU mechanism.
+        It is not a problem if such a thread reports quiescent points,
+        either manually or by using the QemuCond/QemuSemaphore/QemuEvent
+        APIs.
+
+Note that these APIs are relatively heavyweight, and should _not_ be
+nested.
+
+
+DIFFERENCES WITH LINUX
+======================
+
+- Waiting on a mutex is possible, though discouraged, within an RCU critical
+  section.  This is because spinlocks are rarely (if ever) used in userspace
+  programming; not allowing this would prevent upgrading an RCU read-side
+  critical section to become an updater.
+
+- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and
+  rcu_assign_pointer.  They take a _pointer_ to the variable being accessed.
+
+- call_rcu is a macro that has an extra argument (the name of the first
+  field in the struct, which must be a struct rcu_head), and expects the
+  type of the callback's argument to be the type of the first argument.
+  call_rcu1 is the same as Linux's call_rcu.
+
+
+RCU PATTERNS
+============
+
+Many patterns using read-writer locks translate directly to RCU, with
+the advantages of higher scalability and deadlock immunity.
+
+In general, RCU can be used whenever it is possible to create a new
+"version" of a data structure every time the updater runs.  This may
+sound like a very strict restriction, however:
+
+- the updater does not mean "everything that writes to a data structure",
+  but rather "everything that involves a reclamation step".  See the
+  array example below
+
+- in some cases, creating a new version of a data structure may actually
+  be very cheap.  For example, modifying the "next" pointer of a singly
+  linked list is effectively creating a new version of the list.
+
+Here are some frequently-used RCU idioms that are worth noting.
+
+
+RCU list processing
+-------------------
+
+TBD (not yet used in QEMU)
+
+
+RCU reference counting
+----------------------
+
+Because grace periods are not allowed to complete while there is an RCU
+read-side critical section in progress, the RCU read-side primitives
+may be used as a restricted reference-counting mechanism.  For example,
+consider the following code fragment:
+
+    rcu_read_lock();
+    p = atomic_rcu_read(&foo);
+    /* do something with p. */
+    rcu_read_unlock();
+
+The RCU read-side critical section ensures that the value of "p" remains
+valid until after the rcu_read_unlock().  In some sense, it is acquiring
+a reference to p that is later released when the critical section ends.
+The write side looks simply like this (with appropriate locking):
+
+    qemu_mutex_lock(&foo_mutex);
+    old = foo;
+    atomic_rcu_set(&foo, new);
+    qemu_mutex_unlock(&foo_mutex);
+    synchronize_rcu();
+    free(old);
+
+If the processing cannot be done purely within the critical section, it
+is possible to combine this idiom with a "real" reference count:
+
+    rcu_read_lock();
+    p = atomic_rcu_read(&foo);
+    foo_ref(p);
+    rcu_read_unlock();
+    /* do something with p. */
+    foo_unref(p);
+
+The write side can be like this:
+
+    qemu_mutex_lock(&foo_mutex);
+    old = foo;
+    atomic_rcu_set(&foo, new);
+    qemu_mutex_unlock(&foo_mutex);
+    synchronize_rcu();
+    foo_unref(old);
+
+or with call_rcu:
+
+    qemu_mutex_lock(&foo_mutex);
+    old = foo;
+    atomic_rcu_set(&foo, new);
+    qemu_mutex_unlock(&foo_mutex);
+    call_rcu(foo_unref, old, rcu);
+
+In both cases, the write side only performs removal.  Reclamation
+happens when the last reference to a "foo" object is dropped.
+Using synchronize_rcu() is undesirably expensive, because the
+last reference may be dropped on the read side.  Hence you can
+use call_rcu() instead:
+
+     foo_unref(struct foo *p) {
+        if (atomic_fetch_dec(&p->refcount) == 1) {
+            call_rcu(foo_destroy, p, rcu);
+        }
+    }
+
+
+Note that the same idioms would be possible with reader/writer
+locks:
+
+    read_lock(&foo_rwlock);         write_mutex_lock(&foo_rwlock);
+    p = foo;                        p = foo;
+    /* do something with p. */      foo = new;
+    read_unlock(&foo_rwlock);       free(p);
+                                    write_mutex_unlock(&foo_rwlock);
+                                    free(p);
+
+    ------------------------------------------------------------------
+
+    read_lock(&foo_rwlock);         write_mutex_lock(&foo_rwlock);
+    p = foo;                        old = foo;
+    foo_ref(p);                     foo = new;
+    read_unlock(&foo_rwlock);       foo_unref(old);
+    /* do something with p. */      write_mutex_unlock(&foo_rwlock);
+    read_lock(&foo_rwlock);
+    foo_unref(p);
+    read_unlock(&foo_rwlock);
+
+foo_unref could use a mechanism such as bottom halves to move deallocation
+out of the write-side critical section.
+
+
+RCU resizable arrays
+--------------------
+
+Resizable arrays can be used with RCU.  The expensive RCU synchronization
+(or call_rcu) only needs to take place when the array is resized.
+The two items to take care of are:
+
+- ensuring that the old version of the array is available between removal
+  and reclamation;
+
+- avoiding mismatches in the read side between the array data and the
+  array size.
+
+The first problem is avoided simply by not using realloc.  Instead,
+each resize will allocate a new array and copy the old data into it.
+The second problem would arise if the size and the data pointers were
+two members of a larger struct:
+
+    struct mystuff {
+        ...
+        int data_size;
+        int data_alloc;
+        T   *data;
+        ...
+    };
+
+Instead, we store the size of the array with the array itself:
+
+    struct arr {
+        int size;
+        int alloc;
+        T   data[];
+    };
+    struct arr *global_array;
+
+    read side:
+        rcu_read_lock();
+        struct arr *array = atomic_rcu_read(&global_array);
+        x = i < array->size ? array->data[i] : -1;
+        rcu_read_unlock();
+        return x;
+
+    write side (running under a lock):
+        if (global_array->size == global_array->alloc) {
+            /* Creating a new version.  */
+            new_array = g_malloc(sizeof(struct arr) +
+                                 global_array->alloc * 2 * sizeof(T));
+            new_array->size = global_array->size;
+            new_array->alloc = global_array->alloc * 2;
+            memcpy(new_array->data, global_array->data,
+                   global_array->alloc * sizeof(T));
+
+            /* Removal phase.  */
+            old_array = global_array;
+            atomic_rcu_set(&new_array->data, new_array);
+            synchronize_rcu();
+
+            /* Reclamation phase.  */
+            free(old_array);
+        }
+
+
+SOURCES
+=======
+
+* Documentation/RCU/ from the Linux kernel
diff --git a/hmp.c b/hmp.c
index 481be80..a42c5c0 100644
--- a/hmp.c
+++ b/hmp.c
@@ -535,6 +535,7 @@
     qapi_free_VncInfo(info);
 }
 
+#ifdef CONFIG_SPICE
 void hmp_info_spice(Monitor *mon, const QDict *qdict)
 {
     SpiceChannelList *chan;
@@ -581,6 +582,7 @@
 out:
     qapi_free_SpiceInfo(info);
 }
+#endif
 
 void hmp_info_balloon(Monitor *mon, const QDict *qdict)
 {
diff --git a/hw/9pfs/virtio-9p-synth.c b/hw/9pfs/virtio-9p-synth.c
index 71262bc..e75aa87 100644
--- a/hw/9pfs/virtio-9p-synth.c
+++ b/hw/9pfs/virtio-9p-synth.c
@@ -17,6 +17,7 @@
 #include "virtio-9p-xattr.h"
 #include "fsdev/qemu-fsdev.h"
 #include "virtio-9p-synth.h"
+#include "qemu/rcu.h"
 
 #include <sys/stat.h>
 
diff --git a/hw/arm/armv7m.c b/hw/arm/armv7m.c
index ef24ca4..c6eab6d 100644
--- a/hw/arm/armv7m.c
+++ b/hw/arm/armv7m.c
@@ -163,30 +163,23 @@
 }
 
 /* Init CPU and memory for a v7-M based board.
-   flash_size and sram_size are in kb.
+   mem_size is in bytes.
    Returns the NVIC array.  */
 
-qemu_irq *armv7m_init(MemoryRegion *system_memory,
-                      int flash_size, int sram_size,
+qemu_irq *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq,
                       const char *kernel_filename, const char *cpu_model)
 {
     ARMCPU *cpu;
     CPUARMState *env;
     DeviceState *nvic;
-    /* FIXME: make this local state.  */
-    static qemu_irq pic[64];
+    qemu_irq *pic = g_new(qemu_irq, num_irq);
     int image_size;
     uint64_t entry;
     uint64_t lowaddr;
     int i;
     int big_endian;
-    MemoryRegion *sram = g_new(MemoryRegion, 1);
-    MemoryRegion *flash = g_new(MemoryRegion, 1);
     MemoryRegion *hack = g_new(MemoryRegion, 1);
 
-    flash_size *= 1024;
-    sram_size *= 1024;
-
     if (cpu_model == NULL) {
 	cpu_model = "cortex-m3";
     }
@@ -197,35 +190,15 @@
     }
     env = &cpu->env;
 
-#if 0
-    /* > 32Mb SRAM gets complicated because it overlaps the bitband area.
-       We don't have proper commandline options, so allocate half of memory
-       as SRAM, up to a maximum of 32Mb, and the rest as code.  */
-    if (ram_size > (512 + 32) * 1024 * 1024)
-        ram_size = (512 + 32) * 1024 * 1024;
-    sram_size = (ram_size / 2) & TARGET_PAGE_MASK;
-    if (sram_size > 32 * 1024 * 1024)
-        sram_size = 32 * 1024 * 1024;
-    code_size = ram_size - sram_size;
-#endif
-
-    /* Flash programming is done via the SCU, so pretend it is ROM.  */
-    memory_region_init_ram(flash, NULL, "armv7m.flash", flash_size,
-                           &error_abort);
-    vmstate_register_ram_global(flash);
-    memory_region_set_readonly(flash, true);
-    memory_region_add_subregion(system_memory, 0, flash);
-    memory_region_init_ram(sram, NULL, "armv7m.sram", sram_size, &error_abort);
-    vmstate_register_ram_global(sram);
-    memory_region_add_subregion(system_memory, 0x20000000, sram);
     armv7m_bitband_init();
 
     nvic = qdev_create(NULL, "armv7m_nvic");
+    qdev_prop_set_uint32(nvic, "num-irq", num_irq);
     env->nvic = nvic;
     qdev_init_nofail(nvic);
     sysbus_connect_irq(SYS_BUS_DEVICE(nvic), 0,
                        qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ));
-    for (i = 0; i < 64; i++) {
+    for (i = 0; i < num_irq; i++) {
         pic[i] = qdev_get_gpio_in(nvic, i);
     }
 
@@ -244,7 +217,7 @@
         image_size = load_elf(kernel_filename, NULL, NULL, &entry, &lowaddr,
                               NULL, big_endian, ELF_MACHINE, 1);
         if (image_size < 0) {
-            image_size = load_image_targphys(kernel_filename, 0, flash_size);
+            image_size = load_image_targphys(kernel_filename, 0, mem_size);
             lowaddr = 0;
         }
         if (image_size < 0) {
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 52ebd8b..a48d1b2 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -463,8 +463,26 @@
              * (SCR.NS = 0), we change that here if non-secure boot has been
              * requested.
              */
-            if (arm_feature(env, ARM_FEATURE_EL3) && !info->secure_boot) {
-                env->cp15.scr_el3 |= SCR_NS;
+            if (arm_feature(env, ARM_FEATURE_EL3)) {
+                /* AArch64 is defined to come out of reset into EL3 if enabled.
+                 * If we are booting Linux then we need to adjust our EL as
+                 * Linux expects us to be in EL2 or EL1.  AArch32 resets into
+                 * SVC, which Linux expects, so no privilege/exception level to
+                 * adjust.
+                 */
+                if (env->aarch64) {
+                    if (arm_feature(env, ARM_FEATURE_EL2)) {
+                        env->pstate = PSTATE_MODE_EL2h;
+                    } else {
+                        env->pstate = PSTATE_MODE_EL1h;
+                    }
+                }
+
+                /* Set to non-secure if not a secure boot */
+                if (!info->secure_boot) {
+                    /* Linux expects non-secure state */
+                    env->cp15.scr_el3 |= SCR_NS;
+                }
             }
 
             if (CPU(cpu) == first_cpu) {
diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c
index ccc3b18..cb515ec 100644
--- a/hw/arm/stellaris.c
+++ b/hw/arm/stellaris.c
@@ -29,6 +29,8 @@
 #define BP_OLED_SSI  0x02
 #define BP_GAMEPAD   0x04
 
+#define NUM_IRQ_LINES 64
+
 typedef const struct {
     const char *name;
     uint32_t did0;
@@ -1220,10 +1222,27 @@
     int i;
     int j;
 
-    flash_size = ((board->dc0 & 0xffff) + 1) << 1;
-    sram_size = (board->dc0 >> 18) + 1;
-    pic = armv7m_init(get_system_memory(),
-                      flash_size, sram_size, kernel_filename, cpu_model);
+    MemoryRegion *sram = g_new(MemoryRegion, 1);
+    MemoryRegion *flash = g_new(MemoryRegion, 1);
+    MemoryRegion *system_memory = get_system_memory();
+
+    flash_size = (((board->dc0 & 0xffff) + 1) << 1) * 1024;
+    sram_size = ((board->dc0 >> 18) + 1) * 1024;
+
+    /* Flash programming is done via the SCU, so pretend it is ROM.  */
+    memory_region_init_ram(flash, NULL, "stellaris.flash", flash_size,
+                           &error_abort);
+    vmstate_register_ram_global(flash);
+    memory_region_set_readonly(flash, true);
+    memory_region_add_subregion(system_memory, 0, flash);
+
+    memory_region_init_ram(sram, NULL, "stellaris.sram", sram_size,
+                           &error_abort);
+    vmstate_register_ram_global(sram);
+    memory_region_add_subregion(system_memory, 0x20000000, sram);
+
+    pic = armv7m_init(system_memory, flash_size, NUM_IRQ_LINES,
+                      kernel_filename, cpu_model);
 
     if (board->dc1 & (1 << 16)) {
         dev = sysbus_create_varargs(TYPE_STELLARIS_ADC, 0x40038000,
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 2353440..34d9379 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -441,10 +441,32 @@
     int i;
     hwaddr size = vbi->memmap[VIRT_MMIO].size;
 
-    /* Note that we have to create the transports in forwards order
-     * so that command line devices are inserted lowest address first,
-     * and then add dtb nodes in reverse order so that they appear in
-     * the finished device tree lowest address first.
+    /* We create the transports in forwards order. Since qbus_realize()
+     * prepends (not appends) new child buses, the incrementing loop below will
+     * create a list of virtio-mmio buses with decreasing base addresses.
+     *
+     * When a -device option is processed from the command line,
+     * qbus_find_recursive() picks the next free virtio-mmio bus in forwards
+     * order. The upshot is that -device options in increasing command line
+     * order are mapped to virtio-mmio buses with decreasing base addresses.
+     *
+     * When this code was originally written, that arrangement ensured that the
+     * guest Linux kernel would give the lowest "name" (/dev/vda, eth0, etc) to
+     * the first -device on the command line. (The end-to-end order is a
+     * function of this loop, qbus_realize(), qbus_find_recursive(), and the
+     * guest kernel's name-to-address assignment strategy.)
+     *
+     * Meanwhile, the kernel's traversal seems to have been reversed; see eg.
+     * the message, if not necessarily the code, of commit 70161ff336.
+     * Therefore the loop now establishes the inverse of the original intent.
+     *
+     * Unfortunately, we can't counteract the kernel change by reversing the
+     * loop; it would break existing command lines.
+     *
+     * In any case, the kernel makes no guarantee about the stability of
+     * enumeration order of virtio devices (as demonstrated by it changing
+     * between kernel versions). For reliable and stable identification
+     * of disks users must use UUIDs or similar mechanisms.
      */
     for (i = 0; i < NUM_VIRTIO_TRANSPORTS; i++) {
         int irq = vbi->irqmap[VIRT_MMIO] + i;
@@ -453,6 +475,13 @@
         sysbus_create_simple("virtio-mmio", base, pic[irq]);
     }
 
+    /* We add dtb nodes in reverse order so that they appear in the finished
+     * device tree lowest address first.
+     *
+     * Note that this mapping is independent of the loop above. The previous
+     * loop influences virtio device to virtio transport assignment, whereas
+     * this loop controls how virtio transports are laid out in the dtb.
+     */
     for (i = NUM_VIRTIO_TRANSPORTS - 1; i >= 0; i--) {
         char *nodename;
         int irq = vbi->irqmap[VIRT_MMIO] + i;
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index d5e0e41..d508930 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -513,7 +513,7 @@
      * This makes us compatible with old devices
      * which never set or clear this bit. */
     s->config[PCI_STATUS] &= ~PCI_STATUS_INTERRUPT;
-    vmstate_save_state(f, pci_get_vmstate(s), s);
+    vmstate_save_state(f, pci_get_vmstate(s), s, NULL);
     /* Restore the interrupt status bit. */
     pci_update_irq_status(s);
 }
diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
index 3b77c9a..4ba8409 100644
--- a/hw/s390x/ipl.c
+++ b/hw/s390x/ipl.c
@@ -62,6 +62,7 @@
 static int s390_ipl_init(SysBusDevice *dev)
 {
     S390IPLState *ipl = S390_IPL(dev);
+    uint64_t pentry = KERN_IMAGE_START;
     int kernel_size;
 
     if (!ipl->kernel) {
@@ -94,31 +95,31 @@
             hw_error("could not load bootloader '%s'\n", bios_name);
         }
         return 0;
-    } else {
-        uint64_t pentry = KERN_IMAGE_START;
-        kernel_size = load_elf(ipl->kernel, NULL, NULL, &pentry, NULL,
-                               NULL, 1, ELF_MACHINE, 0);
-        if (kernel_size < 0) {
-            kernel_size = load_image_targphys(ipl->kernel, 0, ram_size);
-        }
-        if (kernel_size < 0) {
-            fprintf(stderr, "could not load kernel '%s'\n", ipl->kernel);
-            return -1;
-        }
-        /*
-         * Is it a Linux kernel (starting at 0x10000)? If yes, we fill in the
-         * kernel parameters here as well. Note: For old kernels (up to 3.2)
-         * we can not rely on the ELF entry point - it was 0x800 (the SALIPL
-         * loader) and it won't work. For this case we force it to 0x10000, too.
-         */
-        if (pentry == KERN_IMAGE_START || pentry == 0x800) {
-            ipl->start_addr = KERN_IMAGE_START;
-            /* Overwrite parameters in the kernel image, which are "rom" */
-            strcpy(rom_ptr(KERN_PARM_AREA), ipl->cmdline);
-        } else {
-            ipl->start_addr = pentry;
-        }
     }
+
+    kernel_size = load_elf(ipl->kernel, NULL, NULL, &pentry, NULL,
+                           NULL, 1, ELF_MACHINE, 0);
+    if (kernel_size < 0) {
+        kernel_size = load_image_targphys(ipl->kernel, 0, ram_size);
+    }
+    if (kernel_size < 0) {
+        fprintf(stderr, "could not load kernel '%s'\n", ipl->kernel);
+        return -1;
+    }
+    /*
+     * Is it a Linux kernel (starting at 0x10000)? If yes, we fill in the
+     * kernel parameters here as well. Note: For old kernels (up to 3.2)
+     * we can not rely on the ELF entry point - it was 0x800 (the SALIPL
+     * loader) and it won't work. For this case we force it to 0x10000, too.
+     */
+    if (pentry == KERN_IMAGE_START || pentry == 0x800) {
+        ipl->start_addr = KERN_IMAGE_START;
+        /* Overwrite parameters in the kernel image, which are "rom" */
+        strcpy(rom_ptr(KERN_PARM_AREA), ipl->cmdline);
+    } else {
+        ipl->start_addr = pentry;
+    }
+
     if (ipl->initrd) {
         ram_addr_t initrd_offset;
         int initrd_size;
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 1201b8d..dc455a2 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -170,7 +170,7 @@
     S390pciState *s = S390_PCI_HOST_BRIDGE(
         object_resolve_path(TYPE_S390_PCI_HOST_BRIDGE, NULL));
 
-    if (!s) {
+    if (!s || !fh) {
         return NULL;
     }
 
@@ -187,7 +187,7 @@
 static void s390_pci_generate_event(uint8_t cc, uint16_t pec, uint32_t fh,
                                     uint32_t fid, uint64_t faddr, uint32_t e)
 {
-    SeiContainer *sei_cont = g_malloc0(sizeof(SeiContainer));
+    SeiContainer *sei_cont;
     S390pciState *s = S390_PCI_HOST_BRIDGE(
         object_resolve_path(TYPE_S390_PCI_HOST_BRIDGE, NULL));
 
@@ -195,6 +195,7 @@
         return;
     }
 
+    sei_cont = g_malloc0(sizeof(SeiContainer));
     sei_cont->fh = fh;
     sei_cont->fid = fid;
     sei_cont->cc = cc;
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 5ea13e5..9e5bc5b 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -487,7 +487,7 @@
     CPUS390XState *env = &cpu->env;
     uint32_t fh;
     S390PCIBusDevice *pbdev;
-    ram_addr_t size;
+    hwaddr start, end;
     IOMMUTLBEntry entry;
     MemoryRegion *mr;
 
@@ -504,7 +504,8 @@
     }
 
     fh = env->regs[r1] >> 32;
-    size = env->regs[r2 + 1];
+    start = env->regs[r2];
+    end = start + env->regs[r2 + 1];
 
     pbdev = s390_pci_find_dev_by_fh(fh);
 
@@ -515,15 +516,18 @@
     }
 
     mr = pci_device_iommu_address_space(pbdev->pdev)->root;
-    entry = mr->iommu_ops->translate(mr, env->regs[r2], 0);
+    while (start < end) {
+        entry = mr->iommu_ops->translate(mr, start, 0);
 
-    if (!entry.translated_addr) {
-        setcc(cpu, ZPCI_PCI_LS_ERR);
-        goto out;
+        if (!entry.translated_addr) {
+            setcc(cpu, ZPCI_PCI_LS_ERR);
+            goto out;
+        }
+
+        memory_region_notify_iommu(mr, entry);
+        start += entry.addr_mask + 1;
     }
 
-    entry.addr_mask = size - 1;
-    memory_region_notify_iommu(mr, entry);
     setcc(cpu, ZPCI_PCI_LS_OK);
 out:
     return 0;
@@ -784,10 +788,10 @@
     stq_p(&fib.aisb, pbdev->routes.adapter.summary_addr);
     stq_p(&fib.fmb_addr, pbdev->fmb_addr);
 
-    data = (pbdev->isc << 28) | (pbdev->noi << 16) |
-           (pbdev->routes.adapter.ind_offset << 8) | (pbdev->sum << 7) |
-           pbdev->routes.adapter.summary_offset;
-    stw_p(&fib.data, data);
+    data = ((uint32_t)pbdev->isc << 28) | ((uint32_t)pbdev->noi << 16) |
+           ((uint32_t)pbdev->routes.adapter.ind_offset << 8) |
+           ((uint32_t)pbdev->sum << 7) | pbdev->routes.adapter.summary_offset;
+    stl_p(&fib.data, data);
 
     if (pbdev->fh >> ENABLE_BIT_OFFSET) {
         fib.fc |= 0x80;
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 9b740a3..db39ae0 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -1756,6 +1756,8 @@
     req->io_canceled = true;
     if (req->aiocb) {
         blk_aio_cancel_async(req->aiocb);
+    } else {
+        scsi_req_cancel_complete(req);
     }
 }
 
diff --git a/hw/scsi/spapr_vscsi.c b/hw/scsi/spapr_vscsi.c
index 20b20f0..3639235 100644
--- a/hw/scsi/spapr_vscsi.c
+++ b/hw/scsi/spapr_vscsi.c
@@ -630,7 +630,7 @@
     vscsi_req *req = sreq->hba_private;
     assert(req->active);
 
-    vmstate_save_state(f, &vmstate_spapr_vscsi_req, req);
+    vmstate_save_state(f, &vmstate_spapr_vscsi_req, req, NULL);
 
     DPRINTF("VSCSI: saving tag=%u, current desc#%d, offset=%x\n",
             req->qtag, req->cur_desc_num, req->cur_desc_offset);
diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c
index 5a107fa..0600c9a 100644
--- a/hw/timer/mc146818rtc.c
+++ b/hw/timer/mc146818rtc.c
@@ -734,7 +734,7 @@
 }
 
 static const VMStateDescription vmstate_rtc_irq_reinject_on_ack_count = {
-    .name = "irq_reinject_on_ack_count",
+    .name = "mc146818rtc/irq_reinject_on_ack_count",
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index cf483ff..e71385e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -32,7 +32,7 @@
 #include "trace.h"
 
 struct vfio_group_head vfio_group_list =
-    QLIST_HEAD_INITIALIZER(vfio_address_spaces);
+    QLIST_HEAD_INITIALIZER(vfio_group_list);
 struct vfio_as_head vfio_address_spaces =
     QLIST_HEAD_INITIALIZER(vfio_address_spaces);
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 014a92c..29caabc 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3065,6 +3065,7 @@
 {
     g_free(vdev->vbasedev.name);
     if (vdev->msix) {
+        object_unparent(OBJECT(&vdev->msix->mmap_mem));
         g_free(vdev->msix);
         vdev->msix = NULL;
     }
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 013979a..d735343 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -955,7 +955,7 @@
     }
 
     /* Subsections */
-    vmstate_save_state(f, &vmstate_virtio, vdev);
+    vmstate_save_state(f, &vmstate_virtio, vdev, NULL);
 }
 
 int virtio_set_features(VirtIODevice *vdev, uint32_t val)
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index 0e825ea..1673287 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -244,9 +244,31 @@
 #undef MEMSUFFIX
 #endif /* (NB_MMU_MODES >= 6) */
 
-#if (NB_MMU_MODES > 6)
-#error "NB_MMU_MODES > 6 is not supported for now"
-#endif /* (NB_MMU_MODES > 6) */
+#if (NB_MMU_MODES >= 7) && defined(MMU_MODE6_SUFFIX)
+
+#define CPU_MMU_INDEX 6
+#define MEMSUFFIX MMU_MODE6_SUFFIX
+#define DATA_SIZE 1
+#include "exec/cpu_ldst_template.h"
+
+#define DATA_SIZE 2
+#include "exec/cpu_ldst_template.h"
+
+#define DATA_SIZE 4
+#include "exec/cpu_ldst_template.h"
+
+#define DATA_SIZE 8
+#include "exec/cpu_ldst_template.h"
+#undef CPU_MMU_INDEX
+#undef MEMSUFFIX
+#endif /* (NB_MMU_MODES >= 7) */
+
+#if (NB_MMU_MODES > 7)
+/* Note that supporting NB_MMU_MODES == 9 would require
+ * changes to at least the ARM TCG backend.
+ */
+#error "NB_MMU_MODES > 7 is not supported for now"
+#endif /* (NB_MMU_MODES > 7) */
 
 /* these access are slower, they must be as rare as possible */
 #define CPU_MMU_INDEX (cpu_mmu_index(env))
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 0cd96b1..06ffa1d 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -33,6 +33,7 @@
 #include "qemu/notify.h"
 #include "qapi/error.h"
 #include "qom/object.h"
+#include "qemu/rcu.h"
 
 #define MAX_PHYS_ADDR_SPACE_BITS 62
 #define MAX_PHYS_ADDR            (((hwaddr)1 << MAX_PHYS_ADDR_SPACE_BITS) - 1)
@@ -207,9 +208,13 @@
  */
 struct AddressSpace {
     /* All fields are private. */
+    struct rcu_head rcu;
     char *name;
     MemoryRegion *root;
+
+    /* Accessed via RCU.  */
     struct FlatView *current_map;
+
     int ioeventfd_nb;
     struct MemoryRegionIoeventfd *ioeventfds;
     struct AddressSpaceDispatch *dispatch;
diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
index c4bf56d..5c940eb 100644
--- a/include/hw/arm/arm.h
+++ b/include/hw/arm/arm.h
@@ -15,8 +15,7 @@
 #include "hw/irq.h"
 
 /* armv7m.c */
-qemu_irq *armv7m_init(MemoryRegion *system_memory,
-                      int flash_size, int sram_size,
+qemu_irq *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq,
                       const char *kernel_filename, const char *cpu_model);
 
 /* arm_boot.c */
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 3cb5ba8..f37348b 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -33,6 +33,7 @@
 #define QEMU_VM_SECTION_END          0x03
 #define QEMU_VM_SECTION_FULL         0x04
 #define QEMU_VM_SUBSECTION           0x05
+#define QEMU_VM_VMDESCRIPTION        0x06
 
 struct MigrationParams {
     bool blk;
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index d843c00..a923cec 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -121,6 +121,7 @@
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
 int64_t qemu_ftell(QEMUFile *f);
+int64_t qemu_ftell_fast(QEMUFile *f);
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
 void qemu_put_byte(QEMUFile *f, int v);
 /*
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index fa307a6..0b26bc6 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -29,6 +29,7 @@
 #ifndef CONFIG_USER_ONLY
 #include <migration/qemu-file.h>
 #endif
+#include <qjson.h>
 
 typedef void SaveStateHandler(QEMUFile *f, void *opaque);
 typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id);
@@ -801,7 +802,7 @@
 int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
                        void *opaque, int version_id);
 void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd,
-                        void *opaque);
+                        void *opaque, QJSON *vmdesc);
 
 int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
                                    const VMStateDescription *vmsd,
diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index 0ca6cbd..eeaf0cb 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -52,9 +52,6 @@
 #define QERR_BUS_NOT_FOUND \
     ERROR_CLASS_GENERIC_ERROR, "Bus '%s' not found"
 
-#define QERR_COMMAND_NOT_FOUND \
-    ERROR_CLASS_COMMAND_NOT_FOUND, "The command %s has not been found"
-
 #define QERR_DEVICE_ENCRYPTED \
     ERROR_CLASS_DEVICE_ENCRYPTED, "'%s' (%s) is encrypted"
 
@@ -73,9 +70,6 @@
 #define QERR_DEVICE_NO_HOTPLUG \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging"
 
-#define QERR_DEVICE_NOT_ACTIVE \
-    ERROR_CLASS_DEVICE_NOT_ACTIVE, "No %s device has been activated"
-
 #define QERR_DEVICE_NOT_ENCRYPTED \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' is not encrypted"
 
@@ -112,9 +106,6 @@
 #define QERR_JSON_PARSING \
     ERROR_CLASS_GENERIC_ERROR, "Invalid JSON syntax"
 
-#define QERR_KVM_MISSING_CAP \
-    ERROR_CLASS_KVM_MISSING_CAP, "Using KVM without %s, %s unavailable"
-
 #define QERR_MIGRATION_ACTIVE \
     ERROR_CLASS_GENERIC_ERROR, "There's a migration process in progress"
 
diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 93c2ae2..98e05ca 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -129,6 +129,67 @@
 #define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))
 #endif
 
+/**
+ * atomic_rcu_read - reads a RCU-protected pointer to a local variable
+ * into a RCU read-side critical section. The pointer can later be safely
+ * dereferenced within the critical section.
+ *
+ * This ensures that the pointer copy is invariant thorough the whole critical
+ * section.
+ *
+ * Inserts memory barriers on architectures that require them (currently only
+ * Alpha) and documents which pointers are protected by RCU.
+ *
+ * Unless the __ATOMIC_CONSUME memory order is available, atomic_rcu_read also
+ * includes a compiler barrier to ensure that value-speculative optimizations
+ * (e.g. VSS: Value Speculation Scheduling) does not perform the data read
+ * before the pointer read by speculating the value of the pointer.  On new
+ * enough compilers, atomic_load takes care of such concern about
+ * dependency-breaking optimizations.
+ *
+ * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().
+ */
+#ifndef atomic_rcu_read
+#ifdef __ATOMIC_CONSUME
+#define atomic_rcu_read(ptr)    ({                \
+    typeof(*ptr) _val;                            \
+     __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \
+    _val;                                         \
+})
+#else
+#define atomic_rcu_read(ptr)    ({                \
+    typeof(*ptr) _val = atomic_read(ptr);         \
+    smp_read_barrier_depends();                   \
+    _val;                                         \
+})
+#endif
+#endif
+
+/**
+ * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure
+ * meant to be read by RCU read-side critical sections.
+ *
+ * Documents which pointers will be dereferenced by RCU read-side critical
+ * sections and adds the required memory barriers on architectures requiring
+ * them. It also makes sure the compiler does not reorder code initializing the
+ * data structure before its publication.
+ *
+ * Should match atomic_rcu_read().
+ */
+#ifndef atomic_rcu_set
+#ifdef __ATOMIC_RELEASE
+#define atomic_rcu_set(ptr, i)  do {              \
+    typeof(*ptr) _val = (i);                      \
+    __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \
+} while(0)
+#else
+#define atomic_rcu_set(ptr, i)  do {              \
+    smp_wmb();                                    \
+    atomic_set(ptr, i);                           \
+} while (0)
+#endif
+#endif
+
 /* These have the same semantics as Java volatile variables.
  * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:
  * "1. Issue a StoreStore barrier (wmb) before each volatile store."
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index a98eb3a..c602797 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -104,6 +104,19 @@
         (head)->lh_first = NULL;                                        \
 } while (/*CONSTCOND*/0)
 
+#define QLIST_SWAP(dstlist, srclist, field) do {                        \
+        void *tmplist;                                                  \
+        tmplist = (srclist)->lh_first;                                  \
+        (srclist)->lh_first = (dstlist)->lh_first;                      \
+        if ((srclist)->lh_first != NULL) {                              \
+            (srclist)->lh_first->field.le_prev = &(srclist)->lh_first;  \
+        }                                                               \
+        (dstlist)->lh_first = tmplist;                                  \
+        if ((dstlist)->lh_first != NULL) {                              \
+            (dstlist)->lh_first->field.le_prev = &(dstlist)->lh_first;  \
+        }                                                               \
+} while (/*CONSTCOND*/0)
+
 #define QLIST_INSERT_AFTER(listelm, elm, field) do {                    \
         if (((elm)->field.le_next = (listelm)->field.le_next) != NULL)  \
                 (listelm)->field.le_next->field.le_prev =               \
diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
new file mode 100644
index 0000000..068a279
--- /dev/null
+++ b/include/qemu/rcu.h
@@ -0,0 +1,147 @@
+#ifndef QEMU_RCU_H
+#define QEMU_RCU_H
+
+/*
+ * urcu-mb.h
+ *
+ * Userspace RCU header with explicit memory barrier.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * IBM's contributions to this file may be relicensed under LGPLv2 or later.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <limits.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <glib.h>
+
+#include "qemu/compiler.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/atomic.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Important !
+ *
+ * Each thread containing read-side critical sections must be registered
+ * with rcu_register_thread() before calling rcu_read_lock().
+ * rcu_unregister_thread() should be called before the thread exits.
+ */
+
+#ifdef DEBUG_RCU
+#define rcu_assert(args...)    assert(args)
+#else
+#define rcu_assert(args...)
+#endif
+
+/*
+ * Global quiescent period counter with low-order bits unused.
+ * Using a int rather than a char to eliminate false register dependencies
+ * causing stalls on some architectures.
+ */
+extern unsigned long rcu_gp_ctr;
+
+extern QemuEvent rcu_gp_event;
+
+struct rcu_reader_data {
+    /* Data used by both reader and synchronize_rcu() */
+    unsigned long ctr;
+    bool waiting;
+
+    /* Data used by reader only */
+    unsigned depth;
+
+    /* Data used for registry, protected by rcu_gp_lock */
+    QLIST_ENTRY(rcu_reader_data) node;
+};
+
+extern __thread struct rcu_reader_data rcu_reader;
+
+static inline void rcu_read_lock(void)
+{
+    struct rcu_reader_data *p_rcu_reader = &rcu_reader;
+    unsigned ctr;
+
+    if (p_rcu_reader->depth++ > 0) {
+        return;
+    }
+
+    ctr = atomic_read(&rcu_gp_ctr);
+    atomic_xchg(&p_rcu_reader->ctr, ctr);
+    if (atomic_read(&p_rcu_reader->waiting)) {
+        atomic_set(&p_rcu_reader->waiting, false);
+        qemu_event_set(&rcu_gp_event);
+    }
+}
+
+static inline void rcu_read_unlock(void)
+{
+    struct rcu_reader_data *p_rcu_reader = &rcu_reader;
+
+    assert(p_rcu_reader->depth != 0);
+    if (--p_rcu_reader->depth > 0) {
+        return;
+    }
+
+    atomic_xchg(&p_rcu_reader->ctr, 0);
+    if (atomic_read(&p_rcu_reader->waiting)) {
+        atomic_set(&p_rcu_reader->waiting, false);
+        qemu_event_set(&rcu_gp_event);
+    }
+}
+
+extern void synchronize_rcu(void);
+
+/*
+ * Reader thread registration.
+ */
+extern void rcu_register_thread(void);
+extern void rcu_unregister_thread(void);
+
+struct rcu_head;
+typedef void RCUCBFunc(struct rcu_head *head);
+
+struct rcu_head {
+    struct rcu_head *next;
+    RCUCBFunc *func;
+};
+
+extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);
+
+/* The operands of the minus operator must have the same type,
+ * which must be the one that we specify in the cast.
+ */
+#define call_rcu(head, func, field)                                      \
+    call_rcu1(({                                                         \
+         char __attribute__((unused))                                    \
+            offset_must_be_zero[-offsetof(typeof(*(head)), field)],      \
+            func_type_invalid = (func) - (void (*)(typeof(head)))(func); \
+         &(head)->field;                                                 \
+      }),                                                                \
+      (RCUCBFunc *)(func))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QEMU_RCU_H */
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index e89fdc9..5114ec8 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -25,9 +25,6 @@
 int qemu_mutex_trylock(QemuMutex *mutex);
 void qemu_mutex_unlock(QemuMutex *mutex);
 
-#define rcu_read_lock() do { } while (0)
-#define rcu_read_unlock() do { } while (0)
-
 void qemu_cond_init(QemuCond *cond);
 void qemu_cond_destroy(QemuCond *cond);
 
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index ca5befb..eba8b21 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -838,7 +838,6 @@
 int64_t cpu_get_icount_raw(void);
 int64_t cpu_get_icount(void);
 int64_t cpu_get_clock(void);
-int64_t cpu_get_clock_offset(void);
 int64_t cpu_icount_to_ns(int64_t icount);
 
 /*******************************************/
diff --git a/include/qjson.h b/include/qjson.h
new file mode 100644
index 0000000..7c54fdf
--- /dev/null
+++ b/include/qjson.h
@@ -0,0 +1,29 @@
+/*
+ * QEMU JSON writer
+ *
+ * Copyright Alexander Graf
+ *
+ * Authors:
+ *  Alexander Graf <agraf@suse.de>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+#ifndef QEMU_QJSON_H
+#define QEMU_QJSON_H
+
+#define TYPE_QJSON "QJSON"
+typedef struct QJSON QJSON;
+
+QJSON *qjson_new(void);
+void json_prop_str(QJSON *json, const char *name, const char *str);
+void json_prop_int(QJSON *json, const char *name, int64_t val);
+void json_end_array(QJSON *json);
+void json_start_array(QJSON *json, const char *name);
+void json_end_object(QJSON *json);
+void json_start_object(QJSON *json, const char *name);
+const char *qjson_get_str(QJSON *json);
+void qjson_finish(QJSON *json);
+
+#endif /* QEMU_QJSON_H */
diff --git a/include/ui/qemu-spice.h b/include/ui/qemu-spice.h
index a93b4b2..762e063 100644
--- a/include/ui/qemu-spice.h
+++ b/include/ui/qemu-spice.h
@@ -88,4 +88,14 @@
 
 #endif /* CONFIG_SPICE */
 
+static inline bool qemu_using_spice(Error **errp)
+{
+    if (!using_spice) {
+        error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE,
+                  "SPICE is not in use");
+        return false;
+    }
+    return true;
+}
+
 #endif /* QEMU_SPICE_H */
diff --git a/memory.c b/memory.c
index c343bf3..9b91243 100644
--- a/memory.c
+++ b/memory.c
@@ -33,26 +33,12 @@
 static bool ioeventfd_update_pending;
 static bool global_dirty_log = false;
 
-/* flat_view_mutex is taken around reading as->current_map; the critical
- * section is extremely short, so I'm using a single mutex for every AS.
- * We could also RCU for the read-side.
- *
- * The BQL is taken around transaction commits, hence both locks are taken
- * while writing to as->current_map (with the BQL taken outside).
- */
-static QemuMutex flat_view_mutex;
-
 static QTAILQ_HEAD(memory_listeners, MemoryListener) memory_listeners
     = QTAILQ_HEAD_INITIALIZER(memory_listeners);
 
 static QTAILQ_HEAD(, AddressSpace) address_spaces
     = QTAILQ_HEAD_INITIALIZER(address_spaces);
 
-static void memory_init(void)
-{
-    qemu_mutex_init(&flat_view_mutex);
-}
-
 typedef struct AddrRange AddrRange;
 
 /*
@@ -242,6 +228,7 @@
  * order.
  */
 struct FlatView {
+    struct rcu_head rcu;
     unsigned ref;
     FlatRange *ranges;
     unsigned nr;
@@ -654,10 +641,10 @@
 {
     FlatView *view;
 
-    qemu_mutex_lock(&flat_view_mutex);
-    view = as->current_map;
+    rcu_read_lock();
+    view = atomic_rcu_read(&as->current_map);
     flatview_ref(view);
-    qemu_mutex_unlock(&flat_view_mutex);
+    rcu_read_unlock();
     return view;
 }
 
@@ -766,10 +753,9 @@
     address_space_update_topology_pass(as, old_view, new_view, false);
     address_space_update_topology_pass(as, old_view, new_view, true);
 
-    qemu_mutex_lock(&flat_view_mutex);
-    flatview_unref(as->current_map);
-    as->current_map = new_view;
-    qemu_mutex_unlock(&flat_view_mutex);
+    /* Writes are protected by the BQL.  */
+    atomic_rcu_set(&as->current_map, new_view);
+    call_rcu(old_view, flatview_unref, rcu);
 
     /* Note that all the old MemoryRegions are still alive up to this
      * point.  This relieves most MemoryListeners from the need to
@@ -1263,7 +1249,6 @@
     MemoryRegion *mr = MEMORY_REGION(obj);
 
     assert(QTAILQ_EMPTY(&mr->subregions));
-    assert(memory_region_transaction_depth == 0);
     mr->destructor(mr);
     memory_region_clear_coalescing(mr);
     g_free((char *)mr->name);
@@ -1843,11 +1828,11 @@
     }
     range = addrrange_make(int128_make64(addr), int128_make64(size));
 
-    view = address_space_get_flatview(as);
+    rcu_read_lock();
+    view = atomic_rcu_read(&as->current_map);
     fr = flatview_lookup(view, range);
     if (!fr) {
-        flatview_unref(view);
-        return ret;
+        goto out;
     }
 
     while (fr > view->ranges && addrrange_intersects(fr[-1].addr, range)) {
@@ -1864,8 +1849,8 @@
     ret.offset_within_address_space = int128_get64(range.start);
     ret.readonly = fr->readonly;
     memory_region_ref(ret.mr);
-
-    flatview_unref(view);
+out:
+    rcu_read_unlock();
     return ret;
 }
 
@@ -1958,10 +1943,6 @@
 
 void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
 {
-    if (QTAILQ_EMPTY(&address_spaces)) {
-        memory_init();
-    }
-
     memory_region_transaction_begin();
     as->root = root;
     as->current_map = g_new(FlatView, 1);
@@ -1975,15 +1956,10 @@
     memory_region_transaction_commit();
 }
 
-void address_space_destroy(AddressSpace *as)
+static void do_address_space_destroy(AddressSpace *as)
 {
     MemoryListener *listener;
 
-    /* Flush out anything from MemoryListeners listening in on this */
-    memory_region_transaction_begin();
-    as->root = NULL;
-    memory_region_transaction_commit();
-    QTAILQ_REMOVE(&address_spaces, as, address_spaces_link);
     address_space_destroy_dispatch(as);
 
     QTAILQ_FOREACH(listener, &memory_listeners, link) {
@@ -1995,6 +1971,21 @@
     g_free(as->ioeventfds);
 }
 
+void address_space_destroy(AddressSpace *as)
+{
+    /* Flush out anything from MemoryListeners listening in on this */
+    memory_region_transaction_begin();
+    as->root = NULL;
+    memory_region_transaction_commit();
+    QTAILQ_REMOVE(&address_spaces, as, address_spaces_link);
+
+    /* At this point, as->dispatch and as->current_map are dummy
+     * entries that the guest should never use.  Wait for the old
+     * values to expire before freeing the data.
+     */
+    call_rcu(as, do_address_space_destroy, rcu);
+}
+
 bool io_mem_read(MemoryRegion *mr, hwaddr addr, uint64_t *pval, unsigned size)
 {
     return memory_region_dispatch_read(mr, addr, pval, size);
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index edc2830..e66e557 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -452,6 +452,22 @@
     return result;
 }
 
+int64_t qemu_ftell_fast(QEMUFile *f)
+{
+    int64_t ret = f->pos;
+    int i;
+
+    if (f->ops->writev_buffer) {
+        for (i = 0; i < f->iovcnt; i++) {
+            ret += f->iov[i].iov_len;
+        }
+    } else {
+        ret += f->buf_index;
+    }
+
+    return ret;
+}
+
 int64_t qemu_ftell(QEMUFile *f)
 {
     qemu_fflush(f);
diff --git a/migration/rdma.c b/migration/rdma.c
index b32dbdf..fc351ea 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -26,34 +26,7 @@
 #include <arpa/inet.h>
 #include <string.h>
 #include <rdma/rdma_cma.h>
-
-//#define DEBUG_RDMA
-//#define DEBUG_RDMA_VERBOSE
-//#define DEBUG_RDMA_REALLY_VERBOSE
-
-#ifdef DEBUG_RDMA
-#define DPRINTF(fmt, ...) \
-    do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-#ifdef DEBUG_RDMA_VERBOSE
-#define DDPRINTF(fmt, ...) \
-    do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DDPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-#ifdef DEBUG_RDMA_REALLY_VERBOSE
-#define DDDPRINTF(fmt, ...) \
-    do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DDDPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
+#include "trace.h"
 
 /*
  * Print and error on both the Monitor and the Log file.
@@ -104,8 +77,8 @@
     do { \
         if (rdma->error_state) { \
             if (!rdma->error_reported) { \
-                fprintf(stderr, "RDMA is in an error state waiting migration" \
-                                " to abort!\n"); \
+                error_report("RDMA is in an error state waiting migration" \
+                                " to abort!"); \
                 rdma->error_reported = 1; \
             } \
             return rdma->error_state; \
@@ -578,12 +551,13 @@
 
     g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
 
-    DDPRINTF("Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
-           " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
-            local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
-            block->length, (uint64_t) (block->local_host_addr + block->length),
-                BITS_TO_LONGS(block->nb_chunks) *
-                    sizeof(unsigned long) * 8, block->nb_chunks);
+    trace___qemu_rdma_add_block(local->nb_blocks,
+                           (uint64_t) block->local_host_addr, block->offset,
+                           block->length,
+                           (uint64_t) (block->local_host_addr + block->length),
+                           BITS_TO_LONGS(block->nb_chunks) *
+                               sizeof(unsigned long) * 8,
+                           block->nb_chunks);
 
     local->nb_blocks++;
 
@@ -614,7 +588,7 @@
     rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
     memset(local, 0, sizeof *local);
     qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
-    DPRINTF("Allocated %d local ram block structures\n", local->nb_blocks);
+    trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
     rdma->block = (RDMARemoteBlock *) g_malloc0(sizeof(RDMARemoteBlock) *
                         rdma->local_ram_blocks.nb_blocks);
     local->init = true;
@@ -683,12 +657,12 @@
         local->block = NULL;
     }
 
-    DDPRINTF("Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
-           " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
-            local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
-            block->length, (uint64_t) (block->local_host_addr + block->length),
-                BITS_TO_LONGS(block->nb_chunks) *
-                    sizeof(unsigned long) * 8, block->nb_chunks);
+    trace___qemu_rdma_delete_block(local->nb_blocks,
+                           (uint64_t)block->local_host_addr,
+                           block->offset, block->length,
+                           (uint64_t)(block->local_host_addr + block->length),
+                           BITS_TO_LONGS(block->nb_chunks) *
+                               sizeof(unsigned long) * 8, block->nb_chunks);
 
     g_free(old);
 
@@ -713,7 +687,7 @@
     struct ibv_port_attr port;
 
     if (ibv_query_port(verbs, 1, &port)) {
-        fprintf(stderr, "FAILED TO QUERY PORT INFORMATION!\n");
+        error_report("Failed to query port information");
         return;
     }
 
@@ -744,7 +718,7 @@
     char dgid[33];
     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
-    DPRINTF("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid);
+    trace_qemu_rdma_dump_gid(who, sgid, dgid);
 }
 
 /*
@@ -918,7 +892,7 @@
     for (e = res; e != NULL; e = e->ai_next) {
         inet_ntop(e->ai_family,
             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
-        DPRINTF("Trying %s => %s\n", rdma->host, ip);
+        trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
 
         ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
                 RDMA_RESOLVE_TIMEOUT_MS);
@@ -997,14 +971,14 @@
     /* allocate pd */
     rdma->pd = ibv_alloc_pd(rdma->verbs);
     if (!rdma->pd) {
-        fprintf(stderr, "failed to allocate protection domain\n");
+        error_report("failed to allocate protection domain");
         return -1;
     }
 
     /* create completion channel */
     rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
     if (!rdma->comp_channel) {
-        fprintf(stderr, "failed to allocate completion channel\n");
+        error_report("failed to allocate completion channel");
         goto err_alloc_pd_cq;
     }
 
@@ -1015,7 +989,7 @@
     rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
             NULL, rdma->comp_channel, 0);
     if (!rdma->cq) {
-        fprintf(stderr, "failed to allocate completion queue\n");
+        error_report("failed to allocate completion queue");
         goto err_alloc_pd_cq;
     }
 
@@ -1160,8 +1134,7 @@
     if (!block->pmr[chunk]) {
         uint64_t len = chunk_end - chunk_start;
 
-        DDPRINTF("Registering %" PRIu64 " bytes @ %p\n",
-                 len, chunk_start);
+        trace_qemu_rdma_register_and_get_keys(len, chunk_start);
 
         block->pmr[chunk] = ibv_reg_mr(rdma->pd,
                 chunk_start, len,
@@ -1204,7 +1177,7 @@
         rdma->total_registrations++;
         return 0;
     }
-    fprintf(stderr, "qemu_rdma_reg_control failed!\n");
+    error_report("qemu_rdma_reg_control failed");
     return -1;
 }
 
@@ -1270,8 +1243,8 @@
                                    .repeat = 1,
                                  };
 
-        DDPRINTF("Processing unregister for chunk: %" PRIu64
-                 " at position %d\n", chunk, rdma->unregister_current);
+        trace_qemu_rdma_unregister_waiting_proc(chunk,
+                                                rdma->unregister_current);
 
         rdma->unregistrations[rdma->unregister_current] = 0;
         rdma->unregister_current++;
@@ -1291,11 +1264,11 @@
         clear_bit(chunk, block->unregister_bitmap);
 
         if (test_bit(chunk, block->transit_bitmap)) {
-            DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
+            trace_qemu_rdma_unregister_waiting_inflight(chunk);
             continue;
         }
 
-        DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
+        trace_qemu_rdma_unregister_waiting_send(chunk);
 
         ret = ibv_dereg_mr(block->pmr[chunk]);
         block->pmr[chunk] = NULL;
@@ -1315,7 +1288,7 @@
             return ret;
         }
 
-        DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
+        trace_qemu_rdma_unregister_waiting_complete(chunk);
     }
 
     return 0;
@@ -1340,13 +1313,13 @@
                                         uint64_t chunk, uint64_t wr_id)
 {
     if (rdma->unregistrations[rdma->unregister_next] != 0) {
-        fprintf(stderr, "rdma migration: queue is full!\n");
+        error_report("rdma migration: queue is full");
     } else {
         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
 
         if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
-            DDPRINTF("Appending unregister chunk %" PRIu64
-                    " at position %d\n", chunk, rdma->unregister_next);
+            trace_qemu_rdma_signal_unregister_append(chunk,
+                                                     rdma->unregister_next);
 
             rdma->unregistrations[rdma->unregister_next++] =
                     qemu_rdma_make_wrid(wr_id, index, chunk);
@@ -1355,8 +1328,7 @@
                 rdma->unregister_next = 0;
             }
         } else {
-            DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
-                    chunk);
+            trace_qemu_rdma_signal_unregister_already(chunk);
         }
     }
 }
@@ -1381,7 +1353,7 @@
     }
 
     if (ret < 0) {
-        fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
+        error_report("ibv_poll_cq return %d", ret);
         return ret;
     }
 
@@ -1397,8 +1369,7 @@
 
     if (rdma->control_ready_expected &&
         (wr_id >= RDMA_WRID_RECV_CONTROL)) {
-        DDDPRINTF("completion %s #%" PRId64 " received (%" PRId64 ")"
-                  " left %d\n", wrid_desc[RDMA_WRID_RECV_CONTROL],
+        trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
                   wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
         rdma->control_ready_expected = 0;
     }
@@ -1410,9 +1381,8 @@
             (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
 
-        DDDPRINTF("completions %s (%" PRId64 ") left %d, "
-                 "block %" PRIu64 ", chunk: %" PRIu64 " %p %p\n",
-                 print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk,
+        trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
+                 index, chunk,
                  block->local_host_addr, (void *)block->remote_host_addr);
 
         clear_bit(chunk, block->transit_bitmap);
@@ -1433,8 +1403,7 @@
 #endif
         }
     } else {
-        DDDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
-            print_wrid(wr_id), wr_id, rdma->nb_sent);
+        trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
     }
 
     *wr_id_out = wc.wr_id;
@@ -1482,9 +1451,8 @@
             break;
         }
         if (wr_id != wrid_requested) {
-            DDDPRINTF("A Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
-                print_wrid(wrid_requested),
-                wrid_requested, print_wrid(wr_id), wr_id);
+            trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
+                       wrid_requested, print_wrid(wr_id), wr_id);
         }
     }
 
@@ -1524,9 +1492,8 @@
                 break;
             }
             if (wr_id != wrid_requested) {
-                DDDPRINTF("B Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
-                    print_wrid(wrid_requested), wrid_requested,
-                    print_wrid(wr_id), wr_id);
+                trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
+                                   wrid_requested, print_wrid(wr_id), wr_id);
             }
         }
 
@@ -1571,7 +1538,7 @@
                                    .num_sge = 1,
                                 };
 
-    DDDPRINTF("CONTROL: sending %s..\n", control_desc[head->type]);
+    trace_qemu_rdma_post_send_control(control_desc[head->type]);
 
     /*
      * We don't actually need to do a memcpy() in here if we used
@@ -1593,13 +1560,13 @@
     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
 
     if (ret > 0) {
-        fprintf(stderr, "Failed to use post IB SEND for control!\n");
+        error_report("Failed to use post IB SEND for control");
         return -ret;
     }
 
     ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
     if (ret < 0) {
-        fprintf(stderr, "rdma migration: send polling control error!\n");
+        error_report("rdma migration: send polling control error");
     }
 
     return ret;
@@ -1643,32 +1610,31 @@
                                        &byte_len);
 
     if (ret < 0) {
-        fprintf(stderr, "rdma migration: recv polling control error!\n");
+        error_report("rdma migration: recv polling control error!");
         return ret;
     }
 
     network_to_control((void *) rdma->wr_data[idx].control);
     memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
 
-    DDDPRINTF("CONTROL: %s receiving...\n", control_desc[expecting]);
+    trace_qemu_rdma_exchange_get_response_start(control_desc[expecting]);
 
     if (expecting == RDMA_CONTROL_NONE) {
-        DDDPRINTF("Surprise: got %s (%d)\n",
-                  control_desc[head->type], head->type);
+        trace_qemu_rdma_exchange_get_response_none(control_desc[head->type],
+                                             head->type);
     } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
-        fprintf(stderr, "Was expecting a %s (%d) control message"
-                ", but got: %s (%d), length: %d\n",
+        error_report("Was expecting a %s (%d) control message"
+                ", but got: %s (%d), length: %d",
                 control_desc[expecting], expecting,
                 control_desc[head->type], head->type, head->len);
         return -EIO;
     }
     if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
-        fprintf(stderr, "too long length: %d\n", head->len);
+        error_report("too long length: %d\n", head->len);
         return -EINVAL;
     }
     if (sizeof(*head) + head->len != byte_len) {
-        fprintf(stderr, "Malformed length: %d byte_len %d\n",
-                head->len, byte_len);
+        error_report("Malformed length: %d byte_len %d", head->len, byte_len);
         return -EINVAL;
     }
 
@@ -1730,7 +1696,7 @@
     if (resp) {
         ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
         if (ret) {
-            fprintf(stderr, "rdma migration: error posting"
+            error_report("rdma migration: error posting"
                     " extra control recv for anticipated result!");
             return ret;
         }
@@ -1741,7 +1707,7 @@
      */
     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
     if (ret) {
-        fprintf(stderr, "rdma migration: error posting first control recv!");
+        error_report("rdma migration: error posting first control recv!");
         return ret;
     }
 
@@ -1751,7 +1717,7 @@
     ret = qemu_rdma_post_send_control(rdma, data, head);
 
     if (ret < 0) {
-        fprintf(stderr, "Failed to send control buffer!\n");
+        error_report("Failed to send control buffer!");
         return ret;
     }
 
@@ -1760,14 +1726,14 @@
      */
     if (resp) {
         if (callback) {
-            DDPRINTF("Issuing callback before receiving response...\n");
+            trace_qemu_rdma_exchange_send_issue_callback();
             ret = callback(rdma);
             if (ret < 0) {
                 return ret;
             }
         }
 
-        DDPRINTF("Waiting for response %s\n", control_desc[resp->type]);
+        trace_qemu_rdma_exchange_send_waiting(control_desc[resp->type]);
         ret = qemu_rdma_exchange_get_response(rdma, resp,
                                               resp->type, RDMA_WRID_DATA);
 
@@ -1779,7 +1745,7 @@
         if (resp_idx) {
             *resp_idx = RDMA_WRID_DATA;
         }
-        DDPRINTF("Response %s received.\n", control_desc[resp->type]);
+        trace_qemu_rdma_exchange_send_received(control_desc[resp->type]);
     }
 
     rdma->control_ready_expected = 1;
@@ -1807,7 +1773,7 @@
     ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
 
     if (ret < 0) {
-        fprintf(stderr, "Failed to send control buffer!\n");
+        error_report("Failed to send control buffer!");
         return ret;
     }
 
@@ -1828,7 +1794,7 @@
      */
     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
     if (ret) {
-        fprintf(stderr, "rdma migration: error posting second control recv!");
+        error_report("rdma migration: error posting second control recv!");
         return ret;
     }
 
@@ -1882,8 +1848,9 @@
         }
     }
 
-    DDPRINTF("Writing %" PRIu64 " chunks, (%" PRIu64 " MB)\n",
-        chunks + 1, (chunks + 1) * (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
+    trace_qemu_rdma_write_one_top(chunks + 1,
+                                  (chunks + 1) *
+                                  (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
 
     chunk_end = ram_chunk_end(block, chunk + chunks);
 
@@ -1895,17 +1862,15 @@
 
     while (test_bit(chunk, block->transit_bitmap)) {
         (void)count;
-        DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
-                " current %" PRIu64 " len %" PRIu64 " %d %d\n",
-                count++, current_index, chunk,
+        trace_qemu_rdma_write_one_block(count++, current_index, chunk,
                 sge.addr, length, rdma->nb_sent, block->nb_chunks);
 
         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
 
         if (ret < 0) {
-            fprintf(stderr, "Failed to Wait for previous write to complete "
+            error_report("Failed to Wait for previous write to complete "
                     "block %d chunk %" PRIu64
-                    " current %" PRIu64 " len %" PRIu64 " %d\n",
+                    " current %" PRIu64 " len %" PRIu64 " %d",
                     current_index, chunk, sge.addr, length, rdma->nb_sent);
             return ret;
         }
@@ -1932,10 +1897,8 @@
                 head.len = sizeof(comp);
                 head.type = RDMA_CONTROL_COMPRESS;
 
-                DDPRINTF("Entire chunk is zero, sending compress: %"
-                    PRIu64 " for %d "
-                    "bytes, index: %d, offset: %" PRId64 "...\n",
-                    chunk, sge.length, current_index, current_addr);
+                trace_qemu_rdma_write_one_zero(chunk, sge.length,
+                                               current_index, current_addr);
 
                 compress_to_network(&comp);
                 ret = qemu_rdma_exchange_send(rdma, &head,
@@ -1961,9 +1924,8 @@
             }
             reg.chunks = chunks;
 
-            DDPRINTF("Sending registration request chunk %" PRIu64 " for %d "
-                    "bytes, index: %d, offset: %" PRId64 "...\n",
-                    chunk, sge.length, current_index, current_addr);
+            trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
+                                              current_addr);
 
             register_to_network(&reg);
             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
@@ -1977,7 +1939,7 @@
                                                 (uint8_t *) sge.addr,
                                                 &sge.lkey, NULL, chunk,
                                                 chunk_start, chunk_end)) {
-                fprintf(stderr, "cannot get lkey!\n");
+                error_report("cannot get lkey");
                 return -EINVAL;
             }
 
@@ -1986,9 +1948,8 @@
 
             network_to_result(reg_result);
 
-            DDPRINTF("Received registration result:"
-                    " my key: %x their key %x, chunk %" PRIu64 "\n",
-                    block->remote_keys[chunk], reg_result->rkey, chunk);
+            trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
+                                                 reg_result->rkey, chunk);
 
             block->remote_keys[chunk] = reg_result->rkey;
             block->remote_host_addr = reg_result->host_addr;
@@ -1998,7 +1959,7 @@
                                                 (uint8_t *)sge.addr,
                                                 &sge.lkey, NULL, chunk,
                                                 chunk_start, chunk_end)) {
-                fprintf(stderr, "cannot get lkey!\n");
+                error_report("cannot get lkey!");
                 return -EINVAL;
             }
         }
@@ -2010,7 +1971,7 @@
         if (qemu_rdma_register_and_get_keys(rdma, block, (uint8_t *)sge.addr,
                                                      &sge.lkey, NULL, chunk,
                                                      chunk_start, chunk_end)) {
-            fprintf(stderr, "cannot get lkey!\n");
+            error_report("cannot get lkey!");
             return -EINVAL;
         }
     }
@@ -2031,10 +1992,8 @@
     send_wr.wr.rdma.remote_addr = block->remote_host_addr +
                                 (current_addr - block->offset);
 
-    DDDPRINTF("Posting chunk: %" PRIu64 ", addr: %lx"
-              " remote: %lx, bytes %" PRIu32 "\n",
-              chunk, sge.addr, send_wr.wr.rdma.remote_addr,
-              sge.length);
+    trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
+                                   sge.length);
 
     /*
      * ibv_post_send() does not return negative error numbers,
@@ -2043,11 +2002,11 @@
     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
 
     if (ret == ENOMEM) {
-        DDPRINTF("send queue is full. wait a little....\n");
+        trace_qemu_rdma_write_one_queue_full();
         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
         if (ret < 0) {
-            fprintf(stderr, "rdma migration: failed to make "
-                            "room in full send queue! %d\n", ret);
+            error_report("rdma migration: failed to make "
+                         "room in full send queue! %d", ret);
             return ret;
         }
 
@@ -2088,7 +2047,7 @@
 
     if (ret == 0) {
         rdma->nb_sent++;
-        DDDPRINTF("sent total: %d\n", rdma->nb_sent);
+        trace_qemu_rdma_write_flush(rdma->nb_sent);
     }
 
     rdma->current_length = 0;
@@ -2173,7 +2132,7 @@
         ret = qemu_rdma_search_ram_block(rdma, block_offset,
                                          offset, len, &index, &chunk);
         if (ret) {
-            fprintf(stderr, "ram block search failed\n");
+            error_report("ram block search failed");
             return ret;
         }
         rdma->current_index = index;
@@ -2202,19 +2161,19 @@
                                        .type = RDMA_CONTROL_ERROR,
                                        .repeat = 1,
                                      };
-            fprintf(stderr, "Early error. Sending error.\n");
+            error_report("Early error. Sending error.");
             qemu_rdma_post_send_control(rdma, NULL, &head);
         }
 
         ret = rdma_disconnect(rdma->cm_id);
         if (!ret) {
-            DDPRINTF("waiting for disconnect\n");
+            trace_qemu_rdma_cleanup_waiting_for_disconnect();
             ret = rdma_get_cm_event(rdma->channel, &cm_event);
             if (!ret) {
                 rdma_ack_cm_event(cm_event);
             }
         }
-        DDPRINTF("Disconnected.\n");
+        trace_qemu_rdma_cleanup_disconnect();
         rdma->connected = false;
     }
 
@@ -2341,7 +2300,7 @@
      * on the source first requested the capability.
      */
     if (rdma->pin_all) {
-        DPRINTF("Server pin-all memory requested.\n");
+        trace_qemu_rdma_connect_pin_all_requested();
         cap.flags |= RDMA_CAPABILITY_PIN_ALL;
     }
 
@@ -2389,7 +2348,7 @@
         rdma->pin_all = false;
     }
 
-    DPRINTF("Pin all memory: %s\n", rdma->pin_all ? "enabled" : "disabled");
+    trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
 
     rdma_ack_cm_event(cm_event);
 
@@ -2456,7 +2415,7 @@
         for (e = res; e != NULL; e = e->ai_next) {
             inet_ntop(e->ai_family,
                 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
-            DPRINTF("Trying %s => %s\n", rdma->host, ip);
+            trace_qemu_rdma_dest_init_trying(rdma->host, ip);
             ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
             if (!ret) {
                 if (e->ai_family == AF_INET6) {
@@ -2575,8 +2534,7 @@
     size_t len = 0;
 
     if (rdma->wr_data[idx].control_len) {
-        DDDPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",
-                    rdma->wr_data[idx].control_len, size);
+        trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
 
         len = MIN(size, rdma->wr_data[idx].control_len);
         memcpy(buf, rdma->wr_data[idx].control_curr, len);
@@ -2643,7 +2601,7 @@
     while (rdma->nb_sent) {
         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
         if (ret < 0) {
-            fprintf(stderr, "rdma migration: complete polling error!\n");
+            error_report("rdma migration: complete polling error!");
             return -EIO;
         }
     }
@@ -2655,7 +2613,7 @@
 
 static int qemu_rdma_close(void *opaque)
 {
-    DPRINTF("Shutting down connection.\n");
+    trace_qemu_rdma_close();
     QEMUFileRDMA *r = opaque;
     if (r->rdma) {
         qemu_rdma_cleanup(r->rdma);
@@ -2719,7 +2677,7 @@
          */
         ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
         if (ret < 0) {
-            fprintf(stderr, "rdma migration: write error! %d\n", ret);
+            error_report("rdma migration: write error! %d", ret);
             goto err;
         }
 
@@ -2752,7 +2710,7 @@
                                          offset, size, &index, &chunk);
 
         if (ret) {
-            fprintf(stderr, "ram block search failed\n");
+            error_report("ram block search failed");
             goto err;
         }
 
@@ -2779,7 +2737,7 @@
         uint64_t wr_id, wr_id_in;
         int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
         if (ret < 0) {
-            fprintf(stderr, "rdma migration: polling error! %d\n", ret);
+            error_report("rdma migration: polling error! %d", ret);
             goto err;
         }
 
@@ -2824,7 +2782,7 @@
     network_to_caps(&cap);
 
     if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
-            fprintf(stderr, "Unknown source RDMA version: %d, bailing...\n",
+            error_report("Unknown source RDMA version: %d, bailing...",
                             cap.version);
             rdma_ack_cm_event(cm_event);
             goto err_rdma_dest_wait;
@@ -2848,17 +2806,17 @@
 
     rdma_ack_cm_event(cm_event);
 
-    DPRINTF("Memory pin all: %s\n", rdma->pin_all ? "enabled" : "disabled");
+    trace_qemu_rdma_accept_pin_state(rdma->pin_all);
 
     caps_to_network(&cap);
 
-    DPRINTF("verbs context after listen: %p\n", verbs);
+    trace_qemu_rdma_accept_pin_verbsc(verbs);
 
     if (!rdma->verbs) {
         rdma->verbs = verbs;
     } else if (rdma->verbs != verbs) {
-            fprintf(stderr, "ibv context not matching %p, %p!\n",
-                    rdma->verbs, verbs);
+            error_report("ibv context not matching %p, %p!", rdma->verbs,
+                         verbs);
             goto err_rdma_dest_wait;
     }
 
@@ -2866,26 +2824,26 @@
 
     ret = qemu_rdma_alloc_pd_cq(rdma);
     if (ret) {
-        fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
+        error_report("rdma migration: error allocating pd and cq!");
         goto err_rdma_dest_wait;
     }
 
     ret = qemu_rdma_alloc_qp(rdma);
     if (ret) {
-        fprintf(stderr, "rdma migration: error allocating qp!\n");
+        error_report("rdma migration: error allocating qp!");
         goto err_rdma_dest_wait;
     }
 
     ret = qemu_rdma_init_ram_blocks(rdma);
     if (ret) {
-        fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
+        error_report("rdma migration: error initializing ram blocks!");
         goto err_rdma_dest_wait;
     }
 
     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
         ret = qemu_rdma_reg_control(rdma, idx);
         if (ret) {
-            fprintf(stderr, "rdma: error registering %d control!\n", idx);
+            error_report("rdma: error registering %d control", idx);
             goto err_rdma_dest_wait;
         }
     }
@@ -2894,18 +2852,18 @@
 
     ret = rdma_accept(rdma->cm_id, &conn_param);
     if (ret) {
-        fprintf(stderr, "rdma_accept returns %d!\n", ret);
+        error_report("rdma_accept returns %d", ret);
         goto err_rdma_dest_wait;
     }
 
     ret = rdma_get_cm_event(rdma->channel, &cm_event);
     if (ret) {
-        fprintf(stderr, "rdma_accept get_cm_event failed %d!\n", ret);
+        error_report("rdma_accept get_cm_event failed %d", ret);
         goto err_rdma_dest_wait;
     }
 
     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
-        fprintf(stderr, "rdma_accept not event established!\n");
+        error_report("rdma_accept not event established");
         rdma_ack_cm_event(cm_event);
         goto err_rdma_dest_wait;
     }
@@ -2915,7 +2873,7 @@
 
     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
     if (ret) {
-        fprintf(stderr, "rdma migration: error posting second control recv!\n");
+        error_report("rdma migration: error posting second control recv");
         goto err_rdma_dest_wait;
     }
 
@@ -2969,7 +2927,7 @@
     CHECK_ERROR_STATE();
 
     do {
-        DDDPRINTF("Waiting for next request %" PRIu64 "...\n", flags);
+        trace_qemu_rdma_registration_handle_wait(flags);
 
         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
 
@@ -2978,8 +2936,8 @@
         }
 
         if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
-            fprintf(stderr, "rdma: Too many requests in this message (%d)."
-                            "Bailing.\n", head.repeat);
+            error_report("rdma: Too many requests in this message (%d)."
+                            "Bailing.", head.repeat);
             ret = -EIO;
             break;
         }
@@ -2989,9 +2947,9 @@
             comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
             network_to_compress(comp);
 
-            DDPRINTF("Zapping zero chunk: %" PRId64
-                    " bytes, index %d, offset %" PRId64 "\n",
-                    comp->length, comp->block_idx, comp->offset);
+            trace_qemu_rdma_registration_handle_compress(comp->length,
+                                                         comp->block_idx,
+                                                         comp->offset);
             block = &(rdma->local_ram_blocks.block[comp->block_idx]);
 
             host_addr = block->local_host_addr +
@@ -3001,17 +2959,17 @@
             break;
 
         case RDMA_CONTROL_REGISTER_FINISHED:
-            DDDPRINTF("Current registrations complete.\n");
+            trace_qemu_rdma_registration_handle_finished();
             goto out;
 
         case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
-            DPRINTF("Initial setup info requested.\n");
+            trace_qemu_rdma_registration_handle_ram_blocks();
 
             if (rdma->pin_all) {
                 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
                 if (ret) {
-                    fprintf(stderr, "rdma migration: error dest "
-                                    "registering ram blocks!\n");
+                    error_report("rdma migration: error dest "
+                                    "registering ram blocks");
                     goto out;
                 }
             }
@@ -3044,13 +3002,13 @@
                                         (uint8_t *) rdma->block, &blocks);
 
             if (ret < 0) {
-                fprintf(stderr, "rdma migration: error sending remote info!\n");
+                error_report("rdma migration: error sending remote info");
                 goto out;
             }
 
             break;
         case RDMA_CONTROL_REGISTER_REQUEST:
-            DDPRINTF("There are %d registration requests\n", head.repeat);
+            trace_qemu_rdma_registration_handle_register(head.repeat);
 
             reg_resp.repeat = head.repeat;
             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
@@ -3064,8 +3022,7 @@
 
                 reg_result = &results[count];
 
-                DDPRINTF("Registration request (%d): index %d, current_addr %"
-                         PRIu64 " chunks: %" PRIu64 "\n", count,
+                trace_qemu_rdma_registration_handle_register_loop(count,
                          reg->current_index, reg->key.current_addr, reg->chunks);
 
                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
@@ -3084,15 +3041,15 @@
                 if (qemu_rdma_register_and_get_keys(rdma, block,
                             (uint8_t *)host_addr, NULL, &reg_result->rkey,
                             chunk, chunk_start, chunk_end)) {
-                    fprintf(stderr, "cannot get rkey!\n");
+                    error_report("cannot get rkey");
                     ret = -EINVAL;
                     goto out;
                 }
 
                 reg_result->host_addr = (uint64_t) block->local_host_addr;
 
-                DDPRINTF("Registered rkey for this request: %x\n",
-                                reg_result->rkey);
+                trace_qemu_rdma_registration_handle_register_rkey(
+                                                           reg_result->rkey);
 
                 result_to_network(reg_result);
             }
@@ -3101,12 +3058,12 @@
                             (uint8_t *) results, &reg_resp);
 
             if (ret < 0) {
-                fprintf(stderr, "Failed to send control buffer!\n");
+                error_report("Failed to send control buffer");
                 goto out;
             }
             break;
         case RDMA_CONTROL_UNREGISTER_REQUEST:
-            DDPRINTF("There are %d unregistration requests\n", head.repeat);
+            trace_qemu_rdma_registration_handle_unregister(head.repeat);
             unreg_resp.repeat = head.repeat;
             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
 
@@ -3114,9 +3071,8 @@
                 reg = &registers[count];
                 network_to_register(reg);
 
-                DDPRINTF("Unregistration request (%d): "
-                         " index %d, chunk %" PRIu64 "\n",
-                         count, reg->current_index, reg->key.chunk);
+                trace_qemu_rdma_registration_handle_unregister_loop(count,
+                           reg->current_index, reg->key.chunk);
 
                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
 
@@ -3131,24 +3087,23 @@
 
                 rdma->total_registrations--;
 
-                DDPRINTF("Unregistered chunk %" PRIu64 " successfully.\n",
-                            reg->key.chunk);
+                trace_qemu_rdma_registration_handle_unregister_success(
+                                                       reg->key.chunk);
             }
 
             ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
 
             if (ret < 0) {
-                fprintf(stderr, "Failed to send control buffer!\n");
+                error_report("Failed to send control buffer");
                 goto out;
             }
             break;
         case RDMA_CONTROL_REGISTER_RESULT:
-            fprintf(stderr, "Invalid RESULT message at dest.\n");
+            error_report("Invalid RESULT message at dest.");
             ret = -EIO;
             goto out;
         default:
-            fprintf(stderr, "Unknown control message %s\n",
-                                control_desc[head.type]);
+            error_report("Unknown control message %s", control_desc[head.type]);
             ret = -EIO;
             goto out;
         }
@@ -3168,7 +3123,7 @@
 
     CHECK_ERROR_STATE();
 
-    DDDPRINTF("start section: %" PRIu64 "\n", flags);
+    trace_qemu_rdma_registration_start(flags);
     qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
     qemu_fflush(f);
 
@@ -3203,7 +3158,7 @@
         int reg_result_idx, i, j, nb_remote_blocks;
 
         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
-        DPRINTF("Sending registration setup for ram blocks...\n");
+        trace_qemu_rdma_registration_stop_ram();
 
         /*
          * Make sure that we parallelize the pinning on both sides.
@@ -3275,7 +3230,7 @@
         }
     }
 
-    DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags);
+    trace_qemu_rdma_registration_stop(flags);
 
     head.type = RDMA_CONTROL_REGISTER_FINISHED;
     ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
@@ -3339,7 +3294,7 @@
     QEMUFile *f;
     Error *local_err = NULL, **errp = &local_err;
 
-    DPRINTF("Accepting rdma connection...\n");
+    trace_qemu_dma_accept_incoming_migration();
     ret = qemu_rdma_accept(rdma);
 
     if (ret) {
@@ -3347,7 +3302,7 @@
         return;
     }
 
-    DPRINTF("Accepted migration\n");
+    trace_qemu_dma_accept_incoming_migration_accepted();
 
     f = qemu_fopen_rdma(rdma, "rb");
     if (f == NULL) {
@@ -3366,7 +3321,7 @@
     RDMAContext *rdma;
     Error *local_err = NULL;
 
-    DPRINTF("Starting RDMA-based incoming migration\n");
+    trace_rdma_start_incoming_migration();
     rdma = qemu_rdma_data_init(host_port, &local_err);
 
     if (rdma == NULL) {
@@ -3379,7 +3334,7 @@
         goto err;
     }
 
-    DPRINTF("qemu_rdma_dest_init success\n");
+    trace_rdma_start_incoming_migration_after_dest_init();
 
     ret = rdma_listen(rdma->listen_id, 5);
 
@@ -3388,7 +3343,7 @@
         goto err;
     }
 
-    DPRINTF("rdma_listen success\n");
+    trace_rdma_start_incoming_migration_after_rdma_listen();
 
     qemu_set_fd_handler2(rdma->channel->fd, NULL,
                          rdma_accept_incoming_migration, NULL,
@@ -3419,14 +3374,14 @@
         goto err;
     }
 
-    DPRINTF("qemu_rdma_source_init success\n");
+    trace_rdma_start_outgoing_migration_after_rdma_source_init();
     ret = qemu_rdma_connect(rdma, &local_err);
 
     if (ret) {
         goto err;
     }
 
-    DPRINTF("qemu_rdma_source_connect success\n");
+    trace_rdma_start_outgoing_migration_after_rdma_connect();
 
     s->file = qemu_fopen_rdma(rdma, "wb");
     migrate_fd_connect(s);
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 3dde574..e5388f0 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -3,10 +3,12 @@
 #include "migration/qemu-file.h"
 #include "migration/vmstate.h"
 #include "qemu/bitops.h"
+#include "qemu/error-report.h"
 #include "trace.h"
+#include "qjson.h"
 
 static void vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
-                                    void *opaque);
+                                    void *opaque, QJSON *vmdesc);
 static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd,
                                    void *opaque);
 
@@ -72,16 +74,21 @@
                        void *opaque, int version_id)
 {
     VMStateField *field = vmsd->fields;
-    int ret;
+    int ret = 0;
 
+    trace_vmstate_load_state(vmsd->name, version_id);
     if (version_id > vmsd->version_id) {
+        trace_vmstate_load_state_end(vmsd->name, "too new", -EINVAL);
         return -EINVAL;
     }
     if  (version_id < vmsd->minimum_version_id) {
         if (vmsd->load_state_old &&
             version_id >= vmsd->minimum_version_id_old) {
-            return vmsd->load_state_old(f, opaque, version_id);
+            ret = vmsd->load_state_old(f, opaque, version_id);
+            trace_vmstate_load_state_end(vmsd->name, "old path", ret);
+            return ret;
         }
+        trace_vmstate_load_state_end(vmsd->name, "too old", -EINVAL);
         return -EINVAL;
     }
     if (vmsd->pre_load) {
@@ -91,6 +98,7 @@
         }
     }
     while (field->name) {
+        trace_vmstate_load_state_field(vmsd->name, field->name);
         if ((field->field_exists &&
              field->field_exists(opaque, version_id)) ||
             (!field->field_exists &&
@@ -122,8 +130,8 @@
                 }
             }
         } else if (field->flags & VMS_MUST_EXIST) {
-            fprintf(stderr, "Input validation failed: %s/%s\n",
-                    vmsd->name, field->name);
+            error_report("Input validation failed: %s/%s",
+                         vmsd->name, field->name);
             return -1;
         }
         field++;
@@ -133,48 +141,203 @@
         return ret;
     }
     if (vmsd->post_load) {
-        return vmsd->post_load(opaque, version_id);
+        ret = vmsd->post_load(opaque, version_id);
     }
-    return 0;
+    trace_vmstate_load_state_end(vmsd->name, "end", ret);
+    return ret;
+}
+
+static int vmfield_name_num(VMStateField *start, VMStateField *search)
+{
+    VMStateField *field;
+    int found = 0;
+
+    for (field = start; field->name; field++) {
+        if (!strcmp(field->name, search->name)) {
+            if (field == search) {
+                return found;
+            }
+            found++;
+        }
+    }
+
+    return -1;
+}
+
+static bool vmfield_name_is_unique(VMStateField *start, VMStateField *search)
+{
+    VMStateField *field;
+    int found = 0;
+
+    for (field = start; field->name; field++) {
+        if (!strcmp(field->name, search->name)) {
+            found++;
+            /* name found more than once, so it's not unique */
+            if (found > 1) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+static const char *vmfield_get_type_name(VMStateField *field)
+{
+    const char *type = "unknown";
+
+    if (field->flags & VMS_STRUCT) {
+        type = "struct";
+    } else if (field->info->name) {
+        type = field->info->name;
+    }
+
+    return type;
+}
+
+static bool vmsd_can_compress(VMStateField *field)
+{
+    if (field->field_exists) {
+        /* Dynamically existing fields mess up compression */
+        return false;
+    }
+
+    if (field->flags & VMS_STRUCT) {
+        VMStateField *sfield = field->vmsd->fields;
+        while (sfield->name) {
+            if (!vmsd_can_compress(sfield)) {
+                /* Child elements can't compress, so can't we */
+                return false;
+            }
+            sfield++;
+        }
+
+        if (field->vmsd->subsections) {
+            /* Subsections may come and go, better don't compress */
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vmsd_desc_field_start(const VMStateDescription *vmsd, QJSON *vmdesc,
+                                  VMStateField *field, int i, int max)
+{
+    char *name, *old_name;
+    bool is_array = max > 1;
+    bool can_compress = vmsd_can_compress(field);
+
+    if (!vmdesc) {
+        return;
+    }
+
+    name = g_strdup(field->name);
+
+    /* Field name is not unique, need to make it unique */
+    if (!vmfield_name_is_unique(vmsd->fields, field)) {
+        int num = vmfield_name_num(vmsd->fields, field);
+        old_name = name;
+        name = g_strdup_printf("%s[%d]", name, num);
+        g_free(old_name);
+    }
+
+    json_start_object(vmdesc, NULL);
+    json_prop_str(vmdesc, "name", name);
+    if (is_array) {
+        if (can_compress) {
+            json_prop_int(vmdesc, "array_len", max);
+        } else {
+            json_prop_int(vmdesc, "index", i);
+        }
+    }
+    json_prop_str(vmdesc, "type", vmfield_get_type_name(field));
+
+    if (field->flags & VMS_STRUCT) {
+        json_start_object(vmdesc, "struct");
+    }
+
+    g_free(name);
+}
+
+static void vmsd_desc_field_end(const VMStateDescription *vmsd, QJSON *vmdesc,
+                                VMStateField *field, size_t size, int i)
+{
+    if (!vmdesc) {
+        return;
+    }
+
+    if (field->flags & VMS_STRUCT) {
+        /* We printed a struct in between, close its child object */
+        json_end_object(vmdesc);
+    }
+
+    json_prop_int(vmdesc, "size", size);
+    json_end_object(vmdesc);
 }
 
 void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd,
-                        void *opaque)
+                        void *opaque, QJSON *vmdesc)
 {
     VMStateField *field = vmsd->fields;
 
     if (vmsd->pre_save) {
         vmsd->pre_save(opaque);
     }
+
+    if (vmdesc) {
+        json_prop_str(vmdesc, "vmsd_name", vmsd->name);
+        json_prop_int(vmdesc, "version", vmsd->version_id);
+        json_start_array(vmdesc, "fields");
+    }
+
     while (field->name) {
         if (!field->field_exists ||
             field->field_exists(opaque, vmsd->version_id)) {
             void *base_addr = vmstate_base_addr(opaque, field, false);
             int i, n_elems = vmstate_n_elems(opaque, field);
             int size = vmstate_size(opaque, field);
+            int64_t old_offset, written_bytes;
+            QJSON *vmdesc_loop = vmdesc;
 
             for (i = 0; i < n_elems; i++) {
                 void *addr = base_addr + size * i;
 
+                vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems);
+                old_offset = qemu_ftell_fast(f);
+
                 if (field->flags & VMS_ARRAY_OF_POINTER) {
                     addr = *(void **)addr;
                 }
                 if (field->flags & VMS_STRUCT) {
-                    vmstate_save_state(f, field->vmsd, addr);
+                    vmstate_save_state(f, field->vmsd, addr, vmdesc_loop);
                 } else {
                     field->info->put(f, addr, size);
                 }
+
+                written_bytes = qemu_ftell_fast(f) - old_offset;
+                vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i);
+
+                /* Compressed arrays only care about the first element */
+                if (vmdesc_loop && vmsd_can_compress(field)) {
+                    vmdesc_loop = NULL;
+                }
             }
         } else {
             if (field->flags & VMS_MUST_EXIST) {
-                fprintf(stderr, "Output state validation failed: %s/%s\n",
+                error_report("Output state validation failed: %s/%s",
                         vmsd->name, field->name);
                 assert(!(field->flags & VMS_MUST_EXIST));
             }
         }
         field++;
     }
-    vmstate_subsection_save(f, vmsd, opaque);
+
+    if (vmdesc) {
+        json_end_array(vmdesc);
+    }
+
+    vmstate_subsection_save(f, vmsd, opaque, vmdesc);
 }
 
 static const VMStateDescription *
@@ -192,6 +355,8 @@
 static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd,
                                    void *opaque)
 {
+    trace_vmstate_subsection_load(vmsd->name);
+
     while (qemu_peek_byte(f, 0) == QEMU_VM_SUBSECTION) {
         char idstr[256];
         int ret;
@@ -201,20 +366,24 @@
         len = qemu_peek_byte(f, 1);
         if (len < strlen(vmsd->name) + 1) {
             /* subsection name has be be "section_name/a" */
+            trace_vmstate_subsection_load_bad(vmsd->name, "(short)");
             return 0;
         }
         size = qemu_peek_buffer(f, (uint8_t *)idstr, len, 2);
         if (size != len) {
+            trace_vmstate_subsection_load_bad(vmsd->name, "(peek fail)");
             return 0;
         }
         idstr[size] = 0;
 
         if (strncmp(vmsd->name, idstr, strlen(vmsd->name)) != 0) {
+            trace_vmstate_subsection_load_bad(vmsd->name, idstr);
             /* it don't have a valid subsection name */
             return 0;
         }
         sub_vmsd = vmstate_get_subsection(vmsd->subsections, idstr);
         if (sub_vmsd == NULL) {
+            trace_vmstate_subsection_load_bad(vmsd->name, "(lookup)");
             return -ENOENT;
         }
         qemu_file_skip(f, 1); /* subsection */
@@ -224,31 +393,53 @@
 
         ret = vmstate_load_state(f, sub_vmsd, opaque, version_id);
         if (ret) {
+            trace_vmstate_subsection_load_bad(vmsd->name, "(child)");
             return ret;
         }
     }
+
+    trace_vmstate_subsection_load_good(vmsd->name);
     return 0;
 }
 
 static void vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd,
-                                    void *opaque)
+                                    void *opaque, QJSON *vmdesc)
 {
     const VMStateSubsection *sub = vmsd->subsections;
+    bool subsection_found = false;
 
     while (sub && sub->needed) {
         if (sub->needed(opaque)) {
             const VMStateDescription *vmsd = sub->vmsd;
             uint8_t len;
 
+            if (vmdesc) {
+                /* Only create subsection array when we have any */
+                if (!subsection_found) {
+                    json_start_array(vmdesc, "subsections");
+                    subsection_found = true;
+                }
+
+                json_start_object(vmdesc, NULL);
+            }
+
             qemu_put_byte(f, QEMU_VM_SUBSECTION);
             len = strlen(vmsd->name);
             qemu_put_byte(f, len);
             qemu_put_buffer(f, (uint8_t *)vmsd->name, len);
             qemu_put_be32(f, vmsd->version_id);
-            vmstate_save_state(f, vmsd, opaque);
+            vmstate_save_state(f, vmsd, opaque, vmdesc);
+
+            if (vmdesc) {
+                json_end_object(vmdesc);
+            }
         }
         sub++;
     }
+
+    if (vmdesc && subsection_found) {
+        json_end_array(vmdesc);
+    }
 }
 
 /* bool */
diff --git a/monitor.c b/monitor.c
index e6dc50a..5a24311 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1095,11 +1095,12 @@
     const char *subject  = qdict_get_try_str(qdict, "cert-subject");
     int port             = qdict_get_try_int(qdict, "port", -1);
     int tls_port         = qdict_get_try_int(qdict, "tls-port", -1);
+    Error *err;
     int ret;
 
     if (strcmp(protocol, "spice") == 0) {
-        if (!using_spice) {
-            qerror_report(QERR_DEVICE_NOT_ACTIVE, "spice");
+        if (!qemu_using_spice(&err)) {
+            qerror_report_err(err);
             return -1;
         }
 
@@ -4787,9 +4788,9 @@
     return (mon->suspend_cnt == 0) ? 1 : 0;
 }
 
-static int invalid_qmp_mode(const Monitor *mon, const char *cmd_name)
+static int invalid_qmp_mode(const Monitor *mon, const mon_cmd_t *cmd)
 {
-    int is_cap = compare_cmd(cmd_name, "qmp_capabilities");
+    int is_cap = cmd->mhandler.cmd_new == do_qmp_capabilities;
     return (qmp_cmd_mode(mon) ? is_cap : !is_cap);
 }
 
@@ -5083,14 +5084,10 @@
 
     cmd_name = qdict_get_str(input, "execute");
     trace_handle_qmp_command(mon, cmd_name);
-    if (invalid_qmp_mode(mon, cmd_name)) {
-        qerror_report(QERR_COMMAND_NOT_FOUND, cmd_name);
-        goto err_out;
-    }
-
     cmd = qmp_find_cmd(cmd_name);
-    if (!cmd) {
-        qerror_report(QERR_COMMAND_NOT_FOUND, cmd_name);
+    if (!cmd || invalid_qmp_mode(mon, cmd)) {
+        qerror_report(ERROR_CLASS_COMMAND_NOT_FOUND,
+                      "The command %s has not been found", cmd_name);
         goto err_out;
     }
 
diff --git a/pc-bios/s390-ccw.img b/pc-bios/s390-ccw.img
index 44873ad..dbe5a38 100644
--- a/pc-bios/s390-ccw.img
+++ b/pc-bios/s390-ccw.img
Binary files differ
diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c
index 115d8bb..b678d5e 100644
--- a/pc-bios/s390-ccw/bootmap.c
+++ b/pc-bios/s390-ccw/bootmap.c
@@ -33,7 +33,7 @@
     uint32_t ipl_continue;
 } ResetInfo;
 
-ResetInfo save;
+static ResetInfo save;
 
 static void jump_to_IPL_2(void)
 {
@@ -80,7 +80,7 @@
  */
 
 static unsigned char _bprs[8*1024]; /* guessed "max" ECKD sector size */
-const int max_bprs_entries = sizeof(_bprs) / sizeof(ExtEckdBlockPtr);
+static const int max_bprs_entries = sizeof(_bprs) / sizeof(ExtEckdBlockPtr);
 
 static inline void verify_boot_info(BootInfo *bip)
 {
diff --git a/pc-bios/s390-ccw/bootmap.h b/pc-bios/s390-ccw/bootmap.h
index 6a4823d..ab132e3 100644
--- a/pc-bios/s390-ccw/bootmap.h
+++ b/pc-bios/s390-ccw/bootmap.h
@@ -15,7 +15,7 @@
 #include "virtio.h"
 
 typedef uint64_t block_number_t;
-#define NULL_BLOCK_NR 0xffffffffffffffff
+#define NULL_BLOCK_NR 0xffffffffffffffffULL
 
 #define FREE_SPACE_FILLER '\xAA'
 
diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
index f9ec215..6f707bb 100644
--- a/pc-bios/s390-ccw/main.c
+++ b/pc-bios/s390-ccw/main.c
@@ -13,7 +13,7 @@
 
 char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
 uint64_t boot_value;
-struct subchannel_id blk_schid = { .one = 1 };
+static struct subchannel_id blk_schid = { .one = 1 };
 
 /*
  * Priniciples of Operations (SA22-7832-09) chapter 17 requires that
diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h
index 2b773de..ceb7418 100644
--- a/pc-bios/s390-ccw/s390-ccw.h
+++ b/pc-bios/s390-ccw/s390-ccw.h
@@ -51,6 +51,8 @@
 /* main.c */
 void virtio_panic(const char *string);
 void write_subsystem_identification(void);
+extern char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
+extern uint64_t boot_value;
 
 /* sclp-ascii.c */
 void sclp_print(const char *string);
diff --git a/pc-bios/s390-ccw/virtio.c b/pc-bios/s390-ccw/virtio.c
index c0540d1..4dc91a7 100644
--- a/pc-bios/s390-ccw/virtio.c
+++ b/pc-bios/s390-ccw/virtio.c
@@ -11,7 +11,7 @@
 #include "s390-ccw.h"
 #include "virtio.h"
 
-struct vring block;
+static struct vring block;
 
 static char chsc_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
 
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index 168b083..2227420 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -76,7 +76,8 @@
     command = qdict_get_str(dict, "execute");
     cmd = qmp_find_command(command);
     if (cmd == NULL) {
-        error_set(errp, QERR_COMMAND_NOT_FOUND, command);
+        error_set(errp, ERROR_CLASS_COMMAND_NOT_FOUND,
+                  "The command %s has not been found", command);
         return NULL;
     }
     if (!cmd->enabled) {
diff --git a/qjson.c b/qjson.c
new file mode 100644
index 0000000..b242222
--- /dev/null
+++ b/qjson.c
@@ -0,0 +1,129 @@
+/*
+ * QEMU JSON writer
+ *
+ * Copyright Alexander Graf
+ *
+ * Authors:
+ *  Alexander Graf <agraf@suse.de
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include <qapi/qmp/qstring.h>
+#include <stdbool.h>
+#include <glib.h>
+#include <qjson.h>
+#include <qemu/module.h>
+#include <qom/object.h>
+
+struct QJSON {
+    Object obj;
+    QString *str;
+    bool omit_comma;
+};
+
+static void json_emit_element(QJSON *json, const char *name)
+{
+    /* Check whether we need to print a , before an element */
+    if (json->omit_comma) {
+        json->omit_comma = false;
+    } else {
+        qstring_append(json->str, ", ");
+    }
+
+    if (name) {
+        qstring_append(json->str, "\"");
+        qstring_append(json->str, name);
+        qstring_append(json->str, "\" : ");
+    }
+}
+
+void json_start_object(QJSON *json, const char *name)
+{
+    json_emit_element(json, name);
+    qstring_append(json->str, "{ ");
+    json->omit_comma = true;
+}
+
+void json_end_object(QJSON *json)
+{
+    qstring_append(json->str, " }");
+    json->omit_comma = false;
+}
+
+void json_start_array(QJSON *json, const char *name)
+{
+    json_emit_element(json, name);
+    qstring_append(json->str, "[ ");
+    json->omit_comma = true;
+}
+
+void json_end_array(QJSON *json)
+{
+    qstring_append(json->str, " ]");
+    json->omit_comma = false;
+}
+
+void json_prop_int(QJSON *json, const char *name, int64_t val)
+{
+    json_emit_element(json, name);
+    qstring_append_int(json->str, val);
+}
+
+void json_prop_str(QJSON *json, const char *name, const char *str)
+{
+    json_emit_element(json, name);
+    qstring_append_chr(json->str, '"');
+    qstring_append(json->str, str);
+    qstring_append_chr(json->str, '"');
+}
+
+const char *qjson_get_str(QJSON *json)
+{
+    return qstring_get_str(json->str);
+}
+
+QJSON *qjson_new(void)
+{
+    QJSON *json = (QJSON *)object_new(TYPE_QJSON);
+    return json;
+}
+
+void qjson_finish(QJSON *json)
+{
+    json_end_object(json);
+}
+
+static void qjson_initfn(Object *obj)
+{
+    QJSON *json = (QJSON *)object_dynamic_cast(obj, TYPE_QJSON);
+    assert(json);
+
+    json->str = qstring_from_str("{ ");
+    json->omit_comma = true;
+}
+
+static void qjson_finalizefn(Object *obj)
+{
+    QJSON *json = (QJSON *)object_dynamic_cast(obj, TYPE_QJSON);
+
+    assert(json);
+    qobject_decref(QOBJECT(json->str));
+}
+
+static const TypeInfo qjson_type_info = {
+    .name = TYPE_QJSON,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(QJSON),
+    .instance_init = qjson_initfn,
+    .instance_finalize = qjson_finalizefn,
+};
+
+static void qjson_register_types(void)
+{
+    type_register_static(&qjson_type_info);
+}
+
+type_init(qjson_register_types)
diff --git a/qmp.c b/qmp.c
index 963305c..7f2d25a 100644
--- a/qmp.c
+++ b/qmp.c
@@ -137,14 +137,18 @@
 #endif
 
 #ifndef CONFIG_SPICE
-/* If SPICE support is enabled, the "true" query-spice command is
-   defined in the SPICE subsystem. Also note that we use a small
-   trick to maintain query-spice's original behavior, which is not
-   to be available in the namespace if SPICE is not compiled in */
+/*
+ * qmp-commands.hx ensures that QMP command query-spice exists only
+ * #ifdef CONFIG_SPICE.  Necessary for an accurate query-commands
+ * result.  However, the QAPI schema is blissfully unaware of that,
+ * and the QAPI code generator happily generates a dead
+ * qmp_marshal_input_query_spice() that calls qmp_query_spice().
+ * Provide it one, or else linking fails.
+ * FIXME Educate the QAPI schema on CONFIG_SPICE.
+ */
 SpiceInfo *qmp_query_spice(Error **errp)
 {
-    error_set(errp, QERR_COMMAND_NOT_FOUND, "query-spice");
-    return NULL;
+    abort();
 };
 #endif
 
@@ -287,9 +291,7 @@
     }
 
     if (strcmp(protocol, "spice") == 0) {
-        if (!using_spice) {
-            /* correct one? spice isn't a device ,,, */
-            error_set(errp, QERR_DEVICE_NOT_ACTIVE, "spice");
+        if (!qemu_using_spice(errp)) {
             return;
         }
         rc = qemu_spice_set_passwd(password, fail_if_connected,
@@ -335,9 +337,7 @@
     }
 
     if (strcmp(protocol, "spice") == 0) {
-        if (!using_spice) {
-            /* correct one? spice isn't a device ,,, */
-            error_set(errp, QERR_DEVICE_NOT_ACTIVE, "spice");
+        if (!qemu_using_spice(errp)) {
             return;
         }
         rc = qemu_spice_set_pw_expire(when);
@@ -575,8 +575,7 @@
     }
 
     if (strcmp(protocol, "spice") == 0) {
-        if (!using_spice) {
-            error_set(errp, QERR_DEVICE_NOT_ACTIVE, "spice");
+        if (!qemu_using_spice(errp)) {
             close(fd);
             return;
         }
diff --git a/savevm.c b/savevm.c
index 08ec678..8040766 100644
--- a/savevm.c
+++ b/savevm.c
@@ -572,14 +572,34 @@
     return vmstate_load_state(f, se->vmsd, se->opaque, version_id);
 }
 
-static void vmstate_save(QEMUFile *f, SaveStateEntry *se)
+static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
+{
+    int64_t old_offset, size;
+
+    old_offset = qemu_ftell_fast(f);
+    se->ops->save_state(f, se->opaque);
+    size = qemu_ftell_fast(f) - old_offset;
+
+    if (vmdesc) {
+        json_prop_int(vmdesc, "size", size);
+        json_start_array(vmdesc, "fields");
+        json_start_object(vmdesc, NULL);
+        json_prop_str(vmdesc, "name", "data");
+        json_prop_int(vmdesc, "size", size);
+        json_prop_str(vmdesc, "type", "buffer");
+        json_end_object(vmdesc);
+        json_end_array(vmdesc);
+    }
+}
+
+static void vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
 {
     trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
-    if (!se->vmsd) {         /* Old style */
-        se->ops->save_state(f, se->opaque);
+    if (!se->vmsd) {
+        vmstate_save_old_style(f, se, vmdesc);
         return;
     }
-    vmstate_save_state(f, se->vmsd, se->opaque);
+    vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 }
 
 bool qemu_savevm_state_blocked(Error **errp)
@@ -674,7 +694,7 @@
         qemu_put_be32(f, se->section_id);
 
         ret = se->ops->save_live_iterate(f, se->opaque);
-        trace_savevm_section_end(se->idstr, se->section_id);
+        trace_savevm_section_end(se->idstr, se->section_id, ret);
 
         if (ret < 0) {
             qemu_file_set_error(f, ret);
@@ -692,6 +712,8 @@
 
 void qemu_savevm_state_complete(QEMUFile *f)
 {
+    QJSON *vmdesc;
+    int vmdesc_len;
     SaveStateEntry *se;
     int ret;
 
@@ -714,13 +736,16 @@
         qemu_put_be32(f, se->section_id);
 
         ret = se->ops->save_live_complete(f, se->opaque);
-        trace_savevm_section_end(se->idstr, se->section_id);
+        trace_savevm_section_end(se->idstr, se->section_id, ret);
         if (ret < 0) {
             qemu_file_set_error(f, ret);
             return;
         }
     }
 
+    vmdesc = qjson_new();
+    json_prop_int(vmdesc, "page_size", TARGET_PAGE_SIZE);
+    json_start_array(vmdesc, "devices");
     QTAILQ_FOREACH(se, &savevm_handlers, entry) {
         int len;
 
@@ -728,6 +753,11 @@
             continue;
         }
         trace_savevm_section_start(se->idstr, se->section_id);
+
+        json_start_object(vmdesc, NULL);
+        json_prop_str(vmdesc, "name", se->idstr);
+        json_prop_int(vmdesc, "instance_id", se->instance_id);
+
         /* Section type */
         qemu_put_byte(f, QEMU_VM_SECTION_FULL);
         qemu_put_be32(f, se->section_id);
@@ -740,11 +770,23 @@
         qemu_put_be32(f, se->instance_id);
         qemu_put_be32(f, se->version_id);
 
-        vmstate_save(f, se);
-        trace_savevm_section_end(se->idstr, se->section_id);
+        vmstate_save(f, se, vmdesc);
+
+        json_end_object(vmdesc);
+        trace_savevm_section_end(se->idstr, se->section_id, 0);
     }
 
     qemu_put_byte(f, QEMU_VM_EOF);
+
+    json_end_array(vmdesc);
+    qjson_finish(vmdesc);
+    vmdesc_len = strlen(qjson_get_str(vmdesc));
+
+    qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
+    qemu_put_be32(f, vmdesc_len);
+    qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len);
+    object_unref(OBJECT(vmdesc));
+
     qemu_fflush(f);
 }
 
@@ -843,7 +885,7 @@
         qemu_put_be32(f, se->instance_id);
         qemu_put_be32(f, se->version_id);
 
-        vmstate_save(f, se);
+        vmstate_save(f, se, NULL);
     }
 
     qemu_put_byte(f, QEMU_VM_EOF);
@@ -883,25 +925,30 @@
     QLIST_HEAD(, LoadStateEntry) loadvm_handlers =
         QLIST_HEAD_INITIALIZER(loadvm_handlers);
     LoadStateEntry *le, *new_le;
+    Error *local_err = NULL;
     uint8_t section_type;
     unsigned int v;
     int ret;
 
-    if (qemu_savevm_state_blocked(NULL)) {
+    if (qemu_savevm_state_blocked(&local_err)) {
+        error_report("%s", error_get_pretty(local_err));
+        error_free(local_err);
         return -EINVAL;
     }
 
     v = qemu_get_be32(f);
     if (v != QEMU_VM_FILE_MAGIC) {
+        error_report("Not a migration stream");
         return -EINVAL;
     }
 
     v = qemu_get_be32(f);
     if (v == QEMU_VM_FILE_VERSION_COMPAT) {
-        fprintf(stderr, "SaveVM v2 format is obsolete and don't work anymore\n");
+        error_report("SaveVM v2 format is obsolete and don't work anymore");
         return -ENOTSUP;
     }
     if (v != QEMU_VM_FILE_VERSION) {
+        error_report("Unsupported migration stream version");
         return -ENOTSUP;
     }
 
@@ -911,6 +958,7 @@
         char idstr[257];
         int len;
 
+        trace_qemu_loadvm_state_section(section_type);
         switch (section_type) {
         case QEMU_VM_SECTION_START:
         case QEMU_VM_SECTION_FULL:
@@ -922,18 +970,21 @@
             instance_id = qemu_get_be32(f);
             version_id = qemu_get_be32(f);
 
+            trace_qemu_loadvm_state_section_startfull(section_id, idstr,
+                                                      instance_id, version_id);
             /* Find savevm section */
             se = find_se(idstr, instance_id);
             if (se == NULL) {
-                fprintf(stderr, "Unknown savevm section or instance '%s' %d\n", idstr, instance_id);
+                error_report("Unknown savevm section or instance '%s' %d",
+                             idstr, instance_id);
                 ret = -EINVAL;
                 goto out;
             }
 
             /* Validate version */
             if (version_id > se->version_id) {
-                fprintf(stderr, "savevm: unsupported version %d for '%s' v%d\n",
-                        version_id, idstr, se->version_id);
+                error_report("savevm: unsupported version %d for '%s' v%d",
+                             version_id, idstr, se->version_id);
                 ret = -EINVAL;
                 goto out;
             }
@@ -948,8 +999,8 @@
 
             ret = vmstate_load(f, le->se, le->version_id);
             if (ret < 0) {
-                fprintf(stderr, "qemu: warning: error while loading state for instance 0x%x of device '%s'\n",
-                        instance_id, idstr);
+                error_report("error while loading state for instance 0x%x of"
+                             " device '%s'", instance_id, idstr);
                 goto out;
             }
             break;
@@ -957,26 +1008,27 @@
         case QEMU_VM_SECTION_END:
             section_id = qemu_get_be32(f);
 
+            trace_qemu_loadvm_state_section_partend(section_id);
             QLIST_FOREACH(le, &loadvm_handlers, entry) {
                 if (le->section_id == section_id) {
                     break;
                 }
             }
             if (le == NULL) {
-                fprintf(stderr, "Unknown savevm section %d\n", section_id);
+                error_report("Unknown savevm section %d", section_id);
                 ret = -EINVAL;
                 goto out;
             }
 
             ret = vmstate_load(f, le->se, le->version_id);
             if (ret < 0) {
-                fprintf(stderr, "qemu: warning: error while loading state section id %d\n",
-                        section_id);
+                error_report("error while loading state section id %d(%s)",
+                             section_id, le->se->idstr);
                 goto out;
             }
             break;
         default:
-            fprintf(stderr, "Unknown savevm section type %d\n", section_type);
+            error_report("Unknown savevm section type %d", section_type);
             ret = -EINVAL;
             goto out;
         }
diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py
new file mode 100755
index 0000000..b8b9968
--- /dev/null
+++ b/scripts/analyze-migration.py
@@ -0,0 +1,592 @@
+#!/usr/bin/env python
+#
+#  Migration Stream Analyzer
+#
+#  Copyright (c) 2015 Alexander Graf <agraf@suse.de>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import json
+import os
+import argparse
+import collections
+import pprint
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError:
+        pass
+
+class MigrationFile(object):
+    def __init__(self, filename):
+        self.filename = filename
+        self.file = open(self.filename, "rb")
+
+    def read64(self):
+        return np.asscalar(np.fromfile(self.file, count=1, dtype='>i8')[0])
+
+    def read32(self):
+        return np.asscalar(np.fromfile(self.file, count=1, dtype='>i4')[0])
+
+    def read16(self):
+        return np.asscalar(np.fromfile(self.file, count=1, dtype='>i2')[0])
+
+    def read8(self):
+        return np.asscalar(np.fromfile(self.file, count=1, dtype='>i1')[0])
+
+    def readstr(self, len = None):
+        if len is None:
+            len = self.read8()
+        if len == 0:
+            return ""
+        return np.fromfile(self.file, count=1, dtype=('S%d' % len))[0]
+
+    def readvar(self, size = None):
+        if size is None:
+            size = self.read8()
+        if size == 0:
+            return ""
+        value = self.file.read(size)
+        if len(value) != size:
+            raise Exception("Unexpected end of %s at 0x%x" % (self.filename, self.file.tell()))
+        return value
+
+    def tell(self):
+        return self.file.tell()
+
+    # The VMSD description is at the end of the file, after EOF. Look for
+    # the last NULL byte, then for the beginning brace of JSON.
+    def read_migration_debug_json(self):
+        QEMU_VM_VMDESCRIPTION = 0x06
+
+        # Remember the offset in the file when we started
+        entrypos = self.file.tell()
+
+        # Read the last 10MB
+        self.file.seek(0, os.SEEK_END)
+        endpos = self.file.tell()
+        self.file.seek(max(-endpos, -10 * 1024 * 1024), os.SEEK_END)
+        datapos = self.file.tell()
+        data = self.file.read()
+        # The full file read closed the file as well, reopen it
+        self.file = open(self.filename, "rb")
+
+        # Find the last NULL byte, then the first brace after that. This should
+        # be the beginning of our JSON data.
+        nulpos = data.rfind("\0")
+        jsonpos = data.find("{", nulpos)
+
+        # Check backwards from there and see whether we guessed right
+        self.file.seek(datapos + jsonpos - 5, 0)
+        if self.read8() != QEMU_VM_VMDESCRIPTION:
+            raise Exception("No Debug Migration device found")
+
+        jsonlen = self.read32()
+
+        # Seek back to where we were at the beginning
+        self.file.seek(entrypos, 0)
+
+        return data[jsonpos:jsonpos + jsonlen]
+
+    def close(self):
+        self.file.close()
+
+class RamSection(object):
+    RAM_SAVE_FLAG_COMPRESS = 0x02
+    RAM_SAVE_FLAG_MEM_SIZE = 0x04
+    RAM_SAVE_FLAG_PAGE     = 0x08
+    RAM_SAVE_FLAG_EOS      = 0x10
+    RAM_SAVE_FLAG_CONTINUE = 0x20
+    RAM_SAVE_FLAG_XBZRLE   = 0x40
+    RAM_SAVE_FLAG_HOOK     = 0x80
+
+    def __init__(self, file, version_id, ramargs, section_key):
+        if version_id != 4:
+            raise Exception("Unknown RAM version %d" % version_id)
+
+        self.file = file
+        self.section_key = section_key
+        self.TARGET_PAGE_SIZE = ramargs['page_size']
+        self.dump_memory = ramargs['dump_memory']
+        self.write_memory = ramargs['write_memory']
+        self.sizeinfo = collections.OrderedDict()
+        self.data = collections.OrderedDict()
+        self.data['section sizes'] = self.sizeinfo
+        self.name = ''
+        if self.write_memory:
+            self.files = { }
+        if self.dump_memory:
+            self.memory = collections.OrderedDict()
+            self.data['memory'] = self.memory
+
+    def __repr__(self):
+        return self.data.__repr__()
+
+    def __str__(self):
+        return self.data.__str__()
+
+    def getDict(self):
+        return self.data
+
+    def read(self):
+        # Read all RAM sections
+        while True:
+            addr = self.file.read64()
+            flags = addr & (self.TARGET_PAGE_SIZE - 1)
+            addr &= ~(self.TARGET_PAGE_SIZE - 1)
+
+            if flags & self.RAM_SAVE_FLAG_MEM_SIZE:
+                while True:
+                    namelen = self.file.read8()
+                    # We assume that no RAM chunk is big enough to ever
+                    # hit the first byte of the address, so when we see
+                    # a zero here we know it has to be an address, not the
+                    # length of the next block.
+                    if namelen == 0:
+                        self.file.file.seek(-1, 1)
+                        break
+                    self.name = self.file.readstr(len = namelen)
+                    len = self.file.read64()
+                    self.sizeinfo[self.name] = '0x%016x' % len
+                    if self.write_memory:
+                        print self.name
+                        mkdir_p('./' + os.path.dirname(self.name))
+                        f = open('./' + self.name, "wb")
+                        f.truncate(0)
+                        f.truncate(len)
+                        self.files[self.name] = f
+                flags &= ~self.RAM_SAVE_FLAG_MEM_SIZE
+
+            if flags & self.RAM_SAVE_FLAG_COMPRESS:
+                if flags & self.RAM_SAVE_FLAG_CONTINUE:
+                    flags &= ~self.RAM_SAVE_FLAG_CONTINUE
+                else:
+                    self.name = self.file.readstr()
+                fill_char = self.file.read8()
+                # The page in question is filled with fill_char now
+                if self.write_memory and fill_char != 0:
+                    self.files[self.name].seek(addr, os.SEEK_SET)
+                    self.files[self.name].write(chr(fill_char) * self.TARGET_PAGE_SIZE)
+                if self.dump_memory:
+                    self.memory['%s (0x%016x)' % (self.name, addr)] = 'Filled with 0x%02x' % fill_char
+                flags &= ~self.RAM_SAVE_FLAG_COMPRESS
+            elif flags & self.RAM_SAVE_FLAG_PAGE:
+                if flags & self.RAM_SAVE_FLAG_CONTINUE:
+                    flags &= ~self.RAM_SAVE_FLAG_CONTINUE
+                else:
+                    self.name = self.file.readstr()
+
+                if self.write_memory or self.dump_memory:
+                    data = self.file.readvar(size = self.TARGET_PAGE_SIZE)
+                else: # Just skip RAM data
+                    self.file.file.seek(self.TARGET_PAGE_SIZE, 1)
+
+                if self.write_memory:
+                    self.files[self.name].seek(addr, os.SEEK_SET)
+                    self.files[self.name].write(data)
+                if self.dump_memory:
+                    hexdata = " ".join("{0:02x}".format(ord(c)) for c in data)
+                    self.memory['%s (0x%016x)' % (self.name, addr)] = hexdata
+
+                flags &= ~self.RAM_SAVE_FLAG_PAGE
+            elif flags & self.RAM_SAVE_FLAG_XBZRLE:
+                raise Exception("XBZRLE RAM compression is not supported yet")
+            elif flags & self.RAM_SAVE_FLAG_HOOK:
+                raise Exception("RAM hooks don't make sense with files")
+
+            # End of RAM section
+            if flags & self.RAM_SAVE_FLAG_EOS:
+                break
+
+            if flags != 0:
+                raise Exception("Unknown RAM flags: %x" % flags)
+
+    def __del__(self):
+        if self.write_memory:
+            for key in self.files:
+                self.files[key].close()
+
+
+class HTABSection(object):
+    HASH_PTE_SIZE_64       = 16
+
+    def __init__(self, file, version_id, device, section_key):
+        if version_id != 1:
+            raise Exception("Unknown HTAB version %d" % version_id)
+
+        self.file = file
+        self.section_key = section_key
+
+    def read(self):
+
+        header = self.file.read32()
+
+        if (header > 0):
+            # First section, just the hash shift
+            return
+
+        # Read until end marker
+        while True:
+            index = self.file.read32()
+            n_valid = self.file.read16()
+            n_invalid = self.file.read16()
+
+            if index == 0 and n_valid == 0 and n_invalid == 0:
+                break
+
+            self.file.readvar(n_valid * HASH_PTE_SIZE_64)
+
+    def getDict(self):
+        return ""
+
+class VMSDFieldGeneric(object):
+    def __init__(self, desc, file):
+        self.file = file
+        self.desc = desc
+        self.data = ""
+
+    def __repr__(self):
+        return str(self.__str__())
+
+    def __str__(self):
+        return " ".join("{0:02x}".format(ord(c)) for c in self.data)
+
+    def getDict(self):
+        return self.__str__()
+
+    def read(self):
+        size = int(self.desc['size'])
+        self.data = self.file.readvar(size)
+        return self.data
+
+class VMSDFieldInt(VMSDFieldGeneric):
+    def __init__(self, desc, file):
+        super(VMSDFieldInt, self).__init__(desc, file)
+        self.size = int(desc['size'])
+        self.format = '0x%%0%dx' % (self.size * 2)
+        self.sdtype = '>i%d' % self.size
+        self.udtype = '>u%d' % self.size
+
+    def __repr__(self):
+        if self.data < 0:
+            return ('%s (%d)' % ((self.format % self.udata), self.data))
+        else:
+            return self.format % self.data
+
+    def __str__(self):
+        return self.__repr__()
+
+    def getDict(self):
+        return self.__str__()
+
+    def read(self):
+        super(VMSDFieldInt, self).read()
+        self.sdata = np.fromstring(self.data, count=1, dtype=(self.sdtype))[0]
+        self.udata = np.fromstring(self.data, count=1, dtype=(self.udtype))[0]
+        self.data = self.sdata
+        return self.data
+
+class VMSDFieldUInt(VMSDFieldInt):
+    def __init__(self, desc, file):
+        super(VMSDFieldUInt, self).__init__(desc, file)
+
+    def read(self):
+        super(VMSDFieldUInt, self).read()
+        self.data = self.udata
+        return self.data
+
+class VMSDFieldIntLE(VMSDFieldInt):
+    def __init__(self, desc, file):
+        super(VMSDFieldIntLE, self).__init__(desc, file)
+        self.dtype = '<i%d' % self.size
+
+class VMSDFieldBool(VMSDFieldGeneric):
+    def __init__(self, desc, file):
+        super(VMSDFieldBool, self).__init__(desc, file)
+
+    def __repr__(self):
+        return self.data.__repr__()
+
+    def __str__(self):
+        return self.data.__str__()
+
+    def getDict(self):
+        return self.data
+
+    def read(self):
+        super(VMSDFieldBool, self).read()
+        if self.data[0] == 0:
+            self.data = False
+        else:
+            self.data = True
+        return self.data
+
+class VMSDFieldStruct(VMSDFieldGeneric):
+    QEMU_VM_SUBSECTION    = 0x05
+
+    def __init__(self, desc, file):
+        super(VMSDFieldStruct, self).__init__(desc, file)
+        self.data = collections.OrderedDict()
+
+        # When we see compressed array elements, unfold them here
+        new_fields = []
+        for field in self.desc['struct']['fields']:
+            if not 'array_len' in field:
+                new_fields.append(field)
+                continue
+            array_len = field.pop('array_len')
+            field['index'] = 0
+            new_fields.append(field)
+            for i in xrange(1, array_len):
+                c = field.copy()
+                c['index'] = i
+                new_fields.append(c)
+
+        self.desc['struct']['fields'] = new_fields
+
+    def __repr__(self):
+        return self.data.__repr__()
+
+    def __str__(self):
+        return self.data.__str__()
+
+    def read(self):
+        for field in self.desc['struct']['fields']:
+            try:
+                reader = vmsd_field_readers[field['type']]
+            except:
+                reader = VMSDFieldGeneric
+
+            field['data'] = reader(field, self.file)
+            field['data'].read()
+
+            if 'index' in field:
+                if field['name'] not in self.data:
+                    self.data[field['name']] = []
+                a = self.data[field['name']]
+                if len(a) != int(field['index']):
+                    raise Exception("internal index of data field unmatched (%d/%d)" % (len(a), int(field['index'])))
+                a.append(field['data'])
+            else:
+                self.data[field['name']] = field['data']
+
+        if 'subsections' in self.desc['struct']:
+            for subsection in self.desc['struct']['subsections']:
+                if self.file.read8() != self.QEMU_VM_SUBSECTION:
+                    raise Exception("Subsection %s not found at offset %x" % ( subsection['vmsd_name'], self.file.tell()))
+                name = self.file.readstr()
+                version_id = self.file.read32()
+                self.data[name] = VMSDSection(self.file, version_id, subsection, (name, 0))
+                self.data[name].read()
+
+    def getDictItem(self, value):
+       # Strings would fall into the array category, treat
+       # them specially
+       if value.__class__ is ''.__class__:
+           return value
+
+       try:
+           return self.getDictOrderedDict(value)
+       except:
+           try:
+               return self.getDictArray(value)
+           except:
+               try:
+                   return value.getDict()
+               except:
+                   return value
+
+    def getDictArray(self, array):
+        r = []
+        for value in array:
+           r.append(self.getDictItem(value))
+        return r
+
+    def getDictOrderedDict(self, dict):
+        r = collections.OrderedDict()
+        for (key, value) in dict.items():
+            r[key] = self.getDictItem(value)
+        return r
+
+    def getDict(self):
+        return self.getDictOrderedDict(self.data)
+
+vmsd_field_readers = {
+    "bool" : VMSDFieldBool,
+    "int8" : VMSDFieldInt,
+    "int16" : VMSDFieldInt,
+    "int32" : VMSDFieldInt,
+    "int32 equal" : VMSDFieldInt,
+    "int32 le" : VMSDFieldIntLE,
+    "int64" : VMSDFieldInt,
+    "uint8" : VMSDFieldUInt,
+    "uint16" : VMSDFieldUInt,
+    "uint32" : VMSDFieldUInt,
+    "uint32 equal" : VMSDFieldUInt,
+    "uint64" : VMSDFieldUInt,
+    "int64 equal" : VMSDFieldInt,
+    "uint8 equal" : VMSDFieldInt,
+    "uint16 equal" : VMSDFieldInt,
+    "float64" : VMSDFieldGeneric,
+    "timer" : VMSDFieldGeneric,
+    "buffer" : VMSDFieldGeneric,
+    "unused_buffer" : VMSDFieldGeneric,
+    "bitmap" : VMSDFieldGeneric,
+    "struct" : VMSDFieldStruct,
+    "unknown" : VMSDFieldGeneric,
+}
+
+class VMSDSection(VMSDFieldStruct):
+    def __init__(self, file, version_id, device, section_key):
+        self.file = file
+        self.data = ""
+        self.vmsd_name = ""
+        self.section_key = section_key
+        desc = device
+        if 'vmsd_name' in device:
+            self.vmsd_name = device['vmsd_name']
+
+        # A section really is nothing but a FieldStruct :)
+        super(VMSDSection, self).__init__({ 'struct' : desc }, file)
+
+###############################################################################
+
+class MigrationDump(object):
+    QEMU_VM_FILE_MAGIC    = 0x5145564d
+    QEMU_VM_FILE_VERSION  = 0x00000003
+    QEMU_VM_EOF           = 0x00
+    QEMU_VM_SECTION_START = 0x01
+    QEMU_VM_SECTION_PART  = 0x02
+    QEMU_VM_SECTION_END   = 0x03
+    QEMU_VM_SECTION_FULL  = 0x04
+    QEMU_VM_SUBSECTION    = 0x05
+    QEMU_VM_VMDESCRIPTION = 0x06
+
+    def __init__(self, filename):
+        self.section_classes = { ( 'ram', 0 ) : [ RamSection, None ],
+                                 ( 'spapr/htab', 0) : ( HTABSection, None ) }
+        self.filename = filename
+        self.vmsd_desc = None
+
+    def read(self, desc_only = False, dump_memory = False, write_memory = False):
+        # Read in the whole file
+        file = MigrationFile(self.filename)
+
+        # File magic
+        data = file.read32()
+        if data != self.QEMU_VM_FILE_MAGIC:
+            raise Exception("Invalid file magic %x" % data)
+
+        # Version (has to be v3)
+        data = file.read32()
+        if data != self.QEMU_VM_FILE_VERSION:
+            raise Exception("Invalid version number %d" % data)
+
+        self.load_vmsd_json(file)
+
+        # Read sections
+        self.sections = collections.OrderedDict()
+
+        if desc_only:
+            return
+
+        ramargs = {}
+        ramargs['page_size'] = self.vmsd_desc['page_size']
+        ramargs['dump_memory'] = dump_memory
+        ramargs['write_memory'] = write_memory
+        self.section_classes[('ram',0)][1] = ramargs
+
+        while True:
+            section_type = file.read8()
+            if section_type == self.QEMU_VM_EOF:
+                break
+            elif section_type == self.QEMU_VM_SECTION_START or section_type == self.QEMU_VM_SECTION_FULL:
+                section_id = file.read32()
+                name = file.readstr()
+                instance_id = file.read32()
+                version_id = file.read32()
+                section_key = (name, instance_id)
+                classdesc = self.section_classes[section_key]
+                section = classdesc[0](file, version_id, classdesc[1], section_key)
+                self.sections[section_id] = section
+                section.read()
+            elif section_type == self.QEMU_VM_SECTION_PART or section_type == self.QEMU_VM_SECTION_END:
+                section_id = file.read32()
+                self.sections[section_id].read()
+            else:
+                raise Exception("Unknown section type: %d" % section_type)
+        file.close()
+
+    def load_vmsd_json(self, file):
+        vmsd_json = file.read_migration_debug_json()
+        self.vmsd_desc = json.loads(vmsd_json, object_pairs_hook=collections.OrderedDict)
+        for device in self.vmsd_desc['devices']:
+            key = (device['name'], device['instance_id'])
+            value = ( VMSDSection, device )
+            self.section_classes[key] = value
+
+    def getDict(self):
+        r = collections.OrderedDict()
+        for (key, value) in self.sections.items():
+           key = "%s (%d)" % ( value.section_key[0], key )
+           r[key] = value.getDict()
+        return r
+
+###############################################################################
+
+class JSONEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, VMSDFieldGeneric):
+            return str(o)
+        return json.JSONEncoder.default(self, o)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-f", "--file", help='migration dump to read from', required=True)
+parser.add_argument("-m", "--memory", help='dump RAM contents as well', action='store_true')
+parser.add_argument("-d", "--dump", help='what to dump ("state" or "desc")', default='state')
+parser.add_argument("-x", "--extract", help='extract contents into individual files', action='store_true')
+args = parser.parse_args()
+
+jsonenc = JSONEncoder(indent=4, separators=(',', ': '))
+
+if args.extract:
+    dump = MigrationDump(args.file)
+
+    dump.read(desc_only = True)
+    print "desc.json"
+    f = open("desc.json", "wb")
+    f.truncate()
+    f.write(jsonenc.encode(dump.vmsd_desc))
+    f.close()
+
+    dump.read(write_memory = True)
+    dict = dump.getDict()
+    print "state.json"
+    f = open("state.json", "wb")
+    f.truncate()
+    f.write(jsonenc.encode(dict))
+    f.close()
+elif args.dump == "state":
+    dump = MigrationDump(args.file)
+    dump.read(dump_memory = args.memory)
+    dict = dump.getDict()
+    print jsonenc.encode(dict)
+elif args.dump == "desc":
+    dump = MigrationDump(args.file)
+    dump.read(desc_only = True)
+    print jsonenc.encode(dump.vmsd_desc)
+else:
+    raise Exception("Please specify either -x, -d state or -d dump")
diff --git a/scripts/coverity-model.c b/scripts/coverity-model.c
index 4c99a85..58356af 100644
--- a/scripts/coverity-model.c
+++ b/scripts/coverity-model.c
@@ -40,6 +40,8 @@
 typedef long long int64_t;
 typedef _Bool bool;
 
+typedef struct va_list_str *va_list;
+
 /* exec.c */
 
 typedef struct AddressSpace AddressSpace;
@@ -90,7 +92,8 @@
     }
 }
 
-/* glib memory allocation functions.
+/*
+ * GLib memory allocation functions.
  *
  * Note that we ignore the fact that g_malloc of 0 bytes returns NULL,
  * and g_realloc of 0 bytes frees the pointer.
@@ -107,60 +110,215 @@
  * we'll get a buffer overflow reported anyway.
  */
 
-void *malloc(size_t);
-void *calloc(size_t, size_t);
-void *realloc(void *, size_t);
-void free(void *);
+/*
+ * Allocation primitives, cannot return NULL
+ * See also Coverity's library/generic/libc/all/all.c
+ */
 
-void *
-g_malloc(size_t n_bytes)
+void *g_malloc_n(size_t nmemb, size_t size)
 {
-    void *mem;
-    __coverity_negative_sink__(n_bytes);
-    mem = malloc(n_bytes == 0 ? 1 : n_bytes);
-    if (!mem) __coverity_panic__();
-    return mem;
+    size_t sz;
+    void *ptr;
+
+    __coverity_negative_sink__(nmemb);
+    __coverity_negative_sink__(size);
+    sz = nmemb * size;
+    ptr = __coverity_alloc__(size);
+    __coverity_mark_as_uninitialized_buffer__(ptr);
+    __coverity_mark_as_afm_allocated__(ptr, "g_free");
+    return ptr;
 }
 
-void *
-g_malloc0(size_t n_bytes)
+void *g_malloc0_n(size_t nmemb, size_t size)
 {
-    void *mem;
-    __coverity_negative_sink__(n_bytes);
-    mem = calloc(1, n_bytes == 0 ? 1 : n_bytes);
-    if (!mem) __coverity_panic__();
-    return mem;
+    size_t sz;
+    void *ptr;
+
+    __coverity_negative_sink__(nmemb);
+    __coverity_negative_sink__(size);
+    sz = nmemb * size;
+    ptr = __coverity_alloc__(size);
+    __coverity_writeall0__(ptr);
+    __coverity_mark_as_afm_allocated__(ptr, "g_free");
+    return ptr;
 }
 
-void g_free(void *mem)
+void *g_realloc_n(void *ptr, size_t nmemb, size_t size)
 {
-    free(mem);
+    size_t sz;
+
+    __coverity_negative_sink__(nmemb);
+    __coverity_negative_sink__(size);
+    sz = nmemb * size;
+    __coverity_escape__(ptr);
+    ptr = __coverity_alloc__(size);
+    /*
+     * Memory beyond the old size isn't actually initialized.  Can't
+     * model that.  See Coverity's realloc() model
+     */
+    __coverity_writeall__(ptr);
+    __coverity_mark_as_afm_allocated__(ptr, "g_free");
+    return ptr;
 }
 
-void *g_realloc(void * mem, size_t n_bytes)
+void g_free(void *ptr)
 {
-    __coverity_negative_sink__(n_bytes);
-    mem = realloc(mem, n_bytes == 0 ? 1 : n_bytes);
-    if (!mem) __coverity_panic__();
-    return mem;
+    __coverity_free__(ptr);
+    __coverity_mark_as_afm_freed__(ptr, "g_free");
 }
 
-void *g_try_malloc(size_t n_bytes)
+/*
+ * Derive the g_try_FOO_n() from the g_FOO_n() by adding indeterminate
+ * out of memory conditions
+ */
+
+void *g_try_malloc_n(size_t nmemb, size_t size)
 {
-    __coverity_negative_sink__(n_bytes);
-    return malloc(n_bytes == 0 ? 1 : n_bytes);
+    int nomem;
+
+    if (nomem) {
+        return NULL;
+    }
+    return g_malloc_n(nmemb, size);
 }
 
-void *g_try_malloc0(size_t n_bytes)
+void *g_try_malloc0_n(size_t nmemb, size_t size)
 {
-    __coverity_negative_sink__(n_bytes);
-    return calloc(1, n_bytes == 0 ? 1 : n_bytes);
+    int nomem;
+
+    if (nomem) {
+        return NULL;
+    }
+    return g_malloc0_n(nmemb, size);
 }
 
-void *g_try_realloc(void *mem, size_t n_bytes)
+void *g_try_realloc_n(void *ptr, size_t nmemb, size_t size)
 {
-    __coverity_negative_sink__(n_bytes);
-    return realloc(mem, n_bytes == 0 ? 1 : n_bytes);
+    int nomem;
+
+    if (nomem) {
+        return NULL;
+    }
+    return g_realloc_n(ptr, nmemb, size);
+}
+
+/* Trivially derive the g_FOO() from the g_FOO_n() */
+
+void *g_malloc(size_t size)
+{
+    return g_malloc_n(1, size);
+}
+
+void *g_malloc0(size_t size)
+{
+    return g_malloc0_n(1, size);
+}
+
+void *g_realloc(void *ptr, size_t size)
+{
+    return g_realloc_n(ptr, 1, size);
+}
+
+void *g_try_malloc(size_t size)
+{
+    return g_try_malloc_n(1, size);
+}
+
+void *g_try_malloc0(size_t size)
+{
+    return g_try_malloc0_n(1, size);
+}
+
+void *g_try_realloc(void *ptr, size_t size)
+{
+    return g_try_realloc_n(ptr, 1, size);
+}
+
+/*
+ * GLib string allocation functions
+ */
+
+char *g_strdup(const char *s)
+{
+    char *dup;
+    size_t i;
+
+    if (!s) {
+        return NULL;
+    }
+
+    __coverity_string_null_sink__(s);
+    __coverity_string_size_sink__(s);
+    dup = __coverity_alloc_nosize__();
+    __coverity_mark_as_afm_allocated__(dup, "g_free");
+    for (i = 0; (dup[i] = s[i]); i++) ;
+    return dup;
+}
+
+char *g_strndup(const char *s, size_t n)
+{
+    char *dup;
+    size_t i;
+
+    __coverity_negative_sink__(n);
+
+    if (!s) {
+        return NULL;
+    }
+
+    dup = g_malloc(n + 1);
+    for (i = 0; i < n && (dup[i] = s[i]); i++) ;
+    dup[i] = 0;
+    return dup;
+}
+
+char *g_strdup_printf(const char *format, ...)
+{
+    char ch, *s;
+    size_t len;
+
+    __coverity_string_null_sink__(format);
+    __coverity_string_size_sink__(format);
+
+    ch = *format;
+
+    s = __coverity_alloc_nosize__();
+    __coverity_writeall__(s);
+    __coverity_mark_as_afm_allocated__(s, "g_free");
+    return s;
+}
+
+char *g_strdup_vprintf(const char *format, va_list ap)
+{
+    char ch, *s;
+    size_t len;
+
+    __coverity_string_null_sink__(format);
+    __coverity_string_size_sink__(format);
+
+    ch = *format;
+    ch = *(char *)ap;
+
+    s = __coverity_alloc_nosize__();
+    __coverity_writeall__(s);
+    __coverity_mark_as_afm_allocated__(s, "g_free");
+
+    return len;
+}
+
+char *g_strconcat(const char *s, ...)
+{
+    char *s;
+
+    /*
+     * Can't model: last argument must be null, the others
+     * null-terminated strings
+     */
+
+    s = __coverity_alloc_nosize__();
+    __coverity_writeall__(s);
+    __coverity_mark_as_afm_allocated__(s, "g_free");
+    return s;
 }
 
 /* Other glib functions */
diff --git a/scripts/tracetool/backend/stderr.py b/scripts/tracetool/backend/stderr.py
index 2a1e906..ca58054 100644
--- a/scripts/tracetool/backend/stderr.py
+++ b/scripts/tracetool/backend/stderr.py
@@ -21,6 +21,9 @@
 
 def generate_h_begin(events):
     out('#include <stdio.h>',
+        '#include <sys/time.h>',
+        '#include <sys/types.h>',
+        '#include <unistd.h>',
         '#include "trace/control.h"',
         '')
 
@@ -31,7 +34,12 @@
         argnames = ", " + argnames
 
     out('    if (trace_event_get_state(%(event_id)s)) {',
-        '        fprintf(stderr, "%(name)s " %(fmt)s "\\n" %(argnames)s);',
+        '        struct timeval _now;',
+        '        gettimeofday(&_now, NULL);',
+        '        fprintf(stderr, "%%d@%%zd.%%06zd:%(name)s " %(fmt)s "\\n",',
+        '                        getpid(),',
+        '                        (size_t)_now.tv_sec, (size_t)_now.tv_usec',
+        '                        %(argnames)s);',
         '    }',
         event_id="TRACE_" + event.name.upper(),
         name=event.name,
diff --git a/scripts/vmstate-static-checker.py b/scripts/vmstate-static-checker.py
index f7ce3fc..b6c0bbe 100755
--- a/scripts/vmstate-static-checker.py
+++ b/scripts/vmstate-static-checker.py
@@ -53,6 +53,8 @@
                                        'parent_obj.parent_obj.parent_obj',
                                        'port.br.dev.exp.aer_log',
                                 'parent_obj.parent_obj.parent_obj.exp.aer_log'],
+        'cirrus_vga': ['hw_cursor_x', 'vga.hw_cursor_x',
+                       'hw_cursor_y', 'vga.hw_cursor_y'],
         'lsiscsi': ['dev', 'parent_obj'],
         'mch': ['d', 'parent_obj'],
         'pci_bridge': ['bridge.dev', 'parent_obj', 'bridge.dev.shpc', 'shpc'],
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 285947f..d38af74 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -113,7 +113,14 @@
         /* and to the FP/Neon instructions */
         env->cp15.c1_coproc = deposit64(env->cp15.c1_coproc, 20, 2, 3);
 #else
-        env->pstate = PSTATE_MODE_EL1h;
+        /* Reset into the highest available EL */
+        if (arm_feature(env, ARM_FEATURE_EL3)) {
+            env->pstate = PSTATE_MODE_EL3h;
+        } else if (arm_feature(env, ARM_FEATURE_EL2)) {
+            env->pstate = PSTATE_MODE_EL2h;
+        } else {
+            env->pstate = PSTATE_MODE_EL1h;
+        }
         env->pc = cpu->rvbar;
 #endif
     } else {
@@ -320,6 +327,29 @@
     kvm_set_irq(kvm_state, kvm_irq, level ? 1 : 0);
 #endif
 }
+
+static bool arm_cpu_is_big_endian(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+    int cur_el;
+
+    cpu_synchronize_state(cs);
+
+    /* In 32bit guest endianness is determined by looking at CPSR's E bit */
+    if (!is_a64(env)) {
+        return (env->uncached_cpsr & CPSR_E) ? 1 : 0;
+    }
+
+    cur_el = arm_current_el(env);
+
+    if (cur_el == 0) {
+        return (env->cp15.sctlr_el[1] & SCTLR_E0E) != 0;
+    }
+
+    return (env->cp15.sctlr_el[cur_el] & SCTLR_EE) != 0;
+}
+
 #endif
 
 static inline void set_feature(CPUARMState *env, int feature)
@@ -1189,6 +1219,7 @@
     cc->do_interrupt = arm_cpu_do_interrupt;
     cc->get_phys_page_debug = arm_cpu_get_phys_page_debug;
     cc->vmsd = &vmstate_arm_cpu;
+    cc->virtio_is_big_endian = arm_cpu_is_big_endian;
 #endif
     cc->gdb_num_core_regs = 26;
     cc->gdb_core_xml_file = "arm-core.xml";
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index cd7a9e8..1830a12 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -32,6 +32,8 @@
 #  define ELF_MACHINE EM_ARM
 #endif
 
+#define TARGET_IS_BIENDIAN 1
+
 #define CPUArchState struct CPUARMState
 
 #include "qemu-common.h"
@@ -98,7 +100,7 @@
 
 struct arm_boot_info;
 
-#define NB_MMU_MODES 4
+#define NB_MMU_MODES 7
 
 /* We currently assume float and double are IEEE single and double
    precision respectively.
@@ -1110,8 +1112,14 @@
  * a register definition to override a previous definition for the
  * same (cp, is64, crn, crm, opc1, opc2) tuple: either the new or the
  * old must have the OVERRIDE bit set.
- * NO_MIGRATE indicates that this register should be ignored for migration;
- * (eg because any state is accessed via some other coprocessor register).
+ * ALIAS indicates that this register is an alias view of some underlying
+ * state which is also visible via another register, and that the other
+ * register is handling migration; registers marked ALIAS will not be migrated
+ * but may have their state set by syncing of register state from KVM.
+ * NO_RAW indicates that this register has no underlying state and does not
+ * support raw access for state saving/loading; it will not be used for either
+ * migration or KVM state synchronization. (Typically this is for "registers"
+ * which are actually used as instructions for cache maintenance and so on.)
  * IO indicates that this register does I/O and therefore its accesses
  * need to be surrounded by gen_io_start()/gen_io_end(). In particular,
  * registers which implement clocks or timers require this.
@@ -1121,8 +1129,9 @@
 #define ARM_CP_64BIT 4
 #define ARM_CP_SUPPRESS_TB_END 8
 #define ARM_CP_OVERRIDE 16
-#define ARM_CP_NO_MIGRATE 32
+#define ARM_CP_ALIAS 32
 #define ARM_CP_IO 64
+#define ARM_CP_NO_RAW 128
 #define ARM_CP_NOP (ARM_CP_SPECIAL | (1 << 8))
 #define ARM_CP_WFI (ARM_CP_SPECIAL | (2 << 8))
 #define ARM_CP_NZCV (ARM_CP_SPECIAL | (3 << 8))
@@ -1132,7 +1141,7 @@
 /* Used only as a terminator for ARMCPRegInfo lists */
 #define ARM_CP_SENTINEL 0xffff
 /* Mask of only the flag bits in a type field */
-#define ARM_CP_FLAG_MASK 0x7f
+#define ARM_CP_FLAG_MASK 0xff
 
 /* Valid values for ARMCPRegInfo state field, indicating which of
  * the AArch32 and AArch64 execution states this register is visible in.
@@ -1211,6 +1220,10 @@
  */
 static inline int arm_current_el(CPUARMState *env)
 {
+    if (arm_feature(env, ARM_FEATURE_M)) {
+        return !((env->v7m.exception == 0) && (env->v7m.control & 1));
+    }
+
     if (is_a64(env)) {
         return extract32(env->pstate, 2, 2);
     }
@@ -1568,13 +1581,90 @@
 #define cpu_signal_handler cpu_arm_signal_handler
 #define cpu_list arm_cpu_list
 
-/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _user
-#define MMU_MODE1_SUFFIX _kernel
+/* ARM has the following "translation regimes" (as the ARM ARM calls them):
+ *
+ * If EL3 is 64-bit:
+ *  + NonSecure EL1 & 0 stage 1
+ *  + NonSecure EL1 & 0 stage 2
+ *  + NonSecure EL2
+ *  + Secure EL1 & EL0
+ *  + Secure EL3
+ * If EL3 is 32-bit:
+ *  + NonSecure PL1 & 0 stage 1
+ *  + NonSecure PL1 & 0 stage 2
+ *  + NonSecure PL2
+ *  + Secure PL0 & PL1
+ * (reminder: for 32 bit EL3, Secure PL1 is *EL3*, not EL1.)
+ *
+ * For QEMU, an mmu_idx is not quite the same as a translation regime because:
+ *  1. we need to split the "EL1 & 0" regimes into two mmu_idxes, because they
+ *     may differ in access permissions even if the VA->PA map is the same
+ *  2. we want to cache in our TLB the full VA->IPA->PA lookup for a stage 1+2
+ *     translation, which means that we have one mmu_idx that deals with two
+ *     concatenated translation regimes [this sort of combined s1+2 TLB is
+ *     architecturally permitted]
+ *  3. we don't need to allocate an mmu_idx to translations that we won't be
+ *     handling via the TLB. The only way to do a stage 1 translation without
+ *     the immediate stage 2 translation is via the ATS or AT system insns,
+ *     which can be slow-pathed and always do a page table walk.
+ *  4. we can also safely fold together the "32 bit EL3" and "64 bit EL3"
+ *     translation regimes, because they map reasonably well to each other
+ *     and they can't both be active at the same time.
+ * This gives us the following list of mmu_idx values:
+ *
+ * NS EL0 (aka NS PL0) stage 1+2
+ * NS EL1 (aka NS PL1) stage 1+2
+ * NS EL2 (aka NS PL2)
+ * S EL3 (aka S PL1)
+ * S EL0 (aka S PL0)
+ * S EL1 (not used if EL3 is 32 bit)
+ * NS EL0+1 stage 2
+ *
+ * (The last of these is an mmu_idx because we want to be able to use the TLB
+ * for the accesses done as part of a stage 1 page table walk, rather than
+ * having to walk the stage 2 page table over and over.)
+ *
+ * Our enumeration includes at the end some entries which are not "true"
+ * mmu_idx values in that they don't have corresponding TLBs and are only
+ * valid for doing slow path page table walks.
+ *
+ * The constant names here are patterned after the general style of the names
+ * of the AT/ATS operations.
+ * The values used are carefully arranged to make mmu_idx => EL lookup easy.
+ */
+typedef enum ARMMMUIdx {
+    ARMMMUIdx_S12NSE0 = 0,
+    ARMMMUIdx_S12NSE1 = 1,
+    ARMMMUIdx_S1E2 = 2,
+    ARMMMUIdx_S1E3 = 3,
+    ARMMMUIdx_S1SE0 = 4,
+    ARMMMUIdx_S1SE1 = 5,
+    ARMMMUIdx_S2NS = 6,
+    /* Indexes below here don't have TLBs and are used only for AT system
+     * instructions or for the first stage of an S12 page table walk.
+     */
+    ARMMMUIdx_S1NSE0 = 7,
+    ARMMMUIdx_S1NSE1 = 8,
+} ARMMMUIdx;
+
 #define MMU_USER_IDX 0
-static inline int cpu_mmu_index (CPUARMState *env)
+
+/* Return the exception level we're running at if this is our mmu_idx */
+static inline int arm_mmu_idx_to_el(ARMMMUIdx mmu_idx)
 {
-    return arm_current_el(env);
+    assert(mmu_idx < ARMMMUIdx_S2NS);
+    return mmu_idx & 3;
+}
+
+/* Determine the current mmu_idx to use for normal loads/stores */
+static inline int cpu_mmu_index(CPUARMState *env)
+{
+    int el = arm_current_el(env);
+
+    if (el < 2 && arm_is_secure_below_el3(env)) {
+        return ARMMMUIdx_S1SE0 + el;
+    }
+    return el;
 }
 
 /* Return the Exception Level targeted by debug exceptions;
@@ -1641,9 +1731,13 @@
 
 /* Bit usage in the TB flags field: bit 31 indicates whether we are
  * in 32 or 64 bit mode. The meaning of the other bits depends on that.
+ * We put flags which are shared between 32 and 64 bit mode at the top
+ * of the word, and flags which apply to only one mode at the bottom.
  */
 #define ARM_TBFLAG_AARCH64_STATE_SHIFT 31
 #define ARM_TBFLAG_AARCH64_STATE_MASK  (1U << ARM_TBFLAG_AARCH64_STATE_SHIFT)
+#define ARM_TBFLAG_MMUIDX_SHIFT 28
+#define ARM_TBFLAG_MMUIDX_MASK (0x7 << ARM_TBFLAG_MMUIDX_SHIFT)
 
 /* Bit usage when in AArch32 state: */
 #define ARM_TBFLAG_THUMB_SHIFT      0
@@ -1652,8 +1746,6 @@
 #define ARM_TBFLAG_VECLEN_MASK      (0x7 << ARM_TBFLAG_VECLEN_SHIFT)
 #define ARM_TBFLAG_VECSTRIDE_SHIFT  4
 #define ARM_TBFLAG_VECSTRIDE_MASK   (0x3 << ARM_TBFLAG_VECSTRIDE_SHIFT)
-#define ARM_TBFLAG_PRIV_SHIFT       6
-#define ARM_TBFLAG_PRIV_MASK        (1 << ARM_TBFLAG_PRIV_SHIFT)
 #define ARM_TBFLAG_VFPEN_SHIFT      7
 #define ARM_TBFLAG_VFPEN_MASK       (1 << ARM_TBFLAG_VFPEN_SHIFT)
 #define ARM_TBFLAG_CONDEXEC_SHIFT   8
@@ -1679,8 +1771,6 @@
 #define ARM_TBFLAG_NS_MASK          (1 << ARM_TBFLAG_NS_SHIFT)
 
 /* Bit usage when in AArch64 state */
-#define ARM_TBFLAG_AA64_EL_SHIFT    0
-#define ARM_TBFLAG_AA64_EL_MASK     (0x3 << ARM_TBFLAG_AA64_EL_SHIFT)
 #define ARM_TBFLAG_AA64_FPEN_SHIFT  2
 #define ARM_TBFLAG_AA64_FPEN_MASK   (1 << ARM_TBFLAG_AA64_FPEN_SHIFT)
 #define ARM_TBFLAG_AA64_SS_ACTIVE_SHIFT 3
@@ -1691,14 +1781,14 @@
 /* some convenience accessor macros */
 #define ARM_TBFLAG_AARCH64_STATE(F) \
     (((F) & ARM_TBFLAG_AARCH64_STATE_MASK) >> ARM_TBFLAG_AARCH64_STATE_SHIFT)
+#define ARM_TBFLAG_MMUIDX(F) \
+    (((F) & ARM_TBFLAG_MMUIDX_MASK) >> ARM_TBFLAG_MMUIDX_SHIFT)
 #define ARM_TBFLAG_THUMB(F) \
     (((F) & ARM_TBFLAG_THUMB_MASK) >> ARM_TBFLAG_THUMB_SHIFT)
 #define ARM_TBFLAG_VECLEN(F) \
     (((F) & ARM_TBFLAG_VECLEN_MASK) >> ARM_TBFLAG_VECLEN_SHIFT)
 #define ARM_TBFLAG_VECSTRIDE(F) \
     (((F) & ARM_TBFLAG_VECSTRIDE_MASK) >> ARM_TBFLAG_VECSTRIDE_SHIFT)
-#define ARM_TBFLAG_PRIV(F) \
-    (((F) & ARM_TBFLAG_PRIV_MASK) >> ARM_TBFLAG_PRIV_SHIFT)
 #define ARM_TBFLAG_VFPEN(F) \
     (((F) & ARM_TBFLAG_VFPEN_MASK) >> ARM_TBFLAG_VFPEN_SHIFT)
 #define ARM_TBFLAG_CONDEXEC(F) \
@@ -1713,8 +1803,6 @@
     (((F) & ARM_TBFLAG_PSTATE_SS_MASK) >> ARM_TBFLAG_PSTATE_SS_SHIFT)
 #define ARM_TBFLAG_XSCALE_CPAR(F) \
     (((F) & ARM_TBFLAG_XSCALE_CPAR_MASK) >> ARM_TBFLAG_XSCALE_CPAR_SHIFT)
-#define ARM_TBFLAG_AA64_EL(F) \
-    (((F) & ARM_TBFLAG_AA64_EL_MASK) >> ARM_TBFLAG_AA64_EL_SHIFT)
 #define ARM_TBFLAG_AA64_FPEN(F) \
     (((F) & ARM_TBFLAG_AA64_FPEN_MASK) >> ARM_TBFLAG_AA64_FPEN_SHIFT)
 #define ARM_TBFLAG_AA64_SS_ACTIVE(F) \
@@ -1738,8 +1826,7 @@
 
     if (is_a64(env)) {
         *pc = env->pc;
-        *flags = ARM_TBFLAG_AARCH64_STATE_MASK
-            | (arm_current_el(env) << ARM_TBFLAG_AA64_EL_SHIFT);
+        *flags = ARM_TBFLAG_AARCH64_STATE_MASK;
         if (fpen == 3 || (fpen == 1 && arm_current_el(env) != 0)) {
             *flags |= ARM_TBFLAG_AA64_FPEN_MASK;
         }
@@ -1757,21 +1844,12 @@
             }
         }
     } else {
-        int privmode;
         *pc = env->regs[15];
         *flags = (env->thumb << ARM_TBFLAG_THUMB_SHIFT)
             | (env->vfp.vec_len << ARM_TBFLAG_VECLEN_SHIFT)
             | (env->vfp.vec_stride << ARM_TBFLAG_VECSTRIDE_SHIFT)
             | (env->condexec_bits << ARM_TBFLAG_CONDEXEC_SHIFT)
             | (env->bswap_code << ARM_TBFLAG_BSWAP_CODE_SHIFT);
-        if (arm_feature(env, ARM_FEATURE_M)) {
-            privmode = !((env->v7m.exception == 0) && (env->v7m.control & 1));
-        } else {
-            privmode = (env->uncached_cpsr & CPSR_M) != ARM_CPU_MODE_USR;
-        }
-        if (privmode) {
-            *flags |= ARM_TBFLAG_PRIV_MASK;
-        }
         if (!(access_secure_reg(env))) {
             *flags |= ARM_TBFLAG_NS_MASK;
         }
@@ -1799,6 +1877,8 @@
                    << ARM_TBFLAG_XSCALE_CPAR_SHIFT);
     }
 
+    *flags |= (cpu_mmu_index(env) << ARM_TBFLAG_MMUIDX_SHIFT);
+
     *cs_base = 0;
 }
 
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 81066ca..8aa40e9 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -135,6 +135,9 @@
 {
     float_status *fpst = fpstp;
 
+    a = float32_squash_input_denormal(a, fpst);
+    b = float32_squash_input_denormal(b, fpst);
+
     if ((float32_is_zero(a) && float32_is_infinity(b)) ||
         (float32_is_infinity(a) && float32_is_zero(b))) {
         /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
@@ -148,6 +151,9 @@
 {
     float_status *fpst = fpstp;
 
+    a = float64_squash_input_denormal(a, fpst);
+    b = float64_squash_input_denormal(b, fpst);
+
     if ((float64_is_zero(a) && float64_is_infinity(b)) ||
         (float64_is_infinity(a) && float64_is_zero(b))) {
         /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
@@ -223,6 +229,9 @@
 {
     float_status *fpst = fpstp;
 
+    a = float32_squash_input_denormal(a, fpst);
+    b = float32_squash_input_denormal(b, fpst);
+
     a = float32_chs(a);
     if ((float32_is_infinity(a) && float32_is_zero(b)) ||
         (float32_is_infinity(b) && float32_is_zero(a))) {
@@ -235,6 +244,9 @@
 {
     float_status *fpst = fpstp;
 
+    a = float64_squash_input_denormal(a, fpst);
+    b = float64_squash_input_denormal(b, fpst);
+
     a = float64_chs(a);
     if ((float64_is_infinity(a) && float64_is_zero(b)) ||
         (float64_is_infinity(b) && float64_is_zero(a))) {
@@ -247,6 +259,9 @@
 {
     float_status *fpst = fpstp;
 
+    a = float32_squash_input_denormal(a, fpst);
+    b = float32_squash_input_denormal(b, fpst);
+
     a = float32_chs(a);
     if ((float32_is_infinity(a) && float32_is_zero(b)) ||
         (float32_is_infinity(b) && float32_is_zero(a))) {
@@ -259,6 +274,9 @@
 {
     float_status *fpst = fpstp;
 
+    a = float64_squash_input_denormal(a, fpst);
+    b = float64_squash_input_denormal(b, fpst);
+
     a = float64_chs(a);
     if ((float64_is_infinity(a) && float64_is_zero(b)) ||
         (float64_is_infinity(b) && float64_is_zero(a))) {
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 1a5e067..1a1a005 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -13,7 +13,7 @@
 
 #ifndef CONFIG_USER_ONLY
 static inline int get_phys_addr(CPUARMState *env, target_ulong address,
-                                int access_type, int is_user,
+                                int access_type, ARMMMUIdx mmu_idx,
                                 hwaddr *phys_ptr, int *prot,
                                 target_ulong *page_size);
 
@@ -119,6 +119,7 @@
 
 static uint64_t raw_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
+    assert(ri->fieldoffset);
     if (cpreg_field_is_64bit(ri)) {
         return CPREG_FIELD64(env, ri);
     } else {
@@ -129,6 +130,7 @@
 static void raw_write(CPUARMState *env, const ARMCPRegInfo *ri,
                       uint64_t value)
 {
+    assert(ri->fieldoffset);
     if (cpreg_field_is_64bit(ri)) {
         CPREG_FIELD64(env, ri) = value;
     } else {
@@ -174,6 +176,27 @@
     }
 }
 
+static bool raw_accessors_invalid(const ARMCPRegInfo *ri)
+{
+   /* Return true if the regdef would cause an assertion if you called
+    * read_raw_cp_reg() or write_raw_cp_reg() on it (ie if it is a
+    * program bug for it not to have the NO_RAW flag).
+    * NB that returning false here doesn't necessarily mean that calling
+    * read/write_raw_cp_reg() is safe, because we can't distinguish "has
+    * read/write access functions which are safe for raw use" from "has
+    * read/write access functions which have side effects but has forgotten
+    * to provide raw access functions".
+    * The tests here line up with the conditions in read/write_raw_cp_reg()
+    * and assertions in raw_read()/raw_write().
+    */
+    if ((ri->type & ARM_CP_CONST) ||
+        ri->fieldoffset ||
+        ((ri->raw_writefn || ri->writefn) && (ri->raw_readfn || ri->readfn))) {
+        return false;
+    }
+    return true;
+}
+
 bool write_cpustate_to_list(ARMCPU *cpu)
 {
     /* Write the coprocessor state from cpu->env to the (index,value) list. */
@@ -189,7 +212,7 @@
             ok = false;
             continue;
         }
-        if (ri->type & ARM_CP_NO_MIGRATE) {
+        if (ri->type & ARM_CP_NO_RAW) {
             continue;
         }
         cpu->cpreg_values[i] = read_raw_cp_reg(&cpu->env, ri);
@@ -212,7 +235,7 @@
             ok = false;
             continue;
         }
-        if (ri->type & ARM_CP_NO_MIGRATE) {
+        if (ri->type & ARM_CP_NO_RAW) {
             continue;
         }
         /* Write value and confirm it reads back as written
@@ -236,7 +259,7 @@
     regidx = *(uint32_t *)key;
     ri = get_arm_cp_reginfo(cpu->cp_regs, regidx);
 
-    if (!(ri->type & ARM_CP_NO_MIGRATE)) {
+    if (!(ri->type & (ARM_CP_NO_RAW|ARM_CP_ALIAS))) {
         cpu->cpreg_indexes[cpu->cpreg_array_len] = cpreg_to_kvm_id(regidx);
         /* The value array need not be initialized at this point */
         cpu->cpreg_array_len++;
@@ -252,7 +275,7 @@
     regidx = *(uint32_t *)key;
     ri = get_arm_cp_reginfo(cpu->cp_regs, regidx);
 
-    if (!(ri->type & ARM_CP_NO_MIGRATE)) {
+    if (!(ri->type & (ARM_CP_NO_RAW|ARM_CP_ALIAS))) {
         cpu->cpreg_array_len++;
     }
 }
@@ -508,7 +531,7 @@
       .resetvalue = 0 },
     /* v6 doesn't have the cache ID registers but Linux reads them anyway */
     { .name = "DUMMY", .cp = 15, .crn = 0, .crm = 0, .opc1 = 1, .opc2 = CP_ANY,
-      .access = PL1_R, .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE,
+      .access = PL1_R, .type = ARM_CP_CONST | ARM_CP_NO_RAW,
       .resetvalue = 0 },
     /* We don't implement pre-v7 debug but most CPUs had at least a DBGDIDR;
      * implementing it as RAZ means the "debug architecture version" bits
@@ -522,16 +545,16 @@
      */
     { .name = "TLBIALL", .cp = 15, .crn = 8, .crm = CP_ANY,
       .opc1 = CP_ANY, .opc2 = 0, .access = PL1_W, .writefn = tlbiall_write,
-      .type = ARM_CP_NO_MIGRATE },
+      .type = ARM_CP_NO_RAW },
     { .name = "TLBIMVA", .cp = 15, .crn = 8, .crm = CP_ANY,
       .opc1 = CP_ANY, .opc2 = 1, .access = PL1_W, .writefn = tlbimva_write,
-      .type = ARM_CP_NO_MIGRATE },
+      .type = ARM_CP_NO_RAW },
     { .name = "TLBIASID", .cp = 15, .crn = 8, .crm = CP_ANY,
       .opc1 = CP_ANY, .opc2 = 2, .access = PL1_W, .writefn = tlbiasid_write,
-      .type = ARM_CP_NO_MIGRATE },
+      .type = ARM_CP_NO_RAW },
     { .name = "TLBIMVAA", .cp = 15, .crn = 8, .crm = CP_ANY,
       .opc1 = CP_ANY, .opc2 = 3, .access = PL1_W, .writefn = tlbimvaa_write,
-      .type = ARM_CP_NO_MIGRATE },
+      .type = ARM_CP_NO_RAW },
     REGINFO_SENTINEL
 };
 
@@ -854,7 +877,7 @@
      * or PL0_RO as appropriate and then check PMUSERENR in the helper fn.
      */
     { .name = "PMCNTENSET", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 1,
-      .access = PL0_RW, .type = ARM_CP_NO_MIGRATE,
+      .access = PL0_RW, .type = ARM_CP_ALIAS,
       .fieldoffset = offsetoflow32(CPUARMState, cp15.c9_pmcnten),
       .writefn = pmcntenset_write,
       .accessfn = pmreg_access,
@@ -869,11 +892,11 @@
       .fieldoffset = offsetoflow32(CPUARMState, cp15.c9_pmcnten),
       .accessfn = pmreg_access,
       .writefn = pmcntenclr_write,
-      .type = ARM_CP_NO_MIGRATE },
+      .type = ARM_CP_ALIAS },
     { .name = "PMCNTENCLR_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .crn = 9, .crm = 12, .opc2 = 2,
       .access = PL0_RW, .accessfn = pmreg_access,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .fieldoffset = offsetof(CPUARMState, cp15.c9_pmcnten),
       .writefn = pmcntenclr_write },
     { .name = "PMOVSR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 3,
@@ -928,7 +951,7 @@
       .resetvalue = 0,
       .writefn = pmintenset_write, .raw_writefn = raw_write },
     { .name = "PMINTENCLR", .cp = 15, .crn = 9, .crm = 14, .opc1 = 0, .opc2 = 2,
-      .access = PL1_RW, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_RW, .type = ARM_CP_ALIAS,
       .fieldoffset = offsetof(CPUARMState, cp15.c9_pminten),
       .resetvalue = 0, .writefn = pmintenclr_write, },
     { .name = "VBAR", .state = ARM_CP_STATE_BOTH,
@@ -939,7 +962,7 @@
       .resetvalue = 0 },
     { .name = "CCSIDR", .state = ARM_CP_STATE_BOTH,
       .opc0 = 3, .crn = 0, .crm = 0, .opc1 = 1, .opc2 = 0,
-      .access = PL1_R, .readfn = ccsidr_read, .type = ARM_CP_NO_MIGRATE },
+      .access = PL1_R, .readfn = ccsidr_read, .type = ARM_CP_NO_RAW },
     { .name = "CSSELR", .state = ARM_CP_STATE_BOTH,
       .opc0 = 3, .crn = 0, .crm = 0, .opc1 = 2, .opc2 = 0,
       .access = PL1_RW, .writefn = csselr_write, .resetvalue = 0,
@@ -988,44 +1011,44 @@
       .resetfn = arm_cp_reset_ignore },
     { .name = "ISR_EL1", .state = ARM_CP_STATE_BOTH,
       .opc0 = 3, .opc1 = 0, .crn = 12, .crm = 1, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_R, .readfn = isr_read },
+      .type = ARM_CP_NO_RAW, .access = PL1_R, .readfn = isr_read },
     /* 32 bit ITLB invalidates */
     { .name = "ITLBIALL", .cp = 15, .opc1 = 0, .crn = 8, .crm = 5, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbiall_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbiall_write },
     { .name = "ITLBIMVA", .cp = 15, .opc1 = 0, .crn = 8, .crm = 5, .opc2 = 1,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimva_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimva_write },
     { .name = "ITLBIASID", .cp = 15, .opc1 = 0, .crn = 8, .crm = 5, .opc2 = 2,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbiasid_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbiasid_write },
     /* 32 bit DTLB invalidates */
     { .name = "DTLBIALL", .cp = 15, .opc1 = 0, .crn = 8, .crm = 6, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbiall_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbiall_write },
     { .name = "DTLBIMVA", .cp = 15, .opc1 = 0, .crn = 8, .crm = 6, .opc2 = 1,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimva_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimva_write },
     { .name = "DTLBIASID", .cp = 15, .opc1 = 0, .crn = 8, .crm = 6, .opc2 = 2,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbiasid_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbiasid_write },
     /* 32 bit TLB invalidates */
     { .name = "TLBIALL", .cp = 15, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbiall_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbiall_write },
     { .name = "TLBIMVA", .cp = 15, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 1,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimva_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimva_write },
     { .name = "TLBIASID", .cp = 15, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 2,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbiasid_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbiasid_write },
     { .name = "TLBIMVAA", .cp = 15, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 3,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimvaa_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimvaa_write },
     REGINFO_SENTINEL
 };
 
 static const ARMCPRegInfo v7mp_cp_reginfo[] = {
     /* 32 bit TLB invalidates, Inner Shareable */
     { .name = "TLBIALLIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbiall_is_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbiall_is_write },
     { .name = "TLBIMVAIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 1,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimva_is_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimva_is_write },
     { .name = "TLBIASIDIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 2,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W,
+      .type = ARM_CP_NO_RAW, .access = PL1_W,
       .writefn = tlbiasid_is_write },
     { .name = "TLBIMVAAIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 3,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W,
+      .type = ARM_CP_NO_RAW, .access = PL1_W,
       .writefn = tlbimvaa_is_write },
     REGINFO_SENTINEL
 };
@@ -1268,7 +1291,7 @@
      * Our reset value matches the fixed frequency we implement the timer at.
      */
     { .name = "CNTFRQ", .cp = 15, .crn = 14, .crm = 0, .opc1 = 0, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .access = PL1_RW | PL0_R, .accessfn = gt_cntfrq_access,
       .fieldoffset = offsetoflow32(CPUARMState, cp15.c14_cntfrq),
       .resetfn = arm_cp_reset_ignore,
@@ -1288,7 +1311,7 @@
     },
     /* per-timer control */
     { .name = "CNTP_CTL", .cp = 15, .crn = 14, .crm = 2, .opc1 = 0, .opc2 = 1,
-      .type = ARM_CP_IO | ARM_CP_NO_MIGRATE, .access = PL1_RW | PL0_R,
+      .type = ARM_CP_IO | ARM_CP_ALIAS, .access = PL1_RW | PL0_R,
       .accessfn = gt_ptimer_access,
       .fieldoffset = offsetoflow32(CPUARMState,
                                    cp15.c14_timer[GTIMER_PHYS].ctl),
@@ -1304,7 +1327,7 @@
       .writefn = gt_ctl_write, .raw_writefn = raw_write,
     },
     { .name = "CNTV_CTL", .cp = 15, .crn = 14, .crm = 3, .opc1 = 0, .opc2 = 1,
-      .type = ARM_CP_IO | ARM_CP_NO_MIGRATE, .access = PL1_RW | PL0_R,
+      .type = ARM_CP_IO | ARM_CP_ALIAS, .access = PL1_RW | PL0_R,
       .accessfn = gt_vtimer_access,
       .fieldoffset = offsetoflow32(CPUARMState,
                                    cp15.c14_timer[GTIMER_VIRT].ctl),
@@ -1321,52 +1344,52 @@
     },
     /* TimerValue views: a 32 bit downcounting view of the underlying state */
     { .name = "CNTP_TVAL", .cp = 15, .crn = 14, .crm = 2, .opc1 = 0, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE | ARM_CP_IO, .access = PL1_RW | PL0_R,
+      .type = ARM_CP_NO_RAW | ARM_CP_IO, .access = PL1_RW | PL0_R,
       .accessfn = gt_ptimer_access,
       .readfn = gt_tval_read, .writefn = gt_tval_write,
     },
     { .name = "CNTP_TVAL_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 2, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE | ARM_CP_IO, .access = PL1_RW | PL0_R,
+      .type = ARM_CP_NO_RAW | ARM_CP_IO, .access = PL1_RW | PL0_R,
       .readfn = gt_tval_read, .writefn = gt_tval_write,
     },
     { .name = "CNTV_TVAL", .cp = 15, .crn = 14, .crm = 3, .opc1 = 0, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE | ARM_CP_IO, .access = PL1_RW | PL0_R,
+      .type = ARM_CP_NO_RAW | ARM_CP_IO, .access = PL1_RW | PL0_R,
       .accessfn = gt_vtimer_access,
       .readfn = gt_tval_read, .writefn = gt_tval_write,
     },
     { .name = "CNTV_TVAL_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 3, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE | ARM_CP_IO, .access = PL1_RW | PL0_R,
+      .type = ARM_CP_NO_RAW | ARM_CP_IO, .access = PL1_RW | PL0_R,
       .readfn = gt_tval_read, .writefn = gt_tval_write,
     },
     /* The counter itself */
     { .name = "CNTPCT", .cp = 15, .crm = 14, .opc1 = 0,
-      .access = PL0_R, .type = ARM_CP_64BIT | ARM_CP_NO_MIGRATE | ARM_CP_IO,
+      .access = PL0_R, .type = ARM_CP_64BIT | ARM_CP_NO_RAW | ARM_CP_IO,
       .accessfn = gt_pct_access,
       .readfn = gt_cnt_read, .resetfn = arm_cp_reset_ignore,
     },
     { .name = "CNTPCT_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 0, .opc2 = 1,
-      .access = PL0_R, .type = ARM_CP_NO_MIGRATE | ARM_CP_IO,
+      .access = PL0_R, .type = ARM_CP_NO_RAW | ARM_CP_IO,
       .accessfn = gt_pct_access,
       .readfn = gt_cnt_read, .resetfn = gt_cnt_reset,
     },
     { .name = "CNTVCT", .cp = 15, .crm = 14, .opc1 = 1,
-      .access = PL0_R, .type = ARM_CP_64BIT | ARM_CP_NO_MIGRATE | ARM_CP_IO,
+      .access = PL0_R, .type = ARM_CP_64BIT | ARM_CP_NO_RAW | ARM_CP_IO,
       .accessfn = gt_vct_access,
       .readfn = gt_cnt_read, .resetfn = arm_cp_reset_ignore,
     },
     { .name = "CNTVCT_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 0, .opc2 = 2,
-      .access = PL0_R, .type = ARM_CP_NO_MIGRATE | ARM_CP_IO,
+      .access = PL0_R, .type = ARM_CP_NO_RAW | ARM_CP_IO,
       .accessfn = gt_vct_access,
       .readfn = gt_cnt_read, .resetfn = gt_cnt_reset,
     },
     /* Comparison value, indicating when the timer goes off */
     { .name = "CNTP_CVAL", .cp = 15, .crm = 14, .opc1 = 2,
       .access = PL1_RW | PL0_R,
-      .type = ARM_CP_64BIT | ARM_CP_IO | ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_64BIT | ARM_CP_IO | ARM_CP_ALIAS,
       .fieldoffset = offsetof(CPUARMState, cp15.c14_timer[GTIMER_PHYS].cval),
       .accessfn = gt_ptimer_access, .resetfn = arm_cp_reset_ignore,
       .writefn = gt_cval_write, .raw_writefn = raw_write,
@@ -1381,7 +1404,7 @@
     },
     { .name = "CNTV_CVAL", .cp = 15, .crm = 14, .opc1 = 3,
       .access = PL1_RW | PL0_R,
-      .type = ARM_CP_64BIT | ARM_CP_IO | ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_64BIT | ARM_CP_IO | ARM_CP_ALIAS,
       .fieldoffset = offsetof(CPUARMState, cp15.c14_timer[GTIMER_VIRT].cval),
       .accessfn = gt_vtimer_access, .resetfn = arm_cp_reset_ignore,
       .writefn = gt_cval_write, .raw_writefn = raw_write,
@@ -1428,23 +1451,23 @@
         /* Other states are only available with TrustZone; in
          * a non-TZ implementation these registers don't exist
          * at all, which is an Uncategorized trap. This underdecoding
-         * is safe because the reginfo is NO_MIGRATE.
+         * is safe because the reginfo is NO_RAW.
          */
         return CP_ACCESS_TRAP_UNCATEGORIZED;
     }
     return CP_ACCESS_OK;
 }
 
-static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+static uint64_t do_ats_write(CPUARMState *env, uint64_t value,
+                             int access_type, ARMMMUIdx mmu_idx)
 {
     hwaddr phys_addr;
     target_ulong page_size;
     int prot;
-    int ret, is_user = ri->opc2 & 2;
-    int access_type = ri->opc2 & 1;
+    int ret;
     uint64_t par64;
 
-    ret = get_phys_addr(env, value, access_type, is_user,
+    ret = get_phys_addr(env, value, access_type, mmu_idx,
                         &phys_addr, &prot, &page_size);
     if (extended_addresses_enabled(env)) {
         /* ret is a DFSR/IFSR value for the long descriptor
@@ -1481,9 +1504,105 @@
                     ((ret & 0xf) << 1) | 1;
         }
     }
+    return par64;
+}
+
+static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
+{
+    int access_type = ri->opc2 & 1;
+    uint64_t par64;
+    ARMMMUIdx mmu_idx;
+    int el = arm_current_el(env);
+    bool secure = arm_is_secure_below_el3(env);
+
+    switch (ri->opc2 & 6) {
+    case 0:
+        /* stage 1 current state PL1: ATS1CPR, ATS1CPW */
+        switch (el) {
+        case 3:
+            mmu_idx = ARMMMUIdx_S1E3;
+            break;
+        case 2:
+            mmu_idx = ARMMMUIdx_S1NSE1;
+            break;
+        case 1:
+            mmu_idx = secure ? ARMMMUIdx_S1SE1 : ARMMMUIdx_S1NSE1;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+    case 2:
+        /* stage 1 current state PL0: ATS1CUR, ATS1CUW */
+        switch (el) {
+        case 3:
+            mmu_idx = ARMMMUIdx_S1SE0;
+            break;
+        case 2:
+            mmu_idx = ARMMMUIdx_S1NSE0;
+            break;
+        case 1:
+            mmu_idx = secure ? ARMMMUIdx_S1SE0 : ARMMMUIdx_S1NSE0;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+    case 4:
+        /* stage 1+2 NonSecure PL1: ATS12NSOPR, ATS12NSOPW */
+        mmu_idx = ARMMMUIdx_S12NSE1;
+        break;
+    case 6:
+        /* stage 1+2 NonSecure PL0: ATS12NSOUR, ATS12NSOUW */
+        mmu_idx = ARMMMUIdx_S12NSE0;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    par64 = do_ats_write(env, value, access_type, mmu_idx);
 
     A32_BANKED_CURRENT_REG_SET(env, par, par64);
 }
+
+static void ats_write64(CPUARMState *env, const ARMCPRegInfo *ri,
+                        uint64_t value)
+{
+    int access_type = ri->opc2 & 1;
+    ARMMMUIdx mmu_idx;
+    int secure = arm_is_secure_below_el3(env);
+
+    switch (ri->opc2 & 6) {
+    case 0:
+        switch (ri->opc1) {
+        case 0: /* AT S1E1R, AT S1E1W */
+            mmu_idx = secure ? ARMMMUIdx_S1SE1 : ARMMMUIdx_S1NSE1;
+            break;
+        case 4: /* AT S1E2R, AT S1E2W */
+            mmu_idx = ARMMMUIdx_S1E2;
+            break;
+        case 6: /* AT S1E3R, AT S1E3W */
+            mmu_idx = ARMMMUIdx_S1E3;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+    case 2: /* AT S1E0R, AT S1E0W */
+        mmu_idx = secure ? ARMMMUIdx_S1SE0 : ARMMMUIdx_S1NSE0;
+        break;
+    case 4: /* AT S12E1R, AT S12E1W */
+        mmu_idx = ARMMMUIdx_S12NSE1;
+        break;
+    case 6: /* AT S12E0R, AT S12E0W */
+        mmu_idx = ARMMMUIdx_S12NSE0;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    env->cp15.par_el[1] = do_ats_write(env, value, access_type, mmu_idx);
+}
 #endif
 
 static const ARMCPRegInfo vapa_cp_reginfo[] = {
@@ -1495,7 +1614,7 @@
 #ifndef CONFIG_USER_ONLY
     { .name = "ATS", .cp = 15, .crn = 7, .crm = 8, .opc1 = 0, .opc2 = CP_ANY,
       .access = PL1_W, .accessfn = ats_access,
-      .writefn = ats_write, .type = ARM_CP_NO_MIGRATE },
+      .writefn = ats_write, .type = ARM_CP_NO_RAW },
 #endif
     REGINFO_SENTINEL
 };
@@ -1554,12 +1673,12 @@
 
 static const ARMCPRegInfo pmsav5_cp_reginfo[] = {
     { .name = "DATA_AP", .cp = 15, .crn = 5, .crm = 0, .opc1 = 0, .opc2 = 0,
-      .access = PL1_RW, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_RW, .type = ARM_CP_ALIAS,
       .fieldoffset = offsetof(CPUARMState, cp15.pmsav5_data_ap),
       .resetvalue = 0,
       .readfn = pmsav5_data_ap_read, .writefn = pmsav5_data_ap_write, },
     { .name = "INSN_AP", .cp = 15, .crn = 5, .crm = 0, .opc1 = 0, .opc2 = 1,
-      .access = PL1_RW, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_RW, .type = ARM_CP_ALIAS,
       .fieldoffset = offsetof(CPUARMState, cp15.pmsav5_insn_ap),
       .resetvalue = 0,
       .readfn = pmsav5_insn_ap_read, .writefn = pmsav5_insn_ap_write, },
@@ -1691,7 +1810,7 @@
 
 static const ARMCPRegInfo vmsa_cp_reginfo[] = {
     { .name = "DFSR", .cp = 15, .crn = 5, .crm = 0, .opc1 = 0, .opc2 = 0,
-      .access = PL1_RW, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_RW, .type = ARM_CP_ALIAS,
       .bank_fieldoffsets = { offsetoflow32(CPUARMState, cp15.dfsr_s),
                              offsetoflow32(CPUARMState, cp15.dfsr_ns) },
       .resetfn = arm_cp_reset_ignore, },
@@ -1719,7 +1838,7 @@
       .resetfn = vmsa_ttbcr_reset, .raw_writefn = raw_write,
       .fieldoffset = offsetof(CPUARMState, cp15.tcr_el[1]) },
     { .name = "TTBCR", .cp = 15, .crn = 2, .crm = 0, .opc1 = 0, .opc2 = 2,
-      .access = PL1_RW, .type = ARM_CP_NO_MIGRATE, .writefn = vmsa_ttbcr_write,
+      .access = PL1_RW, .type = ARM_CP_ALIAS, .writefn = vmsa_ttbcr_write,
       .resetfn = arm_cp_reset_ignore, .raw_writefn = vmsa_ttbcr_raw_write,
       .bank_fieldoffsets = { offsetoflow32(CPUARMState, cp15.tcr_el[3]),
                              offsetoflow32(CPUARMState, cp15.tcr_el[1])} },
@@ -1789,7 +1908,7 @@
       .writefn = omap_threadid_write },
     { .name = "TI925T_STATUS", .cp = 15, .crn = 15,
       .crm = 8, .opc1 = 0, .opc2 = 0, .access = PL1_RW,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_NO_RAW,
       .readfn = arm_cp_read_zero, .writefn = omap_wfi_write, },
     /* TODO: Peripheral port remap register:
      * On OMAP2 mcr p15, 0, rn, c15, c2, 4 sets up the interrupt controller
@@ -1798,7 +1917,7 @@
      */
     { .name = "OMAP_CACHEMAINT", .cp = 15, .crn = 7, .crm = CP_ANY,
       .opc1 = 0, .opc2 = CP_ANY, .access = PL1_W,
-      .type = ARM_CP_OVERRIDE | ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_OVERRIDE | ARM_CP_NO_RAW,
       .writefn = omap_cachemaint_write },
     { .name = "C9", .cp = 15, .crn = 9,
       .crm = CP_ANY, .opc1 = CP_ANY, .opc2 = CP_ANY, .access = PL1_RW,
@@ -1848,7 +1967,7 @@
     { .name = "C15_IMPDEF", .cp = 15, .crn = 15,
       .crm = CP_ANY, .opc1 = CP_ANY, .opc2 = CP_ANY,
       .access = PL1_RW,
-      .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE | ARM_CP_OVERRIDE,
+      .type = ARM_CP_CONST | ARM_CP_NO_RAW | ARM_CP_OVERRIDE,
       .resetvalue = 0 },
     REGINFO_SENTINEL
 };
@@ -1856,7 +1975,7 @@
 static const ARMCPRegInfo cache_dirty_status_cp_reginfo[] = {
     /* Cache status: RAZ because we have no cache so it's always clean */
     { .name = "CDSR", .cp = 15, .crn = 7, .crm = 10, .opc1 = 0, .opc2 = 6,
-      .access = PL1_R, .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE,
+      .access = PL1_R, .type = ARM_CP_CONST | ARM_CP_NO_RAW,
       .resetvalue = 0 },
     REGINFO_SENTINEL
 };
@@ -1864,7 +1983,7 @@
 static const ARMCPRegInfo cache_block_ops_cp_reginfo[] = {
     /* We never have a a block transfer operation in progress */
     { .name = "BXSR", .cp = 15, .crn = 7, .crm = 12, .opc1 = 0, .opc2 = 4,
-      .access = PL0_R, .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE,
+      .access = PL0_R, .type = ARM_CP_CONST | ARM_CP_NO_RAW,
       .resetvalue = 0 },
     /* The cache ops themselves: these all NOP for QEMU */
     { .name = "IICR", .cp = 15, .crm = 5, .opc1 = 0,
@@ -1887,10 +2006,10 @@
      * to indicate that there are no dirty cache lines.
      */
     { .name = "TC_DCACHE", .cp = 15, .crn = 7, .crm = 10, .opc1 = 0, .opc2 = 3,
-      .access = PL0_R, .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE,
+      .access = PL0_R, .type = ARM_CP_CONST | ARM_CP_NO_RAW,
       .resetvalue = (1 << 30) },
     { .name = "TCI_DCACHE", .cp = 15, .crn = 7, .crm = 14, .opc1 = 0, .opc2 = 3,
-      .access = PL0_R, .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE,
+      .access = PL0_R, .type = ARM_CP_CONST | ARM_CP_NO_RAW,
       .resetvalue = (1 << 30) },
     REGINFO_SENTINEL
 };
@@ -1900,7 +2019,7 @@
     { .name = "C9_READBUFFER", .cp = 15, .crn = 9,
       .crm = CP_ANY, .opc1 = CP_ANY, .opc2 = CP_ANY,
       .access = PL1_RW, .resetvalue = 0,
-      .type = ARM_CP_CONST | ARM_CP_OVERRIDE | ARM_CP_NO_MIGRATE },
+      .type = ARM_CP_CONST | ARM_CP_OVERRIDE | ARM_CP_NO_RAW },
     REGINFO_SENTINEL
 };
 
@@ -1926,7 +2045,7 @@
 static const ARMCPRegInfo mpidr_cp_reginfo[] = {
     { .name = "MPIDR", .state = ARM_CP_STATE_BOTH,
       .opc0 = 3, .crn = 0, .crm = 0, .opc1 = 0, .opc2 = 5,
-      .access = PL1_R, .readfn = mpidr_read, .type = ARM_CP_NO_MIGRATE },
+      .access = PL1_R, .readfn = mpidr_read, .type = ARM_CP_NO_RAW },
     REGINFO_SENTINEL
 };
 
@@ -1947,12 +2066,12 @@
       .bank_fieldoffsets = { offsetof(CPUARMState, cp15.par_s),
                              offsetof(CPUARMState, cp15.par_ns)} },
     { .name = "TTBR0", .cp = 15, .crm = 2, .opc1 = 0,
-      .access = PL1_RW, .type = ARM_CP_64BIT | ARM_CP_NO_MIGRATE,
+      .access = PL1_RW, .type = ARM_CP_64BIT | ARM_CP_ALIAS,
       .bank_fieldoffsets = { offsetof(CPUARMState, cp15.ttbr0_s),
                              offsetof(CPUARMState, cp15.ttbr0_ns) },
       .writefn = vmsa_ttbr_write, .resetfn = arm_cp_reset_ignore },
     { .name = "TTBR1", .cp = 15, .crm = 2, .opc1 = 1,
-      .access = PL1_RW, .type = ARM_CP_64BIT | ARM_CP_NO_MIGRATE,
+      .access = PL1_RW, .type = ARM_CP_64BIT | ARM_CP_ALIAS,
       .bank_fieldoffsets = { offsetof(CPUARMState, cp15.ttbr1_s),
                              offsetof(CPUARMState, cp15.ttbr1_ns) },
       .writefn = vmsa_ttbr_write, .resetfn = arm_cp_reset_ignore },
@@ -2144,7 +2263,7 @@
       .access = PL0_RW, .type = ARM_CP_NZCV },
     { .name = "DAIF", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .opc2 = 1, .crn = 4, .crm = 2,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_NO_RAW,
       .access = PL0_RW, .accessfn = aa64_daif_access,
       .fieldoffset = offsetof(CPUARMState, daif),
       .writefn = aa64_daif_write, .resetfn = arm_cp_reset_ignore },
@@ -2156,7 +2275,7 @@
       .access = PL0_RW, .readfn = aa64_fpsr_read, .writefn = aa64_fpsr_write },
     { .name = "DCZID_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .opc2 = 7, .crn = 0, .crm = 0,
-      .access = PL0_R, .type = ARM_CP_NO_MIGRATE,
+      .access = PL0_R, .type = ARM_CP_NO_RAW,
       .readfn = aa64_dczid_read },
     { .name = "DC_ZVA", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 4, .opc2 = 1,
@@ -2207,77 +2326,77 @@
     /* TLBI operations */
     { .name = "TLBI_VMALLE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 0,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbiall_is_write },
     { .name = "TLBI_VAE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 1,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_va_is_write },
     { .name = "TLBI_ASIDE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 2,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_asid_is_write },
     { .name = "TLBI_VAAE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 3,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_vaa_is_write },
     { .name = "TLBI_VALE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 5,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_va_is_write },
     { .name = "TLBI_VAALE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 7,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_vaa_is_write },
     { .name = "TLBI_VMALLE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 0,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbiall_write },
     { .name = "TLBI_VAE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 1,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_va_write },
     { .name = "TLBI_ASIDE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 2,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_asid_write },
     { .name = "TLBI_VAAE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 3,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_vaa_write },
     { .name = "TLBI_VALE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 5,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_va_write },
     { .name = "TLBI_VAALE1", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 7,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE,
+      .access = PL1_W, .type = ARM_CP_NO_RAW,
       .writefn = tlbi_aa64_vaa_write },
 #ifndef CONFIG_USER_ONLY
     /* 64 bit address translation operations */
     { .name = "AT_S1E1R", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 0,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE, .writefn = ats_write },
+      .access = PL1_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
     { .name = "AT_S1E1W", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 1,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE, .writefn = ats_write },
+      .access = PL1_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
     { .name = "AT_S1E0R", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 2,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE, .writefn = ats_write },
+      .access = PL1_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
     { .name = "AT_S1E0W", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 3,
-      .access = PL1_W, .type = ARM_CP_NO_MIGRATE, .writefn = ats_write },
+      .access = PL1_W, .type = ARM_CP_NO_RAW, .writefn = ats_write64 },
 #endif
     /* TLB invalidate last level of translation table walk */
     { .name = "TLBIMVALIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 5,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimva_is_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimva_is_write },
     { .name = "TLBIMVAALIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 7,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W,
+      .type = ARM_CP_NO_RAW, .access = PL1_W,
       .writefn = tlbimvaa_is_write },
     { .name = "TLBIMVAL", .cp = 15, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 5,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimva_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimva_write },
     { .name = "TLBIMVAAL", .cp = 15, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 7,
-      .type = ARM_CP_NO_MIGRATE, .access = PL1_W, .writefn = tlbimvaa_write },
+      .type = ARM_CP_NO_RAW, .access = PL1_W, .writefn = tlbimvaa_write },
     /* 32 bit cache operations */
     { .name = "ICIALLUIS", .cp = 15, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 0,
       .type = ARM_CP_NOP, .access = PL1_W },
@@ -2312,12 +2431,12 @@
       .bank_fieldoffsets = { offsetoflow32(CPUARMState, cp15.dacr_s),
                              offsetoflow32(CPUARMState, cp15.dacr_ns) } },
     { .name = "ELR_EL1", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 0, .crn = 4, .crm = 0, .opc2 = 1,
       .access = PL1_RW,
       .fieldoffset = offsetof(CPUARMState, elr_el[1]) },
     { .name = "SPSR_EL1", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 0, .crn = 4, .crm = 0, .opc2 = 0,
       .access = PL1_RW, .fieldoffset = offsetof(CPUARMState, banked_spsr[0]) },
     /* We rely on the access checks not allowing the guest to write to the
@@ -2327,11 +2446,15 @@
     { .name = "SP_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 0, .crn = 4, .crm = 1, .opc2 = 0,
       .access = PL1_RW, .accessfn = sp_el0_access,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .fieldoffset = offsetof(CPUARMState, sp_el[0]) },
+    { .name = "SP_EL1", .state = ARM_CP_STATE_AA64,
+      .opc0 = 3, .opc1 = 4, .crn = 4, .crm = 1, .opc2 = 0,
+      .access = PL2_RW, .type = ARM_CP_ALIAS,
+      .fieldoffset = offsetof(CPUARMState, sp_el[1]) },
     { .name = "SPSel", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 0, .crn = 4, .crm = 2, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_NO_RAW,
       .access = PL1_RW, .readfn = spsel_read, .writefn = spsel_write },
     REGINFO_SENTINEL
 };
@@ -2343,7 +2466,7 @@
       .access = PL2_RW,
       .readfn = arm_cp_read_zero, .writefn = arm_cp_write_ignore },
     { .name = "HCR_EL2", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_NO_RAW,
       .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 1, .opc2 = 0,
       .access = PL2_RW,
       .readfn = arm_cp_read_zero, .writefn = arm_cp_write_ignore },
@@ -2386,12 +2509,12 @@
       .writefn = dacr_write, .raw_writefn = raw_write,
       .fieldoffset = offsetof(CPUARMState, cp15.dacr32_el2) },
     { .name = "ELR_EL2", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 4, .crn = 4, .crm = 0, .opc2 = 1,
       .access = PL2_RW,
       .fieldoffset = offsetof(CPUARMState, elr_el[2]) },
     { .name = "ESR_EL2", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 4, .crn = 5, .crm = 2, .opc2 = 0,
       .access = PL2_RW, .fieldoffset = offsetof(CPUARMState, cp15.esr_el[2]) },
     { .name = "IFSR32_EL2", .state = ARM_CP_STATE_AA64,
@@ -2402,7 +2525,7 @@
       .opc0 = 3, .opc1 = 4, .crn = 6, .crm = 0, .opc2 = 0,
       .access = PL2_RW, .fieldoffset = offsetof(CPUARMState, cp15.far_el[2]) },
     { .name = "SPSR_EL2", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 4, .crn = 4, .crm = 0, .opc2 = 0,
       .access = PL2_RW, .fieldoffset = offsetof(CPUARMState, banked_spsr[6]) },
     { .name = "VBAR_EL2", .state = ARM_CP_STATE_AA64,
@@ -2410,6 +2533,10 @@
       .access = PL2_RW, .writefn = vbar_write,
       .fieldoffset = offsetof(CPUARMState, cp15.vbar_el[2]),
       .resetvalue = 0 },
+    { .name = "SP_EL2", .state = ARM_CP_STATE_AA64,
+      .opc0 = 3, .opc1 = 6, .crn = 4, .crm = 1, .opc2 = 0,
+      .access = PL3_RW, .type = ARM_CP_ALIAS,
+      .fieldoffset = offsetof(CPUARMState, sp_el[2]) },
     REGINFO_SENTINEL
 };
 
@@ -2418,7 +2545,7 @@
       .opc0 = 3, .opc1 = 6, .crn = 1, .crm = 1, .opc2 = 0,
       .access = PL3_RW, .fieldoffset = offsetof(CPUARMState, cp15.scr_el3),
       .resetvalue = 0, .writefn = scr_write },
-    { .name = "SCR",  .type = ARM_CP_NO_MIGRATE,
+    { .name = "SCR",  .type = ARM_CP_ALIAS,
       .cp = 15, .opc1 = 0, .crn = 1, .crm = 1, .opc2 = 0,
       .access = PL3_RW, .fieldoffset = offsetoflow32(CPUARMState, cp15.scr_el3),
       .resetfn = arm_cp_reset_ignore, .writefn = scr_write },
@@ -2451,19 +2578,19 @@
       .resetfn = vmsa_ttbcr_reset, .raw_writefn = raw_write,
       .fieldoffset = offsetof(CPUARMState, cp15.tcr_el[3]) },
     { .name = "ELR_EL3", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 6, .crn = 4, .crm = 0, .opc2 = 1,
       .access = PL3_RW,
       .fieldoffset = offsetof(CPUARMState, elr_el[3]) },
     { .name = "ESR_EL3", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 6, .crn = 5, .crm = 2, .opc2 = 0,
       .access = PL3_RW, .fieldoffset = offsetof(CPUARMState, cp15.esr_el[3]) },
     { .name = "FAR_EL3", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 6, .crn = 6, .crm = 0, .opc2 = 0,
       .access = PL3_RW, .fieldoffset = offsetof(CPUARMState, cp15.far_el[3]) },
     { .name = "SPSR_EL3", .state = ARM_CP_STATE_AA64,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .opc0 = 3, .opc1 = 6, .crn = 4, .crm = 0, .opc2 = 0,
       .access = PL3_RW, .fieldoffset = offsetof(CPUARMState, banked_spsr[7]) },
     { .name = "VBAR_EL3", .state = ARM_CP_STATE_AA64,
@@ -2510,7 +2637,7 @@
      */
     { .name = "MDCCSR_EL0", .state = ARM_CP_STATE_BOTH,
       .cp = 14, .opc0 = 2, .opc1 = 0, .crn = 0, .crm = 1, .opc2 = 0,
-      .type = ARM_CP_NO_MIGRATE,
+      .type = ARM_CP_ALIAS,
       .access = PL1_R,
       .fieldoffset = offsetof(CPUARMState, cp15.mdscr_el1),
       .resetfn = arm_cp_reset_ignore },
@@ -2963,7 +3090,7 @@
         ARMCPRegInfo pmcr = {
             .name = "PMCR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 0,
             .access = PL0_RW,
-            .type = ARM_CP_IO | ARM_CP_NO_MIGRATE,
+            .type = ARM_CP_IO | ARM_CP_ALIAS,
             .fieldoffset = offsetoflow32(CPUARMState, cp15.c9_pmcr),
             .accessfn = pmreg_access, .writefn = pmcr_write,
             .raw_writefn = raw_write,
@@ -3053,17 +3180,30 @@
               .resetvalue = cpu->mvfr2 },
             REGINFO_SENTINEL
         };
-        ARMCPRegInfo rvbar = {
-            .name = "RVBAR_EL1", .state = ARM_CP_STATE_AA64,
-            .opc0 = 3, .opc1 = 0, .crn = 12, .crm = 0, .opc2 = 2,
-            .type = ARM_CP_CONST, .access = PL1_R, .resetvalue = cpu->rvbar
-        };
-        define_one_arm_cp_reg(cpu, &rvbar);
+        /* RVBAR_EL1 is only implemented if EL1 is the highest EL */
+        if (!arm_feature(env, ARM_FEATURE_EL3) &&
+            !arm_feature(env, ARM_FEATURE_EL2)) {
+            ARMCPRegInfo rvbar = {
+                .name = "RVBAR_EL1", .state = ARM_CP_STATE_AA64,
+                .opc0 = 3, .opc1 = 0, .crn = 12, .crm = 0, .opc2 = 1,
+                .type = ARM_CP_CONST, .access = PL1_R, .resetvalue = cpu->rvbar
+            };
+            define_one_arm_cp_reg(cpu, &rvbar);
+        }
         define_arm_cp_regs(cpu, v8_idregs);
         define_arm_cp_regs(cpu, v8_cp_reginfo);
     }
     if (arm_feature(env, ARM_FEATURE_EL2)) {
         define_arm_cp_regs(cpu, v8_el2_cp_reginfo);
+        /* RVBAR_EL2 is only implemented if EL2 is the highest EL */
+        if (!arm_feature(env, ARM_FEATURE_EL3)) {
+            ARMCPRegInfo rvbar = {
+                .name = "RVBAR_EL2", .state = ARM_CP_STATE_AA64,
+                .opc0 = 3, .opc1 = 4, .crn = 12, .crm = 0, .opc2 = 1,
+                .type = ARM_CP_CONST, .access = PL2_R, .resetvalue = cpu->rvbar
+            };
+            define_one_arm_cp_reg(cpu, &rvbar);
+        }
     } else {
         /* If EL2 is missing but higher ELs are enabled, we need to
          * register the no_el2 reginfos.
@@ -3074,6 +3214,12 @@
     }
     if (arm_feature(env, ARM_FEATURE_EL3)) {
         define_arm_cp_regs(cpu, el3_cp_reginfo);
+        ARMCPRegInfo rvbar = {
+            .name = "RVBAR_EL3", .state = ARM_CP_STATE_AA64,
+            .opc0 = 3, .opc1 = 6, .crn = 12, .crm = 0, .opc2 = 1,
+            .type = ARM_CP_CONST, .access = PL3_R, .resetvalue = cpu->rvbar
+        };
+        define_one_arm_cp_reg(cpu, &rvbar);
     }
     if (arm_feature(env, ARM_FEATURE_MPU)) {
         /* These are the MPU registers prior to PMSAv6. Any new
@@ -3440,14 +3586,14 @@
              */
             if ((r->state == ARM_CP_STATE_BOTH && ns) ||
                 (arm_feature(&cpu->env, ARM_FEATURE_V8) && !ns)) {
-                r2->type |= ARM_CP_NO_MIGRATE;
+                r2->type |= ARM_CP_ALIAS;
                 r2->resetfn = arm_cp_reset_ignore;
             }
         } else if ((secstate != r->secure) && !ns) {
             /* The register is not banked so we only want to allow migration of
              * the non-secure instance.
              */
-            r2->type |= ARM_CP_NO_MIGRATE;
+            r2->type |= ARM_CP_ALIAS;
             r2->resetfn = arm_cp_reset_ignore;
         }
 
@@ -3496,15 +3642,25 @@
     r2->opc2 = opc2;
     /* By convention, for wildcarded registers only the first
      * entry is used for migration; the others are marked as
-     * NO_MIGRATE so we don't try to transfer the register
+     * ALIAS so we don't try to transfer the register
      * multiple times. Special registers (ie NOP/WFI) are
-     * never migratable.
+     * never migratable and not even raw-accessible.
      */
-    if ((r->type & ARM_CP_SPECIAL) ||
-        ((r->crm == CP_ANY) && crm != 0) ||
+    if ((r->type & ARM_CP_SPECIAL)) {
+        r2->type |= ARM_CP_NO_RAW;
+    }
+    if (((r->crm == CP_ANY) && crm != 0) ||
         ((r->opc1 == CP_ANY) && opc1 != 0) ||
         ((r->opc2 == CP_ANY) && opc2 != 0)) {
-        r2->type |= ARM_CP_NO_MIGRATE;
+        r2->type |= ARM_CP_ALIAS;
+    }
+
+    /* Check that raw accesses are either forbidden or handled. Note that
+     * we can't assert this earlier because the setup of fieldoffset for
+     * banked registers has to be done first.
+     */
+    if (!(r2->type & ARM_CP_NO_RAW)) {
+        assert(!raw_accessors_invalid(r2));
     }
 
     /* Overriding of an existing definition must be explicitly
@@ -4460,91 +4616,170 @@
     cs->interrupt_request |= CPU_INTERRUPT_EXITTB;
 }
 
+
+/* Return the exception level which controls this address translation regime */
+static inline uint32_t regime_el(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+    switch (mmu_idx) {
+    case ARMMMUIdx_S2NS:
+    case ARMMMUIdx_S1E2:
+        return 2;
+    case ARMMMUIdx_S1E3:
+        return 3;
+    case ARMMMUIdx_S1SE0:
+        return arm_el_is_aa64(env, 3) ? 1 : 3;
+    case ARMMMUIdx_S1SE1:
+    case ARMMMUIdx_S1NSE0:
+    case ARMMMUIdx_S1NSE1:
+        return 1;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Return the SCTLR value which controls this address translation regime */
+static inline uint32_t regime_sctlr(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+    return env->cp15.sctlr_el[regime_el(env, mmu_idx)];
+}
+
+/* Return true if the specified stage of address translation is disabled */
+static inline bool regime_translation_disabled(CPUARMState *env,
+                                               ARMMMUIdx mmu_idx)
+{
+    if (mmu_idx == ARMMMUIdx_S2NS) {
+        return (env->cp15.hcr_el2 & HCR_VM) == 0;
+    }
+    return (regime_sctlr(env, mmu_idx) & SCTLR_M) == 0;
+}
+
+/* Return the TCR controlling this translation regime */
+static inline TCR *regime_tcr(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+    if (mmu_idx == ARMMMUIdx_S2NS) {
+        /* TODO: return VTCR_EL2 */
+        g_assert_not_reached();
+    }
+    return &env->cp15.tcr_el[regime_el(env, mmu_idx)];
+}
+
+/* Return true if the translation regime is using LPAE format page tables */
+static inline bool regime_using_lpae_format(CPUARMState *env,
+                                            ARMMMUIdx mmu_idx)
+{
+    int el = regime_el(env, mmu_idx);
+    if (el == 2 || arm_el_is_aa64(env, el)) {
+        return true;
+    }
+    if (arm_feature(env, ARM_FEATURE_LPAE)
+        && (regime_tcr(env, mmu_idx)->raw_tcr & TTBCR_EAE)) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool regime_is_user(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+    switch (mmu_idx) {
+    case ARMMMUIdx_S1SE0:
+    case ARMMMUIdx_S1NSE0:
+        return true;
+    default:
+        return false;
+    case ARMMMUIdx_S12NSE0:
+    case ARMMMUIdx_S12NSE1:
+        g_assert_not_reached();
+    }
+}
+
 /* Check section/page access permissions.
    Returns the page protection flags, or zero if the access is not
    permitted.  */
-static inline int check_ap(CPUARMState *env, int ap, int domain_prot,
-                           int access_type, int is_user)
+static inline int check_ap(CPUARMState *env, ARMMMUIdx mmu_idx,
+                           int ap, int domain_prot,
+                           int access_type)
 {
-  int prot_ro;
+    int prot_ro;
+    bool is_user = regime_is_user(env, mmu_idx);
 
-  if (domain_prot == 3) {
-    return PAGE_READ | PAGE_WRITE;
-  }
+    if (domain_prot == 3) {
+        return PAGE_READ | PAGE_WRITE;
+    }
 
-  if (access_type == 1)
-      prot_ro = 0;
-  else
-      prot_ro = PAGE_READ;
+    if (access_type == 1) {
+        prot_ro = 0;
+    } else {
+        prot_ro = PAGE_READ;
+    }
 
-  switch (ap) {
-  case 0:
-      if (arm_feature(env, ARM_FEATURE_V7)) {
-          return 0;
-      }
-      if (access_type == 1)
-          return 0;
-      switch (A32_BANKED_CURRENT_REG_GET(env, sctlr) & (SCTLR_S | SCTLR_R)) {
-      case SCTLR_S:
-          return is_user ? 0 : PAGE_READ;
-      case SCTLR_R:
-          return PAGE_READ;
-      default:
-          return 0;
-      }
-  case 1:
-      return is_user ? 0 : PAGE_READ | PAGE_WRITE;
-  case 2:
-      if (is_user)
-          return prot_ro;
-      else
-          return PAGE_READ | PAGE_WRITE;
-  case 3:
-      return PAGE_READ | PAGE_WRITE;
-  case 4: /* Reserved.  */
-      return 0;
-  case 5:
-      return is_user ? 0 : prot_ro;
-  case 6:
-      return prot_ro;
-  case 7:
-      if (!arm_feature (env, ARM_FEATURE_V6K))
-          return 0;
-      return prot_ro;
-  default:
-      abort();
-  }
+    switch (ap) {
+    case 0:
+        if (arm_feature(env, ARM_FEATURE_V7)) {
+            return 0;
+        }
+        if (access_type == 1) {
+            return 0;
+        }
+        switch (regime_sctlr(env, mmu_idx) & (SCTLR_S | SCTLR_R)) {
+        case SCTLR_S:
+            return is_user ? 0 : PAGE_READ;
+        case SCTLR_R:
+            return PAGE_READ;
+        default:
+            return 0;
+        }
+    case 1:
+        return is_user ? 0 : PAGE_READ | PAGE_WRITE;
+    case 2:
+        if (is_user) {
+            return prot_ro;
+        } else {
+            return PAGE_READ | PAGE_WRITE;
+        }
+    case 3:
+        return PAGE_READ | PAGE_WRITE;
+    case 4: /* Reserved.  */
+        return 0;
+    case 5:
+        return is_user ? 0 : prot_ro;
+    case 6:
+        return prot_ro;
+    case 7:
+        if (!arm_feature(env, ARM_FEATURE_V6K)) {
+            return 0;
+        }
+        return prot_ro;
+    default:
+        abort();
+    }
 }
 
-static bool get_level1_table_address(CPUARMState *env, uint32_t *table,
-                                         uint32_t address)
+static bool get_level1_table_address(CPUARMState *env, ARMMMUIdx mmu_idx,
+                                     uint32_t *table, uint32_t address)
 {
-    /* Get the TCR bank based on our security state */
-    TCR *tcr = &env->cp15.tcr_el[arm_is_secure(env) ? 3 : 1];
+    /* Note that we can only get here for an AArch32 PL0/PL1 lookup */
+    int el = regime_el(env, mmu_idx);
+    TCR *tcr = regime_tcr(env, mmu_idx);
 
-    /* We only get here if EL1 is running in AArch32. If EL3 is running in
-     * AArch32 there is a secure and non-secure instance of the translation
-     * table registers.
-     */
     if (address & tcr->mask) {
         if (tcr->raw_tcr & TTBCR_PD1) {
             /* Translation table walk disabled for TTBR1 */
             return false;
         }
-        *table = A32_BANKED_CURRENT_REG_GET(env, ttbr1) & 0xffffc000;
+        *table = env->cp15.ttbr1_el[el] & 0xffffc000;
     } else {
         if (tcr->raw_tcr & TTBCR_PD0) {
             /* Translation table walk disabled for TTBR0 */
             return false;
         }
-        *table = A32_BANKED_CURRENT_REG_GET(env, ttbr0) & tcr->base_mask;
+        *table = env->cp15.ttbr0_el[el] & tcr->base_mask;
     }
     *table |= (address >> 18) & 0x3ffc;
     return true;
 }
 
 static int get_phys_addr_v5(CPUARMState *env, uint32_t address, int access_type,
-                            int is_user, hwaddr *phys_ptr,
+                            ARMMMUIdx mmu_idx, hwaddr *phys_ptr,
                             int *prot, target_ulong *page_size)
 {
     CPUState *cs = CPU(arm_env_get_cpu(env));
@@ -4556,10 +4791,11 @@
     int domain = 0;
     int domain_prot;
     hwaddr phys_addr;
+    uint32_t dacr;
 
     /* Pagetable walk.  */
     /* Lookup l1 descriptor.  */
-    if (!get_level1_table_address(env, &table, address)) {
+    if (!get_level1_table_address(env, mmu_idx, &table, address)) {
         /* Section translation fault if page walk is disabled by PD0 or PD1 */
         code = 5;
         goto do_fault;
@@ -4567,7 +4803,12 @@
     desc = ldl_phys(cs->as, table);
     type = (desc & 3);
     domain = (desc >> 5) & 0x0f;
-    domain_prot = (A32_BANKED_CURRENT_REG_GET(env, dacr) >> (domain * 2)) & 3;
+    if (regime_el(env, mmu_idx) == 1) {
+        dacr = env->cp15.dacr_ns;
+    } else {
+        dacr = env->cp15.dacr_s;
+    }
+    domain_prot = (dacr >> (domain * 2)) & 3;
     if (type == 0) {
         /* Section translation fault.  */
         code = 5;
@@ -4588,13 +4829,13 @@
         *page_size = 1024 * 1024;
     } else {
         /* Lookup l2 entry.  */
-	if (type == 1) {
-	    /* Coarse pagetable.  */
-	    table = (desc & 0xfffffc00) | ((address >> 10) & 0x3fc);
-	} else {
-	    /* Fine pagetable.  */
-	    table = (desc & 0xfffff000) | ((address >> 8) & 0xffc);
-	}
+        if (type == 1) {
+            /* Coarse pagetable.  */
+            table = (desc & 0xfffffc00) | ((address >> 10) & 0x3fc);
+        } else {
+            /* Fine pagetable.  */
+            table = (desc & 0xfffff000) | ((address >> 8) & 0xffc);
+        }
         desc = ldl_phys(cs->as, table);
         switch (desc & 3) {
         case 0: /* Page translation fault.  */
@@ -4611,17 +4852,17 @@
             *page_size = 0x1000;
             break;
         case 3: /* 1k page.  */
-	    if (type == 1) {
-		if (arm_feature(env, ARM_FEATURE_XSCALE)) {
-		    phys_addr = (desc & 0xfffff000) | (address & 0xfff);
-		} else {
-		    /* Page translation fault.  */
-		    code = 7;
-		    goto do_fault;
-		}
-	    } else {
-		phys_addr = (desc & 0xfffffc00) | (address & 0x3ff);
-	    }
+            if (type == 1) {
+                if (arm_feature(env, ARM_FEATURE_XSCALE)) {
+                    phys_addr = (desc & 0xfffff000) | (address & 0xfff);
+                } else {
+                    /* Page translation fault.  */
+                    code = 7;
+                    goto do_fault;
+                }
+            } else {
+                phys_addr = (desc & 0xfffffc00) | (address & 0x3ff);
+            }
             ap = (desc >> 4) & 3;
             *page_size = 0x400;
             break;
@@ -4631,7 +4872,7 @@
         }
         code = 15;
     }
-    *prot = check_ap(env, ap, domain_prot, access_type, is_user);
+    *prot = check_ap(env, mmu_idx, ap, domain_prot, access_type);
     if (!*prot) {
         /* Access permission fault.  */
         goto do_fault;
@@ -4644,7 +4885,7 @@
 }
 
 static int get_phys_addr_v6(CPUARMState *env, uint32_t address, int access_type,
-                            int is_user, hwaddr *phys_ptr,
+                            ARMMMUIdx mmu_idx, hwaddr *phys_ptr,
                             int *prot, target_ulong *page_size)
 {
     CPUState *cs = CPU(arm_env_get_cpu(env));
@@ -4658,10 +4899,11 @@
     int domain = 0;
     int domain_prot;
     hwaddr phys_addr;
+    uint32_t dacr;
 
     /* Pagetable walk.  */
     /* Lookup l1 descriptor.  */
-    if (!get_level1_table_address(env, &table, address)) {
+    if (!get_level1_table_address(env, mmu_idx, &table, address)) {
         /* Section translation fault if page walk is disabled by PD0 or PD1 */
         code = 5;
         goto do_fault;
@@ -4679,7 +4921,12 @@
         /* Page or Section.  */
         domain = (desc >> 5) & 0x0f;
     }
-    domain_prot = (A32_BANKED_CURRENT_REG_GET(env, dacr) >> (domain * 2)) & 3;
+    if (regime_el(env, mmu_idx) == 1) {
+        dacr = env->cp15.dacr_ns;
+    } else {
+        dacr = env->cp15.dacr_s;
+    }
+    domain_prot = (dacr >> (domain * 2)) & 3;
     if (domain_prot == 0 || domain_prot == 2) {
         if (type != 1) {
             code = 9; /* Section domain fault.  */
@@ -4733,20 +4980,20 @@
     if (domain_prot == 3) {
         *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
     } else {
-        if (pxn && !is_user) {
+        if (pxn && !regime_is_user(env, mmu_idx)) {
             xn = 1;
         }
         if (xn && access_type == 2)
             goto do_fault;
 
         /* The simplified model uses AP[0] as an access control bit.  */
-        if ((A32_BANKED_CURRENT_REG_GET(env, sctlr) & SCTLR_AFE)
+        if ((regime_sctlr(env, mmu_idx) & SCTLR_AFE)
                 && (ap & 1) == 0) {
             /* Access flag fault.  */
             code = (code == 15) ? 6 : 3;
             goto do_fault;
         }
-        *prot = check_ap(env, ap, domain_prot, access_type, is_user);
+        *prot = check_ap(env, mmu_idx, ap, domain_prot, access_type);
         if (!*prot) {
             /* Access permission fault.  */
             goto do_fault;
@@ -4771,7 +5018,7 @@
 } MMUFaultType;
 
 static int get_phys_addr_lpae(CPUARMState *env, target_ulong address,
-                              int access_type, int is_user,
+                              int access_type, ARMMMUIdx mmu_idx,
                               hwaddr *phys_ptr, int *prot,
                               target_ulong *page_size_ptr)
 {
@@ -4791,9 +5038,17 @@
     int32_t granule_sz = 9;
     int32_t va_size = 32;
     int32_t tbi = 0;
-    TCR *tcr = &env->cp15.tcr_el[arm_is_secure(env) ? 3 : 1];
+    bool is_user;
+    TCR *tcr = regime_tcr(env, mmu_idx);
 
-    if (arm_el_is_aa64(env, 1)) {
+    /* TODO:
+     * This code assumes we're either a 64-bit EL1 or a 32-bit PL1;
+     * it doesn't handle the different format TCR for TCR_EL2, TCR_EL3,
+     * and VTCR_EL2, or the fact that those regimes don't have a split
+     * TTBR0/TTBR1. Attribute and permission bit handling should also
+     * be checked when adding support for those page table walks.
+     */
+    if (arm_el_is_aa64(env, regime_el(env, mmu_idx))) {
         va_size = 64;
         if (extract64(address, 55, 1))
             tbi = extract64(tcr->raw_tcr, 38, 1);
@@ -4808,12 +5063,12 @@
      * TTBCR/TTBR0/TTBR1 in accordance with ARM ARM DDI0406C table B-32:
      */
     uint32_t t0sz = extract32(tcr->raw_tcr, 0, 6);
-    if (arm_el_is_aa64(env, 1)) {
+    if (va_size == 64) {
         t0sz = MIN(t0sz, 39);
         t0sz = MAX(t0sz, 16);
     }
     uint32_t t1sz = extract32(tcr->raw_tcr, 16, 6);
-    if (arm_el_is_aa64(env, 1)) {
+    if (va_size == 64) {
         t1sz = MIN(t1sz, 39);
         t1sz = MAX(t1sz, 16);
     }
@@ -4868,6 +5123,10 @@
         }
     }
 
+    /* Here we should have set up all the parameters for the translation:
+     * va_size, ttbr, epd, tsz, granule_sz, tbi
+     */
+
     if (epd) {
         /* Translation table walk disabled => Translation fault on TLB miss */
         goto do_fault;
@@ -4953,6 +5212,7 @@
         goto do_fault;
     }
     fault_type = permission_fault;
+    is_user = regime_is_user(env, mmu_idx);
     if (is_user && !(attrs & (1 << 4))) {
         /* Unprivileged access not enabled */
         goto do_fault;
@@ -4987,27 +5247,31 @@
 }
 
 static int get_phys_addr_mpu(CPUARMState *env, uint32_t address,
-                             int access_type, int is_user,
+                             int access_type, ARMMMUIdx mmu_idx,
                              hwaddr *phys_ptr, int *prot)
 {
     int n;
     uint32_t mask;
     uint32_t base;
+    bool is_user = regime_is_user(env, mmu_idx);
 
     *phys_ptr = address;
     for (n = 7; n >= 0; n--) {
-	base = env->cp15.c6_region[n];
-	if ((base & 1) == 0)
-	    continue;
-	mask = 1 << ((base >> 1) & 0x1f);
-	/* Keep this shift separate from the above to avoid an
-	   (undefined) << 32.  */
-	mask = (mask << 1) - 1;
-	if (((base ^ address) & ~mask) == 0)
-	    break;
+        base = env->cp15.c6_region[n];
+        if ((base & 1) == 0) {
+            continue;
+        }
+        mask = 1 << ((base >> 1) & 0x1f);
+        /* Keep this shift separate from the above to avoid an
+           (undefined) << 32.  */
+        mask = (mask << 1) - 1;
+        if (((base ^ address) & ~mask) == 0) {
+            break;
+        }
     }
-    if (n < 0)
-	return 2;
+    if (n < 0) {
+        return 2;
+    }
 
     if (access_type == 2) {
         mask = env->cp15.pmsav5_insn_ap;
@@ -5017,31 +5281,34 @@
     mask = (mask >> (n * 4)) & 0xf;
     switch (mask) {
     case 0:
-	return 1;
+        return 1;
     case 1:
-	if (is_user)
-	  return 1;
-	*prot = PAGE_READ | PAGE_WRITE;
-	break;
+        if (is_user) {
+            return 1;
+        }
+        *prot = PAGE_READ | PAGE_WRITE;
+        break;
     case 2:
-	*prot = PAGE_READ;
-	if (!is_user)
-	    *prot |= PAGE_WRITE;
-	break;
+        *prot = PAGE_READ;
+        if (!is_user) {
+            *prot |= PAGE_WRITE;
+        }
+        break;
     case 3:
-	*prot = PAGE_READ | PAGE_WRITE;
-	break;
+        *prot = PAGE_READ | PAGE_WRITE;
+        break;
     case 5:
-	if (is_user)
-	    return 1;
-	*prot = PAGE_READ;
-	break;
+        if (is_user) {
+            return 1;
+        }
+        *prot = PAGE_READ;
+        break;
     case 6:
-	*prot = PAGE_READ;
-	break;
+        *prot = PAGE_READ;
+        break;
     default:
-	/* Bad permission.  */
-	return 1;
+        /* Bad permission.  */
+        return 1;
     }
     *prot |= PAGE_EXEC;
     return 0;
@@ -5065,44 +5332,60 @@
  * @env: CPUARMState
  * @address: virtual address to get physical address for
  * @access_type: 0 for read, 1 for write, 2 for execute
- * @is_user: 0 for privileged access, 1 for user
+ * @mmu_idx: MMU index indicating required translation regime
  * @phys_ptr: set to the physical address corresponding to the virtual address
  * @prot: set to the permissions for the page containing phys_ptr
  * @page_size: set to the size of the page containing phys_ptr
  */
 static inline int get_phys_addr(CPUARMState *env, target_ulong address,
-                                int access_type, int is_user,
+                                int access_type, ARMMMUIdx mmu_idx,
                                 hwaddr *phys_ptr, int *prot,
                                 target_ulong *page_size)
 {
-    /* This is not entirely correct as get_phys_addr() can also be called
-     * from ats_write() for an address translation of a specific regime.
-     */
-    uint32_t sctlr = A32_BANKED_CURRENT_REG_GET(env, sctlr);
-
-    /* Fast Context Switch Extension.  */
-    if (address < 0x02000000) {
-        address += A32_BANKED_CURRENT_REG_GET(env, fcseidr);
+    if (mmu_idx == ARMMMUIdx_S12NSE0 || mmu_idx == ARMMMUIdx_S12NSE1) {
+        /* TODO: when we support EL2 we should here call ourselves recursively
+         * to do the stage 1 and then stage 2 translations. The ldl_phys
+         * calls for stage 1 will also need changing.
+         * For non-EL2 CPUs a stage1+stage2 translation is just stage 1.
+         */
+        assert(!arm_feature(env, ARM_FEATURE_EL2));
+        mmu_idx += ARMMMUIdx_S1NSE0;
     }
 
-    if ((sctlr & SCTLR_M) == 0) {
+    /* Fast Context Switch Extension. This doesn't exist at all in v8.
+     * In v7 and earlier it affects all stage 1 translations.
+     */
+    if (address < 0x02000000 && mmu_idx != ARMMMUIdx_S2NS
+        && !arm_feature(env, ARM_FEATURE_V8)) {
+        if (regime_el(env, mmu_idx) == 3) {
+            address += env->cp15.fcseidr_s;
+        } else {
+            address += env->cp15.fcseidr_ns;
+        }
+    }
+
+    if (regime_translation_disabled(env, mmu_idx)) {
         /* MMU/MPU disabled.  */
         *phys_ptr = address;
         *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
         *page_size = TARGET_PAGE_SIZE;
         return 0;
-    } else if (arm_feature(env, ARM_FEATURE_MPU)) {
+    }
+
+    if (arm_feature(env, ARM_FEATURE_MPU)) {
         *page_size = TARGET_PAGE_SIZE;
-	return get_phys_addr_mpu(env, address, access_type, is_user, phys_ptr,
-				 prot);
-    } else if (extended_addresses_enabled(env)) {
-        return get_phys_addr_lpae(env, address, access_type, is_user, phys_ptr,
+        return get_phys_addr_mpu(env, address, access_type, mmu_idx, phys_ptr,
+                                 prot);
+    }
+
+    if (regime_using_lpae_format(env, mmu_idx)) {
+        return get_phys_addr_lpae(env, address, access_type, mmu_idx, phys_ptr,
                                   prot, page_size);
-    } else if (sctlr & SCTLR_XP) {
-        return get_phys_addr_v6(env, address, access_type, is_user, phys_ptr,
+    } else if (regime_sctlr(env, mmu_idx) & SCTLR_XP) {
+        return get_phys_addr_v6(env, address, access_type, mmu_idx, phys_ptr,
                                 prot, page_size);
     } else {
-        return get_phys_addr_v5(env, address, access_type, is_user, phys_ptr,
+        return get_phys_addr_v5(env, address, access_type, mmu_idx, phys_ptr,
                                 prot, page_size);
     }
 }
@@ -5115,12 +5398,11 @@
     hwaddr phys_addr;
     target_ulong page_size;
     int prot;
-    int ret, is_user;
+    int ret;
     uint32_t syn;
     bool same_el = (arm_current_el(env) != 0);
 
-    is_user = mmu_idx == MMU_USER_IDX;
-    ret = get_phys_addr(env, address, access_type, is_user, &phys_addr, &prot,
+    ret = get_phys_addr(env, address, access_type, mmu_idx, &phys_addr, &prot,
                         &page_size);
     if (ret == 0) {
         /* Map a single [sub]page.  */
@@ -5156,12 +5438,14 @@
 hwaddr arm_cpu_get_phys_page_debug(CPUState *cs, vaddr addr)
 {
     ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
     hwaddr phys_addr;
     target_ulong page_size;
     int prot;
     int ret;
 
-    ret = get_phys_addr(&cpu->env, addr, 0, 0, &phys_addr, &prot, &page_size);
+    ret = get_phys_addr(env, addr, 0, cpu_mmu_index(env), &phys_addr,
+                        &prot, &page_size);
 
     if (ret != 0) {
         return -1;
@@ -6242,7 +6526,7 @@
         } else {
             return float64_set_sign(float64_maxnorm, float64_is_neg(f64));
         }
-    } else if (f64_exp >= 1023 && fpst->flush_to_zero) {
+    } else if (f64_exp >= 2045 && fpst->flush_to_zero) {
         float_raise(float_flag_underflow, fpst);
         return float64_set_sign(float64_zero, float64_is_neg(f64));
     }
diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
index ba16821..033babf 100644
--- a/target-arm/kvm64.c
+++ b/target-arm/kvm64.c
@@ -193,9 +193,12 @@
         }
     }
 
+    if (!write_list_to_kvmstate(cpu)) {
+        return EINVAL;
+    }
+
     /* TODO:
      * FP state
-     * system registers
      */
     return ret;
 }
@@ -269,6 +272,14 @@
         }
     }
 
+    if (!write_kvmstate_to_list(cpu)) {
+        return EINVAL;
+    }
+    /* Note that it's OK to have registers which aren't in CPUState,
+     * so we can ignore a failure return here.
+     */
+    write_list_to_cpustate(cpu);
+
     /* TODO: other registers */
     return ret;
 }
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 80d2359..acf4b16 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -123,6 +123,23 @@
 #endif
 }
 
+static inline ARMMMUIdx get_a64_user_mem_index(DisasContext *s)
+{
+    /* Return the mmu_idx to use for A64 "unprivileged load/store" insns:
+     *  if EL1, access as if EL0; otherwise access at current EL
+     */
+    switch (s->mmu_idx) {
+    case ARMMMUIdx_S12NSE1:
+        return ARMMMUIdx_S12NSE0;
+    case ARMMMUIdx_S1SE1:
+        return ARMMMUIdx_S1SE0;
+    case ARMMMUIdx_S2NS:
+        g_assert_not_reached();
+    default:
+        return s->mmu_idx;
+    }
+}
+
 void aarch64_cpu_dump_state(CPUState *cs, FILE *f,
                             fprintf_function cpu_fprintf, int flags)
 {
@@ -2107,7 +2124,7 @@
         }
     } else {
         TCGv_i64 tcg_rt = cpu_reg(s, rt);
-        int memidx = is_unpriv ? 1 : get_mem_index(s);
+        int memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s);
 
         if (is_store) {
             do_gpr_st_memidx(s, tcg_rt, tcg_addr, size, memidx);
@@ -10922,14 +10939,15 @@
     dc->bswap_code = 0;
     dc->condexec_mask = 0;
     dc->condexec_cond = 0;
+    dc->mmu_idx = ARM_TBFLAG_MMUIDX(tb->flags);
+    dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
 #if !defined(CONFIG_USER_ONLY)
-    dc->user = (ARM_TBFLAG_AA64_EL(tb->flags) == 0);
+    dc->user = (dc->current_el == 0);
 #endif
     dc->cpacr_fpen = ARM_TBFLAG_AA64_FPEN(tb->flags);
     dc->vec_len = 0;
     dc->vec_stride = 0;
     dc->cp_regs = cpu->cp_regs;
-    dc->current_el = arm_current_el(env);
     dc->features = env->features;
 
     /* Single step state. The code-generation logic here is:
diff --git a/target-arm/translate.c b/target-arm/translate.c
index bdfcdf1..1c36b8b 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -113,6 +113,28 @@
     a64_translate_init();
 }
 
+static inline ARMMMUIdx get_a32_user_mem_index(DisasContext *s)
+{
+    /* Return the mmu_idx to use for A32/T32 "unprivileged load/store"
+     * insns:
+     *  if PL2, UNPREDICTABLE (we choose to implement as if PL0)
+     *  otherwise, access as if at PL0.
+     */
+    switch (s->mmu_idx) {
+    case ARMMMUIdx_S1E2:        /* this one is UNPREDICTABLE */
+    case ARMMMUIdx_S12NSE0:
+    case ARMMMUIdx_S12NSE1:
+        return ARMMMUIdx_S12NSE0;
+    case ARMMMUIdx_S1E3:
+    case ARMMMUIdx_S1SE0:
+    case ARMMMUIdx_S1SE1:
+        return ARMMMUIdx_S1SE0;
+    case ARMMMUIdx_S2NS:
+    default:
+        g_assert_not_reached();
+    }
+}
+
 static inline TCGv_i32 load_cpu_offset(int offset)
 {
     TCGv_i32 tmp = tcg_temp_new_i32();
@@ -8739,6 +8761,10 @@
                         ARCH(6T2);
                         shift = (insn >> 7) & 0x1f;
                         i = (insn >> 16) & 0x1f;
+                        if (i < shift) {
+                            /* UNPREDICTABLE; we choose to UNDEF */
+                            goto illegal_op;
+                        }
                         i = i + 1 - shift;
                         if (rm == 15) {
                             tmp = tcg_temp_new_i32();
@@ -8793,7 +8819,7 @@
             tmp2 = load_reg(s, rn);
             if ((insn & 0x01200000) == 0x00200000) {
                 /* ldrt/strt */
-                i = MMU_USER_IDX;
+                i = get_a32_user_mem_index(s);
             } else {
                 i = get_mem_index(s);
             }
@@ -10173,7 +10199,7 @@
                     break;
                 case 0xe: /* User privilege.  */
                     tcg_gen_addi_i32(addr, addr, imm);
-                    memidx = MMU_USER_IDX;
+                    memidx = get_a32_user_mem_index(s);
                     break;
                 case 0x9: /* Post-decrement.  */
                     imm = -imm;
@@ -11032,8 +11058,10 @@
     dc->bswap_code = ARM_TBFLAG_BSWAP_CODE(tb->flags);
     dc->condexec_mask = (ARM_TBFLAG_CONDEXEC(tb->flags) & 0xf) << 1;
     dc->condexec_cond = ARM_TBFLAG_CONDEXEC(tb->flags) >> 4;
+    dc->mmu_idx = ARM_TBFLAG_MMUIDX(tb->flags);
+    dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
 #if !defined(CONFIG_USER_ONLY)
-    dc->user = (ARM_TBFLAG_PRIV(tb->flags) == 0);
+    dc->user = (dc->current_el == 0);
 #endif
     dc->ns = ARM_TBFLAG_NS(tb->flags);
     dc->cpacr_fpen = ARM_TBFLAG_CPACR_FPEN(tb->flags);
@@ -11042,7 +11070,6 @@
     dc->vec_stride = ARM_TBFLAG_VECSTRIDE(tb->flags);
     dc->c15_cpar = ARM_TBFLAG_XSCALE_CPAR(tb->flags);
     dc->cp_regs = cpu->cp_regs;
-    dc->current_el = arm_current_el(env);
     dc->features = env->features;
 
     /* Single step state. The code-generation logic here is:
diff --git a/target-arm/translate.h b/target-arm/translate.h
index f6ee789..a1eb5b5 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -20,6 +20,7 @@
 #if !defined(CONFIG_USER_ONLY)
     int user;
 #endif
+    ARMMMUIdx mmu_idx; /* MMU index to use for normal loads/stores */
     bool ns;        /* Use non-secure CPREG bank on access */
     bool cpacr_fpen; /* FP enabled via CPACR.FPEN */
     bool vfp_enabled; /* FP enabled via FPSCR.EN */
@@ -69,7 +70,7 @@
 
 static inline int get_mem_index(DisasContext *s)
 {
-    return s->current_el;
+    return s->mmu_idx;
 }
 
 /* target-specific extra values for is_jmp */
diff --git a/target-s390x/cc_helper.c b/target-s390x/cc_helper.c
index 373eb17..00bc883 100644
--- a/target-s390x/cc_helper.c
+++ b/target-s390x/cc_helper.c
@@ -179,16 +179,11 @@
 
 static uint32_t cc_calc_subb_64(uint64_t a1, uint64_t a2, uint64_t ar)
 {
-    /* We had borrow-in if normal subtraction isn't equal.  */
-    int borrow_in = ar - (a1 - a2);
     int borrow_out;
 
-    /* If a2 was ULONG_MAX, and borrow_in, then a2 is logically 65 bits,
-       and we must have had borrow out.  */
-    if (borrow_in && a2 == (uint64_t)-1) {
-        borrow_out = 1;
+    if (ar != a1 - a2) {	/* difference means borrow-in */
+        borrow_out = (a2 >= a1);
     } else {
-        a2 += borrow_in;
         borrow_out = (a2 > a1);
     }
 
@@ -285,16 +280,11 @@
 
 static uint32_t cc_calc_subb_32(uint32_t a1, uint32_t a2, uint32_t ar)
 {
-    /* We had borrow-in if normal subtraction isn't equal.  */
-    int borrow_in = ar - (a1 - a2);
     int borrow_out;
 
-    /* If a2 was UINT_MAX, and borrow_in, then a2 is logically 65 bits,
-       and we must have had borrow out.  */
-    if (borrow_in && a2 == (uint32_t)-1) {
-        borrow_out = 1;
+    if (ar != a1 - a2) {	/* difference means borrow-in */
+        borrow_out = (a2 >= a1);
     } else {
-        a2 += borrow_in;
         borrow_out = (a2 > a1);
     }
 
diff --git a/target-s390x/cpu.h b/target-s390x/cpu.h
index c123b6f..2e2554c 100644
--- a/target-s390x/cpu.h
+++ b/target-s390x/cpu.h
@@ -133,7 +133,9 @@
 
     /* reset does memset(0) up to here */
 
-    int cpu_num;
+    uint32_t cpu_num;
+    uint32_t machine_type;
+
     uint8_t *storage_keys;
 
     uint64_t tod_offset;
diff --git a/target-s390x/helper.h b/target-s390x/helper.h
index faebfd9..8d2c859 100644
--- a/target-s390x/helper.h
+++ b/target-s390x/helper.h
@@ -111,5 +111,8 @@
 DEF_HELPER_FLAGS_3(ipte, TCG_CALL_NO_RWG, void, env, i64, i64)
 DEF_HELPER_FLAGS_1(ptlb, TCG_CALL_NO_RWG, void, env)
 DEF_HELPER_2(lra, i64, env, i64)
+DEF_HELPER_FLAGS_2(lura, TCG_CALL_NO_WG, i64, env, i64)
+DEF_HELPER_FLAGS_2(lurag, TCG_CALL_NO_WG, i64, env, i64)
 DEF_HELPER_FLAGS_3(stura, TCG_CALL_NO_WG, void, env, i64, i64)
+DEF_HELPER_FLAGS_3(sturg, TCG_CALL_NO_WG, void, env, i64, i64)
 #endif
diff --git a/target-s390x/insn-data.def b/target-s390x/insn-data.def
index 4d2feb6..8d8e47e 100644
--- a/target-s390x/insn-data.def
+++ b/target-s390x/insn-data.def
@@ -285,8 +285,12 @@
 
 /* EXTRACT ACCESS */
     C(0xb24f, EAR,     RRE,   Z,   0, 0, new, r1_32, ear, 0)
+/* EXTRACT CPU ATTRIBUTE */
+    C(0xeb4c, ECAG,    RSY_a, GIE, 0, a2, r1, 0, ecag, 0)
 /* EXTRACT FPC */
     C(0xb38c, EFPC,    RRE,   Z,   0, 0, new, r1_32, efpc, 0)
+/* EXTRACT PSW */
+    C(0xb98d, EPSW,    RRE,   Z,   0, 0, 0, 0, epsw, 0)
 
 /* FIND LEFTMOST ONE */
     C(0xb983, FLOGR,   RRE,   EI,  0, r2_o, r1_P, 0, flogr, 0)
@@ -566,6 +570,10 @@
 
 /* SET ACCESS */
     C(0xb24e, SAR,     RRE,   Z,   0, r2_o, 0, 0, sar, 0)
+/* SET ADDRESSING MODE */
+    D(0x010c, SAM24,   E,     Z,   0, 0, 0, 0, sam, 0, 0)
+    D(0x010d, SAM31,   E,     Z,   0, 0, 0, 0, sam, 0, 1)
+    D(0x010e, SAM64,   E,     Z,   0, 0, 0, 0, sam, 0, 3)
 /* SET FPC */
     C(0xb384, SFPC,    RRE,   Z,   0, r1_o, 0, 0, sfpc, 0)
 /* SET FPC AND SIGNAL */
@@ -733,6 +741,9 @@
     C(0xb100, LRA,     RX_a,  Z,   0, a2, r1, 0, lra, 0)
     C(0xe313, LRAY,    RXY_a, LD,  0, a2, r1, 0, lra, 0)
     C(0xe303, LRAG,    RXY_a, Z,   0, a2, r1, 0, lra, 0)
+/* LOAD USING REAL ADDRESS */
+    C(0xb24b, LURA,    RRE,   Z,   0, r2, new, r1_32, lura, 0)
+    C(0xb905, LURAG,   RRE,   Z,   0, r2, r1, 0, lurag, 0)
 /* MOVE TO PRIMARY */
     C(0xda00, MVCP,    SS_d,  Z,   la1, a2, 0, 0, mvcp, 0)
 /* MOVE TO SECONDARY */
@@ -743,10 +754,6 @@
     C(0xb22a, RRBE,    RRE,   Z,   0, r2_o, 0, 0, rrbe, 0)
 /* SERVICE CALL LOGICAL PROCESSOR (PV hypercall) */
     C(0xb220, SERVC,   RRE,   Z,   r1_o, r2_o, 0, 0, servc, 0)
-/* SET ADDRESSING MODE */
-    D(0x010c, SAM24,   E,     Z,   0, 0, 0, 0, sam, 0, 0)
-    D(0x010d, SAM31,   E,     Z,   0, 0, 0, 0, sam, 0, 1)
-    D(0x010e, SAM64,   E,     Z,   0, 0, 0, 0, sam, 0, 3)
 /* SET ADDRESS SPACE CONTROL FAST */
     C(0xb279, SACF,    S,     Z,   0, a2, 0, 0, sacf, 0)
 /* SET CLOCK */
@@ -794,6 +801,7 @@
     C(0xad00, STOSM,   SI,    Z,   la1, 0, 0, 0, stnosm, 0)
 /* STORE USING REAL ADDRESS */
     C(0xb246, STURA,   RRE,   Z,   r1_o, r2_o, 0, 0, stura, 0)
+    C(0xb925, STURG,   RRE,   Z,   r1_o, r2_o, 0, 0, sturg, 0)
 /* TEST PROTECTION */
     C(0xe501, TPROT,   SSE,   Z,   la1, a2, 0, 0, tprot, 0)
 
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index dcd7505..6f2d5b4 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -1046,7 +1046,7 @@
     uint64_t r1, r3;
 
     cpu_synchronize_state(CPU(cpu));
-    r1 = (run->s390_sieic.ipa & 0x00f0) >> 8;
+    r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
     r3 = run->s390_sieic.ipa & 0x000f;
     handle_diag_308(&cpu->env, r1, r3);
 }
@@ -1091,7 +1091,7 @@
         break;
     default:
         DPRINTF("KVM: unknown DIAG: 0x%x\n", func_code);
-        r = -1;
+        enter_pgmcheck(cpu, PGM_SPECIFICATION);
         break;
     }
 
diff --git a/target-s390x/mem_helper.c b/target-s390x/mem_helper.c
index 5a55de8..d67b345 100644
--- a/target-s390x/mem_helper.c
+++ b/target-s390x/mem_helper.c
@@ -490,10 +490,18 @@
             helper_mvc(env, l, get_address(env, 0, b1, d1),
                        get_address(env, 0, b2, d2));
             break;
+        case 0x400:
+            cc = helper_nc(env, l, get_address(env, 0, b1, d1),
+                            get_address(env, 0, b2, d2));
+            break;
         case 0x500:
             cc = helper_clc(env, l, get_address(env, 0, b1, d1),
                             get_address(env, 0, b2, d2));
             break;
+        case 0x600:
+            cc = helper_oc(env, l, get_address(env, 0, b1, d1),
+                            get_address(env, 0, b2, d2));
+            break;
         case 0x700:
             cc = helper_xc(env, l, get_address(env, 0, b1, d1),
                            get_address(env, 0, b2, d2));
@@ -1034,12 +1042,34 @@
     tlb_flush(CPU(cpu), 1);
 }
 
+/* load using real address */
+uint64_t HELPER(lura)(CPUS390XState *env, uint64_t addr)
+{
+    CPUState *cs = CPU(s390_env_get_cpu(env));
+
+    return (uint32_t)ldl_phys(cs->as, get_address(env, 0, 0, addr));
+}
+
+uint64_t HELPER(lurag)(CPUS390XState *env, uint64_t addr)
+{
+    CPUState *cs = CPU(s390_env_get_cpu(env));
+
+    return ldq_phys(cs->as, get_address(env, 0, 0, addr));
+}
+
 /* store using real address */
 void HELPER(stura)(CPUS390XState *env, uint64_t addr, uint64_t v1)
 {
     CPUState *cs = CPU(s390_env_get_cpu(env));
 
-    stw_phys(cs->as, get_address(env, 0, 0, addr), (uint32_t)v1);
+    stl_phys(cs->as, get_address(env, 0, 0, addr), (uint32_t)v1);
+}
+
+void HELPER(sturg)(CPUS390XState *env, uint64_t addr, uint64_t v1)
+{
+    CPUState *cs = CPU(s390_env_get_cpu(env));
+
+    stq_phys(cs->as, get_address(env, 0, 0, addr), v1);
 }
 
 /* load real address */
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index ab01bc0..8b36eca 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -317,12 +317,14 @@
     gen_program_exception(s, PGM_SPECIFICATION);
 }
 
-static inline void check_privileged(DisasContext *s)
+#ifndef CONFIG_USER_ONLY
+static void check_privileged(DisasContext *s)
 {
     if (s->tb->flags & (PSW_MASK_PSTATE >> 32)) {
         gen_program_exception(s, PGM_PRIVILEGED);
     }
 }
+#endif
 
 static TCGv_i64 get_address(DisasContext *s, int x2, int b2, int d2)
 {
@@ -2045,12 +2047,37 @@
     return NO_EXIT;
 }
 
+static ExitStatus op_ecag(DisasContext *s, DisasOps *o)
+{
+    /* No cache information provided.  */
+    tcg_gen_movi_i64(o->out, -1);
+    return NO_EXIT;
+}
+
 static ExitStatus op_efpc(DisasContext *s, DisasOps *o)
 {
     tcg_gen_ld32u_i64(o->out, cpu_env, offsetof(CPUS390XState, fpc));
     return NO_EXIT;
 }
 
+static ExitStatus op_epsw(DisasContext *s, DisasOps *o)
+{
+    int r1 = get_field(s->fields, r1);
+    int r2 = get_field(s->fields, r2);
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    /* Note the "subsequently" in the PoO, which implies a defined result
+       if r1 == r2.  Thus we cannot defer these writes to an output hook.  */
+    tcg_gen_shri_i64(t, psw_mask, 32);
+    store_reg32_i64(r1, t);
+    if (r2 != 0) {
+        store_reg32_i64(r2, psw_mask);
+    }
+
+    tcg_temp_free_i64(t);
+    return NO_EXIT;
+}
+
 static ExitStatus op_ex(DisasContext *s, DisasOps *o)
 {
     /* ??? Perhaps a better way to implement EXECUTE is to set a bit in
@@ -2460,6 +2487,24 @@
     return NO_EXIT;
 }
 
+#ifndef CONFIG_USER_ONLY
+static ExitStatus op_lura(DisasContext *s, DisasOps *o)
+{
+    check_privileged(s);
+    potential_page_fault(s);
+    gen_helper_lura(o->out, cpu_env, o->in2);
+    return NO_EXIT;
+}
+
+static ExitStatus op_lurag(DisasContext *s, DisasOps *o)
+{
+    check_privileged(s);
+    potential_page_fault(s);
+    gen_helper_lurag(o->out, cpu_env, o->in2);
+    return NO_EXIT;
+}
+#endif
+
 static ExitStatus op_mov2(DisasContext *s, DisasOps *o)
 {
     o->out = o->in2;
@@ -2925,19 +2970,42 @@
     /* Addressing mode has changed, so end the block.  */
     return EXIT_PC_STALE;
 }
+#endif
 
 static ExitStatus op_sam(DisasContext *s, DisasOps *o)
 {
     int sam = s->insn->data;
-    TCGv_i64 tsam = tcg_const_i64(sam);
+    TCGv_i64 tsam;
+    uint64_t mask;
 
-    /* Overwrite PSW_MASK_64 and PSW_MASK_32 */
+    switch (sam) {
+    case 0:
+        mask = 0xffffff;
+        break;
+    case 1:
+        mask = 0x7fffffff;
+        break;
+    default:
+        mask = -1;
+        break;
+    }
+
+    /* Bizzare but true, we check the address of the current insn for the
+       specification exception, not the next to be executed.  Thus the PoO
+       documents that Bad Things Happen two bytes before the end.  */
+    if (s->pc & ~mask) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return EXIT_NORETURN;
+    }
+    s->next_pc &= mask;
+
+    tsam = tcg_const_i64(sam);
     tcg_gen_deposit_i64(psw_mask, psw_mask, tsam, 31, 2);
-
     tcg_temp_free_i64(tsam);
+
+    /* Always exit the TB, since we (may have) changed execution mode.  */
     return EXIT_PC_STALE;
 }
-#endif
 
 static ExitStatus op_sar(DisasContext *s, DisasOps *o)
 {
@@ -3221,8 +3289,14 @@
 
 static ExitStatus op_stidp(DisasContext *s, DisasOps *o)
 {
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
     check_privileged(s);
     tcg_gen_ld32u_i64(o->out, cpu_env, offsetof(CPUS390XState, cpu_num));
+    tcg_gen_ld32u_i64(t1, cpu_env, offsetof(CPUS390XState, machine_type));
+    tcg_gen_deposit_i64(o->out, o->out, t1, 32, 32);
+    tcg_temp_free_i64(t1);
+
     return NO_EXIT;
 }
 
@@ -3317,6 +3391,14 @@
     gen_helper_stura(cpu_env, o->in2, o->in1);
     return NO_EXIT;
 }
+
+static ExitStatus op_sturg(DisasContext *s, DisasOps *o)
+{
+    check_privileged(s);
+    potential_page_fault(s);
+    gen_helper_sturg(cpu_env, o->in2, o->in1);
+    return NO_EXIT;
+}
 #endif
 
 static ExitStatus op_st8(DisasContext *s, DisasOps *o)
diff --git a/tests/Makefile b/tests/Makefile
index c2e2e52..5caccf7 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -60,6 +60,8 @@
 check-unit-y += tests/test-int128$(EXESUF)
 # all code tested by test-int128 is inside int128.h
 gcov-files-test-int128-y =
+check-unit-y += tests/rcutorture$(EXESUF)
+gcov-files-rcutorture-y = util/rcu.c
 check-unit-y += tests/test-bitops$(EXESUF)
 check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += tests/test-qdev-global-props$(EXESUF)
 check-unit-y += tests/check-qom-interface$(EXESUF)
@@ -223,7 +225,8 @@
 	tests/test-qmp-input-visitor.o tests/test-qmp-input-strict.o \
 	tests/test-qmp-commands.o tests/test-visitor-serialization.o \
 	tests/test-x86-cpuid.o tests/test-mul64.o tests/test-int128.o \
-	tests/test-opts-visitor.o tests/test-qmp-event.o
+	tests/test-opts-visitor.o tests/test-qmp-event.o \
+	tests/rcutorture.o
 
 test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
 		  tests/test-qapi-event.o
@@ -252,6 +255,8 @@
 tests/test-xbzrle$(EXESUF): tests/test-xbzrle.o migration/xbzrle.o page_cache.o libqemuutil.a
 tests/test-cutils$(EXESUF): tests/test-cutils.o util/cutils.o
 tests/test-int128$(EXESUF): tests/test-int128.o
+tests/rcutorture$(EXESUF): tests/rcutorture.o libqemuutil.a
+
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
 	hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
 	hw/core/irq.o \
@@ -261,7 +266,8 @@
 	libqemuutil.a libqemustub.a
 tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
 	migration/vmstate.o migration/qemu-file.o migration/qemu-file-buf.o \
-        migration/qemu-file-unix.o \
+        migration/qemu-file-unix.o qjson.o \
+	$(qom-core-obj) \
 	libqemuutil.a libqemustub.a
 
 tests/test-qapi-types.c tests/test-qapi-types.h :\
diff --git a/tests/rcutorture.c b/tests/rcutorture.c
new file mode 100644
index 0000000..60a2ccf
--- /dev/null
+++ b/tests/rcutorture.c
@@ -0,0 +1,451 @@
+/*
+ * rcutorture.c: simple user-level performance/stress test of RCU.
+ *
+ * Usage:
+ *     ./rcu <nreaders> rperf [ <seconds> ]
+ *         Run a read-side performance test with the specified
+ *         number of readers for <seconds> seconds.
+ *     ./rcu <nupdaters> uperf [ <seconds> ]
+ *         Run an update-side performance test with the specified
+ *         number of updaters and specified duration.
+ *     ./rcu <nreaders> perf [ <seconds> ]
+ *         Run a combined read/update performance test with the specified
+ *         number of readers and one updater and specified duration.
+ *
+ * The above tests produce output as follows:
+ *
+ * n_reads: 46008000  n_updates: 146026  nreaders: 2  nupdaters: 1 duration: 1
+ * ns/read: 43.4707  ns/update: 6848.1
+ *
+ * The first line lists the total number of RCU reads and updates executed
+ * during the test, the number of reader threads, the number of updater
+ * threads, and the duration of the test in seconds.  The second line
+ * lists the average duration of each type of operation in nanoseconds,
+ * or "nan" if the corresponding type of operation was not performed.
+ *
+ *     ./rcu <nreaders> stress [ <seconds> ]
+ *         Run a stress test with the specified number of readers and
+ *         one updater.
+ *
+ * This test produces output as follows:
+ *
+ * n_reads: 114633217  n_updates: 3903415  n_mberror: 0
+ * rcu_stress_count: 114618391 14826 0 0 0 0 0 0 0 0 0
+ *
+ * The first line lists the number of RCU read and update operations
+ * executed, followed by the number of memory-ordering violations
+ * (which will be zero in a correct RCU implementation).  The second
+ * line lists the number of readers observing progressively more stale
+ * data.  A correct RCU implementation will have all but the first two
+ * numbers non-zero.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) 2008 Paul E. McKenney, IBM Corporation.
+ */
+
+/*
+ * Test variables.
+ */
+
+#include <glib.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "qemu/atomic.h"
+#include "qemu/rcu.h"
+#include "qemu/compiler.h"
+#include "qemu/thread.h"
+
+long long n_reads = 0LL;
+long n_updates = 0L;
+int nthreadsrunning;
+
+#define GOFLAG_INIT 0
+#define GOFLAG_RUN  1
+#define GOFLAG_STOP 2
+
+static volatile int goflag = GOFLAG_INIT;
+
+#define RCU_READ_RUN 1000
+
+#define NR_THREADS 100
+static QemuThread threads[NR_THREADS];
+static struct rcu_reader_data *data[NR_THREADS];
+static int n_threads;
+
+static void create_thread(void *(*func)(void *))
+{
+    if (n_threads >= NR_THREADS) {
+        fprintf(stderr, "Thread limit of %d exceeded!\n", NR_THREADS);
+        exit(-1);
+    }
+    qemu_thread_create(&threads[n_threads], "test", func, &data[n_threads],
+                       QEMU_THREAD_JOINABLE);
+    n_threads++;
+}
+
+static void wait_all_threads(void)
+{
+    int i;
+
+    for (i = 0; i < n_threads; i++) {
+        qemu_thread_join(&threads[i]);
+    }
+    n_threads = 0;
+}
+
+/*
+ * Performance test.
+ */
+
+static void *rcu_read_perf_test(void *arg)
+{
+    int i;
+    long long n_reads_local = 0;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    atomic_inc(&nthreadsrunning);
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        for (i = 0; i < RCU_READ_RUN; i++) {
+            rcu_read_lock();
+            rcu_read_unlock();
+        }
+        n_reads_local += RCU_READ_RUN;
+    }
+    atomic_add(&n_reads, n_reads_local);
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void *rcu_update_perf_test(void *arg)
+{
+    long long n_updates_local = 0;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    atomic_inc(&nthreadsrunning);
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        synchronize_rcu();
+        n_updates_local++;
+    }
+    atomic_add(&n_updates, n_updates_local);
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void perftestinit(void)
+{
+    nthreadsrunning = 0;
+}
+
+static void perftestrun(int nthreads, int duration, int nreaders, int nupdaters)
+{
+    while (atomic_read(&nthreadsrunning) < nthreads) {
+        g_usleep(1000);
+    }
+    goflag = GOFLAG_RUN;
+    g_usleep(duration * G_USEC_PER_SEC);
+    goflag = GOFLAG_STOP;
+    wait_all_threads();
+    printf("n_reads: %lld  n_updates: %ld  nreaders: %d  nupdaters: %d duration: %d\n",
+           n_reads, n_updates, nreaders, nupdaters, duration);
+    printf("ns/read: %g  ns/update: %g\n",
+           ((duration * 1000*1000*1000.*(double)nreaders) /
+        (double)n_reads),
+           ((duration * 1000*1000*1000.*(double)nupdaters) /
+        (double)n_updates));
+    exit(0);
+}
+
+static void perftest(int nreaders, int duration)
+{
+    int i;
+
+    perftestinit();
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_perf_test);
+    }
+    create_thread(rcu_update_perf_test);
+    perftestrun(i + 1, duration, nreaders, 1);
+}
+
+static void rperftest(int nreaders, int duration)
+{
+    int i;
+
+    perftestinit();
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_perf_test);
+    }
+    perftestrun(i, duration, nreaders, 0);
+}
+
+static void uperftest(int nupdaters, int duration)
+{
+    int i;
+
+    perftestinit();
+    for (i = 0; i < nupdaters; i++) {
+        create_thread(rcu_update_perf_test);
+    }
+    perftestrun(i, duration, 0, nupdaters);
+}
+
+/*
+ * Stress test.
+ */
+
+#define RCU_STRESS_PIPE_LEN 10
+
+struct rcu_stress {
+    int pipe_count;
+    int mbtest;
+};
+
+struct rcu_stress rcu_stress_array[RCU_STRESS_PIPE_LEN] = { { 0 } };
+struct rcu_stress *rcu_stress_current;
+int rcu_stress_idx;
+
+int n_mberror;
+long long rcu_stress_count[RCU_STRESS_PIPE_LEN + 1];
+
+
+static void *rcu_read_stress_test(void *arg)
+{
+    int i;
+    int itercnt = 0;
+    struct rcu_stress *p;
+    int pc;
+    long long n_reads_local = 0;
+    volatile int garbage = 0;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        rcu_read_lock();
+        p = atomic_rcu_read(&rcu_stress_current);
+        if (p->mbtest == 0) {
+            n_mberror++;
+        }
+        rcu_read_lock();
+        for (i = 0; i < 100; i++) {
+            garbage++;
+        }
+        rcu_read_unlock();
+        pc = p->pipe_count;
+        rcu_read_unlock();
+        if ((pc > RCU_STRESS_PIPE_LEN) || (pc < 0)) {
+            pc = RCU_STRESS_PIPE_LEN;
+        }
+        atomic_inc(&rcu_stress_count[pc]);
+        n_reads_local++;
+        if ((++itercnt % 0x1000) == 0) {
+            synchronize_rcu();
+        }
+    }
+    atomic_add(&n_reads, n_reads_local);
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void *rcu_update_stress_test(void *arg)
+{
+    int i;
+    struct rcu_stress *p;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        i = rcu_stress_idx + 1;
+        if (i >= RCU_STRESS_PIPE_LEN) {
+            i = 0;
+        }
+        p = &rcu_stress_array[i];
+        p->mbtest = 0;
+        smp_mb();
+        p->pipe_count = 0;
+        p->mbtest = 1;
+        atomic_rcu_set(&rcu_stress_current, p);
+        rcu_stress_idx = i;
+        for (i = 0; i < RCU_STRESS_PIPE_LEN; i++) {
+            if (i != rcu_stress_idx) {
+                rcu_stress_array[i].pipe_count++;
+            }
+        }
+        synchronize_rcu();
+        n_updates++;
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void *rcu_fake_update_stress_test(void *arg)
+{
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        synchronize_rcu();
+        g_usleep(1000);
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void stresstest(int nreaders, int duration)
+{
+    int i;
+
+    rcu_stress_current = &rcu_stress_array[0];
+    rcu_stress_current->pipe_count = 0;
+    rcu_stress_current->mbtest = 1;
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_stress_test);
+    }
+    create_thread(rcu_update_stress_test);
+    for (i = 0; i < 5; i++) {
+        create_thread(rcu_fake_update_stress_test);
+    }
+    goflag = GOFLAG_RUN;
+    g_usleep(duration * G_USEC_PER_SEC);
+    goflag = GOFLAG_STOP;
+    wait_all_threads();
+    printf("n_reads: %lld  n_updates: %ld  n_mberror: %d\n",
+           n_reads, n_updates, n_mberror);
+    printf("rcu_stress_count:");
+    for (i = 0; i <= RCU_STRESS_PIPE_LEN; i++) {
+        printf(" %lld", rcu_stress_count[i]);
+    }
+    printf("\n");
+    exit(0);
+}
+
+/* GTest interface */
+
+static void gtest_stress(int nreaders, int duration)
+{
+    int i;
+
+    rcu_stress_current = &rcu_stress_array[0];
+    rcu_stress_current->pipe_count = 0;
+    rcu_stress_current->mbtest = 1;
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_stress_test);
+    }
+    create_thread(rcu_update_stress_test);
+    for (i = 0; i < 5; i++) {
+        create_thread(rcu_fake_update_stress_test);
+    }
+    goflag = GOFLAG_RUN;
+    g_usleep(duration * G_USEC_PER_SEC);
+    goflag = GOFLAG_STOP;
+    wait_all_threads();
+    g_assert_cmpint(n_mberror, ==, 0);
+    for (i = 2; i <= RCU_STRESS_PIPE_LEN; i++) {
+        g_assert_cmpint(rcu_stress_count[i], ==, 0);
+    }
+}
+
+static void gtest_stress_1_1(void)
+{
+    gtest_stress(1, 1);
+}
+
+static void gtest_stress_10_1(void)
+{
+    gtest_stress(10, 1);
+}
+
+static void gtest_stress_1_5(void)
+{
+    gtest_stress(1, 5);
+}
+
+static void gtest_stress_10_5(void)
+{
+    gtest_stress(10, 5);
+}
+
+/*
+ * Mainprogram.
+ */
+
+static void usage(int argc, char *argv[])
+{
+    fprintf(stderr, "Usage: %s [nreaders [ perf | stress ] ]\n", argv[0]);
+    exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+    int nreaders = 1;
+    int duration = 1;
+
+    if (argc >= 2 && argv[1][0] == '-') {
+        g_test_init(&argc, &argv, NULL);
+        if (g_test_quick()) {
+            g_test_add_func("/rcu/torture/1reader", gtest_stress_1_1);
+            g_test_add_func("/rcu/torture/10readers", gtest_stress_10_1);
+        } else {
+            g_test_add_func("/rcu/torture/1reader", gtest_stress_1_5);
+            g_test_add_func("/rcu/torture/10readers", gtest_stress_10_5);
+        }
+        return g_test_run();
+    }
+
+    if (argc >= 2) {
+        nreaders = strtoul(argv[1], NULL, 0);
+    }
+    if (argc > 3) {
+        duration = strtoul(argv[3], NULL, 0);
+    }
+    if (argc < 3 || strcmp(argv[2], "stress") == 0) {
+        stresstest(nreaders, duration);
+    } else if (strcmp(argv[2], "rperf") == 0) {
+        rperftest(nreaders, duration);
+    } else if (strcmp(argv[2], "uperf") == 0) {
+        uperftest(nreaders, duration);
+    } else if (strcmp(argv[2], "perf") == 0) {
+        perftest(nreaders, duration);
+    }
+    usage(argc, argv);
+    return 0;
+}
diff --git a/tests/test-vmstate.c b/tests/test-vmstate.c
index 39b7b01..1d620e0 100644
--- a/tests/test-vmstate.c
+++ b/tests/test-vmstate.c
@@ -85,7 +85,7 @@
     QEMUFile *f = open_test_file(true);
 
     /* Save file with vmstate */
-    vmstate_save_state(f, desc, obj);
+    vmstate_save_state(f, desc, obj, NULL);
     qemu_put_byte(f, QEMU_VM_EOF);
     g_assert(!qemu_file_get_error(f));
     qemu_fclose(f);
@@ -394,7 +394,7 @@
     QEMUFile *fsave = qemu_bufopen("w", NULL);
     TestStruct obj = { .a = 1, .b = 2, .c = 3, .d = 4, .e = 5, .f = 6,
                        .skip_c_e = false };
-    vmstate_save_state(fsave, &vmstate_skipping, &obj);
+    vmstate_save_state(fsave, &vmstate_skipping, &obj, NULL);
     g_assert(!qemu_file_get_error(fsave));
 
     uint8_t expected[] = {
@@ -414,7 +414,7 @@
     QEMUFile *fsave = qemu_bufopen("w", NULL);
     TestStruct obj = { .a = 1, .b = 2, .c = 3, .d = 4, .e = 5, .f = 6,
                        .skip_c_e = true };
-    vmstate_save_state(fsave, &vmstate_skipping, &obj);
+    vmstate_save_state(fsave, &vmstate_skipping, &obj, NULL);
     g_assert(!qemu_file_get_error(fsave));
 
     uint8_t expected[] = {
diff --git a/trace-events b/trace-events
index 04f5df2..907da7e 100644
--- a/trace-events
+++ b/trace-events
@@ -1142,8 +1142,11 @@
 vmware_setmode(uint32_t w, uint32_t h, uint32_t bpp) "%dx%d @ %d bpp"
 
 # savevm.c
+qemu_loadvm_state_section(unsigned int section_type) "%d"
+qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
+qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
 savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
-savevm_section_end(const char *id, unsigned int section_id) "%s, section_id %u"
+savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
 savevm_state_begin(void) ""
 savevm_state_iterate(void) ""
 savevm_state_complete(void) ""
@@ -1154,6 +1157,12 @@
 
 # vmstate.c
 vmstate_load_field_error(const char *field, int ret) "field \"%s\" load failed, ret = %d"
+vmstate_load_state(const char *name, int version_id) "%s v%d"
+vmstate_load_state_end(const char *name, const char *reason, int val) "%s %s/%d"
+vmstate_load_state_field(const char *name, const char *field) "%s:%s"
+vmstate_subsection_load(const char *parent) "%s"
+vmstate_subsection_load_bad(const char *parent,  const char *sub) "%s: %s"
+vmstate_subsection_load_good(const char *parent) "%s"
 
 # qemu-file.c
 qemu_file_fclose(void) ""
@@ -1326,6 +1335,68 @@
 migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
 migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
 
+# migration/rdma.c
+__qemu_rdma_add_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
+__qemu_rdma_delete_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
+qemu_dma_accept_incoming_migration(void) ""
+qemu_dma_accept_incoming_migration_accepted(void) ""
+qemu_rdma_accept_pin_state(bool pin) "%d"
+qemu_rdma_accept_pin_verbsc(void *verbs) "Verbs context after listen: %p"
+qemu_rdma_block_for_wrid_miss(const char *wcompstr, int wcomp, const char *gcompstr, uint64_t req) "A Wanted wrid %s (%d) but got %s (%" PRIu64 ")"
+qemu_rdma_block_for_wrid_miss_b(const char *wcompstr, int wcomp, const char *gcompstr, uint64_t req) "B Wanted wrid %s (%d) but got %s (%" PRIu64 ")"
+qemu_rdma_cleanup_disconnect(void) ""
+qemu_rdma_cleanup_waiting_for_disconnect(void) ""
+qemu_rdma_close(void) ""
+qemu_rdma_connect_pin_all_requested(void) ""
+qemu_rdma_connect_pin_all_outcome(bool pin) "%d"
+qemu_rdma_dest_init_trying(const char *host, const char *ip) "%s => %s"
+qemu_rdma_dump_gid(const char *who, const char *src, const char *dst) "%s Source GID: %s, Dest GID: %s"
+qemu_rdma_exchange_get_response_start(const char *desc) "CONTROL: %s receiving..."
+qemu_rdma_exchange_get_response_none(const char *desc, int type) "Surprise: got %s (%d)"
+qemu_rdma_exchange_send_issue_callback(void) ""
+qemu_rdma_exchange_send_waiting(const char *desc) "Waiting for response %s"
+qemu_rdma_exchange_send_received(const char *desc) "Response %s received."
+qemu_rdma_fill(int64_t control_len, int size) "RDMA %" PRId64 " of %d bytes already in buffer"
+qemu_rdma_init_ram_blocks(int blocks) "Allocated %d local ram block structures"
+qemu_rdma_poll_recv(const char *compstr, int64_t comp, int64_t id, int sent) "completion %s #%" PRId64 " received (%" PRId64 ") left %d"
+qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block, uint64_t chunk, void *local, void *remote) "completions %s (%" PRId64 ") left %d, block %" PRIu64 ", chunk: %" PRIu64 " %p %p"
+qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
+qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
+qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
+qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
+qemu_rdma_registration_handle_finished(void) ""
+qemu_rdma_registration_handle_ram_blocks(void) ""
+qemu_rdma_registration_handle_register(int requests) "%d requests"
+qemu_rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64
+qemu_rdma_registration_handle_register_rkey(int rkey) "%x"
+qemu_rdma_registration_handle_unregister(int requests) "%d requests"
+qemu_rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64
+qemu_rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64
+qemu_rdma_registration_handle_wait(uint64_t flags) "Waiting for next request %" PRIu64
+qemu_rdma_registration_start(uint64_t flags) "%" PRIu64
+qemu_rdma_registration_stop(uint64_t flags) "%" PRIu64
+qemu_rdma_registration_stop_ram(void) ""
+qemu_rdma_resolve_host_trying(const char *host, const char *ip) "Trying %s => %s"
+qemu_rdma_signal_unregister_append(uint64_t chunk, int pos) "Appending unregister chunk %" PRIu64 " at position %d"
+qemu_rdma_signal_unregister_already(uint64_t chunk) "Unregister chunk %" PRIu64 " already in queue"
+qemu_rdma_unregister_waiting_inflight(uint64_t chunk) "Cannot unregister inflight chunk: %" PRIu64
+qemu_rdma_unregister_waiting_proc(uint64_t chunk, int pos) "Processing unregister for chunk: %" PRIu64 " at position %d"
+qemu_rdma_unregister_waiting_send(uint64_t chunk) "Sending unregister for chunk: %" PRIu64
+qemu_rdma_unregister_waiting_complete(uint64_t chunk) "Unregister for chunk: %" PRIu64 " complete."
+qemu_rdma_write_flush(int sent) "sent total: %d"
+qemu_rdma_write_one_block(int count, int block, uint64_t chunk, uint64_t current, uint64_t len, int nb_sent, int nb_chunks) "(%d) Not clobbering: block: %d chunk %" PRIu64 " current %" PRIu64 " len %" PRIu64 " %d %d"
+qemu_rdma_write_one_post(uint64_t chunk, long addr, long remote, uint32_t len) "Posting chunk: %" PRIu64 ", addr: %lx remote: %lx, bytes %" PRIu32
+qemu_rdma_write_one_queue_full(void) ""
+qemu_rdma_write_one_recvregres(int mykey, int theirkey, uint64_t chunk) "Received registration result: my key: %x their key %x, chunk %" PRIu64
+qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) "Sending registration request chunk %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
+qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)"
+qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
+rdma_start_incoming_migration(void) ""
+rdma_start_incoming_migration_after_dest_init(void) ""
+rdma_start_incoming_migration_after_rdma_listen(void) ""
+rdma_start_outgoing_migration_after_rdma_connect(void) ""
+rdma_start_outgoing_migration_after_rdma_source_init(void) ""
+
 # kvm-all.c
 kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
 kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 93007e2..ceaba30 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -17,3 +17,4 @@
 util-obj-y += getauxval.o
 util-obj-y += readline.o
 util-obj-y += rfifolock.o
+util-obj-y += rcu.o
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index 41cb23d..50a29d8 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -307,11 +307,13 @@
 #else
 static inline void futex_wake(QemuEvent *ev, int n)
 {
+    pthread_mutex_lock(&ev->lock);
     if (n == 1) {
         pthread_cond_signal(&ev->cond);
     } else {
         pthread_cond_broadcast(&ev->cond);
     }
+    pthread_mutex_unlock(&ev->lock);
 }
 
 static inline void futex_wait(QemuEvent *ev, unsigned val)
diff --git a/util/rcu.c b/util/rcu.c
new file mode 100644
index 0000000..c9c3e6e
--- /dev/null
+++ b/util/rcu.c
@@ -0,0 +1,291 @@
+/*
+ * urcu-mb.c
+ *
+ * Userspace RCU library with explicit memory barriers
+ *
+ * Copyright (c) 2009 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * Copyright (c) 2009 Paul E. McKenney, IBM Corporation.
+ * Copyright 2015 Red Hat, Inc.
+ *
+ * Ported to QEMU by Paolo Bonzini  <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * IBM's contributions to this file may be relicensed under LGPLv2 or later.
+ */
+
+#include "qemu-common.h"
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+#include "qemu/rcu.h"
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+
+/*
+ * Global grace period counter.  Bit 0 is always one in rcu_gp_ctr.
+ * Bits 1 and above are defined in synchronize_rcu.
+ */
+#define RCU_GP_LOCKED           (1UL << 0)
+#define RCU_GP_CTR              (1UL << 1)
+
+unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
+
+QemuEvent rcu_gp_event;
+static QemuMutex rcu_gp_lock;
+
+/*
+ * Check whether a quiescent state was crossed between the beginning of
+ * update_counter_and_wait and now.
+ */
+static inline int rcu_gp_ongoing(unsigned long *ctr)
+{
+    unsigned long v;
+
+    v = atomic_read(ctr);
+    return v && (v != rcu_gp_ctr);
+}
+
+/* Written to only by each individual reader. Read by both the reader and the
+ * writers.
+ */
+__thread struct rcu_reader_data rcu_reader;
+
+/* Protected by rcu_gp_lock.  */
+typedef QLIST_HEAD(, rcu_reader_data) ThreadList;
+static ThreadList registry = QLIST_HEAD_INITIALIZER(registry);
+
+/* Wait for previous parity/grace period to be empty of readers.  */
+static void wait_for_readers(void)
+{
+    ThreadList qsreaders = QLIST_HEAD_INITIALIZER(qsreaders);
+    struct rcu_reader_data *index, *tmp;
+
+    for (;;) {
+        /* We want to be notified of changes made to rcu_gp_ongoing
+         * while we walk the list.
+         */
+        qemu_event_reset(&rcu_gp_event);
+
+        /* Instead of using atomic_mb_set for index->waiting, and
+         * atomic_mb_read for index->ctr, memory barriers are placed
+         * manually since writes to different threads are independent.
+         * atomic_mb_set has a smp_wmb before...
+         */
+        smp_wmb();
+        QLIST_FOREACH(index, &registry, node) {
+            atomic_set(&index->waiting, true);
+        }
+
+        /* ... and a smp_mb after.  */
+        smp_mb();
+
+        QLIST_FOREACH_SAFE(index, &registry, node, tmp) {
+            if (!rcu_gp_ongoing(&index->ctr)) {
+                QLIST_REMOVE(index, node);
+                QLIST_INSERT_HEAD(&qsreaders, index, node);
+
+                /* No need for mb_set here, worst of all we
+                 * get some extra futex wakeups.
+                 */
+                atomic_set(&index->waiting, false);
+            }
+        }
+
+        /* atomic_mb_read has smp_rmb after.  */
+        smp_rmb();
+
+        if (QLIST_EMPTY(&registry)) {
+            break;
+        }
+
+        /* Wait for one thread to report a quiescent state and
+         * try again.
+         */
+        qemu_event_wait(&rcu_gp_event);
+    }
+
+    /* put back the reader list in the registry */
+    QLIST_SWAP(&registry, &qsreaders, node);
+}
+
+void synchronize_rcu(void)
+{
+    qemu_mutex_lock(&rcu_gp_lock);
+
+    if (!QLIST_EMPTY(&registry)) {
+        /* In either case, the atomic_mb_set below blocks stores that free
+         * old RCU-protected pointers.
+         */
+        if (sizeof(rcu_gp_ctr) < 8) {
+            /* For architectures with 32-bit longs, a two-subphases algorithm
+             * ensures we do not encounter overflow bugs.
+             *
+             * Switch parity: 0 -> 1, 1 -> 0.
+             */
+            atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+            wait_for_readers();
+            atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+        } else {
+            /* Increment current grace period.  */
+            atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR);
+        }
+
+        wait_for_readers();
+    }
+
+    qemu_mutex_unlock(&rcu_gp_lock);
+}
+
+
+#define RCU_CALL_MIN_SIZE        30
+
+/* Multi-producer, single-consumer queue based on urcu/static/wfqueue.h
+ * from liburcu.  Note that head is only used by the consumer.
+ */
+static struct rcu_head dummy;
+static struct rcu_head *head = &dummy, **tail = &dummy.next;
+static int rcu_call_count;
+static QemuEvent rcu_call_ready_event;
+
+static void enqueue(struct rcu_head *node)
+{
+    struct rcu_head **old_tail;
+
+    node->next = NULL;
+    old_tail = atomic_xchg(&tail, &node->next);
+    atomic_mb_set(old_tail, node);
+}
+
+static struct rcu_head *try_dequeue(void)
+{
+    struct rcu_head *node, *next;
+
+retry:
+    /* Test for an empty list, which we do not expect.  Note that for
+     * the consumer head and tail are always consistent.  The head
+     * is consistent because only the consumer reads/writes it.
+     * The tail, because it is the first step in the enqueuing.
+     * It is only the next pointers that might be inconsistent.
+     */
+    if (head == &dummy && atomic_mb_read(&tail) == &dummy.next) {
+        abort();
+    }
+
+    /* If the head node has NULL in its next pointer, the value is
+     * wrong and we need to wait until its enqueuer finishes the update.
+     */
+    node = head;
+    next = atomic_mb_read(&head->next);
+    if (!next) {
+        return NULL;
+    }
+
+    /* Since we are the sole consumer, and we excluded the empty case
+     * above, the queue will always have at least two nodes: the
+     * dummy node, and the one being removed.  So we do not need to update
+     * the tail pointer.
+     */
+    head = next;
+
+    /* If we dequeued the dummy node, add it back at the end and retry.  */
+    if (node == &dummy) {
+        enqueue(node);
+        goto retry;
+    }
+
+    return node;
+}
+
+static void *call_rcu_thread(void *opaque)
+{
+    struct rcu_head *node;
+
+    for (;;) {
+        int tries = 0;
+        int n = atomic_read(&rcu_call_count);
+
+        /* Heuristically wait for a decent number of callbacks to pile up.
+         * Fetch rcu_call_count now, we only must process elements that were
+         * added before synchronize_rcu() starts.
+         */
+        while (n < RCU_CALL_MIN_SIZE && ++tries <= 5) {
+            g_usleep(100000);
+            qemu_event_reset(&rcu_call_ready_event);
+            n = atomic_read(&rcu_call_count);
+            if (n < RCU_CALL_MIN_SIZE) {
+                qemu_event_wait(&rcu_call_ready_event);
+                n = atomic_read(&rcu_call_count);
+            }
+        }
+
+        atomic_sub(&rcu_call_count, n);
+        synchronize_rcu();
+        while (n > 0) {
+            node = try_dequeue();
+            while (!node) {
+                qemu_event_reset(&rcu_call_ready_event);
+                node = try_dequeue();
+                if (!node) {
+                    qemu_event_wait(&rcu_call_ready_event);
+                    node = try_dequeue();
+                }
+            }
+
+            n--;
+            node->func(node);
+        }
+    }
+    abort();
+}
+
+void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node))
+{
+    node->func = func;
+    enqueue(node);
+    atomic_inc(&rcu_call_count);
+    qemu_event_set(&rcu_call_ready_event);
+}
+
+void rcu_register_thread(void)
+{
+    assert(rcu_reader.ctr == 0);
+    qemu_mutex_lock(&rcu_gp_lock);
+    QLIST_INSERT_HEAD(&registry, &rcu_reader, node);
+    qemu_mutex_unlock(&rcu_gp_lock);
+}
+
+void rcu_unregister_thread(void)
+{
+    qemu_mutex_lock(&rcu_gp_lock);
+    QLIST_REMOVE(&rcu_reader, node);
+    qemu_mutex_unlock(&rcu_gp_lock);
+}
+
+static void __attribute__((__constructor__)) rcu_init(void)
+{
+    QemuThread thread;
+
+    qemu_mutex_init(&rcu_gp_lock);
+    qemu_event_init(&rcu_gp_event, true);
+
+    qemu_event_init(&rcu_call_ready_event, false);
+    qemu_thread_create(&thread, "call_rcu", call_rcu_thread,
+                       NULL, QEMU_THREAD_DETACHED);
+
+    rcu_register_thread();
+}