Merge remote-tracking branch 'bonzini/nbd-next' into staging

* bonzini/nbd-next:
  nbd: add nbd_export_get_blockdev
  nbd: negotiate with named exports
  nbd: register named exports
  qemu-nbd: rewrite termination conditions to use a state machine
  nbd: add notification for closing an NBDExport
  nbd: track clients into NBDExport
  nbd: add reference counting to NBDExport
  nbd: do not leak nbd_trip coroutines when a connection is torn down
  nbd: make refcount interface public
  nbd: do not close BlockDriverState in nbd_export_close
  nbd: pass NBDClient to nbd_send_negotiate
  nbd: add more constants
diff --git a/Makefile b/Makefile
index 1cd5bc8..971e92f 100644
--- a/Makefile
+++ b/Makefile
@@ -157,7 +157,8 @@
 	iohandler.o cutils.o iov.o async.o
 tools-obj-$(CONFIG_POSIX) += compatfd.o
 
-qemu-img$(EXESUF): qemu-img.o $(tools-obj-y) $(block-obj-y)
+qemu-img$(EXESUF): qemu-img.o $(tools-obj-y) $(block-obj-y) $(qapi-obj-y) \
+                              qapi-visit.o qapi-types.o
 qemu-nbd$(EXESUF): qemu-nbd.o $(tools-obj-y) $(block-obj-y)
 qemu-io$(EXESUF): qemu-io.o cmd.o $(tools-obj-y) $(block-obj-y)
 
diff --git a/arch_init.c b/arch_init.c
index 5a1173e..f849f9b 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -562,7 +562,7 @@
         if ((i & 63) == 0) {
             uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 1000000;
             if (t1 > MAX_WAIT) {
-                DPRINTF("big wait: " PRIu64 " milliseconds, %d iterations\n",
+                DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                         t1, i);
                 break;
             }
@@ -587,7 +587,7 @@
 
     expected_time = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
 
-    DPRINTF("ram_save_live: expected(" PRIu64 ") <= max(" PRIu64 ")?\n",
+    DPRINTF("ram_save_live: expected(%" PRIu64 ") <= max(%" PRIu64 ")?\n",
             expected_time, migrate_max_downtime());
 
     if (expected_time <= migrate_max_downtime()) {
@@ -799,8 +799,8 @@
     } while (!(flags & RAM_SAVE_FLAG_EOS));
 
 done:
-    DPRINTF("Completed load of VM with exit code %d seq iteration " PRIu64 "\n",
-            ret, seq_iter);
+    DPRINTF("Completed load of VM with exit code %d seq iteration "
+            "%" PRIu64 "\n", ret, seq_iter);
     return ret;
 }
 
@@ -922,11 +922,16 @@
     if (is_help_option(optarg)) {
     show_valid_cards:
 
+#ifdef HAS_AUDIO_CHOICE
         printf("Valid sound card names (comma separated):\n");
         for (c = soundhw; c->name; ++c) {
             printf ("%-11s %s\n", c->name, c->descr);
         }
         printf("\n-soundhw all will enable all of the above\n");
+#else
+        printf("Machine has no user-selectable audio hardware "
+               "(it may or may not have always-present audio hardware).\n");
+#endif
         exit(!is_help_option(optarg));
     }
     else {
diff --git a/audio/audio_template.h b/audio/audio_template.h
index 519432a..16f7880 100644
--- a/audio/audio_template.h
+++ b/audio/audio_template.h
@@ -410,15 +410,15 @@
     SW *old_sw = NULL;
 #endif
 
-    ldebug ("open %s, freq %d, nchannels %d, fmt %d\n",
-            name, as->freq, as->nchannels, as->fmt);
-
     if (audio_bug (AUDIO_FUNC, !card || !name || !callback_fn || !as)) {
         dolog ("card=%p name=%p callback_fn=%p as=%p\n",
                card, name, callback_fn, as);
         goto fail;
     }
 
+    ldebug ("open %s, freq %d, nchannels %d, fmt %d\n",
+            name, as->freq, as->nchannels, as->fmt);
+
     if (audio_bug (AUDIO_FUNC, audio_validate_settings (as))) {
         audio_print_settings (as);
         goto fail;
diff --git a/block.c b/block.c
index 470bdcc..e78039b 100644
--- a/block.c
+++ b/block.c
@@ -433,7 +433,11 @@
         return -EOVERFLOW;
     }
     fd = mkstemp(filename);
-    if (fd < 0 || close(fd)) {
+    if (fd < 0) {
+        return -errno;
+    }
+    if (close(fd) != 0) {
+        unlink(filename);
         return -errno;
     }
     return 0;
@@ -897,10 +901,10 @@
             bdrv_delete(bs->file);
             bs->file = NULL;
         }
-
-        bdrv_dev_change_media_cb(bs, false);
     }
 
+    bdrv_dev_change_media_cb(bs, false);
+
     /*throttling disk I/O limits*/
     if (bs->io_limits_enabled) {
         bdrv_io_limits_disable(bs);
diff --git a/block/curl.c b/block/curl.c
index e7c3634..c1074cd 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -542,8 +542,7 @@
     }
     if (s->multi)
         curl_multi_cleanup(s->multi);
-    if (s->url)
-        free(s->url);
+    g_free(s->url);
 }
 
 static int64_t curl_getlength(BlockDriverState *bs)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index df4f441..e0753ee 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1986,7 +1986,7 @@
         vdi_index = pos / SD_DATA_OBJ_SIZE;
         offset = pos % SD_DATA_OBJ_SIZE;
 
-        data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
+        data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
 
         vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
 
@@ -2007,6 +2007,7 @@
         }
 
         pos += data_len;
+        data += data_len;
         remaining -= data_len;
     }
     ret = size;
diff --git a/block/vdi.c b/block/vdi.c
index c4f1529..550cf58 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -628,7 +628,6 @@
     VdiHeader header;
     size_t i;
     size_t bmap_size;
-    uint32_t *bmap;
 
     logout("\n");
 
@@ -693,21 +692,21 @@
         result = -errno;
     }
 
-    bmap = NULL;
     if (bmap_size > 0) {
-        bmap = (uint32_t *)g_malloc0(bmap_size);
-    }
-    for (i = 0; i < blocks; i++) {
-        if (image_type == VDI_TYPE_STATIC) {
-            bmap[i] = i;
-        } else {
-            bmap[i] = VDI_UNALLOCATED;
+        uint32_t *bmap = g_malloc0(bmap_size);
+        for (i = 0; i < blocks; i++) {
+            if (image_type == VDI_TYPE_STATIC) {
+                bmap[i] = i;
+            } else {
+                bmap[i] = VDI_UNALLOCATED;
+            }
         }
+        if (write(fd, bmap, bmap_size) < 0) {
+            result = -errno;
+        }
+        g_free(bmap);
     }
-    if (write(fd, bmap, bmap_size) < 0) {
-        result = -errno;
-    }
-    g_free(bmap);
+
     if (image_type == VDI_TYPE_STATIC) {
         if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
             result = -errno;
diff --git a/configure b/configure
index 7e23309..1b86517 100755
--- a/configure
+++ b/configure
@@ -111,7 +111,6 @@
 cpu=""
 interp_prefix="/usr/gnemul/qemu-%M"
 static="no"
-sparc_cpu=""
 cross_prefix=""
 audio_drv_list=""
 audio_card_list="ac97 es1370 sb16 hda"
@@ -126,7 +125,7 @@
 libs_qga=""
 debug_info="yes"
 
-target_list="DEFAULT"
+target_list=""
 
 # Default value for a variable defining feature "foo".
 #  * foo="no"  feature will only be used if --enable-foo arg is given
@@ -241,21 +240,6 @@
   ;;
   --disable-debug-info) debug_info="no"
   ;;
-  --sparc_cpu=*)
-    sparc_cpu="$optarg"
-    case $sparc_cpu in
-    v7|v8|v8plus|v8plusa)
-      cpu="sparc"
-    ;;
-    v9)
-      cpu="sparc64"
-    ;;
-    *)
-      echo "undefined SPARC architecture. Exiting";
-      exit 1
-    ;;
-    esac
-  ;;
   esac
 done
 # OS specific
@@ -343,8 +327,6 @@
 elif check_define __x86_64__ ; then
   cpu="x86_64"
 elif check_define __sparc__ ; then
-  # We can't check for 64 bit (when gcc is biarch) or V8PLUSA
-  # They must be specified using --sparc_cpu
   if check_define __arch64__ ; then
     cpu="sparc64"
   else
@@ -792,8 +774,6 @@
   ;;
   --enable-uname-release=*) uname_release="$optarg"
   ;;
-  --sparc_cpu=*)
-  ;;
   --enable-werror) werror="yes"
   ;;
   --disable-werror) werror="no"
@@ -881,35 +861,17 @@
   esac
 done
 
-#
-# If cpu ~= sparc and  sparc_cpu hasn't been defined, plug in the right
-# QEMU_CFLAGS/LDFLAGS (assume sparc_v8plus for 32-bit and sparc_v9 for 64-bit)
-#
 host_guest_base="no"
 case "$cpu" in
-    sparc) case $sparc_cpu in
-           v7|v8)
-             QEMU_CFLAGS="-mcpu=${sparc_cpu} -D__sparc_${sparc_cpu}__ $QEMU_CFLAGS"
-           ;;
-           v8plus|v8plusa)
-             QEMU_CFLAGS="-mcpu=ultrasparc -D__sparc_${sparc_cpu}__ $QEMU_CFLAGS"
-           ;;
-           *) # sparc_cpu not defined in the command line
-             QEMU_CFLAGS="-mcpu=ultrasparc -D__sparc_v8plus__ $QEMU_CFLAGS"
-           esac
+    sparc)
            LDFLAGS="-m32 $LDFLAGS"
-           QEMU_CFLAGS="-m32 -ffixed-g2 -ffixed-g3 $QEMU_CFLAGS"
-           if test "$solaris" = "no" ; then
-             QEMU_CFLAGS="-ffixed-g1 -ffixed-g6 $QEMU_CFLAGS"
-           fi
+           QEMU_CFLAGS="-m32 -mcpu=ultrasparc $QEMU_CFLAGS"
+           host_guest_base="yes"
            ;;
     sparc64)
-           QEMU_CFLAGS="-m64 -mcpu=ultrasparc -D__sparc_v9__ $QEMU_CFLAGS"
            LDFLAGS="-m64 $LDFLAGS"
-           QEMU_CFLAGS="-ffixed-g5 -ffixed-g6 -ffixed-g7 $QEMU_CFLAGS"
-           if test "$solaris" != "no" ; then
-             QEMU_CFLAGS="-ffixed-g1 $QEMU_CFLAGS"
-           fi
+           QEMU_CFLAGS="-m64 -mcpu=ultrasparc $QEMU_CFLAGS"
+           host_guest_base="yes"
            ;;
     s390)
            QEMU_CFLAGS="-m31 -march=z990 $QEMU_CFLAGS"
@@ -1319,10 +1281,15 @@
   exit 1
 fi
 
-if test "$target_list" = "DEFAULT" ; then
-    target_list=`echo "$default_target_list" | sed -e 's/,/ /g'`
+if test -z "$target_list" ; then
+    target_list="$default_target_list"
+else
+    target_list=`echo "$target_list" | sed -e 's/,/ /g'`
 fi
-
+if test -z "$target_list" ; then
+    echo "No targets enabled"
+    exit 1
+fi
 # see if system emulation was really requested
 case " $target_list " in
   *"-softmmu "*) softmmu=yes
@@ -1428,10 +1395,10 @@
         LIBS=`$pkg_config --libs libseccomp`
 	seccomp="yes"
     else
-	seccomp="no"
 	if test "$seccomp" = "yes"; then
             feature_not_found "libseccomp"
 	fi
+	seccomp="no"
     fi
 fi
 ##########################################
@@ -2733,6 +2700,9 @@
     if $pkg_config --atleast-version=0.12.0 spice-protocol >/dev/null 2>&1; then
         spice_qxl_io_monitors_config_async="yes"
     fi
+    if $pkg_config --atleast-version=0.12.2 spice-protocol > /dev/null 2>&1; then
+        spice_qxl_client_monitors_config="yes"
+    fi
   else
     if test "$spice" = "yes" ; then
       feature_not_found "spice"
@@ -2787,7 +2757,7 @@
         usb_redir_cflags=$($pkg_config --cflags libusbredirparser 2>/dev/null)
         usb_redir_libs=$($pkg_config --libs libusbredirparser 2>/dev/null)
         QEMU_CFLAGS="$QEMU_CFLAGS $usb_redir_cflags"
-        LIBS="$LIBS $usb_redir_libs"
+        libs_softmmu="$libs_softmmu $usb_redir_libs"
     else
         if test "$usb_redir" = "yes"; then
             feature_not_found "usb-redir"
@@ -3480,6 +3450,10 @@
   echo "CONFIG_QXL_IO_MONITORS_CONFIG_ASYNC=y" >> $config_host_mak
 fi
 
+if test "$spice_qxl_client_monitors_config" = "yes" ; then
+  echo "CONFIG_QXL_CLIENT_MONITORS_CONFIG=y" >> $config_host_mak
+fi
+
 if test "$smartcard" = "yes" ; then
   echo "CONFIG_SMARTCARD=y" >> $config_host_mak
 fi
@@ -4119,10 +4093,6 @@
 
 if test "$target_linux_user" = "yes" -o "$target_bsd_user" = "yes" ; then
   case "$ARCH" in
-  sparc)
-    # -static is used to avoid g1/g3 usage by the dynamic linker
-    ldflags="$linker_script -static $ldflags"
-    ;;
   alpha | s390x)
     # The default placement of the application is fine.
     ;;
diff --git a/console.c b/console.c
index c1ed5e0..a8bcc42 100644
--- a/console.c
+++ b/console.c
@@ -1612,7 +1612,7 @@
     memset(&pf, 0x00, sizeof(PixelFormat));
 
     pf.bits_per_pixel = bpp;
-    pf.bytes_per_pixel = bpp / 8;
+    pf.bytes_per_pixel = DIV_ROUND_UP(bpp, 8);
     pf.depth = bpp == 32 ? 24 : bpp;
 
     switch (bpp) {
@@ -1661,13 +1661,12 @@
     memset(&pf, 0x00, sizeof(PixelFormat));
 
     pf.bits_per_pixel = bpp;
-    pf.bytes_per_pixel = bpp / 8;
+    pf.bytes_per_pixel = DIV_ROUND_UP(bpp, 8);
     pf.depth = bpp == 32 ? 24 : bpp;
 
     switch (bpp) {
         case 15:
             pf.bits_per_pixel = 16;
-            pf.bytes_per_pixel = 2;
             pf.rmask = 0x00007c00;
             pf.gmask = 0x000003E0;
             pf.bmask = 0x0000001F;
diff --git a/disas.c b/disas.c
index 7b2acc9..b801c8f 100644
--- a/disas.c
+++ b/disas.c
@@ -316,9 +316,7 @@
     print_insn = print_insn_alpha;
 #elif defined(__sparc__)
     print_insn = print_insn_sparc;
-#if defined(__sparc_v8plus__) || defined(__sparc_v8plusa__) || defined(__sparc_v9__)
     disasm_info.mach = bfd_mach_sparc_v9b;
-#endif
 #elif defined(__arm__)
     print_insn = print_insn_arm;
 #elif defined(__MIPSEB__)
diff --git a/docs/specs/ppc-spapr-hcalls.txt b/docs/specs/ppc-spapr-hcalls.txt
index 52ba8d4..667b3fa 100644
--- a/docs/specs/ppc-spapr-hcalls.txt
+++ b/docs/specs/ppc-spapr-hcalls.txt
@@ -31,7 +31,7 @@
 
 Returns:
 
-  H_SUCCESS   : Successully called the RTAS function (RTAS result
+  H_SUCCESS   : Successfully called the RTAS function (RTAS result
                 will have been stored in the parameter block)
   H_PARAMETER : Unknown token
 
diff --git a/docs/usb2.txt b/docs/usb2.txt
index d17e3c0..43dacde 100644
--- a/docs/usb2.txt
+++ b/docs/usb2.txt
@@ -58,11 +58,11 @@
 xhci controller support
 -----------------------
 
-There also is xhci host controller support available.  It got alot
+There is also xhci host controller support available.  It got a lot
 less testing than ehci and there are a bunch of known limitations, so
 ehci may work better for you.  On the other hand the xhci hardware
 design is much more virtualization-friendly, thus xhci emulation uses
-less ressources (especially cpu).  If you wanna give xhci a try
+less resources (especially cpu).  If you want to give xhci a try
 use this to add the host controller ...
 
     qemu -device nec-usb-xhci,id=xhci
diff --git a/exec-all.h b/exec-all.h
index dba9609..6516da0 100644
--- a/exec-all.h
+++ b/exec-all.h
@@ -132,9 +132,10 @@
 #define CODE_GEN_AVG_BLOCK_SIZE 64
 #endif
 
-#if defined(_ARCH_PPC) || defined(__x86_64__) || defined(__arm__) || defined(__i386__)
-#define USE_DIRECT_JUMP
-#elif defined(CONFIG_TCG_INTERPRETER)
+#if defined(__arm__) || defined(_ARCH_PPC) \
+    || defined(__x86_64__) || defined(__i386__) \
+    || defined(__sparc__) \
+    || defined(CONFIG_TCG_INTERPRETER)
 #define USE_DIRECT_JUMP
 #endif
 
@@ -244,6 +245,8 @@
     __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" (_flg));
 #endif
 }
+#elif defined(__sparc__)
+void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
 #else
 #error tb_set_jmp_target1 is missing
 #endif
diff --git a/exec.c b/exec.c
index 5834766..bb6aa4a 100644
--- a/exec.c
+++ b/exec.c
@@ -86,7 +86,7 @@
 /* any access to the tbs or the page table must use this lock */
 spinlock_t tb_lock = SPIN_LOCK_UNLOCKED;
 
-#if defined(__arm__) || defined(__sparc_v9__)
+#if defined(__arm__) || defined(__sparc__)
 /* The prologue must be reachable with a direct jump. ARM and Sparc64
  have limited branch ranges (possibly also PPC) so place it in a
  section close to code segment. */
@@ -541,10 +541,9 @@
         /* Cannot map more than that */
         if (code_gen_buffer_size > (800 * 1024 * 1024))
             code_gen_buffer_size = (800 * 1024 * 1024);
-#elif defined(__sparc_v9__)
+#elif defined(__sparc__) && HOST_LONG_BITS == 64
         // Map the buffer below 2G, so we can use direct calls and branches
-        flags |= MAP_FIXED;
-        start = (void *) 0x60000000UL;
+        start = (void *) 0x40000000UL;
         if (code_gen_buffer_size > (512 * 1024 * 1024))
             code_gen_buffer_size = (512 * 1024 * 1024);
 #elif defined(__arm__)
@@ -582,10 +581,9 @@
         /* Cannot map more than that */
         if (code_gen_buffer_size > (800 * 1024 * 1024))
             code_gen_buffer_size = (800 * 1024 * 1024);
-#elif defined(__sparc_v9__)
+#elif defined(__sparc__) && HOST_LONG_BITS == 64
         // Map the buffer below 2G, so we can use direct calls and branches
-        flags |= MAP_FIXED;
-        addr = (void *) 0x60000000UL;
+        addr = (void *) 0x40000000UL;
         if (code_gen_buffer_size > (512 * 1024 * 1024)) {
             code_gen_buffer_size = (512 * 1024 * 1024);
         }
@@ -2525,6 +2523,19 @@
     }
 }
 
+static int memory_try_enable_merging(void *addr, size_t len)
+{
+    QemuOpts *opts;
+
+    opts = qemu_opts_find(qemu_find_opts("machine"), 0);
+    if (opts && !qemu_opt_get_bool(opts, "mem-merge", true)) {
+        /* disabled by the user */
+        return 0;
+    }
+
+    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
+}
+
 ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                    MemoryRegion *mr)
 {
@@ -2544,7 +2555,7 @@
             new_block->host = file_ram_alloc(new_block, size, mem_path);
             if (!new_block->host) {
                 new_block->host = qemu_vmalloc(size);
-                qemu_madvise(new_block->host, size, QEMU_MADV_MERGEABLE);
+                memory_try_enable_merging(new_block->host, size);
             }
 #else
             fprintf(stderr, "-mem-path option unsupported\n");
@@ -2559,7 +2570,7 @@
             } else {
                 new_block->host = qemu_vmalloc(size);
             }
-            qemu_madvise(new_block->host, size, QEMU_MADV_MERGEABLE);
+            memory_try_enable_merging(new_block->host, size);
         }
     }
     new_block->length = size;
@@ -2689,7 +2700,7 @@
                             length, addr);
                     exit(1);
                 }
-                qemu_madvise(vaddr, length, QEMU_MADV_MERGEABLE);
+                memory_try_enable_merging(vaddr, length);
                 qemu_ram_setup_dump(vaddr, length);
             }
             return;
@@ -3523,6 +3534,13 @@
             /* ROM/RAM case */
             ptr = qemu_get_ram_ptr(addr1);
             memcpy(ptr, buf, l);
+            if (!cpu_physical_memory_is_dirty(addr1)) {
+                /* invalidate code */
+                tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
+                /* set dirty bit */
+                cpu_physical_memory_set_dirty_flags(
+                    addr1, (0xff & ~CODE_DIRTY_FLAG));
+            }
             qemu_put_ram_ptr(ptr);
         }
         len -= l;
diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index 4902450..a1d489e 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -41,6 +41,13 @@
 #define SNAN_BIT_IS_ONE		0
 #endif
 
+#if defined(TARGET_XTENSA)
+/* Define for architectures which deviate from IEEE in not supporting
+ * signaling NaNs (so all NaNs are treated as quiet).
+ */
+#define NO_SIGNALING_NANS 1
+#endif
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated half-precision NaN.
 *----------------------------------------------------------------------------*/
@@ -57,7 +64,8 @@
 *----------------------------------------------------------------------------*/
 #if defined(TARGET_SPARC)
 const float32 float32_default_nan = const_float32(0x7FFFFFFF);
-#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
+#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) || \
+      defined(TARGET_XTENSA)
 const float32 float32_default_nan = const_float32(0x7FC00000);
 #elif SNAN_BIT_IS_ONE
 const float32 float32_default_nan = const_float32(0x7FBFFFFF);
@@ -127,6 +135,17 @@
     uint64_t high, low;
 } commonNaNT;
 
+#ifdef NO_SIGNALING_NANS
+int float16_is_quiet_nan(float16 a_)
+{
+    return float16_is_any_nan(a_);
+}
+
+int float16_is_signaling_nan(float16 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the half-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -156,6 +175,7 @@
     return (((a >> 9) & 0x3F) == 0x3E) && (a & 0x1FF);
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the half-precision floating point value `a' is a
@@ -217,6 +237,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int float32_is_quiet_nan(float32 a_)
+{
+    return float32_is_any_nan(a_);
+}
+
+int float32_is_signaling_nan(float32 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the single-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -246,6 +277,7 @@
     return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the single-precision floating point value `a' is a
@@ -372,7 +404,7 @@
         return 1;
     }
 }
-#elif defined(TARGET_PPC)
+#elif defined(TARGET_PPC) || defined(TARGET_XTENSA)
 static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN,
                    flag aIsLargerSignificand)
 {
@@ -586,6 +618,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int float64_is_quiet_nan(float64 a_)
+{
+    return float64_is_any_nan(a_);
+}
+
+int float64_is_signaling_nan(float64 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the double-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -619,6 +662,7 @@
         && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the double-precision floating point value `a' is a
@@ -773,6 +817,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int floatx80_is_quiet_nan(floatx80 a_)
+{
+    return floatx80_is_any_nan(a_);
+}
+
+int floatx80_is_signaling_nan(floatx80 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the extended double-precision floating-point value `a' is a
 | quiet NaN; otherwise returns 0. This slightly differs from the same
@@ -816,6 +871,7 @@
         && ( a.low == aLow );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the extended double-precision floating point value
@@ -929,6 +985,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int float128_is_quiet_nan(float128 a_)
+{
+    return float128_is_any_nan(a_);
+}
+
+int float128_is_signaling_nan(float128 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the quadruple-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -964,6 +1031,7 @@
         && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the quadruple-precision floating point value `a' is
diff --git a/fpu/softfloat.h b/fpu/softfloat.h
index feec3a1..d8999b3 100644
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -219,7 +219,7 @@
 enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
-    float_muladd_negate_result = 3,
+    float_muladd_negate_result = 4,
 };
 
 /*----------------------------------------------------------------------------
@@ -251,6 +251,11 @@
 int float16_is_signaling_nan( float16 );
 float16 float16_maybe_silence_nan( float16 );
 
+INLINE int float16_is_any_nan(float16 a)
+{
+    return ((float16_val(a) & ~0x8000) > 0x7c00);
+}
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated half-precision NaN.
 *----------------------------------------------------------------------------*/
diff --git a/gdbstub.c b/gdbstub.c
index 5d37dd9..d02ec75 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -1226,33 +1226,48 @@
 
 static int cpu_gdb_read_register(CPUSH4State *env, uint8_t *mem_buf, int n)
 {
-    if (n < 8) {
+    switch (n) {
+    case 0 ... 7:
         if ((env->sr & (SR_MD | SR_RB)) == (SR_MD | SR_RB)) {
             GET_REGL(env->gregs[n + 16]);
         } else {
             GET_REGL(env->gregs[n]);
         }
-    } else if (n < 16) {
+    case 8 ... 15:
         GET_REGL(env->gregs[n]);
-    } else if (n >= 25 && n < 41) {
-	GET_REGL(env->fregs[(n - 25) + ((env->fpscr & FPSCR_FR) ? 16 : 0)]);
-    } else if (n >= 43 && n < 51) {
-	GET_REGL(env->gregs[n - 43]);
-    } else if (n >= 51 && n < 59) {
-	GET_REGL(env->gregs[n - (51 - 16)]);
-    }
-    switch (n) {
-    case 16: GET_REGL(env->pc);
-    case 17: GET_REGL(env->pr);
-    case 18: GET_REGL(env->gbr);
-    case 19: GET_REGL(env->vbr);
-    case 20: GET_REGL(env->mach);
-    case 21: GET_REGL(env->macl);
-    case 22: GET_REGL(env->sr);
-    case 23: GET_REGL(env->fpul);
-    case 24: GET_REGL(env->fpscr);
-    case 41: GET_REGL(env->ssr);
-    case 42: GET_REGL(env->spc);
+    case 16:
+        GET_REGL(env->pc);
+    case 17:
+        GET_REGL(env->pr);
+    case 18:
+        GET_REGL(env->gbr);
+    case 19:
+        GET_REGL(env->vbr);
+    case 20:
+        GET_REGL(env->mach);
+    case 21:
+        GET_REGL(env->macl);
+    case 22:
+        GET_REGL(env->sr);
+    case 23:
+        GET_REGL(env->fpul);
+    case 24:
+        GET_REGL(env->fpscr);
+    case 25 ... 40:
+        if (env->fpscr & FPSCR_FR) {
+            stfl_p(mem_buf, env->fregs[n - 9]);
+        } else {
+            stfl_p(mem_buf, env->fregs[n - 25]);
+        }
+        return 4;
+    case 41:
+        GET_REGL(env->ssr);
+    case 42:
+        GET_REGL(env->spc);
+    case 43 ... 50:
+        GET_REGL(env->gregs[n - 43]);
+    case 51 ... 58:
+        GET_REGL(env->gregs[n - (51 - 16)]);
     }
 
     return 0;
@@ -1260,42 +1275,63 @@
 
 static int cpu_gdb_write_register(CPUSH4State *env, uint8_t *mem_buf, int n)
 {
-    uint32_t tmp;
-
-    tmp = ldl_p(mem_buf);
-
-    if (n < 8) {
-        if ((env->sr & (SR_MD | SR_RB)) == (SR_MD | SR_RB)) {
-            env->gregs[n + 16] = tmp;
-        } else {
-            env->gregs[n] = tmp;
-        }
-	return 4;
-    } else if (n < 16) {
-        env->gregs[n] = tmp;
-	return 4;
-    } else if (n >= 25 && n < 41) {
-	env->fregs[(n - 25) + ((env->fpscr & FPSCR_FR) ? 16 : 0)] = tmp;
-	return 4;
-    } else if (n >= 43 && n < 51) {
-	env->gregs[n - 43] = tmp;
-	return 4;
-    } else if (n >= 51 && n < 59) {
-	env->gregs[n - (51 - 16)] = tmp;
-	return 4;
-    }
     switch (n) {
-    case 16: env->pc = tmp; break;
-    case 17: env->pr = tmp; break;
-    case 18: env->gbr = tmp; break;
-    case 19: env->vbr = tmp; break;
-    case 20: env->mach = tmp; break;
-    case 21: env->macl = tmp; break;
-    case 22: env->sr = tmp; break;
-    case 23: env->fpul = tmp; break;
-    case 24: env->fpscr = tmp; break;
-    case 41: env->ssr = tmp; break;
-    case 42: env->spc = tmp; break;
+    case 0 ... 7:
+        if ((env->sr & (SR_MD | SR_RB)) == (SR_MD | SR_RB)) {
+            env->gregs[n + 16] = ldl_p(mem_buf);
+        } else {
+            env->gregs[n] = ldl_p(mem_buf);
+        }
+        break;
+    case 8 ... 15:
+        env->gregs[n] = ldl_p(mem_buf);
+        break;
+    case 16:
+        env->pc = ldl_p(mem_buf);
+        break;
+    case 17:
+        env->pr = ldl_p(mem_buf);
+        break;
+    case 18:
+        env->gbr = ldl_p(mem_buf);
+        break;
+    case 19:
+        env->vbr = ldl_p(mem_buf);
+        break;
+    case 20:
+        env->mach = ldl_p(mem_buf);
+        break;
+    case 21:
+        env->macl = ldl_p(mem_buf);
+        break;
+    case 22:
+        env->sr = ldl_p(mem_buf);
+        break;
+    case 23:
+        env->fpul = ldl_p(mem_buf);
+        break;
+    case 24:
+        env->fpscr = ldl_p(mem_buf);
+        break;
+    case 25 ... 40:
+        if (env->fpscr & FPSCR_FR) {
+            env->fregs[n - 9] = ldfl_p(mem_buf);
+        } else {
+            env->fregs[n - 25] = ldfl_p(mem_buf);
+        }
+        break;
+    case 41:
+        env->ssr = ldl_p(mem_buf);
+        break;
+    case 42:
+        env->spc = ldl_p(mem_buf);
+        break;
+    case 43 ... 50:
+        env->gregs[n - 43] = ldl_p(mem_buf);
+        break;
+    case 51 ... 58:
+        env->gregs[n - (51 - 16)] = ldl_p(mem_buf);
+        break;
     default: return 0;
     }
 
@@ -1660,6 +1696,10 @@
         GET_REG32(env->uregs[reg->targno & 0xff]);
         break;
 
+    case 4: /*f*/
+        GET_REG32(float32_val(env->fregs[reg->targno & 0x0f]));
+        break;
+
     case 8: /*a*/
         GET_REG32(env->regs[reg->targno & 0x0f]);
         break;
@@ -1700,6 +1740,10 @@
         env->uregs[reg->targno & 0xff] = tmp;
         break;
 
+    case 4: /*f*/
+        env->fregs[reg->targno & 0x0f] = make_float32(tmp);
+        break;
+
     case 8: /*a*/
         env->regs[reg->targno & 0x0f] = tmp;
         break;
diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c
index e8dcc6b..9a0a565 100644
--- a/hw/cirrus_vga.c
+++ b/hw/cirrus_vga.c
@@ -2441,6 +2441,8 @@
     VGACommonState *s = &c->vga;
     int val, index;
 
+    qemu_flush_coalesced_mmio_buffer();
+
     if (vga_ioport_invalid(s, addr)) {
 	val = 0xff;
     } else {
@@ -2534,6 +2536,8 @@
     VGACommonState *s = &c->vga;
     int index;
 
+    qemu_flush_coalesced_mmio_buffer();
+
     /* check port range access depending on color/monochrome mode */
     if (vga_ioport_invalid(s, addr)) {
 	return;
@@ -2854,6 +2858,7 @@
     /* I/O handler for LFB */
     memory_region_init_io(&s->cirrus_linear_io, &cirrus_linear_io_ops, s,
                           "cirrus-linear-io", VGA_RAM_SIZE);
+    memory_region_set_flush_coalesced(&s->cirrus_linear_io);
 
     /* I/O handler for LFB */
     memory_region_init_io(&s->cirrus_linear_bitblt_io,
@@ -2861,10 +2866,12 @@
                           s,
                           "cirrus-bitblt-mmio",
                           0x400000);
+    memory_region_set_flush_coalesced(&s->cirrus_linear_bitblt_io);
 
     /* I/O handler for memory-mapped I/O */
     memory_region_init_io(&s->cirrus_mmio_io, &cirrus_mmio_io_ops, s,
                           "cirrus-mmio", CIRRUS_PNPMMIO_SIZE);
+    memory_region_set_flush_coalesced(&s->cirrus_mmio_io);
 
     s->real_vram_size =
         (s->device_id == CIRRUS_ID_CLGD5446) ? 4096 * 1024 : 2048 * 1024;
diff --git a/hw/e1000.c b/hw/e1000.c
index ae8a6c5..ec3a7c4 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -295,6 +295,7 @@
     s->rxbuf_min_shift = ((val / E1000_RCTL_RDMTS_QUAT) & 3) + 1;
     DBGOUT(RX, "RCTL: %d, mac_reg[RCTL] = 0x%x\n", s->mac_reg[RDT],
            s->mac_reg[RCTL]);
+    qemu_flush_queued_packets(&s->nic->nc);
 }
 
 static void
@@ -926,6 +927,9 @@
 {
     s->check_rxov = 0;
     s->mac_reg[index] = val & 0xffff;
+    if (e1000_has_rxbufs(s, 1)) {
+        qemu_flush_queued_packets(&s->nic->nc);
+    }
 }
 
 static void
diff --git a/hw/eepro100.c b/hw/eepro100.c
index 50d117e..5b23116 100644
--- a/hw/eepro100.c
+++ b/hw/eepro100.c
@@ -1036,6 +1036,7 @@
         }
         set_ru_state(s, ru_ready);
         s->ru_offset = e100_read_reg4(s, SCBPointer);
+        qemu_flush_queued_packets(&s->nic->nc);
         TRACE(OTHER, logout("val=0x%02x (rx start)\n", val));
         break;
     case RX_RESUME:
@@ -1770,7 +1771,8 @@
     if (rfd_command & COMMAND_EL) {
         /* EL bit is set, so this was the last frame. */
         logout("receive: Running out of frames\n");
-        set_ru_state(s, ru_suspended);
+        set_ru_state(s, ru_no_resources);
+        eepro100_rnr_interrupt(s);
     }
     if (rfd_command & COMMAND_S) {
         /* S bit is set. */
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 5ea3cad..68671bc 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1175,7 +1175,6 @@
         ad->port_no = i;
         ad->port.dma = &ad->dma;
         ad->port.dma->ops = &ahci_dma_ops;
-        ad->port_regs.cmd = PORT_CMD_SPIN_UP | PORT_CMD_POWER_ON;
     }
 }
 
@@ -1199,6 +1198,7 @@
         pr->irq_stat = 0;
         pr->irq_mask = 0;
         pr->scr_ctl = 0;
+        pr->cmd = PORT_CMD_SPIN_UP | PORT_CMD_POWER_ON;
         ahci_reset_port(s, i);
     }
 }
diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index f7f714c..685cbaa 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -875,6 +875,12 @@
     int sense;
     bool start = buf[4] & 1;
     bool loej = buf[4] & 2;     /* load on start, eject on !start */
+    int pwrcnd = buf[4] & 0xf0;
+
+    if (pwrcnd) {
+        /* eject/load only happens for power condition == 0 */
+        return;
+    }
 
     if (loej) {
         if (!start && !s->tray_open && s->tray_locked) {
diff --git a/hw/ide/core.c b/hw/ide/core.c
index d65ef3d..d6fb69c 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -53,8 +53,6 @@
     { 0x0c, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
     /* airflow-temperature-celsius */
     { 190,  0x03, 0x00, 0x45, 0x45, 0x1f, 0x00, 0x1f, 0x1f, 0x00, 0x00, 0x32},
-    /* end of list */
-    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
 static int ide_handle_rw_error(IDEState *s, int error, int op);
@@ -1468,9 +1466,7 @@
 	case SMART_READ_THRESH:
 		memset(s->io_buffer, 0, 0x200);
 		s->io_buffer[0] = 0x01; /* smart struct version */
-		for (n=0; n<30; n++) {
-		if (smart_attributes[n][0] == 0)
-			break;
+		for (n = 0; n < ARRAY_SIZE(smart_attributes); n++) {
 		s->io_buffer[2+0+(n*12)] = smart_attributes[n][0];
 		s->io_buffer[2+1+(n*12)] = smart_attributes[n][11];
 		}
@@ -1484,10 +1480,7 @@
 	case SMART_READ_DATA:
 		memset(s->io_buffer, 0, 0x200);
 		s->io_buffer[0] = 0x01; /* smart struct version */
-		for (n=0; n<30; n++) {
-		    if (smart_attributes[n][0] == 0) {
-			break;
-		    }
+		for (n = 0; n < ARRAY_SIZE(smart_attributes); n++) {
 		    int i;
 		    for(i = 0; i < 11; i++) {
 			s->io_buffer[2+i+(n*12)] = smart_attributes[n][i];
diff --git a/hw/imx_avic.c b/hw/imx_avic.c
index 4f010e8..b1a8fe6 100644
--- a/hw/imx_avic.c
+++ b/hw/imx_avic.c
@@ -6,9 +6,9 @@
  *
  * Copyright (c) 2008 OKL
  * Copyright (c) 2011 NICTA Pty Ltd
- * Originally Written by Hans Jiang
+ * Originally written by Hans Jiang
  *
- * This code is licenced under the GPL version 2 or later.  See
+ * This code is licensed under the GPL version 2 or later.  See
  * the COPYING file in the top-level directory.
  *
  * TODO: implement vectors.
diff --git a/hw/imx_timer.c b/hw/imx_timer.c
index 16215cc..c28c537 100644
--- a/hw/imx_timer.c
+++ b/hw/imx_timer.c
@@ -3,10 +3,10 @@
  *
  * Copyright (c) 2008 OK Labs
  * Copyright (c) 2011 NICTA Pty Ltd
- * Originally Written by Hans Jiang
+ * Originally written by Hans Jiang
  * Updated by Peter Chubb
  *
- * This code is licenced under GPL version 2 or later.  See
+ * This code is licensed under GPL version 2 or later.  See
  * the COPYING file in the top-level directory.
  *
  */
diff --git a/hw/kzm.c b/hw/kzm.c
index 6a5e9df..68cd1b4 100644
--- a/hw/kzm.c
+++ b/hw/kzm.c
@@ -5,7 +5,7 @@
  * Written by Hans at OK-Labs
  * Updated by Peter Chubb.
  *
- * This code is licenced under the GPL, version 2 or later.
+ * This code is licensed under the GPL, version 2 or later.
  * See the file `COPYING' in the top level directory.
  *
  * It (partially) emulates a Kyoto Microcomputer
diff --git a/hw/pc.c b/hw/pc.c
index 112739a..7e7e0e2 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -53,9 +53,6 @@
 #include "bitmap.h"
 #include "vga-pci.h"
 
-/* output Bochs bios info messages */
-//#define DEBUG_BIOS
-
 /* debug PC/ISA interrupts */
 //#define DEBUG_IRQ
 
@@ -534,17 +531,6 @@
     static int shutdown_index = 0;
 
     switch(addr) {
-        /* Bochs BIOS messages */
-    case 0x400:
-    case 0x401:
-        /* used to be panic, now unused */
-        break;
-    case 0x402:
-    case 0x403:
-#ifdef DEBUG_BIOS
-        fprintf(stderr, "%c", val);
-#endif
-        break;
     case 0x8900:
         /* same as Bochs power off */
         if (val == shutdown_str[shutdown_index]) {
@@ -558,16 +544,9 @@
         }
         break;
 
-        /* LGPL'ed VGA BIOS messages */
     case 0x501:
     case 0x502:
         exit((val << 1) | 1);
-    case 0x500:
-    case 0x503:
-#ifdef DEBUG_BIOS
-        fprintf(stderr, "%c", val);
-#endif
-        break;
     }
 }
 
@@ -596,17 +575,11 @@
     uint64_t *numa_fw_cfg;
     int i, j;
 
-    register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
-    register_ioport_write(0x401, 1, 2, bochs_bios_write, NULL);
-    register_ioport_write(0x402, 1, 1, bochs_bios_write, NULL);
-    register_ioport_write(0x403, 1, 1, bochs_bios_write, NULL);
     register_ioport_write(0x8900, 1, 1, bochs_bios_write, NULL);
 
     register_ioport_write(0x501, 1, 1, bochs_bios_write, NULL);
     register_ioport_write(0x501, 1, 2, bochs_bios_write, NULL);
     register_ioport_write(0x502, 1, 2, bochs_bios_write, NULL);
-    register_ioport_write(0x500, 1, 1, bochs_bios_write, NULL);
-    register_ioport_write(0x503, 1, 1, bochs_bios_write, NULL);
 
     fw_cfg = fw_cfg_init(BIOS_CFG_IOPORT, BIOS_CFG_IOPORT + 1, 0, 0);
 
diff --git a/hw/pflash_cfi01.c b/hw/pflash_cfi01.c
index d1c7423..9c42d31 100644
--- a/hw/pflash_cfi01.c
+++ b/hw/pflash_cfi01.c
@@ -41,6 +41,7 @@
 #include "block.h"
 #include "qemu-timer.h"
 #include "exec-memory.h"
+#include "host-utils.h"
 
 #define PFLASH_BUG(fmt, ...) \
 do { \
@@ -543,42 +544,6 @@
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-/* Count trailing zeroes of a 32 bits quantity */
-static int ctz32 (uint32_t n)
-{
-    int ret;
-
-    ret = 0;
-    if (!(n & 0xFFFF)) {
-        ret += 16;
-        n = n >> 16;
-    }
-    if (!(n & 0xFF)) {
-        ret += 8;
-        n = n >> 8;
-    }
-    if (!(n & 0xF)) {
-        ret += 4;
-        n = n >> 4;
-    }
-    if (!(n & 0x3)) {
-        ret += 2;
-        n = n >> 2;
-    }
-    if (!(n & 0x1)) {
-        ret++;
-#if 0 /* This is not necessary as n is never 0 */
-        n = n >> 1;
-#endif
-    }
-#if 0 /* This is not necessary as n is never 0 */
-    if (!n)
-        ret++;
-#endif
-
-    return ret;
-}
-
 pflash_t *pflash_cfi01_register(target_phys_addr_t base,
                                 DeviceState *qdev, const char *name,
                                 target_phys_addr_t size,
@@ -711,7 +676,7 @@
     pfl->cfi_table[0x33] = 'I';
 
     pfl->cfi_table[0x34] = '1';
-    pfl->cfi_table[0x35] = '1';
+    pfl->cfi_table[0x35] = '0';
 
     pfl->cfi_table[0x36] = 0x00;
     pfl->cfi_table[0x37] = 0x00;
@@ -723,6 +688,8 @@
     pfl->cfi_table[0x3b] = 0x00;
     pfl->cfi_table[0x3c] = 0x00;
 
+    pfl->cfi_table[0x3f] = 0x01; /* Number of protection fields */
+
     return pfl;
 }
 
diff --git a/hw/pflash_cfi02.c b/hw/pflash_cfi02.c
index 3e2002e..8cb1549 100644
--- a/hw/pflash_cfi02.c
+++ b/hw/pflash_cfi02.c
@@ -40,6 +40,7 @@
 #include "qemu-timer.h"
 #include "block.h"
 #include "exec-memory.h"
+#include "host-utils.h"
 
 //#define PFLASH_DEBUG
 #ifdef PFLASH_DEBUG
@@ -575,42 +576,6 @@
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-/* Count trailing zeroes of a 32 bits quantity */
-static int ctz32 (uint32_t n)
-{
-    int ret;
-
-    ret = 0;
-    if (!(n & 0xFFFF)) {
-        ret += 16;
-        n = n >> 16;
-    }
-    if (!(n & 0xFF)) {
-        ret += 8;
-        n = n >> 8;
-    }
-    if (!(n & 0xF)) {
-        ret += 4;
-        n = n >> 4;
-    }
-    if (!(n & 0x3)) {
-        ret += 2;
-        n = n >> 2;
-    }
-    if (!(n & 0x1)) {
-        ret++;
-#if 0 /* This is not necessary as n is never 0 */
-        n = n >> 1;
-#endif
-    }
-#if 0 /* This is not necessary as n is never 0 */
-    if (!n)
-        ret++;
-#endif
-
-    return ret;
-}
-
 pflash_t *pflash_cfi02_register(target_phys_addr_t base,
                                 DeviceState *qdev, const char *name,
                                 target_phys_addr_t size,
diff --git a/hw/qxl.c b/hw/qxl.c
index 5b3f484..33169f3 100644
--- a/hw/qxl.c
+++ b/hw/qxl.c
@@ -18,6 +18,8 @@
  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <zlib.h>
+
 #include "qemu-common.h"
 #include "qemu-timer.h"
 #include "qemu-queue.h"
@@ -141,6 +143,7 @@
 
 void qxl_set_guest_bug(PCIQXLDevice *qxl, const char *msg, ...)
 {
+    trace_qxl_set_guest_bug(qxl->id);
     qxl_send_events(qxl, QXL_INTERRUPT_ERROR);
     qxl->guest_bug = 1;
     if (qxl->guestdebug) {
@@ -201,6 +204,7 @@
         spice_qxl_destroy_surface_async(&qxl->ssd.qxl, id, (uintptr_t)cookie);
     } else {
         qxl->ssd.worker->destroy_surface_wait(qxl->ssd.worker, id);
+        qxl_spice_destroy_surface_wait_complete(qxl, id);
     }
 }
 
@@ -597,9 +601,9 @@
     case QXL_MODE_VGA:
         ret = false;
         qemu_mutex_lock(&qxl->ssd.lock);
-        if (qxl->ssd.update != NULL) {
-            update = qxl->ssd.update;
-            qxl->ssd.update = NULL;
+        update = QTAILQ_FIRST(&qxl->ssd.updates);
+        if (update != NULL) {
+            QTAILQ_REMOVE(&qxl->ssd.updates, update, next);
             *ext = update->ext;
             ret = true;
         }
@@ -953,6 +957,11 @@
 {
     PCIQXLDevice *qxl = container_of(sin, PCIQXLDevice, ssd.qxl);
 
+    if (runstate_check(RUN_STATE_INMIGRATE) ||
+        runstate_check(RUN_STATE_POSTMIGRATE)) {
+        return;
+    }
+
     qxl->shadow_rom.client_present = client_present;
     memcpy(qxl->shadow_rom.client_capabilities, caps, sizeof(caps));
     qxl->rom->client_present = client_present;
@@ -964,6 +973,79 @@
 
 #endif
 
+#if defined(CONFIG_QXL_CLIENT_MONITORS_CONFIG) \
+    && SPICE_SERVER_VERSION >= 0x000b05
+
+static uint32_t qxl_crc32(const uint8_t *p, unsigned len)
+{
+    /*
+     * zlib xors the seed with 0xffffffff, and xors the result
+     * again with 0xffffffff; Both are not done with linux's crc32,
+     * which we want to be compatible with, so undo that.
+     */
+    return crc32(0xffffffff, p, len) ^ 0xffffffff;
+}
+
+/* called from main context only */
+static int interface_client_monitors_config(QXLInstance *sin,
+                                        VDAgentMonitorsConfig *monitors_config)
+{
+    PCIQXLDevice *qxl = container_of(sin, PCIQXLDevice, ssd.qxl);
+    QXLRom *rom = memory_region_get_ram_ptr(&qxl->rom_bar);
+    int i;
+
+    /*
+     * Older windows drivers set int_mask to 0 when their ISR is called,
+     * then later set it to ~0. So it doesn't relate to the actual interrupts
+     * handled. However, they are old, so clearly they don't support this
+     * interrupt
+     */
+    if (qxl->ram->int_mask == 0 || qxl->ram->int_mask == ~0 ||
+        !(qxl->ram->int_mask & QXL_INTERRUPT_CLIENT_MONITORS_CONFIG)) {
+        trace_qxl_client_monitors_config_unsupported_by_guest(qxl->id,
+                                                            qxl->ram->int_mask,
+                                                            monitors_config);
+        return 0;
+    }
+    if (!monitors_config) {
+        return 1;
+    }
+    memset(&rom->client_monitors_config, 0,
+           sizeof(rom->client_monitors_config));
+    rom->client_monitors_config.count = monitors_config->num_of_monitors;
+    /* monitors_config->flags ignored */
+    if (rom->client_monitors_config.count >=
+            ARRAY_SIZE(rom->client_monitors_config.heads)) {
+        trace_qxl_client_monitors_config_capped(qxl->id,
+                                monitors_config->num_of_monitors,
+                                ARRAY_SIZE(rom->client_monitors_config.heads));
+        rom->client_monitors_config.count =
+            ARRAY_SIZE(rom->client_monitors_config.heads);
+    }
+    for (i = 0 ; i < rom->client_monitors_config.count ; ++i) {
+        VDAgentMonConfig *monitor = &monitors_config->monitors[i];
+        QXLURect *rect = &rom->client_monitors_config.heads[i];
+        /* monitor->depth ignored */
+        rect->left = monitor->x;
+        rect->top = monitor->y;
+        rect->right = monitor->x + monitor->width;
+        rect->bottom = monitor->y + monitor->height;
+    }
+    rom->client_monitors_config_crc = qxl_crc32(
+            (const uint8_t *)&rom->client_monitors_config,
+            sizeof(rom->client_monitors_config));
+    trace_qxl_client_monitors_config_crc(qxl->id,
+            sizeof(rom->client_monitors_config),
+            rom->client_monitors_config_crc);
+
+    trace_qxl_interrupt_client_monitors_config(qxl->id,
+                        rom->client_monitors_config.count,
+                        rom->client_monitors_config.heads);
+    qxl_send_events(qxl, QXL_INTERRUPT_CLIENT_MONITORS_CONFIG);
+    return 1;
+}
+#endif
+
 static const QXLInterface qxl_interface = {
     .base.type               = SPICE_INTERFACE_QXL,
     .base.description        = "qxl gpu",
@@ -988,6 +1070,10 @@
 #if SPICE_SERVER_VERSION >= 0x000b04
     .set_client_capabilities = interface_set_client_capabilities,
 #endif
+#if SPICE_SERVER_VERSION >= 0x000b05 && \
+    defined(CONFIG_QXL_CLIENT_MONITORS_CONFIG)
+    .client_monitors_config = interface_client_monitors_config,
+#endif
 };
 
 static void qxl_enter_vga_mode(PCIQXLDevice *d)
@@ -1402,7 +1488,7 @@
             break;
         }
         trace_qxl_io_unexpected_vga_mode(d->id,
-            io_port, io_port_to_string(io_port));
+            addr, val, io_port_to_string(io_port));
         /* be nice to buggy guest drivers */
         if (io_port >= QXL_IO_UPDATE_AREA_ASYNC &&
             io_port < QXL_IO_RANGE_SIZE) {
@@ -1470,6 +1556,13 @@
             return;
         }
 
+        if (update.left < 0 || update.top < 0 || update.left >= update.right ||
+            update.top >= update.bottom) {
+            qxl_set_guest_bug(d, "QXL_IO_UPDATE_AREA: "
+                              "invalid area(%d,%d,%d,%d)\n", update.left,
+                              update.right, update.top, update.bottom);
+            break;
+        }
         if (async == QXL_ASYNC) {
             cookie = qxl_cookie_new(QXL_COOKIE_TYPE_IO,
                                     QXL_IO_UPDATE_AREA_ASYNC);
@@ -1501,6 +1594,7 @@
         qxl_set_mode(d, val, 0);
         break;
     case QXL_IO_LOG:
+        trace_qxl_io_log(d->id, d->ram->log_buf);
         if (d->guestdebug) {
             fprintf(stderr, "qxl/guest-%d: %" PRId64 ": %s", d->id,
                     qemu_get_clock_ns(vm_clock), d->ram->log_buf);
@@ -1594,9 +1688,9 @@
 static uint64_t ioport_read(void *opaque, target_phys_addr_t addr,
                             unsigned size)
 {
-    PCIQXLDevice *d = opaque;
+    PCIQXLDevice *qxl = opaque;
 
-    trace_qxl_io_read_unexpected(d->id);
+    trace_qxl_io_read_unexpected(qxl->id);
     return 0xff;
 }
 
@@ -1626,6 +1720,7 @@
     uint32_t old_pending;
     uint32_t le_events = cpu_to_le32(events);
 
+    trace_qxl_send_events(d->id, events);
     assert(qemu_spice_display_is_running(&d->ssd));
     old_pending = __sync_fetch_and_or(&d->ram->int_pending, le_events);
     if ((old_pending & le_events) == le_events) {
@@ -1910,6 +2005,7 @@
     if (qxl->id == 0) {
         vga_dirty_log_start(&qxl->vga);
     }
+    memory_region_set_flush_coalesced(&qxl->io_bar);
 
 
     pci_register_bar(&qxl->pci, QXL_IO_RANGE_INDEX,
diff --git a/hw/srp.h b/hw/srp.h
index 3009bd5..5e0cad5 100644
--- a/hw/srp.h
+++ b/hw/srp.h
@@ -177,13 +177,13 @@
     uint8_t    reserved1[6];
     uint64_t   tag;
     uint8_t    reserved2[4];
-    uint64_t   lun QEMU_PACKED;
+    uint64_t   lun;
     uint8_t    reserved3[2];
     uint8_t    tsk_mgmt_func;
     uint8_t    reserved4;
     uint64_t   task_tag;
     uint8_t    reserved5[8];
-};
+} QEMU_PACKED;
 
 /*
  * We need the packed attribute because the SRP spec only aligns the
@@ -198,14 +198,14 @@
     uint8_t    data_in_desc_cnt;
     uint64_t   tag;
     uint8_t    reserved2[4];
-    uint64_t   lun QEMU_PACKED;
+    uint64_t   lun;
     uint8_t    reserved3;
     uint8_t    task_attr;
     uint8_t    reserved4;
     uint8_t    add_cdb_len;
     uint8_t    cdb[16];
     uint8_t    add_data[0];
-};
+} QEMU_PACKED;
 
 enum {
     SRP_RSP_FLAG_RSPVALID = 1 << 0,
diff --git a/hw/usb/dev-network.c b/hw/usb/dev-network.c
index c84892c..e4a4359 100644
--- a/hw/usb/dev-network.c
+++ b/hw/usb/dev-network.c
@@ -1001,6 +1001,13 @@
     return 0;
 }
 
+/* Prepare to receive the next packet */
+static void usb_net_reset_in_buf(USBNetState *s)
+{
+    s->in_ptr = s->in_len = 0;
+    qemu_flush_queued_packets(&s->nic->nc);
+}
+
 static int rndis_parse(USBNetState *s, uint8_t *data, int length)
 {
     uint32_t msg_type;
@@ -1025,7 +1032,8 @@
 
     case RNDIS_RESET_MSG:
         rndis_clear_responsequeue(s);
-        s->out_ptr = s->in_ptr = s->in_len = 0;
+        s->out_ptr = 0;
+        usb_net_reset_in_buf(s);
         return rndis_reset_response(s, (rndis_reset_msg_type *) data);
 
     case RNDIS_KEEPALIVE_MSG:
@@ -1135,7 +1143,7 @@
     int ret = USB_RET_NAK;
 
     if (s->in_ptr > s->in_len) {
-        s->in_ptr = s->in_len = 0;
+        usb_net_reset_in_buf(s);
         ret = USB_RET_NAK;
         return ret;
     }
@@ -1152,7 +1160,7 @@
     if (s->in_ptr >= s->in_len &&
                     (is_rndis(s) || (s->in_len & (64 - 1)) || !ret)) {
         /* no short packet necessary */
-        s->in_ptr = s->in_len = 0;
+        usb_net_reset_in_buf(s);
     }
 
 #ifdef TRAFFIC_DEBUG
@@ -1250,20 +1258,32 @@
 static ssize_t usbnet_receive(NetClientState *nc, const uint8_t *buf, size_t size)
 {
     USBNetState *s = DO_UPCAST(NICState, nc, nc)->opaque;
-    struct rndis_packet_msg_type *msg;
+    uint8_t *in_buf = s->in_buf;
+    size_t total_size = size;
 
     if (is_rndis(s)) {
-        msg = (struct rndis_packet_msg_type *) s->in_buf;
         if (s->rndis_state != RNDIS_DATA_INITIALIZED) {
             return -1;
         }
-        if (size + sizeof(struct rndis_packet_msg_type) > sizeof(s->in_buf))
-            return -1;
+        total_size += sizeof(struct rndis_packet_msg_type);
+    }
+    if (total_size > sizeof(s->in_buf)) {
+        return -1;
+    }
 
+    /* Only accept packet if input buffer is empty */
+    if (s->in_len > 0) {
+        return 0;
+    }
+
+    if (is_rndis(s)) {
+        struct rndis_packet_msg_type *msg;
+
+        msg = (struct rndis_packet_msg_type *)in_buf;
         memset(msg, 0, sizeof(struct rndis_packet_msg_type));
         msg->MessageType = cpu_to_le32(RNDIS_PACKET_MSG);
-        msg->MessageLength = cpu_to_le32(size + sizeof(struct rndis_packet_msg_type));
-        msg->DataOffset = cpu_to_le32(sizeof(struct rndis_packet_msg_type) - 8);
+        msg->MessageLength = cpu_to_le32(size + sizeof(*msg));
+        msg->DataOffset = cpu_to_le32(sizeof(*msg) - 8);
         msg->DataLength = cpu_to_le32(size);
         /* msg->OOBDataOffset;
          * msg->OOBDataLength;
@@ -1273,14 +1293,11 @@
          * msg->VcHandle;
          * msg->Reserved;
          */
-        memcpy(msg + 1, buf, size);
-        s->in_len = size + sizeof(struct rndis_packet_msg_type);
-    } else {
-        if (size > sizeof(s->in_buf))
-            return -1;
-        memcpy(s->in_buf, buf, size);
-        s->in_len = size;
+        in_buf += sizeof(*msg);
     }
+
+    memcpy(in_buf, buf, size);
+    s->in_len = total_size;
     s->in_ptr = 0;
     return size;
 }
diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c
index 2f3e9c0..6a5da84 100644
--- a/hw/usb/hcd-ehci.c
+++ b/hw/usb/hcd-ehci.c
@@ -34,6 +34,7 @@
 #include "monitor.h"
 #include "trace.h"
 #include "dma.h"
+#include "sysemu.h"
 
 #define EHCI_DEBUG   0
 
@@ -139,6 +140,7 @@
 #define NB_PORTS         6        // Number of downstream ports
 #define BUFF_SIZE        5*4096   // Max bytes to transfer per transaction
 #define MAX_QH           100      // Max allowable queue heads in a chain
+#define MIN_FR_PER_TICK  3        // Min frames to process when catching up
 
 /*  Internal periodic / asynchronous schedule state machine states
  */
@@ -389,6 +391,9 @@
     USBBus bus;
     qemu_irq irq;
     MemoryRegion mem;
+    MemoryRegion mem_caps;
+    MemoryRegion mem_opreg;
+    MemoryRegion mem_ports;
     int companion_count;
 
     /* properties */
@@ -398,10 +403,10 @@
      *  EHCI spec version 1.0 Section 2.3
      *  Host Controller Operational Registers
      */
+    uint8_t caps[OPREGBASE];
     union {
-        uint8_t mmio[MMIO_SIZE];
+        uint32_t opreg[(PORTSC_BEGIN-OPREGBASE)/sizeof(uint32_t)];
         struct {
-            uint8_t cap[OPREGBASE];
             uint32_t usbcmd;
             uint32_t usbsts;
             uint32_t usbintr;
@@ -411,9 +416,9 @@
             uint32_t asynclistaddr;
             uint32_t notused[9];
             uint32_t configflag;
-            uint32_t portsc[NB_PORTS];
         };
     };
+    uint32_t portsc[NB_PORTS];
 
     /*
      *  Internal states, shadow registers, etc
@@ -471,22 +476,12 @@
 };
 
 static const char *ehci_mmio_names[] = {
-    [CAPLENGTH]         = "CAPLENGTH",
-    [HCIVERSION]        = "HCIVERSION",
-    [HCSPARAMS]         = "HCSPARAMS",
-    [HCCPARAMS]         = "HCCPARAMS",
     [USBCMD]            = "USBCMD",
     [USBSTS]            = "USBSTS",
     [USBINTR]           = "USBINTR",
     [FRINDEX]           = "FRINDEX",
     [PERIODICLISTBASE]  = "P-LIST BASE",
     [ASYNCLISTADDR]     = "A-LIST ADDR",
-    [PORTSC_BEGIN]      = "PORTSC #0",
-    [PORTSC_BEGIN + 4]  = "PORTSC #1",
-    [PORTSC_BEGIN + 8]  = "PORTSC #2",
-    [PORTSC_BEGIN + 12] = "PORTSC #3",
-    [PORTSC_BEGIN + 16] = "PORTSC #4",
-    [PORTSC_BEGIN + 20] = "PORTSC #5",
     [CONFIGFLAG]        = "CONFIGFLAG",
 };
 
@@ -509,7 +504,8 @@
 
 static const char *addr2str(target_phys_addr_t addr)
 {
-    return nr2str(ehci_mmio_names, ARRAY_SIZE(ehci_mmio_names), addr);
+    return nr2str(ehci_mmio_names, ARRAY_SIZE(ehci_mmio_names),
+                  addr + OPREGBASE);
 }
 
 static void ehci_trace_usbsts(uint32_t mask, int state)
@@ -853,10 +849,10 @@
     return NULL;
 }
 
-static void ehci_queues_rip_unused(EHCIState *ehci, int async, int flush)
+static void ehci_queues_rip_unused(EHCIState *ehci, int async)
 {
     EHCIQueueHead *head = async ? &ehci->aqueues : &ehci->pqueues;
-    const char *warn = (async && !flush) ? "guest unlinked busy QH" : NULL;
+    const char *warn = async ? "guest unlinked busy QH" : NULL;
     uint64_t maxage = FRAME_TIMER_NS * ehci->maxframes * 4;
     EHCIQueue *q, *tmp;
 
@@ -866,13 +862,25 @@
             q->ts = ehci->last_run_ns;
             continue;
         }
-        if (!flush && ehci->last_run_ns < q->ts + maxage) {
+        if (ehci->last_run_ns < q->ts + maxage) {
             continue;
         }
         ehci_free_queue(q, warn);
     }
 }
 
+static void ehci_queues_rip_unseen(EHCIState *ehci, int async)
+{
+    EHCIQueueHead *head = async ? &ehci->aqueues : &ehci->pqueues;
+    EHCIQueue *q, *tmp;
+
+    QTAILQ_FOREACH_SAFE(q, head, next, tmp) {
+        if (!q->seen) {
+            ehci_free_queue(q, NULL);
+        }
+    }
+}
+
 static void ehci_queues_rip_device(EHCIState *ehci, USBDevice *dev, int async)
 {
     EHCIQueueHead *head = async ? &ehci->aqueues : &ehci->pqueues;
@@ -1018,7 +1026,7 @@
     }
 
     s->companion_count++;
-    s->mmio[0x05] = (s->companion_count << 4) | portcount;
+    s->caps[0x05] = (s->companion_count << 4) | portcount;
 
     return 0;
 }
@@ -1063,7 +1071,8 @@
         }
     }
 
-    memset(&s->mmio[OPREGBASE], 0x00, MMIO_SIZE - OPREGBASE);
+    memset(&s->opreg, 0x00, sizeof(s->opreg));
+    memset(&s->portsc, 0x00, sizeof(s->portsc));
 
     s->usbcmd = NB_MAXINTRATE << USBCMD_ITC_SH;
     s->usbsts = USBSTS_HALT;
@@ -1090,50 +1099,35 @@
     qemu_bh_cancel(s->async_bh);
 }
 
-static uint32_t ehci_mem_readb(void *ptr, target_phys_addr_t addr)
+static uint64_t ehci_caps_read(void *ptr, target_phys_addr_t addr,
+                               unsigned size)
+{
+    EHCIState *s = ptr;
+    return s->caps[addr];
+}
+
+static uint64_t ehci_opreg_read(void *ptr, target_phys_addr_t addr,
+                                unsigned size)
 {
     EHCIState *s = ptr;
     uint32_t val;
 
-    val = s->mmio[addr];
-
+    val = s->opreg[addr >> 2];
+    trace_usb_ehci_opreg_read(addr + OPREGBASE, addr2str(addr), val);
     return val;
 }
 
-static uint32_t ehci_mem_readw(void *ptr, target_phys_addr_t addr)
+static uint64_t ehci_port_read(void *ptr, target_phys_addr_t addr,
+                               unsigned size)
 {
     EHCIState *s = ptr;
     uint32_t val;
 
-    val = s->mmio[addr] | (s->mmio[addr+1] << 8);
-
+    val = s->portsc[addr >> 2];
+    trace_usb_ehci_portsc_read(addr + PORTSC_BEGIN, addr >> 2, val);
     return val;
 }
 
-static uint32_t ehci_mem_readl(void *ptr, target_phys_addr_t addr)
-{
-    EHCIState *s = ptr;
-    uint32_t val;
-
-    val = s->mmio[addr] | (s->mmio[addr+1] << 8) |
-          (s->mmio[addr+2] << 16) | (s->mmio[addr+3] << 24);
-
-    trace_usb_ehci_mmio_readl(addr, addr2str(addr), val);
-    return val;
-}
-
-static void ehci_mem_writeb(void *ptr, target_phys_addr_t addr, uint32_t val)
-{
-    fprintf(stderr, "EHCI doesn't handle byte writes to MMIO\n");
-    exit(1);
-}
-
-static void ehci_mem_writew(void *ptr, target_phys_addr_t addr, uint32_t val)
-{
-    fprintf(stderr, "EHCI doesn't handle 16-bit writes to MMIO\n");
-    exit(1);
-}
-
 static void handle_port_owner_write(EHCIState *s, int port, uint32_t owner)
 {
     USBDevice *dev = s->ports[port].dev;
@@ -1162,11 +1156,17 @@
     }
 }
 
-static void handle_port_status_write(EHCIState *s, int port, uint32_t val)
+static void ehci_port_write(void *ptr, target_phys_addr_t addr,
+                            uint64_t val, unsigned size)
 {
+    EHCIState *s = ptr;
+    int port = addr >> 2;
     uint32_t *portsc = &s->portsc[port];
+    uint32_t old = *portsc;
     USBDevice *dev = s->ports[port].dev;
 
+    trace_usb_ehci_portsc_write(addr + PORTSC_BEGIN, addr >> 2, val);
+
     /* Clear rwc bits */
     *portsc &= ~(val & PORTSC_RWC_MASK);
     /* The guest may clear, but not set the PED bit */
@@ -1198,39 +1198,20 @@
 
     *portsc &= ~PORTSC_RO_MASK;
     *portsc |= val;
+    trace_usb_ehci_portsc_change(addr + PORTSC_BEGIN, addr >> 2, *portsc, old);
 }
 
-static void ehci_mem_writel(void *ptr, target_phys_addr_t addr, uint32_t val)
+static void ehci_opreg_write(void *ptr, target_phys_addr_t addr,
+                             uint64_t val, unsigned size)
 {
     EHCIState *s = ptr;
-    uint32_t *mmio = (uint32_t *)(&s->mmio[addr]);
+    uint32_t *mmio = s->opreg + (addr >> 2);
     uint32_t old = *mmio;
     int i;
 
-    trace_usb_ehci_mmio_writel(addr, addr2str(addr), val);
+    trace_usb_ehci_opreg_write(addr + OPREGBASE, addr2str(addr), val);
 
-    /* Only aligned reads are allowed on OHCI */
-    if (addr & 3) {
-        fprintf(stderr, "usb-ehci: Mis-aligned write to addr 0x"
-                TARGET_FMT_plx "\n", addr);
-        return;
-    }
-
-    if (addr >= PORTSC && addr < PORTSC + 4 * NB_PORTS) {
-        handle_port_status_write(s, (addr-PORTSC)/4, val);
-        trace_usb_ehci_mmio_change(addr, addr2str(addr), *mmio, old);
-        return;
-    }
-
-    if (addr < OPREGBASE) {
-        fprintf(stderr, "usb-ehci: write attempt to read-only register"
-                TARGET_FMT_plx "\n", addr);
-        return;
-    }
-
-
-    /* Do any register specific pre-write processing here.  */
-    switch(addr) {
+    switch (addr + OPREGBASE) {
     case USBCMD:
         if (val & USBCMD_HCRESET) {
             ehci_reset(s);
@@ -1241,7 +1222,7 @@
         /* not supporting dynamic frame list size at the moment */
         if ((val & USBCMD_FLS) && !(s->usbcmd & USBCMD_FLS)) {
             fprintf(stderr, "attempt to set frame list size -- value %d\n",
-                    val & USBCMD_FLS);
+                    (int)val & USBCMD_FLS);
             val &= ~USBCMD_FLS;
         }
 
@@ -1308,7 +1289,7 @@
     }
 
     *mmio = val;
-    trace_usb_ehci_mmio_change(addr, addr2str(addr), *mmio, old);
+    trace_usb_ehci_opreg_change(addr + OPREGBASE, addr2str(addr), *mmio, old);
 }
 
 
@@ -1732,7 +1713,7 @@
         ehci_set_usbsts(ehci, USBSTS_REC);
     }
 
-    ehci_queues_rip_unused(ehci, async, 0);
+    ehci_queues_rip_unused(ehci, async);
 
     /*  Find the head of the list (4.9.1.1) */
     for(i = 0; i < MAX_QH; i++) {
@@ -2364,7 +2345,7 @@
          */
         if (ehci->usbcmd & USBCMD_IAAD) {
             /* Remove all unseen qhs from the async qhs queue */
-            ehci_queues_rip_unused(ehci, async, 1);
+            ehci_queues_rip_unseen(ehci, async);
             trace_usb_ehci_doorbell_ack();
             ehci->usbcmd &= ~USBCMD_IAAD;
             ehci_raise_irq(ehci, USBSTS_IAA);
@@ -2417,7 +2398,7 @@
         ehci_set_fetch_addr(ehci, async,entry);
         ehci_set_state(ehci, async, EST_FETCHENTRY);
         ehci_advance_state(ehci, async);
-        ehci_queues_rip_unused(ehci, async, 0);
+        ehci_queues_rip_unused(ehci, async);
         break;
 
     default:
@@ -2446,7 +2427,7 @@
         if (ehci->frindex == 0x00004000) {
             ehci_raise_irq(ehci, USBSTS_FLR);
             ehci->frindex = 0;
-            if (ehci->usbsts_frindex > 0x00004000) {
+            if (ehci->usbsts_frindex >= 0x00004000) {
                 ehci->usbsts_frindex -= 0x00004000;
             } else {
                 ehci->usbsts_frindex = 0;
@@ -2481,6 +2462,19 @@
         }
 
         for (i = 0; i < frames; i++) {
+            /*
+             * If we're running behind schedule, we should not catch up
+             * too fast, as that will make some guests unhappy:
+             * 1) We must process a minimum of MIN_FR_PER_TICK frames,
+             *    otherwise we will never catch up
+             * 2) Process frames until the guest has requested an irq (IOC)
+             */
+            if (i >= MIN_FR_PER_TICK) {
+                ehci_commit_irq(ehci);
+                if ((ehci->usbsts & USBINTR_MASK) & ehci->usbintr) {
+                    break;
+                }
+            }
             ehci_update_frindex(ehci, 1);
             ehci_advance_periodic_state(ehci);
             ehci->last_run_ns += FRAME_TIMER_NS;
@@ -2520,11 +2514,28 @@
     ehci_advance_async_state(ehci);
 }
 
-static const MemoryRegionOps ehci_mem_ops = {
-    .old_mmio = {
-        .read = { ehci_mem_readb, ehci_mem_readw, ehci_mem_readl },
-        .write = { ehci_mem_writeb, ehci_mem_writew, ehci_mem_writel },
-    },
+static const MemoryRegionOps ehci_mmio_caps_ops = {
+    .read = ehci_caps_read,
+    .valid.min_access_size = 1,
+    .valid.max_access_size = 4,
+    .impl.min_access_size = 1,
+    .impl.max_access_size = 1,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static const MemoryRegionOps ehci_mmio_opreg_ops = {
+    .read = ehci_opreg_read,
+    .write = ehci_opreg_write,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 4,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static const MemoryRegionOps ehci_mmio_port_ops = {
+    .read = ehci_port_read,
+    .write = ehci_port_write,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 4,
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
@@ -2562,6 +2573,32 @@
     return 0;
 }
 
+static void usb_ehci_vm_state_change(void *opaque, int running, RunState state)
+{
+    EHCIState *ehci = opaque;
+
+    /*
+     * We don't migrate the EHCIQueue-s, instead we rebuild them for the
+     * schedule in guest memory. We must do the rebuilt ASAP, so that
+     * USB-devices which have async handled packages have a packet in the
+     * ep queue to match the completion with.
+     */
+    if (state == RUN_STATE_RUNNING) {
+        ehci_advance_async_state(ehci);
+    }
+
+    /*
+     * The schedule rebuilt from guest memory could cause the migration dest
+     * to miss a QH unlink, and fail to cancel packets, since the unlinked QH
+     * will never have existed on the destination. Therefor we must flush the
+     * async schedule on savevm to catch any not yet noticed unlinks.
+     */
+    if (state == RUN_STATE_SAVE_VM) {
+        ehci_advance_async_state(ehci);
+        ehci_queues_rip_unseen(ehci, 1);
+    }
+}
+
 static const VMStateDescription vmstate_ehci = {
     .name        = "ehci",
     .version_id  = 2,
@@ -2681,19 +2718,19 @@
     pci_conf[0x6e] = 0x00;
     pci_conf[0x6f] = 0xc0;  // USBLEFCTLSTS
 
-    // 2.2 host controller interface version
-    s->mmio[0x00] = (uint8_t) OPREGBASE;
-    s->mmio[0x01] = 0x00;
-    s->mmio[0x02] = 0x00;
-    s->mmio[0x03] = 0x01;        // HC version
-    s->mmio[0x04] = NB_PORTS;    // Number of downstream ports
-    s->mmio[0x05] = 0x00;        // No companion ports at present
-    s->mmio[0x06] = 0x00;
-    s->mmio[0x07] = 0x00;
-    s->mmio[0x08] = 0x80;        // We can cache whole frame, not 64-bit capable
-    s->mmio[0x09] = 0x68;        // EECP
-    s->mmio[0x0a] = 0x00;
-    s->mmio[0x0b] = 0x00;
+    /* 2.2 host controller interface version */
+    s->caps[0x00] = (uint8_t) OPREGBASE;
+    s->caps[0x01] = 0x00;
+    s->caps[0x02] = 0x00;
+    s->caps[0x03] = 0x01;        /* HC version */
+    s->caps[0x04] = NB_PORTS;    /* Number of downstream ports */
+    s->caps[0x05] = 0x00;        /* No companion ports at present */
+    s->caps[0x06] = 0x00;
+    s->caps[0x07] = 0x00;
+    s->caps[0x08] = 0x80;        /* We can cache whole frame, no 64-bit */
+    s->caps[0x09] = 0x68;        /* EECP */
+    s->caps[0x0a] = 0x00;
+    s->caps[0x0b] = 0x00;
 
     s->irq = s->dev.irq[3];
 
@@ -2711,8 +2748,20 @@
     usb_packet_init(&s->ipacket);
 
     qemu_register_reset(ehci_reset, s);
+    qemu_add_vm_change_state_handler(usb_ehci_vm_state_change, s);
 
-    memory_region_init_io(&s->mem, &ehci_mem_ops, s, "ehci", MMIO_SIZE);
+    memory_region_init(&s->mem, "ehci", MMIO_SIZE);
+    memory_region_init_io(&s->mem_caps, &ehci_mmio_caps_ops, s,
+                          "capabilities", OPREGBASE);
+    memory_region_init_io(&s->mem_opreg, &ehci_mmio_opreg_ops, s,
+                          "operational", PORTSC_BEGIN - OPREGBASE);
+    memory_region_init_io(&s->mem_ports, &ehci_mmio_port_ops, s,
+                          "ports", PORTSC_END - PORTSC_BEGIN);
+
+    memory_region_add_subregion(&s->mem, 0,            &s->mem_caps);
+    memory_region_add_subregion(&s->mem, OPREGBASE,    &s->mem_opreg);
+    memory_region_add_subregion(&s->mem, PORTSC_BEGIN, &s->mem_ports);
+
     pci_register_bar(&s->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &s->mem);
 
     return 0;
diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c
index c7c8786..cdc8bc3 100644
--- a/hw/usb/hcd-uhci.c
+++ b/hw/usb/hcd-uhci.c
@@ -1000,6 +1000,9 @@
         }
         assert(ret == TD_RESULT_ASYNC_START);
         assert(int_mask == 0);
+        if (ptd.ctrl & TD_CTRL_SPD) {
+            break;
+        }
         plink = ptd.link;
     }
 }
@@ -1097,7 +1100,7 @@
 
         case TD_RESULT_ASYNC_START:
             trace_usb_uhci_td_async(curr_qh & ~0xf, link & ~0xf);
-            if (is_valid(td.link)) {
+            if (is_valid(td.link) && !(td.ctrl & TD_CTRL_SPD)) {
                 uhci_fill_queue(s, &td);
             }
             link = curr_qh ? qh.link : td.link;
diff --git a/hw/usb/host-linux.c b/hw/usb/host-linux.c
index 8df9207..44f1a64 100644
--- a/hw/usb/host-linux.c
+++ b/hw/usb/host-linux.c
@@ -1045,7 +1045,6 @@
 
     /* Note request is (bRequestType << 8) | bRequest */
     trace_usb_host_req_control(s->bus_num, s->addr, p, request, value, index);
-    assert(p->result == 0);
 
     switch (request) {
     case DeviceOutRequest | USB_REQ_SET_ADDRESS:
@@ -1074,6 +1073,7 @@
     }
 
     /* The rest are asynchronous */
+    assert(p && p->result == 0);
 
     if (length > sizeof(dev->data_buf)) {
         fprintf(stderr, "husb: ctrl buffer too small (%d > %zu)\n",
diff --git a/hw/usb/redirect.c b/hw/usb/redirect.c
index 5301a69..b10241a 100644
--- a/hw/usb/redirect.c
+++ b/hw/usb/redirect.c
@@ -43,7 +43,6 @@
 #define EP2I(ep_address) (((ep_address & 0x80) >> 3) | (ep_address & 0x0f))
 #define I2EP(i) (((i & 0x10) << 3) | (i & 0x0f))
 
-typedef struct Cancelled Cancelled;
 typedef struct USBRedirDevice USBRedirDevice;
 
 /* Struct to hold buffered packets (iso or int input packets) */
@@ -58,6 +57,7 @@
     uint8_t type;
     uint8_t interval;
     uint8_t interface; /* bInterfaceNumber this ep belongs to */
+    uint16_t max_packet_size; /* In bytes, not wMaxPacketSize format !! */
     uint8_t iso_started;
     uint8_t iso_error; /* For reporting iso errors to the HC */
     uint8_t interrupt_started;
@@ -65,8 +65,20 @@
     uint8_t bufpq_prefilled;
     uint8_t bufpq_dropping_packets;
     QTAILQ_HEAD(, buf_packet) bufpq;
-    int bufpq_size;
-    int bufpq_target_size;
+    int32_t bufpq_size;
+    int32_t bufpq_target_size;
+};
+
+struct PacketIdQueueEntry {
+    uint64_t id;
+    QTAILQ_ENTRY(PacketIdQueueEntry)next;
+};
+
+struct PacketIdQueue {
+    USBRedirDevice *dev;
+    const char *name;
+    QTAILQ_HEAD(, PacketIdQueueEntry) head;
+    int size;
 };
 
 struct USBRedirDevice {
@@ -86,7 +98,8 @@
     int64_t next_attach_time;
     struct usbredirparser *parser;
     struct endp_data endpoint[MAX_ENDPOINTS];
-    QTAILQ_HEAD(, Cancelled) cancelled;
+    struct PacketIdQueue cancelled;
+    struct PacketIdQueue already_in_flight;
     /* Data for device filtering */
     struct usb_redir_device_connect_header device_info;
     struct usb_redir_interface_info_header interface_info;
@@ -94,11 +107,6 @@
     int filter_rules_count;
 };
 
-struct Cancelled {
-    uint64_t id;
-    QTAILQ_ENTRY(Cancelled)next;
-};
-
 static void usbredir_hello(void *priv, struct usb_redir_hello_header *h);
 static void usbredir_device_connect(void *priv,
     struct usb_redir_device_connect_header *device_connect);
@@ -134,6 +142,8 @@
 static int usbredir_handle_status(USBRedirDevice *dev,
                                        int status, int actual_len);
 
+#define VERSION "qemu usb-redir guest " QEMU_VERSION
+
 /*
  * Logging stuff
  */
@@ -232,6 +242,11 @@
         return 0;
     }
 
+    /* Don't send new data to the chardev until our state is fully synced */
+    if (!runstate_check(RUN_STATE_RUNNING)) {
+        return 0;
+    }
+
     return qemu_chr_fe_write(dev->cs, data, count);
 }
 
@@ -239,37 +254,103 @@
  * Cancelled and buffered packets helpers
  */
 
+static void packet_id_queue_init(struct PacketIdQueue *q,
+    USBRedirDevice *dev, const char *name)
+{
+    q->dev = dev;
+    q->name = name;
+    QTAILQ_INIT(&q->head);
+    q->size = 0;
+}
+
+static void packet_id_queue_add(struct PacketIdQueue *q, uint64_t id)
+{
+    USBRedirDevice *dev = q->dev;
+    struct PacketIdQueueEntry *e;
+
+    DPRINTF("adding packet id %"PRIu64" to %s queue\n", id, q->name);
+
+    e = g_malloc0(sizeof(struct PacketIdQueueEntry));
+    e->id = id;
+    QTAILQ_INSERT_TAIL(&q->head, e, next);
+    q->size++;
+}
+
+static int packet_id_queue_remove(struct PacketIdQueue *q, uint64_t id)
+{
+    USBRedirDevice *dev = q->dev;
+    struct PacketIdQueueEntry *e;
+
+    QTAILQ_FOREACH(e, &q->head, next) {
+        if (e->id == id) {
+            DPRINTF("removing packet id %"PRIu64" from %s queue\n",
+                    id, q->name);
+            QTAILQ_REMOVE(&q->head, e, next);
+            q->size--;
+            g_free(e);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static void packet_id_queue_empty(struct PacketIdQueue *q)
+{
+    USBRedirDevice *dev = q->dev;
+    struct PacketIdQueueEntry *e, *next_e;
+
+    DPRINTF("removing %d packet-ids from %s queue\n", q->size, q->name);
+
+    QTAILQ_FOREACH_SAFE(e, &q->head, next, next_e) {
+        QTAILQ_REMOVE(&q->head, e, next);
+        g_free(e);
+    }
+    q->size = 0;
+}
+
 static void usbredir_cancel_packet(USBDevice *udev, USBPacket *p)
 {
     USBRedirDevice *dev = DO_UPCAST(USBRedirDevice, dev, udev);
-    Cancelled *c;
 
-    DPRINTF("cancel packet id %"PRIu64"\n", p->id);
-
-    c = g_malloc0(sizeof(Cancelled));
-    c->id = p->id;
-    QTAILQ_INSERT_TAIL(&dev->cancelled, c, next);
-
+    packet_id_queue_add(&dev->cancelled, p->id);
     usbredirparser_send_cancel_data_packet(dev->parser, p->id);
     usbredirparser_do_write(dev->parser);
 }
 
 static int usbredir_is_cancelled(USBRedirDevice *dev, uint64_t id)
 {
-    Cancelled *c;
-
     if (!dev->dev.attached) {
         return 1; /* Treat everything as cancelled after a disconnect */
     }
+    return packet_id_queue_remove(&dev->cancelled, id);
+}
 
-    QTAILQ_FOREACH(c, &dev->cancelled, next) {
-        if (c->id == id) {
-            QTAILQ_REMOVE(&dev->cancelled, c, next);
-            g_free(c);
-            return 1;
-        }
+static void usbredir_fill_already_in_flight_from_ep(USBRedirDevice *dev,
+    struct USBEndpoint *ep)
+{
+    static USBPacket *p;
+
+    QTAILQ_FOREACH(p, &ep->queue, queue) {
+        packet_id_queue_add(&dev->already_in_flight, p->id);
     }
-    return 0;
+}
+
+static void usbredir_fill_already_in_flight(USBRedirDevice *dev)
+{
+    int ep;
+    struct USBDevice *udev = &dev->dev;
+
+    usbredir_fill_already_in_flight_from_ep(dev, &udev->ep_ctl);
+
+    for (ep = 0; ep < USB_MAX_ENDPOINTS; ep++) {
+        usbredir_fill_already_in_flight_from_ep(dev, &udev->ep_in[ep]);
+        usbredir_fill_already_in_flight_from_ep(dev, &udev->ep_out[ep]);
+    }
+}
+
+static int usbredir_already_in_flight(USBRedirDevice *dev, uint64_t id)
+{
+    return packet_id_queue_remove(&dev->already_in_flight, id);
 }
 
 static USBPacket *usbredir_find_packet_by_id(USBRedirDevice *dev,
@@ -487,6 +568,10 @@
 
     DPRINTF("bulk-out ep %02X len %zd id %"PRIu64"\n", ep, p->iov.size, p->id);
 
+    if (usbredir_already_in_flight(dev, p->id)) {
+        return USB_RET_ASYNC;
+    }
+
     bulk_packet.endpoint  = ep;
     bulk_packet.length    = p->iov.size;
     bulk_packet.stream_id = 0;
@@ -567,6 +652,10 @@
         DPRINTF("interrupt-out ep %02X len %zd id %"PRIu64"\n", ep,
                 p->iov.size, p->id);
 
+        if (usbredir_already_in_flight(dev, p->id)) {
+            return USB_RET_ASYNC;
+        }
+
         interrupt_packet.endpoint  = ep;
         interrupt_packet.length    = p->iov.size;
 
@@ -709,6 +798,10 @@
     USBRedirDevice *dev = DO_UPCAST(USBRedirDevice, dev, udev);
     struct usb_redir_control_packet_header control_packet;
 
+    if (usbredir_already_in_flight(dev, p->id)) {
+        return USB_RET_ASYNC;
+    }
+
     /* Special cases for certain standard device requests */
     switch (request) {
     case DeviceOutRequest | USB_REQ_SET_ADDRESS:
@@ -763,6 +856,7 @@
     usbredir_device_disconnect(dev);
 
     if (dev->parser) {
+        DPRINTF("destroying usbredirparser\n");
         usbredirparser_destroy(dev->parser);
         dev->parser = NULL;
     }
@@ -771,14 +865,13 @@
 static void usbredir_chardev_open(USBRedirDevice *dev)
 {
     uint32_t caps[USB_REDIR_CAPS_SIZE] = { 0, };
-    char version[32];
+    int flags = 0;
 
     /* Make sure any pending closes are handled (no-op if none pending) */
     usbredir_chardev_close_bh(dev);
     qemu_bh_cancel(dev->chardev_close_bh);
 
-    strcpy(version, "qemu usb-redir guest ");
-    pstrcat(version, sizeof(version), qemu_get_version());
+    DPRINTF("creating usbredirparser\n");
 
     dev->parser = qemu_oom_check(usbredirparser_create());
     dev->parser->priv = dev;
@@ -807,7 +900,12 @@
     usbredirparser_caps_set_cap(caps, usb_redir_cap_filter);
     usbredirparser_caps_set_cap(caps, usb_redir_cap_ep_info_max_packet_size);
     usbredirparser_caps_set_cap(caps, usb_redir_cap_64bits_ids);
-    usbredirparser_init(dev->parser, version, caps, USB_REDIR_CAPS_SIZE, 0);
+
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        flags |= usbredirparser_fl_no_hello;
+    }
+    usbredirparser_init(dev->parser, VERSION, caps, USB_REDIR_CAPS_SIZE,
+                        flags);
     usbredirparser_do_write(dev->parser);
 }
 
@@ -853,6 +951,11 @@
         return 0;
     }
 
+    /* Don't read new data from the chardev until our state is fully synced */
+    if (!runstate_check(RUN_STATE_RUNNING)) {
+        return 0;
+    }
+
     /* usbredir_parser_do_read will consume *all* data we give it */
     return 1024 * 1024;
 }
@@ -878,9 +981,11 @@
 
     switch (event) {
     case CHR_EVENT_OPENED:
+        DPRINTF("chardev open\n");
         usbredir_chardev_open(dev);
         break;
     case CHR_EVENT_CLOSED:
+        DPRINTF("chardev close\n");
         qemu_bh_schedule(dev->chardev_close_bh);
         break;
     }
@@ -890,6 +995,15 @@
  * init + destroy
  */
 
+static void usbredir_vm_state_change(void *priv, int running, RunState state)
+{
+    USBRedirDevice *dev = priv;
+
+    if (state == RUN_STATE_RUNNING && dev->parser != NULL) {
+        usbredirparser_do_write(dev->parser); /* Flush any pending writes */
+    }
+}
+
 static int usbredir_initfn(USBDevice *udev)
 {
     USBRedirDevice *dev = DO_UPCAST(USBRedirDevice, dev, udev);
@@ -914,7 +1028,8 @@
     dev->chardev_close_bh = qemu_bh_new(usbredir_chardev_close_bh, dev);
     dev->attach_timer = qemu_new_timer_ms(vm_clock, usbredir_do_attach, dev);
 
-    QTAILQ_INIT(&dev->cancelled);
+    packet_id_queue_init(&dev->cancelled, dev, "cancelled");
+    packet_id_queue_init(&dev->already_in_flight, dev, "already-in-flight");
     for (i = 0; i < MAX_ENDPOINTS; i++) {
         QTAILQ_INIT(&dev->endpoint[i].bufpq);
     }
@@ -927,19 +1042,17 @@
     qemu_chr_add_handlers(dev->cs, usbredir_chardev_can_read,
                           usbredir_chardev_read, usbredir_chardev_event, dev);
 
+    qemu_add_vm_change_state_handler(usbredir_vm_state_change, dev);
     add_boot_device_path(dev->bootindex, &udev->qdev, NULL);
     return 0;
 }
 
 static void usbredir_cleanup_device_queues(USBRedirDevice *dev)
 {
-    Cancelled *c, *next_c;
     int i;
 
-    QTAILQ_FOREACH_SAFE(c, &dev->cancelled, next, next_c) {
-        QTAILQ_REMOVE(&dev->cancelled, c, next);
-        g_free(c);
-    }
+    packet_id_queue_empty(&dev->cancelled);
+    packet_id_queue_empty(&dev->already_in_flight);
     for (i = 0; i < MAX_ENDPOINTS; i++) {
         usbredir_free_bufpq(dev, I2EP(i));
     }
@@ -1118,6 +1231,7 @@
     qemu_del_timer(dev->attach_timer);
 
     if (dev->dev.attached) {
+        DPRINTF("detaching device\n");
         usb_device_detach(&dev->dev);
         /*
          * Delay next usb device attach to give the guest a chance to see
@@ -1195,7 +1309,8 @@
         usb_ep->ifnum = dev->endpoint[i].interface;
         if (usbredirparser_peer_has_cap(dev->parser,
                                      usb_redir_cap_ep_info_max_packet_size)) {
-            usb_ep->max_packet_size = ep_info->max_packet_size[i];
+            dev->endpoint[i].max_packet_size =
+                usb_ep->max_packet_size = ep_info->max_packet_size[i];
         }
         if (ep_info->type[i] == usb_redir_type_bulk) {
             usb_ep->pipeline = true;
@@ -1418,6 +1533,322 @@
     }
 }
 
+/*
+ * Migration code
+ */
+
+static void usbredir_pre_save(void *priv)
+{
+    USBRedirDevice *dev = priv;
+
+    usbredir_fill_already_in_flight(dev);
+}
+
+static int usbredir_post_load(void *priv, int version_id)
+{
+    USBRedirDevice *dev = priv;
+    struct USBEndpoint *usb_ep;
+    int i;
+
+    switch (dev->device_info.speed) {
+    case usb_redir_speed_low:
+        dev->dev.speed = USB_SPEED_LOW;
+        break;
+    case usb_redir_speed_full:
+        dev->dev.speed = USB_SPEED_FULL;
+        break;
+    case usb_redir_speed_high:
+        dev->dev.speed = USB_SPEED_HIGH;
+        break;
+    case usb_redir_speed_super:
+        dev->dev.speed = USB_SPEED_SUPER;
+        break;
+    default:
+        dev->dev.speed = USB_SPEED_FULL;
+    }
+    dev->dev.speedmask = (1 << dev->dev.speed);
+
+    for (i = 0; i < MAX_ENDPOINTS; i++) {
+        usb_ep = usb_ep_get(&dev->dev,
+                            (i & 0x10) ? USB_TOKEN_IN : USB_TOKEN_OUT,
+                            i & 0x0f);
+        usb_ep->type = dev->endpoint[i].type;
+        usb_ep->ifnum = dev->endpoint[i].interface;
+        usb_ep->max_packet_size = dev->endpoint[i].max_packet_size;
+        if (dev->endpoint[i].type == usb_redir_type_bulk) {
+            usb_ep->pipeline = true;
+        }
+    }
+    return 0;
+}
+
+/* For usbredirparser migration */
+static void usbredir_put_parser(QEMUFile *f, void *priv, size_t unused)
+{
+    USBRedirDevice *dev = priv;
+    uint8_t *data;
+    int len;
+
+    if (dev->parser == NULL) {
+        qemu_put_be32(f, 0);
+        return;
+    }
+
+    usbredirparser_serialize(dev->parser, &data, &len);
+    qemu_oom_check(data);
+
+    qemu_put_be32(f, len);
+    qemu_put_buffer(f, data, len);
+
+    free(data);
+}
+
+static int usbredir_get_parser(QEMUFile *f, void *priv, size_t unused)
+{
+    USBRedirDevice *dev = priv;
+    uint8_t *data;
+    int len, ret;
+
+    len = qemu_get_be32(f);
+    if (len == 0) {
+        return 0;
+    }
+
+    /*
+     * Our chardev should be open already at this point, otherwise
+     * the usbredir channel will be broken (ie spice without seamless)
+     */
+    if (dev->parser == NULL) {
+        ERROR("get_parser called with closed chardev, failing migration\n");
+        return -1;
+    }
+
+    data = g_malloc(len);
+    qemu_get_buffer(f, data, len);
+
+    ret = usbredirparser_unserialize(dev->parser, data, len);
+
+    g_free(data);
+
+    return ret;
+}
+
+static const VMStateInfo usbredir_parser_vmstate_info = {
+    .name = "usb-redir-parser",
+    .put  = usbredir_put_parser,
+    .get  = usbredir_get_parser,
+};
+
+
+/* For buffered packets (iso/irq) queue migration */
+static void usbredir_put_bufpq(QEMUFile *f, void *priv, size_t unused)
+{
+    struct endp_data *endp = priv;
+    struct buf_packet *bufp;
+    int remain = endp->bufpq_size;
+
+    qemu_put_be32(f, endp->bufpq_size);
+    QTAILQ_FOREACH(bufp, &endp->bufpq, next) {
+        qemu_put_be32(f, bufp->len);
+        qemu_put_be32(f, bufp->status);
+        qemu_put_buffer(f, bufp->data, bufp->len);
+        remain--;
+    }
+    assert(remain == 0);
+}
+
+static int usbredir_get_bufpq(QEMUFile *f, void *priv, size_t unused)
+{
+    struct endp_data *endp = priv;
+    struct buf_packet *bufp;
+    int i;
+
+    endp->bufpq_size = qemu_get_be32(f);
+    for (i = 0; i < endp->bufpq_size; i++) {
+        bufp = g_malloc(sizeof(struct buf_packet));
+        bufp->len = qemu_get_be32(f);
+        bufp->status = qemu_get_be32(f);
+        bufp->data = qemu_oom_check(malloc(bufp->len)); /* regular malloc! */
+        qemu_get_buffer(f, bufp->data, bufp->len);
+        QTAILQ_INSERT_TAIL(&endp->bufpq, bufp, next);
+    }
+    return 0;
+}
+
+static const VMStateInfo usbredir_ep_bufpq_vmstate_info = {
+    .name = "usb-redir-bufpq",
+    .put  = usbredir_put_bufpq,
+    .get  = usbredir_get_bufpq,
+};
+
+
+/* For endp_data migration */
+static const VMStateDescription usbredir_ep_vmstate = {
+    .name = "usb-redir-ep",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(type, struct endp_data),
+        VMSTATE_UINT8(interval, struct endp_data),
+        VMSTATE_UINT8(interface, struct endp_data),
+        VMSTATE_UINT16(max_packet_size, struct endp_data),
+        VMSTATE_UINT8(iso_started, struct endp_data),
+        VMSTATE_UINT8(iso_error, struct endp_data),
+        VMSTATE_UINT8(interrupt_started, struct endp_data),
+        VMSTATE_UINT8(interrupt_error, struct endp_data),
+        VMSTATE_UINT8(bufpq_prefilled, struct endp_data),
+        VMSTATE_UINT8(bufpq_dropping_packets, struct endp_data),
+        {
+            .name         = "bufpq",
+            .version_id   = 0,
+            .field_exists = NULL,
+            .size         = 0,
+            .info         = &usbredir_ep_bufpq_vmstate_info,
+            .flags        = VMS_SINGLE,
+            .offset       = 0,
+        },
+        VMSTATE_INT32(bufpq_target_size, struct endp_data),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+
+/* For PacketIdQueue migration */
+static void usbredir_put_packet_id_q(QEMUFile *f, void *priv, size_t unused)
+{
+    struct PacketIdQueue *q = priv;
+    USBRedirDevice *dev = q->dev;
+    struct PacketIdQueueEntry *e;
+    int remain = q->size;
+
+    DPRINTF("put_packet_id_q %s size %d\n", q->name, q->size);
+    qemu_put_be32(f, q->size);
+    QTAILQ_FOREACH(e, &q->head, next) {
+        qemu_put_be64(f, e->id);
+        remain--;
+    }
+    assert(remain == 0);
+}
+
+static int usbredir_get_packet_id_q(QEMUFile *f, void *priv, size_t unused)
+{
+    struct PacketIdQueue *q = priv;
+    USBRedirDevice *dev = q->dev;
+    int i, size;
+    uint64_t id;
+
+    size = qemu_get_be32(f);
+    DPRINTF("get_packet_id_q %s size %d\n", q->name, size);
+    for (i = 0; i < size; i++) {
+        id = qemu_get_be64(f);
+        packet_id_queue_add(q, id);
+    }
+    assert(q->size == size);
+    return 0;
+}
+
+static const VMStateInfo usbredir_ep_packet_id_q_vmstate_info = {
+    .name = "usb-redir-packet-id-q",
+    .put  = usbredir_put_packet_id_q,
+    .get  = usbredir_get_packet_id_q,
+};
+
+static const VMStateDescription usbredir_ep_packet_id_queue_vmstate = {
+    .name = "usb-redir-packet-id-queue",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        {
+            .name         = "queue",
+            .version_id   = 0,
+            .field_exists = NULL,
+            .size         = 0,
+            .info         = &usbredir_ep_packet_id_q_vmstate_info,
+            .flags        = VMS_SINGLE,
+            .offset       = 0,
+        },
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+
+/* For usb_redir_device_connect_header migration */
+static const VMStateDescription usbredir_device_info_vmstate = {
+    .name = "usb-redir-device-info",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8(speed, struct usb_redir_device_connect_header),
+        VMSTATE_UINT8(device_class, struct usb_redir_device_connect_header),
+        VMSTATE_UINT8(device_subclass, struct usb_redir_device_connect_header),
+        VMSTATE_UINT8(device_protocol, struct usb_redir_device_connect_header),
+        VMSTATE_UINT16(vendor_id, struct usb_redir_device_connect_header),
+        VMSTATE_UINT16(product_id, struct usb_redir_device_connect_header),
+        VMSTATE_UINT16(device_version_bcd,
+                       struct usb_redir_device_connect_header),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+
+/* For usb_redir_interface_info_header migration */
+static const VMStateDescription usbredir_interface_info_vmstate = {
+    .name = "usb-redir-interface-info",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(interface_count,
+                       struct usb_redir_interface_info_header),
+        VMSTATE_UINT8_ARRAY(interface,
+                            struct usb_redir_interface_info_header, 32),
+        VMSTATE_UINT8_ARRAY(interface_class,
+                            struct usb_redir_interface_info_header, 32),
+        VMSTATE_UINT8_ARRAY(interface_subclass,
+                            struct usb_redir_interface_info_header, 32),
+        VMSTATE_UINT8_ARRAY(interface_protocol,
+                            struct usb_redir_interface_info_header, 32),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+
+/* And finally the USBRedirDevice vmstate itself */
+static const VMStateDescription usbredir_vmstate = {
+    .name = "usb-redir",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = usbredir_pre_save,
+    .post_load = usbredir_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_USB_DEVICE(dev, USBRedirDevice),
+        VMSTATE_TIMER(attach_timer, USBRedirDevice),
+        {
+            .name         = "parser",
+            .version_id   = 0,
+            .field_exists = NULL,
+            .size         = 0,
+            .info         = &usbredir_parser_vmstate_info,
+            .flags        = VMS_SINGLE,
+            .offset       = 0,
+        },
+        VMSTATE_STRUCT_ARRAY(endpoint, USBRedirDevice, MAX_ENDPOINTS, 1,
+                             usbredir_ep_vmstate, struct endp_data),
+        VMSTATE_STRUCT(cancelled, USBRedirDevice, 1,
+                       usbredir_ep_packet_id_queue_vmstate,
+                       struct PacketIdQueue),
+        VMSTATE_STRUCT(already_in_flight, USBRedirDevice, 1,
+                       usbredir_ep_packet_id_queue_vmstate,
+                       struct PacketIdQueue),
+        VMSTATE_STRUCT(device_info, USBRedirDevice, 1,
+                       usbredir_device_info_vmstate,
+                       struct usb_redir_device_connect_header),
+        VMSTATE_STRUCT(interface_info, USBRedirDevice, 1,
+                       usbredir_interface_info_vmstate,
+                       struct usb_redir_interface_info_header),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static Property usbredir_properties[] = {
     DEFINE_PROP_CHR("chardev", USBRedirDevice, cs),
     DEFINE_PROP_UINT8("debug", USBRedirDevice, debug, 0),
@@ -1438,6 +1869,7 @@
     uc->handle_reset   = usbredir_handle_reset;
     uc->handle_data    = usbredir_handle_data;
     uc->handle_control = usbredir_handle_control;
+    dc->vmsd           = &usbredir_vmstate;
     dc->props          = usbredir_properties;
 }
 
diff --git a/hw/vga-isa-mm.c b/hw/vga-isa-mm.c
index 44ae7d9..306e6ba 100644
--- a/hw/vga-isa-mm.c
+++ b/hw/vga-isa-mm.c
@@ -107,6 +107,7 @@
     s_ioport_ctrl = g_malloc(sizeof(*s_ioport_ctrl));
     memory_region_init_io(s_ioport_ctrl, &vga_mm_ctrl_ops, s,
                           "vga-mm-ctrl", 0x100000);
+    memory_region_set_flush_coalesced(s_ioport_ctrl);
 
     vga_io_memory = g_malloc(sizeof(*vga_io_memory));
     /* XXX: endianness? */
diff --git a/hw/vga.c b/hw/vga.c
index 80299ea..afaef0d 100644
--- a/hw/vga.c
+++ b/hw/vga.c
@@ -361,6 +361,8 @@
     VGACommonState *s = opaque;
     int val, index;
 
+    qemu_flush_coalesced_mmio_buffer();
+
     if (vga_ioport_invalid(s, addr)) {
         val = 0xff;
     } else {
@@ -453,6 +455,8 @@
     VGACommonState *s = opaque;
     int index;
 
+    qemu_flush_coalesced_mmio_buffer();
+
     /* check port range access depending on color/monochrome mode */
     if (vga_ioport_invalid(s, addr)) {
         return;
@@ -2338,6 +2342,7 @@
     vga_mem = g_malloc(sizeof(*vga_mem));
     memory_region_init_io(vga_mem, &vga_mem_ops, s,
                           "vga-lowmem", 0x20000);
+    memory_region_set_flush_coalesced(vga_mem);
 
     return vga_mem;
 }
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index b1998b2..6490743 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -447,10 +447,6 @@
     VirtIONet *n = to_virtio_net(vdev);
 
     qemu_flush_queued_packets(&n->nic->nc);
-
-    /* We now have RX buffers, signal to the IO thread to break out of the
-     * select to re-poll the tap file descriptor */
-    qemu_notify_event();
 }
 
 static int virtio_net_can_receive(NetClientState *nc)
diff --git a/hw/vmware_vga.c b/hw/vmware_vga.c
index b68e883..e815a04 100644
--- a/hw/vmware_vga.c
+++ b/hw/vmware_vga.c
@@ -1186,6 +1186,7 @@
 
     memory_region_init_io(&s->io_bar, &vmsvga_io_ops, &s->chip,
                           "vmsvga-io", 0x10);
+    memory_region_set_flush_coalesced(&s->io_bar);
     pci_register_bar(&s->card, 0, PCI_BASE_ADDRESS_SPACE_IO, &s->io_bar);
 
     vmsvga_init(&s->chip, pci_address_space(dev),
diff --git a/hw/xen-host-pci-device.c b/hw/xen-host-pci-device.c
index e7ff680..743b37b 100644
--- a/hw/xen-host-pci-device.c
+++ b/hw/xen-host-pci-device.c
@@ -47,13 +47,13 @@
 }
 
 
-/* This size should be enough to read the first 7 lines of a ressource file */
-#define XEN_HOST_PCI_RESSOURCE_BUFFER_SIZE 400
+/* This size should be enough to read the first 7 lines of a resource file */
+#define XEN_HOST_PCI_RESOURCE_BUFFER_SIZE 400
 static int xen_host_pci_get_resource(XenHostPCIDevice *d)
 {
     int i, rc, fd;
     char path[PATH_MAX];
-    char buf[XEN_HOST_PCI_RESSOURCE_BUFFER_SIZE];
+    char buf[XEN_HOST_PCI_RESOURCE_BUFFER_SIZE];
     unsigned long long start, end, flags, size;
     char *endptr, *s;
     uint8_t type;
diff --git a/hw/xen_nic.c b/hw/xen_nic.c
index 8b79bfb..cf7d559 100644
--- a/hw/xen_nic.c
+++ b/hw/xen_nic.c
@@ -415,6 +415,7 @@
 {
     struct XenNetDev *netdev = container_of(xendev, struct XenNetDev, xendev);
     net_tx_packets(netdev);
+    qemu_flush_queued_packets(&netdev->nic->nc);
 }
 
 static int net_free(struct XenDevice *xendev)
diff --git a/hw/xen_pt.h b/hw/xen_pt.h
index 41904ec..112477a 100644
--- a/hw/xen_pt.h
+++ b/hw/xen_pt.h
@@ -96,7 +96,7 @@
  * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
  */
 
-/* emulated register infomation */
+/* emulated register information */
 struct XenPTRegInfo {
     uint32_t offset;
     uint32_t size;
@@ -140,7 +140,7 @@
     (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
      uint32_t base_offset, uint8_t *size);
 
-/* emulated register group infomation */
+/* emulated register group information */
 struct XenPTRegGroupInfo {
     uint8_t grp_id;
     XenPTRegisterGroupType grp_type;
diff --git a/hw/xen_pt_config_init.c b/hw/xen_pt_config_init.c
index 00eb3d9..e524a40 100644
--- a/hw/xen_pt_config_init.c
+++ b/hw/xen_pt_config_init.c
@@ -562,7 +562,7 @@
     return 0;
 }
 
-/* Header Type0 reg static infomation table */
+/* Header Type0 reg static information table */
 static XenPTRegInfo xen_pt_emu_reg_header0[] = {
     /* Vendor ID reg */
     {
@@ -753,7 +753,7 @@
  * Vital Product Data Capability
  */
 
-/* Vital Product Data Capability Structure reg static infomation table */
+/* Vital Product Data Capability Structure reg static information table */
 static XenPTRegInfo xen_pt_emu_reg_vpd[] = {
     {
         .offset     = PCI_CAP_LIST_NEXT,
@@ -775,7 +775,7 @@
  * Vendor Specific Capability
  */
 
-/* Vendor Specific Capability Structure reg static infomation table */
+/* Vendor Specific Capability Structure reg static information table */
 static XenPTRegInfo xen_pt_emu_reg_vendor[] = {
     {
         .offset     = PCI_CAP_LIST_NEXT,
@@ -866,7 +866,7 @@
     return 0;
 }
 
-/* PCI Express Capability Structure reg static infomation table */
+/* PCI Express Capability Structure reg static information table */
 static XenPTRegInfo xen_pt_emu_reg_pcie[] = {
     /* Next Pointer reg */
     {
@@ -981,7 +981,7 @@
     return 0;
 }
 
-/* Power Management Capability reg static infomation table */
+/* Power Management Capability reg static information table */
 static XenPTRegInfo xen_pt_emu_reg_pm[] = {
     /* Next Pointer reg */
     {
@@ -1259,7 +1259,7 @@
     return 0;
 }
 
-/* MSI Capability Structure reg static infomation table */
+/* MSI Capability Structure reg static information table */
 static XenPTRegInfo xen_pt_emu_reg_msi[] = {
     /* Next Pointer reg */
     {
@@ -1396,7 +1396,7 @@
     return 0;
 }
 
-/* MSI-X Capability Structure reg static infomation table */
+/* MSI-X Capability Structure reg static information table */
 static XenPTRegInfo xen_pt_emu_reg_msix[] = {
     /* Next Pointer reg */
     {
diff --git a/hw/xilinx.h b/hw/xilinx.h
index 556c5aa..9830047 100644
--- a/hw/xilinx.h
+++ b/hw/xilinx.h
@@ -21,9 +21,9 @@
 {
     DeviceState *dev;
 
-    dev = qdev_create(NULL, "xlnx,xps-timer");
+    dev = qdev_create(NULL, "xlnx.xps-timer");
     qdev_prop_set_uint32(dev, "one-timer-only", oto);
-    qdev_prop_set_uint32(dev, "frequency", freq);
+    qdev_prop_set_uint32(dev, "clock-frequency", freq);
     qdev_init_nofail(dev);
     sysbus_mmio_map(sysbus_from_qdev(dev), 0, base);
     sysbus_connect_irq(sysbus_from_qdev(dev), 0, irq);
@@ -55,13 +55,17 @@
                           int txmem, int rxmem)
 {
     DeviceState *dev;
+    Error *errp = NULL;
+
     qemu_check_nic_model(nd, "xlnx.axi-ethernet");
 
     dev = qdev_create(NULL, "xlnx.axi-ethernet");
     qdev_set_nic_properties(dev, nd);
     qdev_prop_set_uint32(dev, "rxmem", rxmem);
     qdev_prop_set_uint32(dev, "txmem", txmem);
-    object_property_set_link(OBJECT(dev), OBJECT(peer), "tx_dev", NULL);
+    object_property_set_link(OBJECT(dev), OBJECT(peer), "axistream-connected",
+                             &errp);
+    assert_no_error(errp);
     qdev_init_nofail(dev);
     sysbus_mmio_map(sysbus_from_qdev(dev), 0, base);
     sysbus_connect_irq(sysbus_from_qdev(dev), 0, irq);
@@ -74,8 +78,12 @@
                            target_phys_addr_t base, qemu_irq irq,
                            qemu_irq irq2, int freqhz)
 {
+    Error *errp = NULL;
+
     qdev_prop_set_uint32(dev, "freqhz", freqhz);
-    object_property_set_link(OBJECT(dev), OBJECT(peer), "tx_dev", NULL);
+    object_property_set_link(OBJECT(dev), OBJECT(peer), "axistream-connected",
+                             &errp);
+    assert_no_error(errp);
     qdev_init_nofail(dev);
 
     sysbus_mmio_map(sysbus_from_qdev(dev), 0, base);
diff --git a/hw/xilinx_timer.c b/hw/xilinx_timer.c
index b562bd0..2e48ca2 100644
--- a/hw/xilinx_timer.c
+++ b/hw/xilinx_timer.c
@@ -24,6 +24,7 @@
 
 #include "sysbus.h"
 #include "ptimer.h"
+#include "qemu-log.h"
 
 #define D(x)
 
@@ -119,7 +120,7 @@
             break;
 
     }
-    D(printf("%s timer=%d %x=%x\n", __func__, timer, addr * 4, r));
+    D(fprintf(stderr, "%s timer=%d %x=%x\n", __func__, timer, addr * 4, r));
     return r;
 }
 
@@ -127,7 +128,7 @@
 {
     uint64_t count;
 
-    D(printf("%s timer=%d down=%d\n", __func__,
+    D(fprintf(stderr, "%s timer=%d down=%d\n", __func__,
               xt->nr, xt->regs[R_TCSR] & TCSR_UDT));
 
     ptimer_stop(xt->ptimer);
@@ -152,7 +153,7 @@
     addr >>= 2;
     timer = timer_from_addr(addr);
     xt = &t->timers[timer];
-    D(printf("%s addr=%x val=%x (timer=%d off=%d)\n",
+    D(fprintf(stderr, "%s addr=%x val=%x (timer=%d off=%d)\n",
              __func__, addr * 4, value, timer, addr & 3));
     /* Further decoding to address a specific timers reg.  */
     addr &= 3;
@@ -189,7 +190,7 @@
 {
     struct xlx_timer *xt = opaque;
     struct timerblock *t = xt->parent;
-    D(printf("%s %d\n", __func__, timer));
+    D(fprintf(stderr, "%s %d\n", __func__, xt->nr));
     xt->regs[R_TCSR] |= TCSR_TINT;
 
     if (xt->regs[R_TCSR] & TCSR_ARHT)
@@ -217,14 +218,15 @@
         ptimer_set_freq(xt->ptimer, t->freq_hz);
     }
 
-    memory_region_init_io(&t->mmio, &timer_ops, t, "xlnx,xps-timer",
+    memory_region_init_io(&t->mmio, &timer_ops, t, "xlnx.xps-timer",
                           R_MAX * 4 * num_timers(t));
     sysbus_init_mmio(dev, &t->mmio);
     return 0;
 }
 
 static Property xilinx_timer_properties[] = {
-    DEFINE_PROP_UINT32("frequency", struct timerblock, freq_hz,   62 * 1000000),
+    DEFINE_PROP_UINT32("clock-frequency", struct timerblock, freq_hz,
+                                                                62 * 1000000),
     DEFINE_PROP_UINT8("one-timer-only", struct timerblock, one_timer_only, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
@@ -239,7 +241,7 @@
 }
 
 static TypeInfo xilinx_timer_info = {
-    .name          = "xlnx,xps-timer",
+    .name          = "xlnx.xps-timer",
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(struct timerblock),
     .class_init    = xilinx_timer_class_init,
diff --git a/iohandler.c b/iohandler.c
index dea4355..a2d871b 100644
--- a/iohandler.c
+++ b/iohandler.c
@@ -56,6 +56,8 @@
 {
     IOHandlerRecord *ioh;
 
+    assert(fd >= 0);
+
     if (!fd_read && !fd_write) {
         QLIST_FOREACH(ioh, &io_handlers, next) {
             if (ioh->fd == fd) {
diff --git a/kvm-all.c b/kvm-all.c
index 39cff55..92a7137 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -92,7 +92,7 @@
     /* The man page (and posix) say ioctl numbers are signed int, but
      * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
      * unsigned, and treating them as signed here can break things */
-    unsigned irqchip_inject_ioctl;
+    unsigned irq_set_ioctl;
 #ifdef KVM_CAP_IRQ_ROUTING
     struct kvm_irq_routing *irq_routes;
     int nr_allocated_irq_routes;
@@ -870,13 +870,13 @@
 
     event.level = level;
     event.irq = irq;
-    ret = kvm_vm_ioctl(s, s->irqchip_inject_ioctl, &event);
+    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
     if (ret < 0) {
         perror("kvm_set_irq");
         abort();
     }
 
-    return (s->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
+    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
 }
 
 #ifdef KVM_CAP_IRQ_ROUTING
@@ -1237,10 +1237,6 @@
         return ret;
     }
 
-    s->irqchip_inject_ioctl = KVM_IRQ_LINE;
-    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
-        s->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
-    }
     kvm_kernel_irqchip = true;
     /* If we have an in-kernel IRQ chip then we must have asynchronous
      * interrupt delivery (though the reverse is not necessarily true)
@@ -1389,6 +1385,11 @@
 
     s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
 
+    s->irq_set_ioctl = KVM_IRQ_LINE;
+    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
+        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
+    }
+
     ret = kvm_arch_init(s);
     if (ret < 0) {
         goto err;
@@ -1409,13 +1410,11 @@
     return 0;
 
 err:
-    if (s) {
-        if (s->vmfd >= 0) {
-            close(s->vmfd);
-        }
-        if (s->fd != -1) {
-            close(s->fd);
-        }
+    if (s->vmfd >= 0) {
+        close(s->vmfd);
+    }
+    if (s->fd != -1) {
+        close(s->fd);
     }
     g_free(s);
 
@@ -1576,8 +1575,6 @@
         qemu_mutex_lock_iothread();
         kvm_arch_post_run(env, run);
 
-        kvm_flush_coalesced_mmio_buffer();
-
         if (run_ret < 0) {
             if (run_ret == -EINTR || run_ret == -EAGAIN) {
                 DPRINTF("io window exit\n");
diff --git a/linux-user/main.c b/linux-user/main.c
index 1a1c661..e84a18c 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -89,19 +89,6 @@
 }
 #endif
 
-/* timers for rdtsc */
-
-#if 0
-
-static uint64_t emu_time;
-
-int64_t cpu_get_real_ticks(void)
-{
-    return emu_time++;
-}
-
-#endif
-
 #if defined(CONFIG_USE_NPTL)
 /***********************************************************/
 /* Helper routines for implementing atomic operations.  */
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 69b27d7..fc4cc00 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -289,46 +289,29 @@
  * struct has been locked - usually with lock_user_struct().
  */
 #define __put_user(x, hptr)\
-({\
+({ __typeof(*hptr) pu_ = (x);\
     switch(sizeof(*hptr)) {\
-    case 1:\
-        *(uint8_t *)(hptr) = (uint8_t)(typeof(*hptr))(x);\
-        break;\
-    case 2:\
-        *(uint16_t *)(hptr) = tswap16((uint16_t)(typeof(*hptr))(x));\
-        break;\
-    case 4:\
-        *(uint32_t *)(hptr) = tswap32((uint32_t)(typeof(*hptr))(x));\
-        break;\
-    case 8:\
-        *(uint64_t *)(hptr) = tswap64((typeof(*hptr))(x));\
-        break;\
-    default:\
-        abort();\
+    case 1: break;\
+    case 2: pu_ = tswap16(pu_); break; \
+    case 4: pu_ = tswap32(pu_); break; \
+    case 8: pu_ = tswap64(pu_); break; \
+    default: abort();\
     }\
+    memcpy(hptr, &pu_, sizeof(pu_)); \
     0;\
 })
 
 #define __get_user(x, hptr) \
-({\
+({ __typeof(*hptr) gu_; \
+    memcpy(&gu_, hptr, sizeof(gu_)); \
     switch(sizeof(*hptr)) {\
-    case 1:\
-        x = (typeof(*hptr))*(uint8_t *)(hptr);\
-        break;\
-    case 2:\
-        x = (typeof(*hptr))tswap16(*(uint16_t *)(hptr));\
-        break;\
-    case 4:\
-        x = (typeof(*hptr))tswap32(*(uint32_t *)(hptr));\
-        break;\
-    case 8:\
-        x = (typeof(*hptr))tswap64(*(uint64_t *)(hptr));\
-        break;\
-    default:\
-        /* avoid warning */\
-        x = 0;\
-        abort();\
+    case 1: break; \
+    case 2: gu_ = tswap16(gu_); break; \
+    case 4: gu_ = tswap32(gu_); break; \
+    case 8: gu_ = tswap64(gu_); break; \
+    default: abort();\
     }\
+    (x) = gu_; \
     0;\
 })
 
diff --git a/memory.c b/memory.c
index d528d1f..4f3ade0 100644
--- a/memory.c
+++ b/memory.c
@@ -24,7 +24,6 @@
 #include "exec-obsolete.h"
 
 unsigned memory_region_transaction_depth = 0;
-static bool memory_region_update_pending = false;
 static bool global_dirty_log = false;
 
 static QTAILQ_HEAD(memory_listeners, MemoryListener) memory_listeners
@@ -311,6 +310,9 @@
     MemoryRegion *mr = opaque;
     uint64_t tmp;
 
+    if (mr->flush_coalesced_mmio) {
+        qemu_flush_coalesced_mmio_buffer();
+    }
     tmp = mr->ops->read(mr->opaque, addr, size);
     *value |= (tmp & mask) << shift;
 }
@@ -325,6 +327,9 @@
     MemoryRegion *mr = opaque;
     uint64_t tmp;
 
+    if (mr->flush_coalesced_mmio) {
+        qemu_flush_coalesced_mmio_buffer();
+    }
     tmp = (*value >> shift) & mask;
     mr->ops->write(mr->opaque, addr, tmp, size);
 }
@@ -726,33 +731,9 @@
     address_space_update_ioeventfds(as);
 }
 
-static void memory_region_update_topology(MemoryRegion *mr)
-{
-    if (memory_region_transaction_depth) {
-        memory_region_update_pending |= !mr || mr->enabled;
-        return;
-    }
-
-    if (mr && !mr->enabled) {
-        return;
-    }
-
-    MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
-
-    if (address_space_memory.root) {
-        address_space_update_topology(&address_space_memory);
-    }
-    if (address_space_io.root) {
-        address_space_update_topology(&address_space_io);
-    }
-
-    MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
-
-    memory_region_update_pending = false;
-}
-
 void memory_region_transaction_begin(void)
 {
+    qemu_flush_coalesced_mmio_buffer();
     ++memory_region_transaction_depth;
 }
 
@@ -760,8 +741,17 @@
 {
     assert(memory_region_transaction_depth);
     --memory_region_transaction_depth;
-    if (!memory_region_transaction_depth && memory_region_update_pending) {
-        memory_region_update_topology(NULL);
+    if (!memory_region_transaction_depth) {
+        MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
+
+        if (address_space_memory.root) {
+            address_space_update_topology(&address_space_memory);
+        }
+        if (address_space_io.root) {
+            address_space_update_topology(&address_space_io);
+        }
+
+        MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
     }
 }
 
@@ -826,6 +816,7 @@
     mr->dirty_log_mask = 0;
     mr->ioeventfd_nb = 0;
     mr->ioeventfds = NULL;
+    mr->flush_coalesced_mmio = false;
 }
 
 static bool memory_region_access_valid(MemoryRegion *mr,
@@ -1069,8 +1060,9 @@
 {
     uint8_t mask = 1 << client;
 
+    memory_region_transaction_begin();
     mr->dirty_log_mask = (mr->dirty_log_mask & ~mask) | (log * mask);
-    memory_region_update_topology(mr);
+    memory_region_transaction_commit();
 }
 
 bool memory_region_get_dirty(MemoryRegion *mr, target_phys_addr_t addr,
@@ -1103,16 +1095,18 @@
 void memory_region_set_readonly(MemoryRegion *mr, bool readonly)
 {
     if (mr->readonly != readonly) {
+        memory_region_transaction_begin();
         mr->readonly = readonly;
-        memory_region_update_topology(mr);
+        memory_region_transaction_commit();
     }
 }
 
 void memory_region_rom_device_set_readable(MemoryRegion *mr, bool readable)
 {
     if (mr->readable != readable) {
+        memory_region_transaction_begin();
         mr->readable = readable;
-        memory_region_update_topology(mr);
+        memory_region_transaction_commit();
     }
 }
 
@@ -1176,12 +1170,16 @@
     cmr->addr = addrrange_make(int128_make64(offset), int128_make64(size));
     QTAILQ_INSERT_TAIL(&mr->coalesced, cmr, link);
     memory_region_update_coalesced_range(mr);
+    memory_region_set_flush_coalesced(mr);
 }
 
 void memory_region_clear_coalescing(MemoryRegion *mr)
 {
     CoalescedMemoryRange *cmr;
 
+    qemu_flush_coalesced_mmio_buffer();
+    mr->flush_coalesced_mmio = false;
+
     while (!QTAILQ_EMPTY(&mr->coalesced)) {
         cmr = QTAILQ_FIRST(&mr->coalesced);
         QTAILQ_REMOVE(&mr->coalesced, cmr, link);
@@ -1190,6 +1188,19 @@
     memory_region_update_coalesced_range(mr);
 }
 
+void memory_region_set_flush_coalesced(MemoryRegion *mr)
+{
+    mr->flush_coalesced_mmio = true;
+}
+
+void memory_region_clear_flush_coalesced(MemoryRegion *mr)
+{
+    qemu_flush_coalesced_mmio_buffer();
+    if (QTAILQ_EMPTY(&mr->coalesced)) {
+        mr->flush_coalesced_mmio = false;
+    }
+}
+
 void memory_region_add_eventfd(MemoryRegion *mr,
                                target_phys_addr_t addr,
                                unsigned size,
@@ -1206,6 +1217,7 @@
     };
     unsigned i;
 
+    memory_region_transaction_begin();
     for (i = 0; i < mr->ioeventfd_nb; ++i) {
         if (memory_region_ioeventfd_before(mrfd, mr->ioeventfds[i])) {
             break;
@@ -1217,7 +1229,7 @@
     memmove(&mr->ioeventfds[i+1], &mr->ioeventfds[i],
             sizeof(*mr->ioeventfds) * (mr->ioeventfd_nb-1 - i));
     mr->ioeventfds[i] = mrfd;
-    memory_region_update_topology(mr);
+    memory_region_transaction_commit();
 }
 
 void memory_region_del_eventfd(MemoryRegion *mr,
@@ -1236,6 +1248,7 @@
     };
     unsigned i;
 
+    memory_region_transaction_begin();
     for (i = 0; i < mr->ioeventfd_nb; ++i) {
         if (memory_region_ioeventfd_equal(mrfd, mr->ioeventfds[i])) {
             break;
@@ -1247,7 +1260,7 @@
     --mr->ioeventfd_nb;
     mr->ioeventfds = g_realloc(mr->ioeventfds,
                                   sizeof(*mr->ioeventfds)*mr->ioeventfd_nb + 1);
-    memory_region_update_topology(mr);
+    memory_region_transaction_commit();
 }
 
 static void memory_region_add_subregion_common(MemoryRegion *mr,
@@ -1256,6 +1269,8 @@
 {
     MemoryRegion *other;
 
+    memory_region_transaction_begin();
+
     assert(!subregion->parent);
     subregion->parent = mr;
     subregion->addr = offset;
@@ -1288,7 +1303,7 @@
     }
     QTAILQ_INSERT_TAIL(&mr->subregions, subregion, subregions_link);
 done:
-    memory_region_update_topology(mr);
+    memory_region_transaction_commit();
 }
 
 
@@ -1314,10 +1329,11 @@
 void memory_region_del_subregion(MemoryRegion *mr,
                                  MemoryRegion *subregion)
 {
+    memory_region_transaction_begin();
     assert(subregion->parent == mr);
     subregion->parent = NULL;
     QTAILQ_REMOVE(&mr->subregions, subregion, subregions_link);
-    memory_region_update_topology(mr);
+    memory_region_transaction_commit();
 }
 
 void memory_region_set_enabled(MemoryRegion *mr, bool enabled)
@@ -1325,8 +1341,9 @@
     if (enabled == mr->enabled) {
         return;
     }
+    memory_region_transaction_begin();
     mr->enabled = enabled;
-    memory_region_update_topology(NULL);
+    memory_region_transaction_commit();
 }
 
 void memory_region_set_address(MemoryRegion *mr, target_phys_addr_t addr)
@@ -1352,16 +1369,15 @@
 
 void memory_region_set_alias_offset(MemoryRegion *mr, target_phys_addr_t offset)
 {
-    target_phys_addr_t old_offset = mr->alias_offset;
-
     assert(mr->alias);
-    mr->alias_offset = offset;
 
-    if (offset == old_offset || !mr->parent) {
+    if (offset == mr->alias_offset) {
         return;
     }
 
-    memory_region_update_topology(mr);
+    memory_region_transaction_begin();
+    mr->alias_offset = offset;
+    memory_region_transaction_commit();
 }
 
 ram_addr_t memory_region_get_ram_addr(MemoryRegion *mr)
@@ -1493,14 +1509,16 @@
 
 void set_system_memory_map(MemoryRegion *mr)
 {
+    memory_region_transaction_begin();
     address_space_memory.root = mr;
-    memory_region_update_topology(NULL);
+    memory_region_transaction_commit();
 }
 
 void set_system_io_map(MemoryRegion *mr)
 {
+    memory_region_transaction_begin();
     address_space_io.root = mr;
-    memory_region_update_topology(NULL);
+    memory_region_transaction_commit();
 }
 
 uint64_t io_mem_read(MemoryRegion *mr, target_phys_addr_t addr, unsigned size)
diff --git a/memory.h b/memory.h
index bd1bbae..37ce151 100644
--- a/memory.h
+++ b/memory.h
@@ -133,6 +133,7 @@
     bool enabled;
     bool rom_device;
     bool warning_printed; /* For reservations */
+    bool flush_coalesced_mmio;
     MemoryRegion *alias;
     target_phys_addr_t alias_offset;
     unsigned priority;
@@ -252,9 +253,9 @@
                             uint64_t size);
 
 /**
- * memory_region_init_ram:  Initialize RAM memory region from a user-provided.
- *                          pointer.  Accesses into the region will modify
- *                          memory directly.
+ * memory_region_init_ram_ptr:  Initialize RAM memory region from a
+ *                              user-provided pointer.  Accesses into the
+ *                              region will modify memory directly.
  *
  * @mr: the #MemoryRegion to be initialized.
  * @name: the name of the region.
@@ -521,6 +522,31 @@
 void memory_region_clear_coalescing(MemoryRegion *mr);
 
 /**
+ * memory_region_set_flush_coalesced: Enforce memory coalescing flush before
+ *                                    accesses.
+ *
+ * Ensure that pending coalesced MMIO request are flushed before the memory
+ * region is accessed. This property is automatically enabled for all regions
+ * passed to memory_region_set_coalescing() and memory_region_add_coalescing().
+ *
+ * @mr: the memory region to be updated.
+ */
+void memory_region_set_flush_coalesced(MemoryRegion *mr);
+
+/**
+ * memory_region_clear_flush_coalesced: Disable memory coalescing flush before
+ *                                      accesses.
+ *
+ * Clear the automatic coalesced MMIO flushing enabled via
+ * memory_region_set_flush_coalesced. Note that this service has no effect on
+ * memory regions that have MMIO coalescing enabled for themselves. For them,
+ * automatic flushing will stop once coalescing is disabled.
+ *
+ * @mr: the memory region to be updated.
+ */
+void memory_region_clear_flush_coalesced(MemoryRegion *mr);
+
+/**
  * memory_region_add_eventfd: Request an eventfd to be triggered when a word
  *                            is written to a location.
  *
@@ -581,7 +607,8 @@
                                  target_phys_addr_t offset,
                                  MemoryRegion *subregion);
 /**
- * memory_region_add_subregion: Add a subregion to a container, with overlap.
+ * memory_region_add_subregion_overlap: Add a subregion to a container
+ *                                      with overlap.
  *
  * Adds a subregion at @offset.  The subregion may overlap with other
  * subregions.  Conflicts are resolved by having a higher @priority hide a
@@ -743,7 +770,7 @@
 void memory_global_dirty_log_start(void);
 
 /**
- * memory_global_dirty_log_stop: begin dirty logging for all regions
+ * memory_global_dirty_log_stop: end dirty logging for all regions
  */
 void memory_global_dirty_log_stop(void);
 
diff --git a/net.c b/net.c
index e5d25d4..a187a7b 100644
--- a/net.c
+++ b/net.c
@@ -357,7 +357,12 @@
 {
     nc->receive_disabled = 0;
 
-    qemu_net_queue_flush(nc->send_queue);
+    if (qemu_net_queue_flush(nc->send_queue)) {
+        /* We emptied the queue successfully, signal to the IO thread to repoll
+         * the file descriptor (for tap, for example).
+         */
+        qemu_notify_event();
+    }
 }
 
 static ssize_t qemu_send_packet_async_with_flags(NetClientState *sender,
@@ -418,16 +423,27 @@
                                 void *opaque)
 {
     NetClientState *nc = opaque;
+    int ret;
 
     if (nc->link_down) {
         return iov_size(iov, iovcnt);
     }
 
-    if (nc->info->receive_iov) {
-        return nc->info->receive_iov(nc, iov, iovcnt);
-    } else {
-        return nc_sendv_compat(nc, iov, iovcnt);
+    if (nc->receive_disabled) {
+        return 0;
     }
+
+    if (nc->info->receive_iov) {
+        ret = nc->info->receive_iov(nc, iov, iovcnt);
+    } else {
+        ret = nc_sendv_compat(nc, iov, iovcnt);
+    }
+
+    if (ret == 0) {
+        nc->receive_disabled = 1;
+    }
+
+    return ret;
 }
 
 ssize_t qemu_sendv_packet_async(NetClientState *sender,
diff --git a/net/hub.c b/net/hub.c
index ac157e3..650a8b4 100644
--- a/net/hub.c
+++ b/net/hub.c
@@ -97,12 +97,12 @@
             continue;
         }
 
-        if (!qemu_can_send_packet(&port->nc)) {
-            return 0;
+        if (qemu_can_send_packet(&port->nc)) {
+            return 1;
         }
     }
 
-    return 1;
+    return 0;
 }
 
 static ssize_t net_hub_port_receive(NetClientState *nc,
diff --git a/net/queue.c b/net/queue.c
index e8030aa..254f280 100644
--- a/net/queue.c
+++ b/net/queue.c
@@ -83,12 +83,12 @@
     g_free(queue);
 }
 
-static ssize_t qemu_net_queue_append(NetQueue *queue,
-                                     NetClientState *sender,
-                                     unsigned flags,
-                                     const uint8_t *buf,
-                                     size_t size,
-                                     NetPacketSent *sent_cb)
+static void qemu_net_queue_append(NetQueue *queue,
+                                  NetClientState *sender,
+                                  unsigned flags,
+                                  const uint8_t *buf,
+                                  size_t size,
+                                  NetPacketSent *sent_cb)
 {
     NetPacket *packet;
 
@@ -100,16 +100,14 @@
     memcpy(packet->data, buf, size);
 
     QTAILQ_INSERT_TAIL(&queue->packets, packet, entry);
-
-    return size;
 }
 
-static ssize_t qemu_net_queue_append_iov(NetQueue *queue,
-                                         NetClientState *sender,
-                                         unsigned flags,
-                                         const struct iovec *iov,
-                                         int iovcnt,
-                                         NetPacketSent *sent_cb)
+static void qemu_net_queue_append_iov(NetQueue *queue,
+                                      NetClientState *sender,
+                                      unsigned flags,
+                                      const struct iovec *iov,
+                                      int iovcnt,
+                                      NetPacketSent *sent_cb)
 {
     NetPacket *packet;
     size_t max_len = 0;
@@ -133,8 +131,6 @@
     }
 
     QTAILQ_INSERT_TAIL(&queue->packets, packet, entry);
-
-    return packet->size;
 }
 
 static ssize_t qemu_net_queue_deliver(NetQueue *queue,
@@ -177,7 +173,8 @@
     ssize_t ret;
 
     if (queue->delivering || !qemu_can_send_packet(sender)) {
-        return qemu_net_queue_append(queue, sender, flags, data, size, sent_cb);
+        qemu_net_queue_append(queue, sender, flags, data, size, sent_cb);
+        return 0;
     }
 
     ret = qemu_net_queue_deliver(queue, sender, flags, data, size);
@@ -201,8 +198,8 @@
     ssize_t ret;
 
     if (queue->delivering || !qemu_can_send_packet(sender)) {
-        return qemu_net_queue_append_iov(queue, sender, flags,
-                                         iov, iovcnt, sent_cb);
+        qemu_net_queue_append_iov(queue, sender, flags, iov, iovcnt, sent_cb);
+        return 0;
     }
 
     ret = qemu_net_queue_deliver_iov(queue, sender, flags, iov, iovcnt);
@@ -228,7 +225,7 @@
     }
 }
 
-void qemu_net_queue_flush(NetQueue *queue)
+bool qemu_net_queue_flush(NetQueue *queue)
 {
     while (!QTAILQ_EMPTY(&queue->packets)) {
         NetPacket *packet;
@@ -244,7 +241,7 @@
                                      packet->size);
         if (ret == 0) {
             QTAILQ_INSERT_HEAD(&queue->packets, packet, entry);
-            break;
+            return false;
         }
 
         if (packet->sent_cb) {
@@ -253,4 +250,5 @@
 
         g_free(packet);
     }
+    return true;
 }
diff --git a/net/queue.h b/net/queue.h
index 9d44a9b..fc02b33 100644
--- a/net/queue.h
+++ b/net/queue.h
@@ -53,6 +53,6 @@
                                 NetPacketSent *sent_cb);
 
 void qemu_net_queue_purge(NetQueue *queue, NetClientState *from);
-void qemu_net_queue_flush(NetQueue *queue);
+bool qemu_net_queue_flush(NetQueue *queue);
 
 #endif /* QEMU_NET_QUEUE_H */
diff --git a/net/socket.c b/net/socket.c
index 7c602e4..5e0c92e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -32,6 +32,7 @@
 #include "qemu-error.h"
 #include "qemu-option.h"
 #include "qemu_socket.h"
+#include "iov.h"
 
 typedef struct NetSocketState {
     NetClientState nc;
@@ -40,29 +41,106 @@
     int state; /* 0 = getting length, 1 = getting data */
     unsigned int index;
     unsigned int packet_len;
+    unsigned int send_index;      /* number of bytes sent (only SOCK_STREAM) */
     uint8_t buf[4096];
     struct sockaddr_in dgram_dst; /* contains inet host and port destination iff connectionless (SOCK_DGRAM) */
+    IOHandler *send_fn;           /* differs between SOCK_STREAM/SOCK_DGRAM */
+    bool read_poll;               /* waiting to receive data? */
+    bool write_poll;              /* waiting to transmit data? */
 } NetSocketState;
 
 static void net_socket_accept(void *opaque);
+static void net_socket_writable(void *opaque);
 
-/* XXX: we consider we can send the whole packet without blocking */
+/* Only read packets from socket when peer can receive them */
+static int net_socket_can_send(void *opaque)
+{
+    NetSocketState *s = opaque;
+
+    return qemu_can_send_packet(&s->nc);
+}
+
+static void net_socket_update_fd_handler(NetSocketState *s)
+{
+    qemu_set_fd_handler2(s->fd,
+                         s->read_poll  ? net_socket_can_send : NULL,
+                         s->read_poll  ? s->send_fn : NULL,
+                         s->write_poll ? net_socket_writable : NULL,
+                         s);
+}
+
+static void net_socket_read_poll(NetSocketState *s, bool enable)
+{
+    s->read_poll = enable;
+    net_socket_update_fd_handler(s);
+}
+
+static void net_socket_write_poll(NetSocketState *s, bool enable)
+{
+    s->write_poll = enable;
+    net_socket_update_fd_handler(s);
+}
+
+static void net_socket_writable(void *opaque)
+{
+    NetSocketState *s = opaque;
+
+    net_socket_write_poll(s, false);
+
+    qemu_flush_queued_packets(&s->nc);
+}
+
 static ssize_t net_socket_receive(NetClientState *nc, const uint8_t *buf, size_t size)
 {
     NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc);
-    uint32_t len;
-    len = htonl(size);
+    uint32_t len = htonl(size);
+    struct iovec iov[] = {
+        {
+            .iov_base = &len,
+            .iov_len  = sizeof(len),
+        }, {
+            .iov_base = (void *)buf,
+            .iov_len  = size,
+        },
+    };
+    size_t remaining;
+    ssize_t ret;
 
-    send_all(s->fd, (const uint8_t *)&len, sizeof(len));
-    return send_all(s->fd, buf, size);
+    remaining = iov_size(iov, 2) - s->send_index;
+    ret = iov_send(s->fd, iov, 2, s->send_index, remaining);
+
+    if (ret == -1 && errno == EAGAIN) {
+        ret = 0; /* handled further down */
+    }
+    if (ret == -1) {
+        s->send_index = 0;
+        return -errno;
+    }
+    if (ret < (ssize_t)remaining) {
+        s->send_index += ret;
+        net_socket_write_poll(s, true);
+        return 0;
+    }
+    s->send_index = 0;
+    return size;
 }
 
 static ssize_t net_socket_receive_dgram(NetClientState *nc, const uint8_t *buf, size_t size)
 {
     NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc);
+    ssize_t ret;
 
-    return sendto(s->fd, (const void *)buf, size, 0,
-                  (struct sockaddr *)&s->dgram_dst, sizeof(s->dgram_dst));
+    do {
+        ret = sendto(s->fd, buf, size, 0,
+                     (struct sockaddr *)&s->dgram_dst,
+                     sizeof(s->dgram_dst));
+    } while (ret == -1 && errno == EINTR);
+
+    if (ret == -1 && errno == EAGAIN) {
+        net_socket_write_poll(s, true);
+        return 0;
+    }
+    return ret;
 }
 
 static void net_socket_send(void *opaque)
@@ -81,7 +159,8 @@
     } else if (size == 0) {
         /* end of connection */
     eoc:
-        qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+        net_socket_read_poll(s, false);
+        net_socket_write_poll(s, false);
         if (s->listen_fd != -1) {
             qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s);
         }
@@ -152,7 +231,8 @@
         return;
     if (size == 0) {
         /* end of connection */
-        qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+        net_socket_read_poll(s, false);
+        net_socket_write_poll(s, false);
         return;
     }
     qemu_send_packet(&s->nc, s->buf, size);
@@ -243,7 +323,8 @@
 {
     NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc);
     if (s->fd != -1) {
-        qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+        net_socket_read_poll(s, false);
+        net_socket_write_poll(s, false);
         close(s->fd);
         s->fd = -1;
     }
@@ -314,8 +395,8 @@
 
     s->fd = fd;
     s->listen_fd = -1;
-
-    qemu_set_fd_handler(s->fd, net_socket_send_dgram, NULL, s);
+    s->send_fn = net_socket_send_dgram;
+    net_socket_read_poll(s, true);
 
     /* mcast: save bound address as dst */
     if (is_connected) {
@@ -332,7 +413,8 @@
 static void net_socket_connect(void *opaque)
 {
     NetSocketState *s = opaque;
-    qemu_set_fd_handler(s->fd, net_socket_send, NULL, s);
+    s->send_fn = net_socket_send;
+    net_socket_read_poll(s, true);
 }
 
 static NetClientInfo net_socket_info = {
diff --git a/oslib-win32.c b/oslib-win32.c
index ffbc6d0..51b33e8 100644
--- a/oslib-win32.c
+++ b/oslib-win32.c
@@ -74,6 +74,30 @@
     VirtualFree(ptr, 0, MEM_RELEASE);
 }
 
+/* FIXME: add proper locking */
+struct tm *gmtime_r(const time_t *timep, struct tm *result)
+{
+    struct tm *p = gmtime(timep);
+    memset(result, 0, sizeof(*result));
+    if (p) {
+        *result = *p;
+        p = result;
+    }
+    return p;
+}
+
+/* FIXME: add proper locking */
+struct tm *localtime_r(const time_t *timep, struct tm *result)
+{
+    struct tm *p = localtime(timep);
+    memset(result, 0, sizeof(*result));
+    if (p) {
+        *result = *p;
+        p = result;
+    }
+    return p;
+}
+
 void socket_set_block(int fd)
 {
     unsigned long opt = 0;
diff --git a/qapi-schema-guest.json b/qapi-schema-guest.json
index d955cf1..ed0eb69 100644
--- a/qapi-schema-guest.json
+++ b/qapi-schema-guest.json
@@ -293,7 +293,7 @@
 ##
 # @GuestFsFreezeStatus
 #
-# An enumation of filesystem freeze states
+# An enumeration of filesystem freeze states
 #
 # @thawed: filesystems thawed/unfrozen
 #
diff --git a/qapi-schema.json b/qapi-schema.json
index a9f465a..14e44199 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -118,7 +118,7 @@
 ##
 # @RunState
 #
-# An enumation of VM run states.
+# An enumeration of VM run states.
 #
 # @debug: QEMU is running on a debugger
 #
@@ -156,6 +156,70 @@
             'running', 'save-vm', 'shutdown', 'suspended', 'watchdog' ] }
 
 ##
+# @SnapshotInfo
+#
+# @id: unique snapshot id
+#
+# @name: user chosen name
+#
+# @vm-state-size: size of the VM state
+#
+# @date-sec: UTC date of the snapshot in seconds
+#
+# @date-nsec: fractional part in nano seconds to be used with date-sec
+#
+# @vm-clock-sec: VM clock relative to boot in seconds
+#
+# @vm-clock-nsec: fractional part in nano seconds to be used with vm-clock-sec
+#
+# Since: 1.3
+#
+##
+
+{ 'type': 'SnapshotInfo',
+  'data': { 'id': 'str', 'name': 'str', 'vm-state-size': 'int',
+            'date-sec': 'int', 'date-nsec': 'int',
+            'vm-clock-sec': 'int', 'vm-clock-nsec': 'int' } }
+
+##
+# @ImageInfo:
+#
+# Information about a QEMU image file
+#
+# @filename: name of the image file
+#
+# @format: format of the image file
+#
+# @virtual-size: maximum capacity in bytes of the image
+#
+# @actual-size: #optional actual size on disk in bytes of the image
+#
+# @dirty-flag: #optional true if image is not cleanly closed
+#
+# @cluster-size: #optional size of a cluster in bytes
+#
+# @encrypted: #optional true if the image is encrypted
+#
+# @backing-filename: #optional name of the backing file
+#
+# @full-backing-filename: #optional full path of the backing file
+#
+# @backing-filename-format: #optional the format of the backing file
+#
+# @snapshots: #optional list of VM snapshots
+#
+# Since: 1.3
+#
+##
+
+{ 'type': 'ImageInfo',
+  'data': {'filename': 'str', 'format': 'str', '*dirty-flag': 'bool',
+           '*actual-size': 'int', 'virtual-size': 'int',
+           '*cluster-size': 'int', '*encrypted': 'bool',
+           '*backing-filename': 'str', '*full-backing-filename': 'str',
+           '*backing-filename-format': 'str', '*snapshots': ['SnapshotInfo'] } }
+
+##
 # @StatusInfo:
 #
 # Information about VCPU run state
@@ -785,7 +849,7 @@
 ##
 # @SpiceQueryMouseMode
 #
-# An enumation of Spice mouse states.
+# An enumeration of Spice mouse states.
 #
 # @client: Mouse cursor position is determined by the client.
 #
diff --git a/qemu-char.c b/qemu-char.c
index 767da93..7f0f895 100644
--- a/qemu-char.c
+++ b/qemu-char.c
@@ -2141,18 +2141,13 @@
 
 static void tcp_chr_accept(void *opaque);
 
-static void tcp_chr_connect(void *opaque);
-
 static int tcp_chr_write(CharDriverState *chr, const uint8_t *buf, int len)
 {
     TCPCharDriver *s = chr->opaque;
     if (s->connected) {
         return send_all(s->fd, buf, len);
-    } else if (s->listen_fd == -1) {
-        /* (Re-)connect for unconnected writing */
-        tcp_chr_connect(chr);
-        return 0;
     } else {
+        /* XXX: indicate an error ? */
         return len;
     }
 }
@@ -2334,8 +2329,10 @@
     TCPCharDriver *s = chr->opaque;
 
     s->connected = 1;
-    qemu_set_fd_handler2(s->fd, tcp_chr_read_poll,
-                         tcp_chr_read, NULL, chr);
+    if (s->fd >= 0) {
+        qemu_set_fd_handler2(s->fd, tcp_chr_read_poll,
+                             tcp_chr_read, NULL, chr);
+    }
     qemu_chr_generic_open(chr);
 }
 
diff --git a/qemu-config.c b/qemu-config.c
index eba977e..12eafbb 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -615,6 +615,10 @@
             .name = "dump-guest-core",
             .type = QEMU_OPT_BOOL,
             .help = "Include guest memory in  a core dump",
+        }, {
+            .name = "mem-merge",
+            .type = QEMU_OPT_BOOL,
+            .help = "enable/disable memory merge support",
         },
         { /* End of list */ }
     },
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 39419a0..0ef82e9 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -34,9 +34,9 @@
 ETEXI
 
 DEF("info", img_info,
-    "info [-f fmt] filename")
+    "info [-f fmt] [--output=ofmt] filename")
 STEXI
-@item info [-f @var{fmt}] @var{filename}
+@item info [-f @var{fmt}] [--output=@var{ofmt}] @var{filename}
 ETEXI
 
 DEF("snapshot", img_snapshot,
diff --git a/qemu-img.c b/qemu-img.c
index b41e670..f17f187 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -21,12 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include "qapi-visit.h"
+#include "qapi/qmp-output-visitor.h"
+#include "qjson.h"
 #include "qemu-common.h"
 #include "qemu-option.h"
 #include "qemu-error.h"
 #include "osdep.h"
 #include "sysemu.h"
 #include "block_int.h"
+#include <getopt.h>
 #include <stdio.h>
 
 #ifdef _WIN32
@@ -84,12 +88,13 @@
            "  '-p' show progress of command (only certain commands)\n"
            "  '-S' indicates the consecutive number of bytes that must contain only zeros\n"
            "       for qemu-img to create a sparse image during conversion\n"
+           "  '--output' takes the format in which the output must be done (human or json)\n"
            "\n"
            "Parameters to check subcommand:\n"
            "  '-r' tries to repair any inconsistencies that are found during the check.\n"
            "       '-r leaks' repairs only cluster leaks, whereas '-r all' fixes all\n"
            "       kinds of errors, with a higher risk of choosing the wrong fix or\n"
-           "       hiding corruption that has already occured.\n"
+           "       hiding corruption that has already occurred.\n"
            "\n"
            "Parameters to snapshot subcommand:\n"
            "  'snapshot' is the name of the snapshot to create, apply or delete\n"
@@ -221,7 +226,8 @@
 
 static BlockDriverState *bdrv_new_open(const char *filename,
                                        const char *fmt,
-                                       int flags)
+                                       int flags,
+                                       bool require_io)
 {
     BlockDriverState *bs;
     BlockDriver *drv;
@@ -246,7 +252,7 @@
         goto fail;
     }
 
-    if (bdrv_is_encrypted(bs)) {
+    if (bdrv_is_encrypted(bs) && require_io) {
         printf("Disk image '%s' is encrypted.\n", filename);
         if (read_password(password, sizeof(password)) < 0) {
             error_report("No password given");
@@ -413,7 +419,7 @@
     }
     filename = argv[optind++];
 
-    bs = bdrv_new_open(filename, fmt, flags);
+    bs = bdrv_new_open(filename, fmt, flags, true);
     if (!bs) {
         return 1;
     }
@@ -520,7 +526,7 @@
         return -1;
     }
 
-    bs = bdrv_new_open(filename, fmt, flags);
+    bs = bdrv_new_open(filename, fmt, flags, true);
     if (!bs) {
         return 1;
     }
@@ -762,7 +768,7 @@
 
     total_sectors = 0;
     for (bs_i = 0; bs_i < bs_n; bs_i++) {
-        bs[bs_i] = bdrv_new_open(argv[optind + bs_i], fmt, BDRV_O_FLAGS);
+        bs[bs_i] = bdrv_new_open(argv[optind + bs_i], fmt, BDRV_O_FLAGS, true);
         if (!bs[bs_i]) {
             error_report("Could not open '%s'", argv[optind + bs_i]);
             ret = -1;
@@ -881,7 +887,7 @@
         return -1;
     }
 
-    out_bs = bdrv_new_open(out_filename, out_fmt, flags);
+    out_bs = bdrv_new_open(out_filename, out_fmt, flags, true);
     if (!out_bs) {
         ret = -1;
         goto out;
@@ -1102,21 +1108,174 @@
     g_free(sn_tab);
 }
 
-static int img_info(int argc, char **argv)
+static void collect_snapshots(BlockDriverState *bs , ImageInfo *info)
 {
-    int c;
-    const char *filename, *fmt;
-    BlockDriverState *bs;
-    char size_buf[128], dsize_buf[128];
+    int i, sn_count;
+    QEMUSnapshotInfo *sn_tab = NULL;
+    SnapshotInfoList *info_list, *cur_item = NULL;
+    sn_count = bdrv_snapshot_list(bs, &sn_tab);
+
+    for (i = 0; i < sn_count; i++) {
+        info->has_snapshots = true;
+        info_list = g_new0(SnapshotInfoList, 1);
+
+        info_list->value                = g_new0(SnapshotInfo, 1);
+        info_list->value->id            = g_strdup(sn_tab[i].id_str);
+        info_list->value->name          = g_strdup(sn_tab[i].name);
+        info_list->value->vm_state_size = sn_tab[i].vm_state_size;
+        info_list->value->date_sec      = sn_tab[i].date_sec;
+        info_list->value->date_nsec     = sn_tab[i].date_nsec;
+        info_list->value->vm_clock_sec  = sn_tab[i].vm_clock_nsec / 1000000000;
+        info_list->value->vm_clock_nsec = sn_tab[i].vm_clock_nsec % 1000000000;
+
+        /* XXX: waiting for the qapi to support qemu-queue.h types */
+        if (!cur_item) {
+            info->snapshots = cur_item = info_list;
+        } else {
+            cur_item->next = info_list;
+            cur_item = info_list;
+        }
+
+    }
+
+    g_free(sn_tab);
+}
+
+static void dump_json_image_info(ImageInfo *info)
+{
+    Error *errp = NULL;
+    QString *str;
+    QmpOutputVisitor *ov = qmp_output_visitor_new();
+    QObject *obj;
+    visit_type_ImageInfo(qmp_output_get_visitor(ov),
+                         &info, NULL, &errp);
+    obj = qmp_output_get_qobject(ov);
+    str = qobject_to_json_pretty(obj);
+    assert(str != NULL);
+    printf("%s\n", qstring_get_str(str));
+    qobject_decref(obj);
+    qmp_output_visitor_cleanup(ov);
+    QDECREF(str);
+}
+
+static void collect_image_info(BlockDriverState *bs,
+                   ImageInfo *info,
+                   const char *filename,
+                   const char *fmt)
+{
     uint64_t total_sectors;
-    int64_t allocated_size;
     char backing_filename[1024];
     char backing_filename2[1024];
     BlockDriverInfo bdi;
 
+    bdrv_get_geometry(bs, &total_sectors);
+
+    info->filename        = g_strdup(filename);
+    info->format          = g_strdup(bdrv_get_format_name(bs));
+    info->virtual_size    = total_sectors * 512;
+    info->actual_size     = bdrv_get_allocated_file_size(bs);
+    info->has_actual_size = info->actual_size >= 0;
+    if (bdrv_is_encrypted(bs)) {
+        info->encrypted = true;
+        info->has_encrypted = true;
+    }
+    if (bdrv_get_info(bs, &bdi) >= 0) {
+        if (bdi.cluster_size != 0) {
+            info->cluster_size = bdi.cluster_size;
+            info->has_cluster_size = true;
+        }
+        info->dirty_flag = bdi.is_dirty;
+        info->has_dirty_flag = true;
+    }
+    bdrv_get_backing_filename(bs, backing_filename, sizeof(backing_filename));
+    if (backing_filename[0] != '\0') {
+        info->backing_filename = g_strdup(backing_filename);
+        info->has_backing_filename = true;
+        bdrv_get_full_backing_filename(bs, backing_filename2,
+                                       sizeof(backing_filename2));
+
+        if (strcmp(backing_filename, backing_filename2) != 0) {
+            info->full_backing_filename =
+                        g_strdup(backing_filename2);
+            info->has_full_backing_filename = true;
+        }
+
+        if (bs->backing_format[0]) {
+            info->backing_filename_format = g_strdup(bs->backing_format);
+            info->has_backing_filename_format = true;
+        }
+    }
+}
+
+static void dump_human_image_info(ImageInfo *info)
+{
+    char size_buf[128], dsize_buf[128];
+    if (!info->has_actual_size) {
+        snprintf(dsize_buf, sizeof(dsize_buf), "unavailable");
+    } else {
+        get_human_readable_size(dsize_buf, sizeof(dsize_buf),
+                                info->actual_size);
+    }
+    get_human_readable_size(size_buf, sizeof(size_buf), info->virtual_size);
+    printf("image: %s\n"
+           "file format: %s\n"
+           "virtual size: %s (%" PRId64 " bytes)\n"
+           "disk size: %s\n",
+           info->filename, info->format, size_buf,
+           info->virtual_size,
+           dsize_buf);
+
+    if (info->has_encrypted && info->encrypted) {
+        printf("encrypted: yes\n");
+    }
+
+    if (info->has_cluster_size) {
+        printf("cluster_size: %" PRId64 "\n", info->cluster_size);
+    }
+
+    if (info->has_dirty_flag && info->dirty_flag) {
+        printf("cleanly shut down: no\n");
+    }
+
+    if (info->has_backing_filename) {
+        printf("backing file: %s", info->backing_filename);
+        if (info->has_full_backing_filename) {
+            printf(" (actual path: %s)", info->full_backing_filename);
+        }
+        putchar('\n');
+        if (info->has_backing_filename_format) {
+            printf("backing file format: %s\n", info->backing_filename_format);
+        }
+    }
+}
+
+enum {OPTION_OUTPUT = 256};
+
+typedef enum OutputFormat {
+    OFORMAT_JSON,
+    OFORMAT_HUMAN,
+} OutputFormat;
+
+static int img_info(int argc, char **argv)
+{
+    int c;
+    OutputFormat output_format = OFORMAT_HUMAN;
+    const char *filename, *fmt, *output;
+    BlockDriverState *bs;
+    ImageInfo *info;
+
     fmt = NULL;
+    output = NULL;
     for(;;) {
-        c = getopt(argc, argv, "f:h");
+        int option_index = 0;
+        static const struct option long_options[] = {
+            {"help", no_argument, 0, 'h'},
+            {"format", required_argument, 0, 'f'},
+            {"output", required_argument, 0, OPTION_OUTPUT},
+            {0, 0, 0, 0}
+        };
+        c = getopt_long(argc, argv, "f:h",
+                        long_options, &option_index);
         if (c == -1) {
             break;
         }
@@ -1128,6 +1287,9 @@
         case 'f':
             fmt = optarg;
             break;
+        case OPTION_OUTPUT:
+            output = optarg;
+            break;
         }
     }
     if (optind >= argc) {
@@ -1135,48 +1297,35 @@
     }
     filename = argv[optind++];
 
-    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS | BDRV_O_NO_BACKING);
+    if (output && !strcmp(output, "json")) {
+        output_format = OFORMAT_JSON;
+    } else if (output && !strcmp(output, "human")) {
+        output_format = OFORMAT_HUMAN;
+    } else if (output) {
+        error_report("--output must be used with human or json as argument.");
+        return 1;
+    }
+
+    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS | BDRV_O_NO_BACKING, false);
     if (!bs) {
         return 1;
     }
-    bdrv_get_geometry(bs, &total_sectors);
-    get_human_readable_size(size_buf, sizeof(size_buf), total_sectors * 512);
-    allocated_size = bdrv_get_allocated_file_size(bs);
-    if (allocated_size < 0) {
-        snprintf(dsize_buf, sizeof(dsize_buf), "unavailable");
-    } else {
-        get_human_readable_size(dsize_buf, sizeof(dsize_buf),
-                                allocated_size);
+
+    info = g_new0(ImageInfo, 1);
+    collect_image_info(bs, info, filename, fmt);
+
+    switch (output_format) {
+    case OFORMAT_HUMAN:
+        dump_human_image_info(info);
+        dump_snapshots(bs);
+        break;
+    case OFORMAT_JSON:
+        collect_snapshots(bs, info);
+        dump_json_image_info(info);
+        break;
     }
-    printf("image: %s\n"
-           "file format: %s\n"
-           "virtual size: %s (%" PRId64 " bytes)\n"
-           "disk size: %s\n",
-           filename, bdrv_get_format_name(bs), size_buf,
-           (total_sectors * 512),
-           dsize_buf);
-    if (bdrv_is_encrypted(bs)) {
-        printf("encrypted: yes\n");
-    }
-    if (bdrv_get_info(bs, &bdi) >= 0) {
-        if (bdi.cluster_size != 0) {
-            printf("cluster_size: %d\n", bdi.cluster_size);
-        }
-        if (bdi.is_dirty) {
-            printf("cleanly shut down: no\n");
-        }
-    }
-    bdrv_get_backing_filename(bs, backing_filename, sizeof(backing_filename));
-    if (backing_filename[0] != '\0') {
-        bdrv_get_full_backing_filename(bs, backing_filename2,
-                                       sizeof(backing_filename2));
-        printf("backing file: %s", backing_filename);
-        if (strcmp(backing_filename, backing_filename2) != 0) {
-            printf(" (actual path: %s)", backing_filename2);
-        }
-        putchar('\n');
-    }
-    dump_snapshots(bs);
+
+    qapi_free_ImageInfo(info);
     bdrv_delete(bs);
     return 0;
 }
@@ -1248,7 +1397,7 @@
     filename = argv[optind++];
 
     /* Open the image */
-    bs = bdrv_new_open(filename, NULL, bdrv_oflags);
+    bs = bdrv_new_open(filename, NULL, bdrv_oflags, true);
     if (!bs) {
         return 1;
     }
@@ -1366,7 +1515,7 @@
      * Ignore the old backing file for unsafe rebase in case we want to correct
      * the reference to a renamed or moved backing file.
      */
-    bs = bdrv_new_open(filename, fmt, flags);
+    bs = bdrv_new_open(filename, fmt, flags, true);
     if (!bs) {
         return 1;
     }
@@ -1639,7 +1788,7 @@
     n = qemu_opt_get_size(param, BLOCK_OPT_SIZE, 0);
     qemu_opts_del(param);
 
-    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS | BDRV_O_RDWR);
+    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS | BDRV_O_RDWR, true);
     if (!bs) {
         ret = -1;
         goto out;
diff --git a/qemu-img.texi b/qemu-img.texi
index 6b42e35..8b05f2c 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -87,7 +87,7 @@
 If @code{-r} is specified, qemu-img tries to repair any inconsistencies found
 during the check. @code{-r leaks} repairs only cluster leaks, whereas
 @code{-r all} fixes all kinds of errors, with a higher risk of choosing the
-wrong fix or hiding corruption that has already occured.
+wrong fix or hiding corruption that has already occurred.
 
 Only the formats @code{qcow2}, @code{qed} and @code{vdi} support
 consistency checks.
@@ -129,12 +129,13 @@
 @var{backing_file} should have the same content as the input's base image,
 however the path, image format, etc may differ.
 
-@item info [-f @var{fmt}] @var{filename}
+@item info [-f @var{fmt}] [--output=@var{ofmt}] @var{filename}
 
 Give information about the disk image @var{filename}. Use it in
 particular to know the size reserved on disk which can be different
 from the displayed size. If VM snapshots are stored in the disk image,
-they are displayed too.
+they are displayed too. The command can output in the format @var{ofmt}
+which is either @code{human} or @code{json}.
 
 @item snapshot [-l | -a @var{snapshot} | -c @var{snapshot} | -d @var{snapshot} ] @var{filename}
 
diff --git a/qemu-options.hx b/qemu-options.hx
index 804a2d1..09c86c4 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -38,7 +38,8 @@
     "                supported accelerators are kvm, xen, tcg (default: tcg)\n"
     "                kernel_irqchip=on|off controls accelerated irqchip support\n"
     "                kvm_shadow_mem=size of KVM shadow MMU\n"
-    "                dump-guest-core=on|off include guest memory in a core dump (default=on)\n",
+    "                dump-guest-core=on|off include guest memory in a core dump (default=on)\n"
+    "                mem-merge=on|off controls memory merge support (default: on)\n",
     QEMU_ARCH_ALL)
 STEXI
 @item -machine [type=]@var{name}[,prop=@var{value}[,...]]
@@ -57,6 +58,10 @@
 Defines the size of the KVM shadow MMU.
 @item dump-guest-core=on|off
 Include guest memory in a core dump. The default is on.
+@item mem-merge=on|off
+Enables or disables memory merge support. This feature, when supported by
+the host, de-duplicates identical memory pages among VMs instances
+(enabled by default).
 @end table
 ETEXI
 
@@ -1357,6 +1362,7 @@
 Not all devices are supported on all targets.  Use -net nic,model=?
 for a list of available devices for your target.
 
+@item -netdev user,id=@var{id}[,@var{option}][,@var{option}][,...]
 @item -net user[,@var{option}][,@var{option}][,...]
 Use the user mode network stack which requires no administrator
 privilege to run. Valid options are:
@@ -1365,6 +1371,7 @@
 @item vlan=@var{n}
 Connect user mode stack to VLAN @var{n} (@var{n} = 0 is the default).
 
+@item id=@var{id}
 @item name=@var{name}
 Assign symbolic name for use in monitor commands.
 
@@ -1490,6 +1497,7 @@
 syntax gives undefined results. Their use for new applications is discouraged
 as they will be removed from future versions.
 
+@item -netdev tap,id=@var{id}[,fd=@var{h}][,ifname=@var{name}][,script=@var{file}][,downscript=@var{dfile}][,helper=@var{helper}]
 @item -net tap[,vlan=@var{n}][,name=@var{name}][,fd=@var{h}][,ifname=@var{name}][,script=@var{file}][,downscript=@var{dfile}][,helper=@var{helper}]
 Connect the host TAP network interface @var{name} to VLAN @var{n}.
 
@@ -1529,6 +1537,7 @@
                  -net nic -net tap,"helper=/usr/local/libexec/qemu-bridge-helper"
 @end example
 
+@item -netdev bridge,id=@var{id}[,br=@var{bridge}][,helper=@var{helper}]
 @item -net bridge[,vlan=@var{n}][,name=@var{name}][,br=@var{bridge}][,helper=@var{helper}]
 Connect a host TAP network interface to a host bridge device.
 
@@ -1551,6 +1560,7 @@
 qemu-system-i386 linux.img -net bridge,br=qemubr0 -net nic,model=virtio
 @end example
 
+@item -netdev socket,id=@var{id}[,fd=@var{h}][,listen=[@var{host}]:@var{port}][,connect=@var{host}:@var{port}]
 @item -net socket[,vlan=@var{n}][,name=@var{name}][,fd=@var{h}] [,listen=[@var{host}]:@var{port}][,connect=@var{host}:@var{port}]
 
 Connect the VLAN @var{n} to a remote VLAN in another QEMU virtual
@@ -1573,6 +1583,7 @@
                  -net socket,connect=127.0.0.1:1234
 @end example
 
+@item -netdev socket,id=@var{id}[,fd=@var{h}][,mcast=@var{maddr}:@var{port}[,localaddr=@var{addr}]]
 @item -net socket[,vlan=@var{n}][,name=@var{name}][,fd=@var{h}][,mcast=@var{maddr}:@var{port}[,localaddr=@var{addr}]]
 
 Create a VLAN @var{n} shared with another QEMU virtual
@@ -1624,6 +1635,7 @@
                  -net socket,mcast=239.192.168.1:1102,localaddr=1.2.3.4
 @end example
 
+@item -netdev vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
 @item -net vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}] [,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
 Connect VLAN @var{n} to PORT @var{n} of a vde switch running on host and
 listening for incoming connections on @var{socketpath}. Use GROUP @var{groupname}
diff --git a/qemu-os-win32.h b/qemu-os-win32.h
index 753679b..3b5a35b 100644
--- a/qemu-os-win32.h
+++ b/qemu-os-win32.h
@@ -68,6 +68,12 @@
 /* Declaration of ffs() is missing in MinGW's strings.h. */
 int ffs(int i);
 
+/* Missing POSIX functions. Don't use MinGW-w64 macros. */
+#undef gmtime_r
+struct tm *gmtime_r(const time_t *timep, struct tm *result);
+#undef localtime_r
+struct tm *localtime_r(const time_t *timep, struct tm *result);
+
 static inline void os_setup_signal_handling(void) {}
 static inline void os_daemonize(void) {}
 static inline void os_setup_post(void) {}
diff --git a/qemu-timer.h b/qemu-timer.h
index f8af595..da7e97c 100644
--- a/qemu-timer.h
+++ b/qemu-timer.h
@@ -218,7 +218,7 @@
     return val;
 }
 
-#elif defined(__sparc_v8plus__) || defined(__sparc_v8plusa__) || defined(__sparc_v9__)
+#elif defined(__sparc__)
 
 static inline int64_t cpu_get_real_ticks (void)
 {
@@ -227,6 +227,8 @@
     asm volatile("rd %%tick,%0" : "=r"(rval));
     return rval;
 #else
+    /* We need an %o or %g register for this.  For recent enough gcc
+       there is an "h" constraint for that.  Don't bother with that.  */
     union {
         uint64_t i64;
         struct {
@@ -234,8 +236,8 @@
             uint32_t low;
         }       i32;
     } rval;
-    asm volatile("rd %%tick,%1; srlx %1,32,%0"
-                 : "=r"(rval.i32.high), "=r"(rval.i32.low));
+    asm volatile("rd %%tick,%%g1; srlx %%g1,32,%0; mov %%g1,%1"
+                 : "=r"(rval.i32.high), "=r"(rval.i32.low) : : "g1");
     return rval.i64;
 #endif
 }
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index a639c5b..53a6f87 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -28,7 +28,21 @@
     output="$PWD"
 fi
 
-for arch in x86 powerpc s390; do
+# This will pick up non-directories too (eg "Kconfig") but we will
+# ignore them in the next loop.
+ARCHLIST=$(cd "$linux/arch" && echo *)
+
+for arch in $ARCHLIST; do
+    # Discard anything which isn't a KVM-supporting architecture
+    if ! [ -e "$linux/arch/$arch/include/asm/kvm.h" ]; then
+        continue
+    fi
+
+    # Blacklist architectures which have KVM headers but are actually dead
+    if [ "$arch" = "ia64" ]; then
+        continue
+    fi
+
     make -C "$linux" INSTALL_HDR_PATH="$tmpdir" SRCARCH=$arch headers_install
 
     rm -rf "$output/linux-headers/asm-$arch"
diff --git a/slirp/tcp_subr.c b/slirp/tcp_subr.c
index 025b374..1542e43 100644
--- a/slirp/tcp_subr.c
+++ b/slirp/tcp_subr.c
@@ -114,9 +114,9 @@
 	int win = 0;
 
 	DEBUG_CALL("tcp_respond");
-	DEBUG_ARG("tp = %lx", (long)tp);
-	DEBUG_ARG("ti = %lx", (long)ti);
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("tp = %p", tp);
+	DEBUG_ARG("ti = %p", ti);
+	DEBUG_ARG("m = %p", m);
 	DEBUG_ARG("ack = %u", ack);
 	DEBUG_ARG("seq = %u", seq);
 	DEBUG_ARG("flags = %x", flags);
@@ -124,7 +124,7 @@
 	if (tp)
 		win = sbspace(&tp->t_socket->so_rcv);
         if (m == NULL) {
-		if ((m = m_get(tp->t_socket->slirp)) == NULL)
+		if (!tp || (m = m_get(tp->t_socket->slirp)) == NULL)
 			return;
 		tlen = 0;
 		m->m_data += IF_MAXLINKHDR;
diff --git a/slirp/tftp.c b/slirp/tftp.c
index b78765f..1a79c45 100644
--- a/slirp/tftp.c
+++ b/slirp/tftp.c
@@ -37,6 +37,10 @@
 
 static void tftp_session_terminate(struct tftp_session *spt)
 {
+    if (spt->fd >= 0) {
+        close(spt->fd);
+        spt->fd = -1;
+    }
     g_free(spt->filename);
     spt->slirp = NULL;
 }
@@ -54,7 +58,7 @@
 
     /* sessions time out after 5 inactive seconds */
     if ((int)(curtime - spt->timestamp) > 5000) {
-        g_free(spt->filename);
+        tftp_session_terminate(spt);
         goto found;
     }
   }
@@ -64,6 +68,7 @@
  found:
   memset(spt, 0, sizeof(*spt));
   memcpy(&spt->client_ip, &tp->ip.ip_src, sizeof(spt->client_ip));
+  spt->fd = -1;
   spt->client_port = tp->udp.uh_sport;
   spt->slirp = slirp;
 
@@ -92,37 +97,36 @@
   return -1;
 }
 
-static int tftp_read_data(struct tftp_session *spt, uint16_t block_nr,
+static int tftp_read_data(struct tftp_session *spt, uint32_t block_nr,
                           uint8_t *buf, int len)
 {
-  int fd;
-  int bytes_read = 0;
+    int bytes_read = 0;
 
-  fd = open(spt->filename, O_RDONLY | O_BINARY);
+    if (spt->fd < 0) {
+        spt->fd = open(spt->filename, O_RDONLY | O_BINARY);
+    }
 
-  if (fd < 0) {
-    return -1;
-  }
+    if (spt->fd < 0) {
+        return -1;
+    }
 
-  if (len) {
-    lseek(fd, block_nr * 512, SEEK_SET);
+    if (len) {
+        lseek(spt->fd, block_nr * 512, SEEK_SET);
 
-    bytes_read = read(fd, buf, len);
-  }
+        bytes_read = read(spt->fd, buf, len);
+    }
 
-  close(fd);
-
-  return bytes_read;
+    return bytes_read;
 }
 
 static int tftp_send_oack(struct tftp_session *spt,
-                          const char *key, uint32_t value,
+                          const char *keys[], uint32_t values[], int nb,
                           struct tftp_t *recv_tp)
 {
     struct sockaddr_in saddr, daddr;
     struct mbuf *m;
     struct tftp_t *tp;
-    int n = 0;
+    int i, n = 0;
 
     m = m_get(spt->slirp);
 
@@ -136,10 +140,12 @@
     m->m_data += sizeof(struct udpiphdr);
 
     tp->tp_op = htons(TFTP_OACK);
-    n += snprintf(tp->x.tp_buf + n, sizeof(tp->x.tp_buf) - n, "%s",
-                  key) + 1;
-    n += snprintf(tp->x.tp_buf + n, sizeof(tp->x.tp_buf) - n, "%u",
-                  value) + 1;
+    for (i = 0; i < nb; i++) {
+        n += snprintf(tp->x.tp_buf + n, sizeof(tp->x.tp_buf) - n, "%s",
+                      keys[i]) + 1;
+        n += snprintf(tp->x.tp_buf + n, sizeof(tp->x.tp_buf) - n, "%u",
+                      values[i]) + 1;
+    }
 
     saddr.sin_addr = recv_tp->ip.ip_dst;
     saddr.sin_port = recv_tp->udp.uh_dport;
@@ -193,23 +199,18 @@
   tftp_session_terminate(spt);
 }
 
-static int tftp_send_data(struct tftp_session *spt,
-                          uint16_t block_nr,
-			  struct tftp_t *recv_tp)
+static void tftp_send_next_block(struct tftp_session *spt,
+                                 struct tftp_t *recv_tp)
 {
   struct sockaddr_in saddr, daddr;
   struct mbuf *m;
   struct tftp_t *tp;
   int nobytes;
 
-  if (block_nr < 1) {
-    return -1;
-  }
-
   m = m_get(spt->slirp);
 
   if (!m) {
-    return -1;
+    return;
   }
 
   memset(m->m_data, 0, m->m_size);
@@ -219,7 +220,7 @@
   m->m_data += sizeof(struct udpiphdr);
 
   tp->tp_op = htons(TFTP_DATA);
-  tp->x.tp_data.tp_block_nr = htons(block_nr);
+  tp->x.tp_data.tp_block_nr = htons((spt->block_nr + 1) & 0xffff);
 
   saddr.sin_addr = recv_tp->ip.ip_dst;
   saddr.sin_port = recv_tp->udp.uh_dport;
@@ -227,7 +228,7 @@
   daddr.sin_addr = spt->client_ip;
   daddr.sin_port = spt->client_port;
 
-  nobytes = tftp_read_data(spt, block_nr - 1, tp->x.tp_data.tp_buf, 512);
+  nobytes = tftp_read_data(spt, spt->block_nr, tp->x.tp_data.tp_buf, 512);
 
   if (nobytes < 0) {
     m_free(m);
@@ -236,7 +237,7 @@
 
     tftp_send_error(spt, 1, "File not found", tp);
 
-    return -1;
+    return;
   }
 
   m->m_len = sizeof(struct tftp_t) - (512 - nobytes) -
@@ -251,7 +252,7 @@
     tftp_session_terminate(spt);
   }
 
-  return 0;
+  spt->block_nr++;
 }
 
 static void tftp_handle_rrq(Slirp *slirp, struct tftp_t *tp, int pktlen)
@@ -260,6 +261,9 @@
   int s, k;
   size_t prefix_len;
   char *req_fname;
+  const char *option_name[2];
+  uint32_t option_value[2];
+  int nb_options = 0;
 
   /* check if a session already exists and if so terminate it */
   s = tftp_session_find(slirp, tp);
@@ -337,7 +341,7 @@
       return;
   }
 
-  while (k < pktlen) {
+  while (k < pktlen && nb_options < ARRAY_SIZE(option_name)) {
       const char *key, *value;
 
       key = &tp->x.tp_buf[k];
@@ -364,12 +368,32 @@
 	      }
 	  }
 
-	  tftp_send_oack(spt, "tsize", tsize, tp);
-	  return;
+          option_name[nb_options] = "tsize";
+          option_value[nb_options] = tsize;
+          nb_options++;
+      } else if (strcasecmp(key, "blksize") == 0) {
+          int blksize = atoi(value);
+
+          /* If blksize option is bigger than what we will
+           * emit, accept the option with our packet size.
+           * Otherwise, simply do as we didn't see the option.
+           */
+          if (blksize >= 512) {
+              option_name[nb_options] = "blksize";
+              option_value[nb_options] = 512;
+              nb_options++;
+          }
       }
   }
 
-  tftp_send_data(spt, 1, tp);
+  if (nb_options > 0) {
+      assert(nb_options <= ARRAY_SIZE(option_name));
+      tftp_send_oack(spt, option_name, option_value, nb_options, tp);
+      return;
+  }
+
+  spt->block_nr = 0;
+  tftp_send_next_block(spt, tp);
 }
 
 static void tftp_handle_ack(Slirp *slirp, struct tftp_t *tp, int pktlen)
@@ -382,11 +406,7 @@
     return;
   }
 
-  if (tftp_send_data(&slirp->tftp_sessions[s],
-		     ntohs(tp->x.tp_data.tp_block_nr) + 1,
-		     tp) < 0) {
-    return;
-  }
+  tftp_send_next_block(&slirp->tftp_sessions[s], tp);
 }
 
 static void tftp_handle_error(Slirp *slirp, struct tftp_t *tp, int pktlen)
diff --git a/slirp/tftp.h b/slirp/tftp.h
index 72e5e91..51704e4 100644
--- a/slirp/tftp.h
+++ b/slirp/tftp.h
@@ -33,9 +33,11 @@
 struct tftp_session {
     Slirp *slirp;
     char *filename;
+    int fd;
 
     struct in_addr client_ip;
     uint16_t client_port;
+    uint32_t block_nr;
 
     int timestamp;
 };
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index 12de6a3..4a9011a 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -426,27 +426,15 @@
 
         return EXIT_GOTO_TB;
     } else {
-        int lab_over = gen_new_label();
+        TCGv_i64 z = tcg_const_i64(0);
+        TCGv_i64 d = tcg_const_i64(dest);
+        TCGv_i64 p = tcg_const_i64(ctx->pc);
 
-        /* ??? Consider using either
-             movi pc, next
-             addi tmp, pc, disp
-             movcond pc, cond, 0, tmp, pc
-           or
-             setcond tmp, cond, 0
-             movi pc, next
-             neg tmp, tmp
-             andi tmp, tmp, disp
-             add pc, pc, tmp
-           The current diamond subgraph surely isn't efficient.  */
+        tcg_gen_movcond_i64(cond, cpu_pc, cmp, z, d, p);
 
-        tcg_gen_brcondi_i64(cond, cmp, 0, lab_true);
-        tcg_gen_movi_i64(cpu_pc, ctx->pc);
-        tcg_gen_br(lab_over);
-        gen_set_label(lab_true);
-        tcg_gen_movi_i64(cpu_pc, dest);
-        gen_set_label(lab_over);
-
+        tcg_temp_free_i64(z);
+        tcg_temp_free_i64(d);
+        tcg_temp_free_i64(p);
         return EXIT_PC_UPDATED;
     }
 }
@@ -521,61 +509,67 @@
 static void gen_cmov(TCGCond cond, int ra, int rb, int rc,
                      int islit, uint8_t lit, int mask)
 {
-    TCGCond inv_cond = tcg_invert_cond(cond);
-    int l1;
-
-    if (unlikely(rc == 31))
-        return;
-
-    l1 = gen_new_label();
-
-    if (ra != 31) {
-        if (mask) {
-            TCGv tmp = tcg_temp_new();
-            tcg_gen_andi_i64(tmp, cpu_ir[ra], 1);
-            tcg_gen_brcondi_i64(inv_cond, tmp, 0, l1);
-            tcg_temp_free(tmp);
-        } else
-            tcg_gen_brcondi_i64(inv_cond, cpu_ir[ra], 0, l1);
-    } else {
-        /* Very uncommon case - Do not bother to optimize.  */
-        TCGv tmp = tcg_const_i64(0);
-        tcg_gen_brcondi_i64(inv_cond, tmp, 0, l1);
-        tcg_temp_free(tmp);
-    }
-
-    if (islit)
-        tcg_gen_movi_i64(cpu_ir[rc], lit);
-    else
-        tcg_gen_mov_i64(cpu_ir[rc], cpu_ir[rb]);
-    gen_set_label(l1);
-}
-
-static void gen_fcmov(TCGCond cond, int ra, int rb, int rc)
-{
-    TCGv cmp_tmp;
-    int l1;
+    TCGv_i64 c1, z, v1;
 
     if (unlikely(rc == 31)) {
         return;
     }
 
-    cmp_tmp = tcg_temp_new();
-    if (unlikely(ra == 31)) {
-        tcg_gen_movi_i64(cmp_tmp, 0);
+    if (ra == 31) {
+        /* Very uncommon case - Do not bother to optimize.  */
+        c1 = tcg_const_i64(0);
+    } else if (mask) {
+        c1 = tcg_const_i64(1);
+        tcg_gen_and_i64(c1, c1, cpu_ir[ra]);
     } else {
-        gen_fold_mzero(cond, cmp_tmp, cpu_fir[ra]);
+        c1 = cpu_ir[ra];
+    }
+    if (islit) {
+        v1 = tcg_const_i64(lit);
+    } else {
+        v1 = cpu_ir[rb];
+    }
+    z = tcg_const_i64(0);
+
+    tcg_gen_movcond_i64(cond, cpu_ir[rc], c1, z, v1, cpu_ir[rc]);
+
+    tcg_temp_free_i64(z);
+    if (ra == 31 || mask) {
+        tcg_temp_free_i64(c1);
+    }
+    if (islit) {
+        tcg_temp_free_i64(v1);
+    }
+}
+
+static void gen_fcmov(TCGCond cond, int ra, int rb, int rc)
+{
+    TCGv_i64 c1, z, v1;
+
+    if (unlikely(rc == 31)) {
+        return;
     }
 
-    l1 = gen_new_label();
-    tcg_gen_brcondi_i64(tcg_invert_cond(cond), cmp_tmp, 0, l1);
-    tcg_temp_free(cmp_tmp);
+    c1 = tcg_temp_new_i64();
+    if (unlikely(ra == 31)) {
+        tcg_gen_movi_i64(c1, 0);
+    } else {
+        gen_fold_mzero(cond, c1, cpu_fir[ra]);
+    }
+    if (rb == 31) {
+        v1 = tcg_const_i64(0);
+    } else {
+        v1 = cpu_fir[rb];
+    }
+    z = tcg_const_i64(0);
 
-    if (rb != 31)
-        tcg_gen_mov_i64(cpu_fir[rc], cpu_fir[rb]);
-    else
-        tcg_gen_movi_i64(cpu_fir[rc], 0);
-    gen_set_label(l1);
+    tcg_gen_movcond_i64(cond, cpu_fir[rc], c1, z, v1, cpu_fir[rc]);
+
+    tcg_temp_free_i64(z);
+    tcg_temp_free_i64(c1);
+    if (rb == 31) {
+        tcg_temp_free_i64(v1);
+    }
 }
 
 #define QUAL_RM_N       0x080   /* Round mode nearest even */
diff --git a/target-mips/Makefile.objs b/target-mips/Makefile.objs
index ca20f21..3eeeeac 100644
--- a/target-mips/Makefile.objs
+++ b/target-mips/Makefile.objs
@@ -1,2 +1,2 @@
-obj-y += translate.o op_helper.o helper.o cpu.o
+obj-y += translate.o op_helper.o lmi_helper.o helper.o cpu.o
 obj-$(CONFIG_SOFTMMU) += machine.o
diff --git a/target-mips/helper.h b/target-mips/helper.h
index 109ac37..f35ed78 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -303,4 +303,63 @@
 DEF_HELPER_2(pmon, void, env, int)
 DEF_HELPER_1(wait, void, env)
 
+/* Loongson multimedia functions.  */
+DEF_HELPER_FLAGS_2(paddsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(psubsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pshufh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packsswh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packsshb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packushb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(punpcklhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpcklbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpcklwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pavgh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pavgb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaxsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pminsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaxub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pminub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pcmpeqw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgtw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpeqh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgth, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpeqb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgtb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(psllw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psllh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrlw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrlh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psraw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrah, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pmullh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmulhh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmulhuh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaddhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pasubub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_1(biadd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
+DEF_HELPER_FLAGS_1(pmovmskb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
+
 #include "def-helper.h"
diff --git a/target-mips/lmi_helper.c b/target-mips/lmi_helper.c
new file mode 100644
index 0000000..1b24353
--- /dev/null
+++ b/target-mips/lmi_helper.c
@@ -0,0 +1,744 @@
+/*
+ *  Loongson Multimedia Instruction emulation helpers for QEMU.
+ *
+ *  Copyright (c) 2011  Richard Henderson <rth@twiddle.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cpu.h"
+#include "helper.h"
+
+/* If the byte ordering doesn't matter, i.e. all columns are treated
+   identically, then this union can be used directly.  If byte ordering
+   does matter, we generally ignore dumping to memory.  */
+typedef union {
+    uint8_t  ub[8];
+    int8_t   sb[8];
+    uint16_t uh[4];
+    int16_t  sh[4];
+    uint32_t uw[2];
+    int32_t  sw[2];
+    uint64_t d;
+} LMIValue;
+
+/* Some byte ordering issues can be mitigated by XORing in the following.  */
+#ifdef HOST_WORDS_BIGENDIAN
+# define BYTE_ORDER_XOR(N) N
+#else
+# define BYTE_ORDER_XOR(N) 0
+#endif
+
+#define SATSB(x)  (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
+#define SATUB(x)  (x > 0xff ? 0xff : x)
+
+#define SATSH(x)  (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
+#define SATUH(x)  (x > 0xffff ? 0xffff : x)
+
+#define SATSW(x) \
+    (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
+#define SATUW(x)  (x > 0xffffffffull ? 0xffffffffull : x)
+
+uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.sb[i] + vt.sb[i];
+        vs.sb[i] = SATSB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] + vt.ub[i];
+        vs.ub[i] = SATUB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.sh[i] + vt.sh[i];
+        vs.sh[i] = SATSH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddush(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.uh[i] + vt.uh[i];
+        vs.uh[i] = SATUH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        vs.ub[i] += vt.ub[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] += vt.uh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] += vt.uw[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.sb[i] - vt.sb[i];
+        vs.sb[i] = SATSB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] - vt.ub[i];
+        vs.ub[i] = SATUB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.sh[i] - vt.sh[i];
+        vs.sh[i] = SATSH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubush(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.uh[i] - vt.uh[i];
+        vs.uh[i] = SATUH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        vs.ub[i] -= vt.ub[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] -= vt.uh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] -= vt.uw[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs;
+    unsigned i;
+
+    vs.d = fs;
+    vd.d = 0;
+    for (i = 0; i < 4; i++, ft >>= 2) {
+        vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
+    }
+    return vd.d;
+}
+
+uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    int64_t tmp;
+
+    tmp = (int32_t)(fs >> 0);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 0;
+
+    tmp = (int32_t)(fs >> 32);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 16;
+
+    tmp = (int32_t)(ft >> 0);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 32;
+
+    tmp = (int32_t)(ft >> 32);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 48;
+
+    return fd;
+}
+
+uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    unsigned int i;
+
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = fs >> (i * 16);
+        tmp = SATSB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
+    }
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = ft >> (i * 16);
+        tmp = SATSB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
+    }
+
+    return fd;
+}
+
+uint64_t helper_packushb(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    unsigned int i;
+
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = fs >> (i * 16);
+        tmp = SATUB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
+    }
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = ft >> (i * 16);
+        tmp = SATUB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
+    }
+
+    return fd;
+}
+
+uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
+{
+    return (fs & 0xffffffff) | (ft << 32);
+}
+
+uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
+{
+    return (fs >> 32) | (ft & ~0xffffffffull);
+}
+
+uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.uh[0 ^ host] = vs.uh[0 ^ host];
+    vd.uh[1 ^ host] = vt.uh[0 ^ host];
+    vd.uh[2 ^ host] = vs.uh[1 ^ host];
+    vd.uh[3 ^ host] = vt.uh[1 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.uh[0 ^ host] = vs.uh[2 ^ host];
+    vd.uh[1 ^ host] = vt.uh[2 ^ host];
+    vd.uh[2 ^ host] = vs.uh[3 ^ host];
+    vd.uh[3 ^ host] = vt.uh[3 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(7);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.ub[0 ^ host] = vs.ub[0 ^ host];
+    vd.ub[1 ^ host] = vt.ub[0 ^ host];
+    vd.ub[2 ^ host] = vs.ub[1 ^ host];
+    vd.ub[3 ^ host] = vt.ub[1 ^ host];
+    vd.ub[4 ^ host] = vs.ub[2 ^ host];
+    vd.ub[5 ^ host] = vt.ub[2 ^ host];
+    vd.ub[6 ^ host] = vs.ub[3 ^ host];
+    vd.ub[7 ^ host] = vt.ub[3 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(7);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.ub[0 ^ host] = vs.ub[4 ^ host];
+    vd.ub[1 ^ host] = vt.ub[4 ^ host];
+    vd.ub[2 ^ host] = vs.ub[5 ^ host];
+    vd.ub[3 ^ host] = vt.ub[5 ^ host];
+    vd.ub[4 ^ host] = vs.ub[6 ^ host];
+    vd.ub[5 ^ host] = vt.ub[6 ^ host];
+    vd.ub[6 ^ host] = vs.ub[7 ^ host];
+    vd.ub[7 ^ host] = vt.ub[7 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pminub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; i++) {
+        vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; i++) {
+        vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psllw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] <<= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psraw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        ft = 31;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.sw[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psllh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] <<= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrah(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        ft = 15;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.sh[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.sh[i] *= vt.sh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int32_t r = vs.sh[i] * vt.sh[i];
+        vs.sh[i] = r >> 16;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        uint32_t r = vs.uh[i] * vt.uh[i];
+        vs.uh[i] = r >> 16;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vs, vt;
+    uint32_t p0, p1;
+
+    vs.d = fs;
+    vt.d = ft;
+    p0  = vs.sh[0 ^ host] * vt.sh[0 ^ host];
+    p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
+    p1  = vs.sh[2 ^ host] * vt.sh[2 ^ host];
+    p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
+
+    return ((uint64_t)p1 << 32) | p0;
+}
+
+uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] - vt.ub[i];
+        vs.ub[i] = (r < 0 ? -r : r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_biadd(uint64_t fs)
+{
+    unsigned i, fd;
+
+    for (i = fd = 0; i < 8; ++i) {
+        fd += (fs >> (i * 8)) & 0xff;
+    }
+    return fd & 0xffff;
+}
+
+uint64_t helper_pmovmskb(uint64_t fs)
+{
+    unsigned fd = 0;
+
+    fd |= ((fs >>  7) & 1) << 0;
+    fd |= ((fs >> 15) & 1) << 1;
+    fd |= ((fs >> 23) & 1) << 2;
+    fd |= ((fs >> 31) & 1) << 3;
+    fd |= ((fs >> 39) & 1) << 4;
+    fd |= ((fs >> 47) & 1) << 5;
+    fd |= ((fs >> 55) & 1) << 6;
+    fd |= ((fs >> 63) & 1) << 7;
+
+    return fd & 0xff;
+}
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 52eeb2b..fa79d49 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -28,7 +28,7 @@
 #define GEN_HELPER 1
 #include "helper.h"
 
-//#define MIPS_DEBUG_DISAS
+#define MIPS_DEBUG_DISAS 0
 //#define MIPS_DEBUG_SIGN_EXTENSIONS
 
 /* MIPS major opcodes */
@@ -446,6 +446,103 @@
     OPC_BC2     = (0x08 << 21) | OPC_CP2,
 };
 
+#define MASK_LMI(op)  (MASK_OP_MAJOR(op) | (op & (0x1F << 21)) | (op & 0x1F))
+
+enum {
+    OPC_PADDSH  = (24 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDUSH = (25 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDH   = (26 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDW   = (27 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDSB  = (28 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDUSB = (29 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDB   = (30 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDD   = (31 << 21) | (0x00) | OPC_CP2,
+
+    OPC_PSUBSH  = (24 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBUSH = (25 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBH   = (26 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBW   = (27 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBSB  = (28 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBUSB = (29 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBB   = (30 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBD   = (31 << 21) | (0x01) | OPC_CP2,
+
+    OPC_PSHUFH   = (24 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKSSWH = (25 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKSSHB = (26 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKUSHB = (27 << 21) | (0x02) | OPC_CP2,
+    OPC_XOR_CP2  = (28 << 21) | (0x02) | OPC_CP2,
+    OPC_NOR_CP2  = (29 << 21) | (0x02) | OPC_CP2,
+    OPC_AND_CP2  = (30 << 21) | (0x02) | OPC_CP2,
+    OPC_PANDN    = (31 << 21) | (0x02) | OPC_CP2,
+
+    OPC_PUNPCKLHW = (24 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKHHW = (25 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKLBH = (26 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKHBH = (27 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_0  = (28 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_1  = (29 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_2  = (30 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_3  = (31 << 21) | (0x03) | OPC_CP2,
+
+    OPC_PAVGH   = (24 << 21) | (0x08) | OPC_CP2,
+    OPC_PAVGB   = (25 << 21) | (0x08) | OPC_CP2,
+    OPC_PMAXSH  = (26 << 21) | (0x08) | OPC_CP2,
+    OPC_PMINSH  = (27 << 21) | (0x08) | OPC_CP2,
+    OPC_PMAXUB  = (28 << 21) | (0x08) | OPC_CP2,
+    OPC_PMINUB  = (29 << 21) | (0x08) | OPC_CP2,
+
+    OPC_PCMPEQW = (24 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTW = (25 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPEQH = (26 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTH = (27 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPEQB = (28 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTB = (29 << 21) | (0x09) | OPC_CP2,
+
+    OPC_PSLLW   = (24 << 21) | (0x0A) | OPC_CP2,
+    OPC_PSLLH   = (25 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULLH  = (26 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULHH  = (27 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULUW  = (28 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULHUH = (29 << 21) | (0x0A) | OPC_CP2,
+
+    OPC_PSRLW     = (24 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRLH     = (25 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRAW     = (26 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRAH     = (27 << 21) | (0x0B) | OPC_CP2,
+    OPC_PUNPCKLWD = (28 << 21) | (0x0B) | OPC_CP2,
+    OPC_PUNPCKHWD = (29 << 21) | (0x0B) | OPC_CP2,
+
+    OPC_ADDU_CP2 = (24 << 21) | (0x0C) | OPC_CP2,
+    OPC_OR_CP2   = (25 << 21) | (0x0C) | OPC_CP2,
+    OPC_ADD_CP2  = (26 << 21) | (0x0C) | OPC_CP2,
+    OPC_DADD_CP2 = (27 << 21) | (0x0C) | OPC_CP2,
+    OPC_SEQU_CP2 = (28 << 21) | (0x0C) | OPC_CP2,
+    OPC_SEQ_CP2  = (29 << 21) | (0x0C) | OPC_CP2,
+
+    OPC_SUBU_CP2 = (24 << 21) | (0x0D) | OPC_CP2,
+    OPC_PASUBUB  = (25 << 21) | (0x0D) | OPC_CP2,
+    OPC_SUB_CP2  = (26 << 21) | (0x0D) | OPC_CP2,
+    OPC_DSUB_CP2 = (27 << 21) | (0x0D) | OPC_CP2,
+    OPC_SLTU_CP2 = (28 << 21) | (0x0D) | OPC_CP2,
+    OPC_SLT_CP2  = (29 << 21) | (0x0D) | OPC_CP2,
+
+    OPC_SLL_CP2  = (24 << 21) | (0x0E) | OPC_CP2,
+    OPC_DSLL_CP2 = (25 << 21) | (0x0E) | OPC_CP2,
+    OPC_PEXTRH   = (26 << 21) | (0x0E) | OPC_CP2,
+    OPC_PMADDHW  = (27 << 21) | (0x0E) | OPC_CP2,
+    OPC_SLEU_CP2 = (28 << 21) | (0x0E) | OPC_CP2,
+    OPC_SLE_CP2  = (29 << 21) | (0x0E) | OPC_CP2,
+
+    OPC_SRL_CP2  = (24 << 21) | (0x0F) | OPC_CP2,
+    OPC_DSRL_CP2 = (25 << 21) | (0x0F) | OPC_CP2,
+    OPC_SRA_CP2  = (26 << 21) | (0x0F) | OPC_CP2,
+    OPC_DSRA_CP2 = (27 << 21) | (0x0F) | OPC_CP2,
+    OPC_BIADD    = (28 << 21) | (0x0F) | OPC_CP2,
+    OPC_PMOVMSKB = (29 << 21) | (0x0F) | OPC_CP2,
+};
+
+
 #define MASK_CP3(op)       MASK_OP_MAJOR(op) | (op & 0x3F)
 
 enum {
@@ -566,22 +663,25 @@
       "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
       "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", };
 
-#ifdef MIPS_DEBUG_DISAS
-#define MIPS_DEBUG(fmt, ...)                         \
-        qemu_log_mask(CPU_LOG_TB_IN_ASM,                \
-                       TARGET_FMT_lx ": %08x " fmt "\n", \
-                       ctx->pc, ctx->opcode , ## __VA_ARGS__)
-#define LOG_DISAS(...) qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__)
-#else
-#define MIPS_DEBUG(fmt, ...) do { } while(0)
-#define LOG_DISAS(...) do { } while (0)
-#endif
+#define MIPS_DEBUG(fmt, ...)                                                  \
+    do {                                                                      \
+        if (MIPS_DEBUG_DISAS) {                                               \
+            qemu_log_mask(CPU_LOG_TB_IN_ASM,                                  \
+                          TARGET_FMT_lx ": %08x " fmt "\n",                   \
+                          ctx->pc, ctx->opcode , ## __VA_ARGS__);             \
+        }                                                                     \
+    } while (0)
+
+#define LOG_DISAS(...)                                                        \
+    do {                                                                      \
+        if (MIPS_DEBUG_DISAS) {                                               \
+            qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__);                 \
+        }                                                                     \
+    } while (0)
 
 #define MIPS_INVAL(op)                                                        \
-do {                                                                          \
     MIPS_DEBUG("Invalid %s %03x %03x %03x", op, ctx->opcode >> 26,            \
-               ctx->opcode & 0x3F, ((ctx->opcode >> 16) & 0x1F));             \
-} while (0)
+               ctx->opcode & 0x3F, ((ctx->opcode >> 16) & 0x1F))
 
 /* General purpose registers moves. */
 static inline void gen_load_gpr (TCGv t, int reg)
@@ -1431,7 +1531,8 @@
 }
 
 /* Logic with immediate operand */
-static void gen_logic_imm (CPUMIPSState *env, uint32_t opc, int rt, int rs, int16_t imm)
+static void gen_logic_imm(CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
+                          int rt, int rs, int16_t imm)
 {
     target_ulong uimm;
     const char *opn = "imm logic";
@@ -1474,7 +1575,8 @@
 }
 
 /* Set on less than with immediate operand */
-static void gen_slt_imm (CPUMIPSState *env, uint32_t opc, int rt, int rs, int16_t imm)
+static void gen_slt_imm(CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
+                        int rt, int rs, int16_t imm)
 {
     target_ulong uimm = (target_long)imm; /* Sign extend to 32/64 bits */
     const char *opn = "imm arith";
@@ -1775,7 +1877,8 @@
 }
 
 /* Conditional move */
-static void gen_cond_move (CPUMIPSState *env, uint32_t opc, int rd, int rs, int rt)
+static void gen_cond_move(CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
+                          int rd, int rs, int rt)
 {
     const char *opn = "cond move";
     int l1;
@@ -1813,7 +1916,8 @@
 }
 
 /* Logic */
-static void gen_logic (CPUMIPSState *env, uint32_t opc, int rd, int rs, int rt)
+static void gen_logic(CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
+                      int rd, int rs, int rt)
 {
     const char *opn = "logic";
 
@@ -1874,7 +1978,8 @@
 }
 
 /* Set on lower than */
-static void gen_slt (CPUMIPSState *env, uint32_t opc, int rd, int rs, int rt)
+static void gen_slt(CPUMIPSState *env, DisasContext *ctx, uint32_t opc,
+                    int rd, int rs, int rt)
 {
     const char *opn = "slt";
     TCGv t0, t1;
@@ -2380,8 +2485,8 @@
 }
 
 /* Godson integer instructions */
-static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
-                                int rd, int rs, int rt)
+static void gen_loongson_integer(DisasContext *ctx, uint32_t opc,
+                                 int rd, int rs, int rt)
 {
     const char *opn = "loongson";
     TCGv t0, t1;
@@ -2594,6 +2699,278 @@
     tcg_temp_free(t1);
 }
 
+/* Loongson multimedia instructions */
+static void gen_loongson_multimedia(DisasContext *ctx, int rd, int rs, int rt)
+{
+    const char *opn = "loongson_cp2";
+    uint32_t opc, shift_max;
+    TCGv_i64 t0, t1;
+
+    opc = MASK_LMI(ctx->opcode);
+    switch (opc) {
+    case OPC_ADD_CP2:
+    case OPC_SUB_CP2:
+    case OPC_DADD_CP2:
+    case OPC_DSUB_CP2:
+        t0 = tcg_temp_local_new_i64();
+        t1 = tcg_temp_local_new_i64();
+        break;
+    default:
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        break;
+    }
+
+    gen_load_fpr64(ctx, t0, rs);
+    gen_load_fpr64(ctx, t1, rt);
+
+#define LMI_HELPER(UP, LO) \
+    case OPC_##UP: gen_helper_##LO(t0, t0, t1); opn = #LO; break
+#define LMI_HELPER_1(UP, LO) \
+    case OPC_##UP: gen_helper_##LO(t0, t0); opn = #LO; break
+#define LMI_DIRECT(UP, LO, OP) \
+    case OPC_##UP: tcg_gen_##OP##_i64(t0, t0, t1); opn = #LO; break
+
+    switch (opc) {
+    LMI_HELPER(PADDSH, paddsh);
+    LMI_HELPER(PADDUSH, paddush);
+    LMI_HELPER(PADDH, paddh);
+    LMI_HELPER(PADDW, paddw);
+    LMI_HELPER(PADDSB, paddsb);
+    LMI_HELPER(PADDUSB, paddusb);
+    LMI_HELPER(PADDB, paddb);
+
+    LMI_HELPER(PSUBSH, psubsh);
+    LMI_HELPER(PSUBUSH, psubush);
+    LMI_HELPER(PSUBH, psubh);
+    LMI_HELPER(PSUBW, psubw);
+    LMI_HELPER(PSUBSB, psubsb);
+    LMI_HELPER(PSUBUSB, psubusb);
+    LMI_HELPER(PSUBB, psubb);
+
+    LMI_HELPER(PSHUFH, pshufh);
+    LMI_HELPER(PACKSSWH, packsswh);
+    LMI_HELPER(PACKSSHB, packsshb);
+    LMI_HELPER(PACKUSHB, packushb);
+
+    LMI_HELPER(PUNPCKLHW, punpcklhw);
+    LMI_HELPER(PUNPCKHHW, punpckhhw);
+    LMI_HELPER(PUNPCKLBH, punpcklbh);
+    LMI_HELPER(PUNPCKHBH, punpckhbh);
+    LMI_HELPER(PUNPCKLWD, punpcklwd);
+    LMI_HELPER(PUNPCKHWD, punpckhwd);
+
+    LMI_HELPER(PAVGH, pavgh);
+    LMI_HELPER(PAVGB, pavgb);
+    LMI_HELPER(PMAXSH, pmaxsh);
+    LMI_HELPER(PMINSH, pminsh);
+    LMI_HELPER(PMAXUB, pmaxub);
+    LMI_HELPER(PMINUB, pminub);
+
+    LMI_HELPER(PCMPEQW, pcmpeqw);
+    LMI_HELPER(PCMPGTW, pcmpgtw);
+    LMI_HELPER(PCMPEQH, pcmpeqh);
+    LMI_HELPER(PCMPGTH, pcmpgth);
+    LMI_HELPER(PCMPEQB, pcmpeqb);
+    LMI_HELPER(PCMPGTB, pcmpgtb);
+
+    LMI_HELPER(PSLLW, psllw);
+    LMI_HELPER(PSLLH, psllh);
+    LMI_HELPER(PSRLW, psrlw);
+    LMI_HELPER(PSRLH, psrlh);
+    LMI_HELPER(PSRAW, psraw);
+    LMI_HELPER(PSRAH, psrah);
+
+    LMI_HELPER(PMULLH, pmullh);
+    LMI_HELPER(PMULHH, pmulhh);
+    LMI_HELPER(PMULHUH, pmulhuh);
+    LMI_HELPER(PMADDHW, pmaddhw);
+
+    LMI_HELPER(PASUBUB, pasubub);
+    LMI_HELPER_1(BIADD, biadd);
+    LMI_HELPER_1(PMOVMSKB, pmovmskb);
+
+    LMI_DIRECT(PADDD, paddd, add);
+    LMI_DIRECT(PSUBD, psubd, sub);
+    LMI_DIRECT(XOR_CP2, xor, xor);
+    LMI_DIRECT(NOR_CP2, nor, nor);
+    LMI_DIRECT(AND_CP2, and, and);
+    LMI_DIRECT(PANDN, pandn, andc);
+    LMI_DIRECT(OR, or, or);
+
+    case OPC_PINSRH_0:
+        tcg_gen_deposit_i64(t0, t0, t1, 0, 16);
+        opn = "pinsrh_0";
+        break;
+    case OPC_PINSRH_1:
+        tcg_gen_deposit_i64(t0, t0, t1, 16, 16);
+        opn = "pinsrh_1";
+        break;
+    case OPC_PINSRH_2:
+        tcg_gen_deposit_i64(t0, t0, t1, 32, 16);
+        opn = "pinsrh_2";
+        break;
+    case OPC_PINSRH_3:
+        tcg_gen_deposit_i64(t0, t0, t1, 48, 16);
+        opn = "pinsrh_3";
+        break;
+
+    case OPC_PEXTRH:
+        tcg_gen_andi_i64(t1, t1, 3);
+        tcg_gen_shli_i64(t1, t1, 4);
+        tcg_gen_shr_i64(t0, t0, t1);
+        tcg_gen_ext16u_i64(t0, t0);
+        opn = "pextrh";
+        break;
+
+    case OPC_ADDU_CP2:
+        tcg_gen_add_i64(t0, t0, t1);
+        tcg_gen_ext32s_i64(t0, t0);
+        opn = "addu";
+        break;
+    case OPC_SUBU_CP2:
+        tcg_gen_sub_i64(t0, t0, t1);
+        tcg_gen_ext32s_i64(t0, t0);
+        opn = "addu";
+        break;
+
+    case OPC_SLL_CP2:
+        opn = "sll";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_SRL_CP2:
+        opn = "srl";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_SRA_CP2:
+        opn = "sra";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_DSLL_CP2:
+        opn = "dsll";
+        shift_max = 64;
+        goto do_shift;
+    case OPC_DSRL_CP2:
+        opn = "dsrl";
+        shift_max = 64;
+        goto do_shift;
+    case OPC_DSRA_CP2:
+        opn = "dsra";
+        shift_max = 64;
+        goto do_shift;
+    do_shift:
+        /* Make sure shift count isn't TCG undefined behaviour.  */
+        tcg_gen_andi_i64(t1, t1, shift_max - 1);
+
+        switch (opc) {
+        case OPC_SLL_CP2:
+        case OPC_DSLL_CP2:
+            tcg_gen_shl_i64(t0, t0, t1);
+            break;
+        case OPC_SRA_CP2:
+        case OPC_DSRA_CP2:
+            /* Since SRA is UndefinedResult without sign-extended inputs,
+               we can treat SRA and DSRA the same.  */
+            tcg_gen_sar_i64(t0, t0, t1);
+            break;
+        case OPC_SRL_CP2:
+            /* We want to shift in zeros for SRL; zero-extend first.  */
+            tcg_gen_ext32u_i64(t0, t0);
+            /* FALLTHRU */
+        case OPC_DSRL_CP2:
+            tcg_gen_shr_i64(t0, t0, t1);
+            break;
+        }
+
+        if (shift_max == 32) {
+            tcg_gen_ext32s_i64(t0, t0);
+        }
+
+        /* Shifts larger than MAX produce zero.  */
+        tcg_gen_setcondi_i64(TCG_COND_LTU, t1, t1, shift_max);
+        tcg_gen_neg_i64(t1, t1);
+        tcg_gen_and_i64(t0, t0, t1);
+        break;
+
+    case OPC_ADD_CP2:
+    case OPC_DADD_CP2:
+        {
+            TCGv_i64 t2 = tcg_temp_new_i64();
+            int lab = gen_new_label();
+
+            tcg_gen_mov_i64(t2, t0);
+            tcg_gen_add_i64(t0, t1, t2);
+            if (opc == OPC_ADD_CP2) {
+                tcg_gen_ext32s_i64(t0, t0);
+            }
+            tcg_gen_xor_i64(t1, t1, t2);
+            tcg_gen_xor_i64(t2, t2, t0);
+            tcg_gen_andc_i64(t1, t2, t1);
+            tcg_temp_free_i64(t2);
+            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
+            generate_exception(ctx, EXCP_OVERFLOW);
+            gen_set_label(lab);
+
+            opn = (opc == OPC_ADD_CP2 ? "add" : "dadd");
+            break;
+        }
+
+    case OPC_SUB_CP2:
+    case OPC_DSUB_CP2:
+        {
+            TCGv_i64 t2 = tcg_temp_new_i64();
+            int lab = gen_new_label();
+
+            tcg_gen_mov_i64(t2, t0);
+            tcg_gen_sub_i64(t0, t1, t2);
+            if (opc == OPC_SUB_CP2) {
+                tcg_gen_ext32s_i64(t0, t0);
+            }
+            tcg_gen_xor_i64(t1, t1, t2);
+            tcg_gen_xor_i64(t2, t2, t0);
+            tcg_gen_and_i64(t1, t1, t2);
+            tcg_temp_free_i64(t2);
+            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
+            generate_exception(ctx, EXCP_OVERFLOW);
+            gen_set_label(lab);
+
+            opn = (opc == OPC_SUB_CP2 ? "sub" : "dsub");
+            break;
+        }
+
+    case OPC_PMULUW:
+        tcg_gen_ext32u_i64(t0, t0);
+        tcg_gen_ext32u_i64(t1, t1);
+        tcg_gen_mul_i64(t0, t0, t1);
+        opn = "pmuluw";
+        break;
+
+    case OPC_SEQU_CP2:
+    case OPC_SEQ_CP2:
+    case OPC_SLTU_CP2:
+    case OPC_SLT_CP2:
+    case OPC_SLEU_CP2:
+    case OPC_SLE_CP2:
+        /* ??? Document is unclear: Set FCC[CC].  Does that mean the
+           FD field is the CC field?  */
+    default:
+        MIPS_INVAL(opn);
+        generate_exception(ctx, EXCP_RI);
+        return;
+    }
+
+#undef LMI_HELPER
+#undef LMI_DIRECT
+
+    gen_store_fpr64(ctx, t0, rd);
+
+    (void)opn; /* avoid a compiler warning */
+    MIPS_DEBUG("%s %s, %s, %s", opn,
+               fregnames[rd], fregnames[rs], fregnames[rt]);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
 /* Traps */
 static void gen_trap (DisasContext *ctx, uint32_t opc,
                       int rs, int rt, int16_t imm)
@@ -8778,10 +9155,10 @@
         gen_arith_imm(env, ctx, OPC_ADDIU, rx, rx, imm);
         break;
     case M16_OPC_SLTI:
-        gen_slt_imm(env, OPC_SLTI, 24, rx, imm);
+        gen_slt_imm(env, ctx, OPC_SLTI, 24, rx, imm);
         break;
     case M16_OPC_SLTIU:
-        gen_slt_imm(env, OPC_SLTIU, 24, rx, imm);
+        gen_slt_imm(env, ctx, OPC_SLTIU, 24, rx, imm);
         break;
     case M16_OPC_I8:
         switch (funct) {
@@ -8992,15 +9369,13 @@
     case M16_OPC_SLTI:
         {
             int16_t imm = (uint8_t) ctx->opcode;
-
-            gen_slt_imm(env, OPC_SLTI, 24, rx, imm);
+            gen_slt_imm(env, ctx, OPC_SLTI, 24, rx, imm);
         }
         break;
     case M16_OPC_SLTIU:
         {
             int16_t imm = (uint8_t) ctx->opcode;
-
-            gen_slt_imm(env, OPC_SLTIU, 24, rx, imm);
+            gen_slt_imm(env, ctx, OPC_SLTIU, 24, rx, imm);
         }
         break;
     case M16_OPC_I8:
@@ -9075,8 +9450,7 @@
     case M16_OPC_CMPI:
         {
             int16_t imm = (uint8_t) ctx->opcode;
-
-            gen_logic_imm(env, OPC_XORI, 24, rx, imm);
+            gen_logic_imm(env, ctx, OPC_XORI, 24, rx, imm);
         }
         break;
 #if defined(TARGET_MIPS64)
@@ -9188,10 +9562,10 @@
             }
             break;
         case RR_SLT:
-            gen_slt(env, OPC_SLT, 24, rx, ry);
+            gen_slt(env, ctx, OPC_SLT, 24, rx, ry);
             break;
         case RR_SLTU:
-            gen_slt(env, OPC_SLTU, 24, rx, ry);
+            gen_slt(env, ctx, OPC_SLTU, 24, rx, ry);
             break;
         case RR_BREAK:
             generate_exception(ctx, EXCP_BREAK);
@@ -9212,22 +9586,22 @@
             break;
 #endif
         case RR_CMP:
-            gen_logic(env, OPC_XOR, 24, rx, ry);
+            gen_logic(env, ctx, OPC_XOR, 24, rx, ry);
             break;
         case RR_NEG:
             gen_arith(env, ctx, OPC_SUBU, rx, 0, ry);
             break;
         case RR_AND:
-            gen_logic(env, OPC_AND, rx, rx, ry);
+            gen_logic(env, ctx, OPC_AND, rx, rx, ry);
             break;
         case RR_OR:
-            gen_logic(env, OPC_OR, rx, rx, ry);
+            gen_logic(env, ctx, OPC_OR, rx, rx, ry);
             break;
         case RR_XOR:
-            gen_logic(env, OPC_XOR, rx, rx, ry);
+            gen_logic(env, ctx, OPC_XOR, rx, rx, ry);
             break;
         case RR_NOT:
-            gen_logic(env, OPC_NOR, rx, ry, 0);
+            gen_logic(env, ctx, OPC_NOR, rx, ry, 0);
             break;
         case RR_MFHI:
             gen_HILO(ctx, OPC_MFHI, rx);
@@ -9849,12 +10223,13 @@
     int rs = mmreg(uMIPS_RS(ctx->opcode));
     int encoded = ZIMM(ctx->opcode, 0, 4);
 
-    gen_logic_imm(env, OPC_ANDI, rd, rs, decoded_imm[encoded]);
+    gen_logic_imm(env, ctx, OPC_ANDI, rd, rs, decoded_imm[encoded]);
 }
 
 static void gen_ldst_multiple (DisasContext *ctx, uint32_t opc, int reglist,
                                int base, int16_t offset)
 {
+    const char *opn = "ldst_multiple";
     TCGv t0, t1;
     TCGv_i32 t2;
 
@@ -9874,19 +10249,24 @@
     switch (opc) {
     case LWM32:
         gen_helper_lwm(cpu_env, t0, t1, t2);
+        opn = "lwm";
         break;
     case SWM32:
         gen_helper_swm(cpu_env, t0, t1, t2);
+        opn = "swm";
         break;
 #ifdef TARGET_MIPS64
     case LDM:
         gen_helper_ldm(cpu_env, t0, t1, t2);
+        opn = "ldm";
         break;
     case SDM:
         gen_helper_sdm(cpu_env, t0, t1, t2);
+        opn = "sdm";
         break;
 #endif
     }
+    (void)opn;
     MIPS_DEBUG("%s, %x, %d(%s)", opn, reglist, offset, regnames[base]);
     tcg_temp_free(t0);
     tcg_temp_free(t1);
@@ -9905,25 +10285,25 @@
     case NOT16 + 1:
     case NOT16 + 2:
     case NOT16 + 3:
-        gen_logic(env, OPC_NOR, rd, rs, 0);
+        gen_logic(env, ctx, OPC_NOR, rd, rs, 0);
         break;
     case XOR16 + 0:
     case XOR16 + 1:
     case XOR16 + 2:
     case XOR16 + 3:
-        gen_logic(env, OPC_XOR, rd, rd, rs);
+        gen_logic(env, ctx, OPC_XOR, rd, rd, rs);
         break;
     case AND16 + 0:
     case AND16 + 1:
     case AND16 + 2:
     case AND16 + 3:
-        gen_logic(env, OPC_AND, rd, rd, rs);
+        gen_logic(env, ctx, OPC_AND, rd, rd, rs);
         break;
     case OR16 + 0:
     case OR16 + 1:
     case OR16 + 2:
     case OR16 + 3:
-        gen_logic(env, OPC_OR, rd, rd, rs);
+        gen_logic(env, ctx, OPC_OR, rd, rd, rs);
         break;
     case LWM16 + 0:
     case LWM16 + 1:
@@ -10737,7 +11117,7 @@
             case XOR32:
                 mips32_op = OPC_XOR;
             do_logic:
-                gen_logic(env, mips32_op, rd, rs, rt);
+                gen_logic(env, ctx, mips32_op, rd, rs, rt);
                 break;
                 /* Set less than */
             case SLT:
@@ -10746,7 +11126,7 @@
             case SLTU:
                 mips32_op = OPC_SLTU;
             do_slt:
-                gen_slt(env, mips32_op, rd, rs, rt);
+                gen_slt(env, ctx, mips32_op, rd, rs, rt);
                 break;
             default:
                 goto pool32a_invalid;
@@ -10762,7 +11142,7 @@
             case MOVZ:
                 mips32_op = OPC_MOVZ;
             do_cmov:
-                gen_cond_move(env, mips32_op, rd, rs, rt);
+                gen_cond_move(env, ctx, mips32_op, rd, rs, rt);
                 break;
             case LWXS:
                 gen_ldxs(ctx, rs, rt, rd);
@@ -11175,7 +11555,7 @@
                target. */
             break;
         case LUI:
-            gen_logic_imm(env, OPC_LUI, rs, -1, imm);
+            gen_logic_imm(env, ctx, OPC_LUI, rs, -1, imm);
             break;
         case SYNCI:
             break;
@@ -11294,7 +11674,7 @@
     case ANDI32:
         mips32_op = OPC_ANDI;
     do_logici:
-        gen_logic_imm(env, mips32_op, rt, rs, imm);
+        gen_logic_imm(env, ctx, mips32_op, rt, rs, imm);
         break;
 
         /* Set less than immediate */
@@ -11304,7 +11684,7 @@
     case SLTIU32:
         mips32_op = OPC_SLTIU;
     do_slti:
-        gen_slt_imm(env, mips32_op, rt, rs, imm);
+        gen_slt_imm(env, ctx, mips32_op, rt, rs, imm);
         break;
     case JALX32:
         offset = (int32_t)(ctx->opcode & 0x3FFFFFF) << 2;
@@ -11781,7 +12161,7 @@
         case OPC_MOVZ:
             check_insn(env, ctx, ISA_MIPS4 | ISA_MIPS32 |
                                  INSN_LOONGSON2E | INSN_LOONGSON2F);
-            gen_cond_move(env, op1, rd, rs, rt);
+            gen_cond_move(env, ctx, op1, rd, rs, rt);
             break;
         case OPC_ADD ... OPC_SUBU:
             gen_arith(env, ctx, op1, rd, rs, rt);
@@ -11808,13 +12188,13 @@
             break;
         case OPC_SLT:          /* Set on less than */
         case OPC_SLTU:
-            gen_slt(env, op1, rd, rs, rt);
+            gen_slt(env, ctx, op1, rd, rs, rt);
             break;
         case OPC_AND:          /* Logic*/
         case OPC_OR:
         case OPC_NOR:
         case OPC_XOR:
-            gen_logic(env, op1, rd, rs, rt);
+            gen_logic(env, ctx, op1, rd, rs, rt);
             break;
         case OPC_MULT ... OPC_DIVU:
             if (sa) {
@@ -12215,13 +12595,13 @@
          break;
     case OPC_SLTI: /* Set on less than with immediate opcode */
     case OPC_SLTIU:
-         gen_slt_imm(env, op, rt, rs, imm);
+         gen_slt_imm(env, ctx, op, rt, rs, imm);
          break;
     case OPC_ANDI: /* Arithmetic with immediate opcode */
     case OPC_LUI:
     case OPC_ORI:
     case OPC_XORI:
-         gen_logic_imm(env, op, rt, rs, imm);
+         gen_logic_imm(env, ctx, op, rt, rs, imm);
          break;
     case OPC_J ... OPC_JAL: /* Jump */
          offset = (int32_t)(ctx->opcode & 0x3FFFFFF) << 2;
@@ -12316,10 +12696,14 @@
     case OPC_LDC2:
     case OPC_SWC2:
     case OPC_SDC2:
-    case OPC_CP2:
         /* COP2: Not implemented. */
         generate_exception_err(ctx, EXCP_CpU, 2);
         break;
+    case OPC_CP2:
+        check_insn(env, ctx, INSN_LOONGSON2F);
+        /* Note that these instructions use different fields.  */
+        gen_loongson_multimedia(ctx, sa, rd, rt);
+        break;
 
     case OPC_CP3:
         if (env->CP0_Config1 & (1 << CP0C1_FP)) {
diff --git a/target-sh4/helper.h b/target-sh4/helper.h
index 6e4f108..6c1a47d 100644
--- a/target-sh4/helper.h
+++ b/target-sh4/helper.h
@@ -1,30 +1,26 @@
 #include "def-helper.h"
 
 DEF_HELPER_1(ldtlb, void, env)
-DEF_HELPER_1(raise_illegal_instruction, void, env)
-DEF_HELPER_1(raise_slot_illegal_instruction, void, env)
-DEF_HELPER_1(raise_fpu_disable, void, env)
-DEF_HELPER_1(raise_slot_fpu_disable, void, env)
-DEF_HELPER_1(debug, void, env)
-DEF_HELPER_2(sleep, void, env, i32)
-DEF_HELPER_2(trapa, void, env, i32)
+DEF_HELPER_1(raise_illegal_instruction, noreturn, env)
+DEF_HELPER_1(raise_slot_illegal_instruction, noreturn, env)
+DEF_HELPER_1(raise_fpu_disable, noreturn, env)
+DEF_HELPER_1(raise_slot_fpu_disable, noreturn, env)
+DEF_HELPER_1(debug, noreturn, env)
+DEF_HELPER_1(sleep, noreturn, env)
+DEF_HELPER_2(trapa, noreturn, env, i32)
 
 DEF_HELPER_3(movcal, void, env, i32, i32)
 DEF_HELPER_1(discard_movcal_backup, void, env)
 DEF_HELPER_2(ocbi, void, env, i32)
 
-DEF_HELPER_3(addv, i32, env, i32, i32)
-DEF_HELPER_3(addc, i32, env, i32, i32)
-DEF_HELPER_3(subv, i32, env, i32, i32)
-DEF_HELPER_3(subc, i32, env, i32, i32)
 DEF_HELPER_3(div1, i32, env, i32, i32)
 DEF_HELPER_3(macl, void, env, i32, i32)
 DEF_HELPER_3(macw, void, env, i32, i32)
 
 DEF_HELPER_2(ld_fpscr, void, env, i32)
 
-DEF_HELPER_1(fabs_FT, f32, f32)
-DEF_HELPER_1(fabs_DT, f64, f64)
+DEF_HELPER_FLAGS_1(fabs_FT, TCG_CALL_CONST | TCG_CALL_PURE, f32, f32)
+DEF_HELPER_FLAGS_1(fabs_DT, TCG_CALL_CONST | TCG_CALL_PURE, f64, f64)
 DEF_HELPER_3(fadd_FT, f32, env, f32, f32)
 DEF_HELPER_3(fadd_DT, f64, env, f64, f64)
 DEF_HELPER_2(fcnvsd_FT_DT, f64, env, f32)
@@ -41,7 +37,7 @@
 DEF_HELPER_4(fmac_FT, f32, env, f32, f32, f32)
 DEF_HELPER_3(fmul_FT, f32, env, f32, f32)
 DEF_HELPER_3(fmul_DT, f64, env, f64, f64)
-DEF_HELPER_1(fneg_T, f32, f32)
+DEF_HELPER_FLAGS_1(fneg_T, TCG_CALL_CONST | TCG_CALL_PURE, f32, f32)
 DEF_HELPER_3(fsub_FT, f32, env, f32, f32)
 DEF_HELPER_3(fsub_DT, f64, env, f64, f64)
 DEF_HELPER_2(fsqrt_FT, f32, env, f32)
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index 9b4328d..60ec4cb 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -21,7 +21,8 @@
 #include "cpu.h"
 #include "helper.h"
 
-static void cpu_restore_state_from_retaddr(CPUSH4State *env, uintptr_t retaddr)
+static inline void cpu_restore_state_from_retaddr(CPUSH4State *env,
+                                                  uintptr_t retaddr)
 {
     TranslationBlock *tb;
 
@@ -77,8 +78,8 @@
 #endif
 }
 
-static inline void raise_exception(CPUSH4State *env, int index,
-                                   uintptr_t retaddr)
+static inline void QEMU_NORETURN raise_exception(CPUSH4State *env, int index,
+                                                 uintptr_t retaddr)
 {
     env->exception_index = index;
     cpu_restore_state_from_retaddr(env, retaddr);
@@ -87,43 +88,40 @@
 
 void helper_raise_illegal_instruction(CPUSH4State *env)
 {
-    raise_exception(env, 0x180, GETPC());
+    raise_exception(env, 0x180, 0);
 }
 
 void helper_raise_slot_illegal_instruction(CPUSH4State *env)
 {
-    raise_exception(env, 0x1a0, GETPC());
+    raise_exception(env, 0x1a0, 0);
 }
 
 void helper_raise_fpu_disable(CPUSH4State *env)
 {
-    raise_exception(env, 0x800, GETPC());
+    raise_exception(env, 0x800, 0);
 }
 
 void helper_raise_slot_fpu_disable(CPUSH4State *env)
 {
-    raise_exception(env, 0x820, GETPC());
+    raise_exception(env, 0x820, 0);
 }
 
 void helper_debug(CPUSH4State *env)
 {
-    env->exception_index = EXCP_DEBUG;
-    cpu_loop_exit(env);
+    raise_exception(env, EXCP_DEBUG, 0);
 }
 
-void helper_sleep(CPUSH4State *env, uint32_t next_pc)
+void helper_sleep(CPUSH4State *env)
 {
     env->halted = 1;
     env->in_sleep = 1;
-    env->exception_index = EXCP_HLT;
-    env->pc = next_pc;
-    cpu_loop_exit(env);
+    raise_exception(env, EXCP_HLT, 0);
 }
 
 void helper_trapa(CPUSH4State *env, uint32_t tra)
 {
     env->tra = tra << 2;
-    raise_exception(env, 0x160, GETPC());
+    raise_exception(env, 0x160, 0);
 }
 
 void helper_movcal(CPUSH4State *env, uint32_t address, uint32_t value)
@@ -177,51 +175,6 @@
     }
 }
 
-uint32_t helper_addc(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
-{
-    uint32_t tmp0, tmp1;
-
-    tmp1 = arg0 + arg1;
-    tmp0 = arg1;
-    arg1 = tmp1 + (env->sr & 1);
-    if (tmp0 > tmp1)
-	env->sr |= SR_T;
-    else
-	env->sr &= ~SR_T;
-    if (tmp1 > arg1)
-	env->sr |= SR_T;
-    return arg1;
-}
-
-uint32_t helper_addv(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
-{
-    uint32_t dest, src, ans;
-
-    if ((int32_t) arg1 >= 0)
-	dest = 0;
-    else
-	dest = 1;
-    if ((int32_t) arg0 >= 0)
-	src = 0;
-    else
-	src = 1;
-    src += dest;
-    arg1 += arg0;
-    if ((int32_t) arg1 >= 0)
-	ans = 0;
-    else
-	ans = 1;
-    ans += dest;
-    if (src == 0 || src == 2) {
-	if (ans == 1)
-	    env->sr |= SR_T;
-	else
-	    env->sr &= ~SR_T;
-    } else
-	env->sr &= ~SR_T;
-    return arg1;
-}
-
 #define T (env->sr & SR_T)
 #define Q (env->sr & SR_Q ? 1 : 0)
 #define M (env->sr & SR_M ? 1 : 0)
@@ -375,51 +328,6 @@
     }
 }
 
-uint32_t helper_subc(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
-{
-    uint32_t tmp0, tmp1;
-
-    tmp1 = arg1 - arg0;
-    tmp0 = arg1;
-    arg1 = tmp1 - (env->sr & SR_T);
-    if (tmp0 < tmp1)
-	env->sr |= SR_T;
-    else
-	env->sr &= ~SR_T;
-    if (tmp1 < arg1)
-	env->sr |= SR_T;
-    return arg1;
-}
-
-uint32_t helper_subv(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
-{
-    int32_t dest, src, ans;
-
-    if ((int32_t) arg1 >= 0)
-	dest = 0;
-    else
-	dest = 1;
-    if ((int32_t) arg0 >= 0)
-	src = 0;
-    else
-	src = 1;
-    src += dest;
-    arg1 -= arg0;
-    if ((int32_t) arg1 >= 0)
-	ans = 0;
-    else
-	ans = 1;
-    ans += dest;
-    if (src == 1) {
-	if (ans == 1)
-	    env->sr |= SR_T;
-	else
-	    env->sr &= ~SR_T;
-    } else
-	env->sr &= ~SR_T;
-    return arg1;
-}
-
 static inline void set_t(CPUSH4State *env)
 {
     env->sr |= SR_T;
@@ -475,9 +383,7 @@
         cause = (env->fpscr & FPSCR_CAUSE_MASK) >> FPSCR_CAUSE_SHIFT;
         enable = (env->fpscr & FPSCR_ENABLE_MASK) >> FPSCR_ENABLE_SHIFT;
         if (cause & enable) {
-            cpu_restore_state_from_retaddr(env, retaddr);
-            env->exception_index = 0x120;
-            cpu_loop_exit(env);
+            raise_exception(env, 0x120, retaddr);
         }
     }
 }
@@ -623,8 +529,7 @@
 float32 helper_fmac_FT(CPUSH4State *env, float32 t0, float32 t1, float32 t2)
 {
     set_float_exception_flags(0, &env->fp_status);
-    t0 = float32_mul(t0, t1, &env->fp_status);
-    t0 = float32_add(t0, t2, &env->fp_status);
+    t0 = float32_muladd(t0, t1, t2, 0, &env->fp_status);
     update_fpscr(env, GETPC());
     return t0;
 }
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index d05c74c..0fa83ca 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -18,7 +18,6 @@
  */
 
 #define DEBUG_DISAS
-#define SH4_DEBUG_DISAS
 //#define SH4_SINGLE_STEP
 
 #include "cpu.h"
@@ -32,8 +31,6 @@
 typedef struct DisasContext {
     struct TranslationBlock *tb;
     target_ulong pc;
-    uint32_t sr;
-    uint32_t fpscr;
     uint16_t opcode;
     uint32_t flags;
     int bstate;
@@ -47,7 +44,7 @@
 #if defined(CONFIG_USER_ONLY)
 #define IS_USER(ctx) 1
 #else
-#define IS_USER(ctx) (!(ctx->sr & SR_MD))
+#define IS_USER(ctx) (!(ctx->flags & SR_MD))
 #endif
 
 enum {
@@ -339,16 +336,6 @@
     gen_jump(ctx);
 }
 
-static inline void gen_set_t(void)
-{
-    tcg_gen_ori_i32(cpu_sr, cpu_sr, SR_T);
-}
-
-static inline void gen_clr_t(void)
-{
-    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
-}
-
 static inline void gen_cmp(int cond, TCGv t0, TCGv t1)
 {
     TCGv t;
@@ -423,44 +410,47 @@
 #define B11_8 ((ctx->opcode >> 8) & 0xf)
 #define B15_12 ((ctx->opcode >> 12) & 0xf)
 
-#define REG(x) ((x) < 8 && (ctx->sr & (SR_MD | SR_RB)) == (SR_MD | SR_RB) ? \
-		(cpu_gregs[x + 16]) : (cpu_gregs[x]))
+#define REG(x) ((x) < 8 && (ctx->flags & (SR_MD | SR_RB)) == (SR_MD | SR_RB) \
+                ? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
-#define ALTREG(x) ((x) < 8 && (ctx->sr & (SR_MD | SR_RB)) != (SR_MD | SR_RB) \
+#define ALTREG(x) ((x) < 8 && (ctx->flags & (SR_MD | SR_RB)) != (SR_MD | SR_RB)\
 		? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
-#define FREG(x) (ctx->fpscr & FPSCR_FR ? (x) ^ 0x10 : (x))
+#define FREG(x) (ctx->flags & FPSCR_FR ? (x) ^ 0x10 : (x))
 #define XHACK(x) ((((x) & 1 ) << 4) | ((x) & 0xe))
-#define XREG(x) (ctx->fpscr & FPSCR_FR ? XHACK(x) ^ 0x10 : XHACK(x))
+#define XREG(x) (ctx->flags & FPSCR_FR ? XHACK(x) ^ 0x10 : XHACK(x))
 #define DREG(x) FREG(x) /* Assumes lsb of (x) is always 0 */
 
 #define CHECK_NOT_DELAY_SLOT \
   if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL))     \
   {                                                           \
+      tcg_gen_movi_i32(cpu_pc, ctx->pc);                      \
       gen_helper_raise_slot_illegal_instruction(cpu_env);     \
-      ctx->bstate = BS_EXCP;                                  \
+      ctx->bstate = BS_BRANCH;                                \
       return;                                                 \
   }
 
 #define CHECK_PRIVILEGED                                        \
   if (IS_USER(ctx)) {                                           \
+      tcg_gen_movi_i32(cpu_pc, ctx->pc);                        \
       if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
           gen_helper_raise_slot_illegal_instruction(cpu_env);   \
       } else {                                                  \
           gen_helper_raise_illegal_instruction(cpu_env);        \
       }                                                         \
-      ctx->bstate = BS_EXCP;                                    \
+      ctx->bstate = BS_BRANCH;                                  \
       return;                                                   \
   }
 
 #define CHECK_FPU_ENABLED                                       \
   if (ctx->flags & SR_FD) {                                     \
+      tcg_gen_movi_i32(cpu_pc, ctx->pc);                        \
       if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
           gen_helper_raise_slot_fpu_disable(cpu_env);           \
       } else {                                                  \
           gen_helper_raise_fpu_disable(cpu_env);                \
       }                                                         \
-      ctx->bstate = BS_EXCP;                                    \
+      ctx->bstate = BS_BRANCH;                                  \
       return;                                                   \
   }
 
@@ -519,7 +509,7 @@
 	tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_S);
 	return;
     case 0x0008:		/* clrt */
-	gen_clr_t();
+        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
 	return;
     case 0x0038:		/* ldtlb */
 	CHECK_PRIVILEGED
@@ -537,21 +527,22 @@
 	tcg_gen_ori_i32(cpu_sr, cpu_sr, SR_S);
 	return;
     case 0x0018:		/* sett */
-	gen_set_t();
+        tcg_gen_ori_i32(cpu_sr, cpu_sr, SR_T);
 	return;
     case 0xfbfd:		/* frchg */
 	tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_FR);
 	ctx->bstate = BS_STOP;
 	return;
     case 0xf3fd:		/* fschg */
-	tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_SZ);
+        tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_SZ);
 	ctx->bstate = BS_STOP;
 	return;
     case 0x0009:		/* nop */
 	return;
     case 0x001b:		/* sleep */
 	CHECK_PRIVILEGED
-        gen_helper_sleep(cpu_env, tcg_const_i32(ctx->pc + 2));
+        tcg_gen_movi_i32(cpu_pc, ctx->pc + 2);
+        gen_helper_sleep(cpu_env);
 	return;
     }
 
@@ -732,17 +723,7 @@
 	}
 	return;
     case 0x6009:		/* swap.w Rm,Rn */
-	{
-	    TCGv high, low;
-	    high = tcg_temp_new();
-	    tcg_gen_shli_i32(high, REG(B7_4), 16);
-	    low = tcg_temp_new();
-	    tcg_gen_shri_i32(low, REG(B7_4), 16);
-	    tcg_gen_ext16u_i32(low, low);
-	    tcg_gen_or_i32(REG(B11_8), high, low);
-	    tcg_temp_free(low);
-	    tcg_temp_free(high);
-	}
+        tcg_gen_rotli_i32(REG(B11_8), REG(B7_4), 16);
 	return;
     case 0x200d:		/* xtrct Rm,Rn */
 	{
@@ -751,7 +732,6 @@
 	    tcg_gen_shli_i32(high, REG(B7_4), 16);
 	    low = tcg_temp_new();
 	    tcg_gen_shri_i32(low, REG(B11_8), 16);
-	    tcg_gen_ext16u_i32(low, low);
 	    tcg_gen_or_i32(REG(B11_8), high, low);
 	    tcg_temp_free(low);
 	    tcg_temp_free(high);
@@ -761,10 +741,43 @@
 	tcg_gen_add_i32(REG(B11_8), REG(B11_8), REG(B7_4));
 	return;
     case 0x300e:		/* addc Rm,Rn */
-        gen_helper_addc(REG(B11_8), cpu_env, REG(B7_4), REG(B11_8));
+        {
+            TCGv t0, t1, t2;
+            t0 = tcg_temp_new();
+            tcg_gen_andi_i32(t0, cpu_sr, SR_T);
+            t1 = tcg_temp_new();
+            tcg_gen_add_i32(t1, REG(B7_4), REG(B11_8));
+            tcg_gen_add_i32(t0, t0, t1);
+            t2 = tcg_temp_new();
+            tcg_gen_setcond_i32(TCG_COND_GTU, t2, REG(B11_8), t1);
+            tcg_gen_setcond_i32(TCG_COND_GTU, t1, t1, t0);
+            tcg_gen_or_i32(t1, t1, t2);
+            tcg_temp_free(t2);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
+            tcg_temp_free(t1);
+            tcg_gen_mov_i32(REG(B11_8), t0);
+            tcg_temp_free(t0);
+        }
 	return;
     case 0x300f:		/* addv Rm,Rn */
-        gen_helper_addv(REG(B11_8), cpu_env, REG(B7_4), REG(B11_8));
+        {
+            TCGv t0, t1, t2;
+            t0 = tcg_temp_new();
+            tcg_gen_add_i32(t0, REG(B7_4), REG(B11_8));
+            t1 = tcg_temp_new();
+            tcg_gen_xor_i32(t1, t0, REG(B11_8));
+            t2 = tcg_temp_new();
+            tcg_gen_xor_i32(t2, REG(B7_4), REG(B11_8));
+            tcg_gen_andc_i32(t1, t1, t2);
+            tcg_temp_free(t2);
+            tcg_gen_shri_i32(t1, t1, 31);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
+            tcg_temp_free(t1);
+            tcg_gen_mov_i32(REG(B7_4), t0);
+            tcg_temp_free(t0);
+        }
 	return;
     case 0x2009:		/* and Rm,Rn */
 	tcg_gen_and_i32(REG(B11_8), REG(B11_8), REG(B7_4));
@@ -1013,10 +1026,43 @@
 	tcg_gen_sub_i32(REG(B11_8), REG(B11_8), REG(B7_4));
 	return;
     case 0x300a:		/* subc Rm,Rn */
-        gen_helper_subc(REG(B11_8), cpu_env, REG(B7_4), REG(B11_8));
+        {
+            TCGv t0, t1, t2;
+            t0 = tcg_temp_new();
+            tcg_gen_andi_i32(t0, cpu_sr, SR_T);
+            t1 = tcg_temp_new();
+            tcg_gen_sub_i32(t1, REG(B11_8), REG(B7_4));
+            tcg_gen_sub_i32(t0, t1, t0);
+            t2 = tcg_temp_new();
+            tcg_gen_setcond_i32(TCG_COND_LTU, t2, REG(B11_8), t1);
+            tcg_gen_setcond_i32(TCG_COND_LTU, t1, t1, t0);
+            tcg_gen_or_i32(t1, t1, t2);
+            tcg_temp_free(t2);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
+            tcg_temp_free(t1);
+            tcg_gen_mov_i32(REG(B11_8), t0);
+            tcg_temp_free(t0);
+        }
 	return;
     case 0x300b:		/* subv Rm,Rn */
-        gen_helper_subv(REG(B11_8), cpu_env, REG(B7_4), REG(B11_8));
+        {
+            TCGv t0, t1, t2;
+            t0 = tcg_temp_new();
+            tcg_gen_sub_i32(t0, REG(B11_8), REG(B7_4));
+            t1 = tcg_temp_new();
+            tcg_gen_xor_i32(t1, t0, REG(B7_4));
+            t2 = tcg_temp_new();
+            tcg_gen_xor_i32(t2, REG(B11_8), REG(B7_4));
+            tcg_gen_and_i32(t1, t1, t2);
+            tcg_temp_free(t2);
+            tcg_gen_shri_i32(t1, t1, 31);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
+            tcg_temp_free(t1);
+            tcg_gen_mov_i32(REG(B11_8), t0);
+            tcg_temp_free(t0);
+        }
 	return;
     case 0x2008:		/* tst Rm,Rn */
 	{
@@ -1031,7 +1077,7 @@
 	return;
     case 0xf00c: /* fmov {F,D,X}Rm,{F,D,X}Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_SZ) {
+        if (ctx->flags & FPSCR_SZ) {
 	    TCGv_i64 fp = tcg_temp_new_i64();
 	    gen_load_fpr64(fp, XREG(B7_4));
 	    gen_store_fpr64(fp, XREG(B11_8));
@@ -1042,7 +1088,7 @@
 	return;
     case 0xf00a: /* fmov {F,D,X}Rm,@Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_SZ) {
+        if (ctx->flags & FPSCR_SZ) {
 	    TCGv addr_hi = tcg_temp_new();
 	    int fr = XREG(B7_4);
 	    tcg_gen_addi_i32(addr_hi, REG(B11_8), 4);
@@ -1055,7 +1101,7 @@
 	return;
     case 0xf008: /* fmov @Rm,{F,D,X}Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_SZ) {
+        if (ctx->flags & FPSCR_SZ) {
 	    TCGv addr_hi = tcg_temp_new();
 	    int fr = XREG(B11_8);
 	    tcg_gen_addi_i32(addr_hi, REG(B7_4), 4);
@@ -1068,7 +1114,7 @@
 	return;
     case 0xf009: /* fmov @Rm+,{F,D,X}Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_SZ) {
+        if (ctx->flags & FPSCR_SZ) {
 	    TCGv addr_hi = tcg_temp_new();
 	    int fr = XREG(B11_8);
 	    tcg_gen_addi_i32(addr_hi, REG(B7_4), 4);
@@ -1083,7 +1129,7 @@
 	return;
     case 0xf00b: /* fmov {F,D,X}Rm,@-Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_SZ) {
+        if (ctx->flags & FPSCR_SZ) {
 	    TCGv addr = tcg_temp_new_i32();
 	    int fr = XREG(B7_4);
 	    tcg_gen_subi_i32(addr, REG(B11_8), 4);
@@ -1106,7 +1152,7 @@
 	{
 	    TCGv addr = tcg_temp_new_i32();
 	    tcg_gen_add_i32(addr, REG(B7_4), REG(0));
-	    if (ctx->fpscr & FPSCR_SZ) {
+            if (ctx->flags & FPSCR_SZ) {
 		int fr = XREG(B11_8);
 		tcg_gen_qemu_ld32u(cpu_fregs[fr	 ], addr, ctx->memidx);
 		tcg_gen_addi_i32(addr, addr, 4);
@@ -1122,7 +1168,7 @@
 	{
 	    TCGv addr = tcg_temp_new();
 	    tcg_gen_add_i32(addr, REG(B11_8), REG(0));
-	    if (ctx->fpscr & FPSCR_SZ) {
+            if (ctx->flags & FPSCR_SZ) {
 		int fr = XREG(B7_4);
 		tcg_gen_qemu_ld32u(cpu_fregs[fr	 ], addr, ctx->memidx);
 		tcg_gen_addi_i32(addr, addr, 4);
@@ -1141,7 +1187,7 @@
     case 0xf005: /* fcmp/gt Rm,Rn - FPSCR: R[PR,Enable.V]/W[Cause,Flag] */
 	{
 	    CHECK_FPU_ENABLED
-	    if (ctx->fpscr & FPSCR_PR) {
+            if (ctx->flags & FPSCR_PR) {
                 TCGv_i64 fp0, fp1;
 
 		if (ctx->opcode & 0x0110)
@@ -1210,7 +1256,7 @@
     case 0xf00e: /* fmac FR0,RM,Rn */
         {
             CHECK_FPU_ENABLED
-            if (ctx->fpscr & FPSCR_PR) {
+            if (ctx->flags & FPSCR_PR) {
                 break; /* illegal instruction */
             } else {
                 gen_helper_fmac_FT(cpu_fregs[FREG(B11_8)], cpu_env,
@@ -1366,6 +1412,7 @@
 	{
 	    TCGv imm;
 	    CHECK_NOT_DELAY_SLOT
+            tcg_gen_movi_i32(cpu_pc, ctx->pc);
 	    imm = tcg_const_i32(B7_0);
             gen_helper_trapa(cpu_env, imm);
 	    tcg_temp_free(imm);
@@ -1605,7 +1652,7 @@
         */
         if (ctx->features & SH_FEATURE_SH4A) {
 	    int label = gen_new_label();
-	    gen_clr_t();
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
 	    tcg_gen_or_i32(cpu_sr, cpu_sr, cpu_ldst);
 	    tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ldst, 0, label);
 	    tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
@@ -1739,7 +1786,7 @@
 	return;
     case 0xf02d: /* float FPUL,FRn/DRn - FPSCR: R[PR,Enable.I]/W[Cause,Flag] */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_PR) {
+        if (ctx->flags & FPSCR_PR) {
 	    TCGv_i64 fp;
 	    if (ctx->opcode & 0x0100)
 		break; /* illegal instruction */
@@ -1754,7 +1801,7 @@
 	return;
     case 0xf03d: /* ftrc FRm/DRm,FPUL - FPSCR: R[PR,Enable.V]/W[Cause,Flag] */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_PR) {
+        if (ctx->flags & FPSCR_PR) {
 	    TCGv_i64 fp;
 	    if (ctx->opcode & 0x0100)
 		break; /* illegal instruction */
@@ -1775,7 +1822,7 @@
 	return;
     case 0xf05d: /* fabs FRn/DRn */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_PR) {
+        if (ctx->flags & FPSCR_PR) {
 	    if (ctx->opcode & 0x0100)
 		break; /* illegal instruction */
 	    TCGv_i64 fp = tcg_temp_new_i64();
@@ -1789,7 +1836,7 @@
 	return;
     case 0xf06d: /* fsqrt FRn */
 	CHECK_FPU_ENABLED
-	if (ctx->fpscr & FPSCR_PR) {
+        if (ctx->flags & FPSCR_PR) {
 	    if (ctx->opcode & 0x0100)
 		break; /* illegal instruction */
 	    TCGv_i64 fp = tcg_temp_new_i64();
@@ -1807,13 +1854,13 @@
 	break;
     case 0xf08d: /* fldi0 FRn - FPSCR: R[PR] */
 	CHECK_FPU_ENABLED
-	if (!(ctx->fpscr & FPSCR_PR)) {
+        if (!(ctx->flags & FPSCR_PR)) {
 	    tcg_gen_movi_i32(cpu_fregs[FREG(B11_8)], 0);
 	}
 	return;
     case 0xf09d: /* fldi1 FRn - FPSCR: R[PR] */
 	CHECK_FPU_ENABLED
-	if (!(ctx->fpscr & FPSCR_PR)) {
+        if (!(ctx->flags & FPSCR_PR)) {
 	    tcg_gen_movi_i32(cpu_fregs[FREG(B11_8)], 0x3f800000);
 	}
 	return;
@@ -1837,7 +1884,7 @@
 	return;
     case 0xf0ed: /* fipr FVm,FVn */
         CHECK_FPU_ENABLED
-        if ((ctx->fpscr & FPSCR_PR) == 0) {
+        if ((ctx->flags & FPSCR_PR) == 0) {
             TCGv m, n;
             m = tcg_const_i32((ctx->opcode >> 8) & 3);
             n = tcg_const_i32((ctx->opcode >> 10) & 3);
@@ -1850,7 +1897,7 @@
     case 0xf0fd: /* ftrv XMTRX,FVn */
         CHECK_FPU_ENABLED
         if ((ctx->opcode & 0x0300) == 0x0100 &&
-            (ctx->fpscr & FPSCR_PR) == 0) {
+            (ctx->flags & FPSCR_PR) == 0) {
             TCGv n;
             n = tcg_const_i32((ctx->opcode >> 10) & 3);
             gen_helper_ftrv(cpu_env, n);
@@ -1864,12 +1911,13 @@
 	    ctx->opcode, ctx->pc);
     fflush(stderr);
 #endif
+    tcg_gen_movi_i32(cpu_pc, ctx->pc);
     if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) {
         gen_helper_raise_slot_illegal_instruction(cpu_env);
     } else {
         gen_helper_raise_illegal_instruction(cpu_env);
     }
-    ctx->bstate = BS_EXCP;
+    ctx->bstate = BS_BRANCH;
 }
 
 static void decode_opc(DisasContext * ctx)
@@ -1923,16 +1971,14 @@
     ctx.pc = pc_start;
     ctx.flags = (uint32_t)tb->flags;
     ctx.bstate = BS_NONE;
-    ctx.sr = env->sr;
-    ctx.fpscr = env->fpscr;
-    ctx.memidx = (env->sr & SR_MD) == 0 ? 1 : 0;
+    ctx.memidx = (ctx.flags & SR_MD) == 0 ? 1 : 0;
     /* We don't know if the delayed pc came from a dynamic or static branch,
        so assume it is a dynamic branch.  */
     ctx.delayed_pc = -1; /* use delayed pc from env pointer */
     ctx.tb = tb;
     ctx.singlestep_enabled = env->singlestep_enabled;
     ctx.features = env->features;
-    ctx.has_movcal = (tb->flags & TB_FLAG_PENDING_MOVCA);
+    ctx.has_movcal = (ctx.flags & TB_FLAG_PENDING_MOVCA);
 
     ii = -1;
     num_insns = 0;
@@ -1947,7 +1993,7 @@
 		    /* We have hit a breakpoint - make sure PC is up-to-date */
 		    tcg_gen_movi_i32(cpu_pc, ctx.pc);
                     gen_helper_debug(cpu_env);
-		    ctx.bstate = BS_EXCP;
+                    ctx.bstate = BS_BRANCH;
 		    break;
 		}
 	    }
@@ -2022,9 +2068,6 @@
     }
 
 #ifdef DEBUG_DISAS
-#ifdef SH4_DEBUG_DISAS
-    qemu_log_mask(CPU_LOG_TB_IN_ASM, "\n");
-#endif
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
 	qemu_log("IN:\n");	/* , lookup_symbol(pc_start)); */
 	log_target_disas(pc_start, ctx.pc - pc_start, 0);
diff --git a/target-xtensa/cpu.h b/target-xtensa/cpu.h
index 177094a..7348277 100644
--- a/target-xtensa/cpu.h
+++ b/target-xtensa/cpu.h
@@ -36,6 +36,7 @@
 #include "config.h"
 #include "qemu-common.h"
 #include "cpu-defs.h"
+#include "fpu/softfloat.h"
 
 #define TARGET_HAS_ICE 1
 
@@ -325,6 +326,8 @@
     uint32_t sregs[256];
     uint32_t uregs[256];
     uint32_t phys_regs[MAX_NAREG];
+    float32 fregs[16];
+    float_status fp_status;
 
     xtensa_tlb_entry itlb[7][MAX_TLB_WAY_SIZE];
     xtensa_tlb_entry dtlb[10][MAX_TLB_WAY_SIZE];
@@ -465,6 +468,8 @@
 #define XTENSA_TBFLAG_LITBASE 0x8
 #define XTENSA_TBFLAG_DEBUG 0x10
 #define XTENSA_TBFLAG_ICOUNT 0x20
+#define XTENSA_TBFLAG_CPENABLE_MASK 0x3fc0
+#define XTENSA_TBFLAG_CPENABLE_SHIFT 6
 
 static inline void cpu_get_tb_cpu_state(CPUXtensaState *env, target_ulong *pc,
         target_ulong *cs_base, int *flags)
@@ -488,6 +493,9 @@
             *flags |= XTENSA_TBFLAG_ICOUNT;
         }
     }
+    if (xtensa_option_enabled(env->config, XTENSA_OPTION_COPROCESSOR)) {
+        *flags |= env->sregs[CPENABLE] << XTENSA_TBFLAG_CPENABLE_SHIFT;
+    }
 }
 
 #include "cpu-all.h"
diff --git a/target-xtensa/helper.h b/target-xtensa/helper.h
index 152fec0..4cc0088 100644
--- a/target-xtensa/helper.h
+++ b/target-xtensa/helper.h
@@ -36,4 +36,25 @@
 DEF_HELPER_3(wsr_dbreaka, void, env, i32, i32)
 DEF_HELPER_3(wsr_dbreakc, void, env, i32, i32)
 
+DEF_HELPER_2(wur_fcr, void, env, i32)
+DEF_HELPER_FLAGS_1(abs_s, TCG_CALL_CONST | TCG_CALL_PURE, f32, f32)
+DEF_HELPER_FLAGS_1(neg_s, TCG_CALL_CONST | TCG_CALL_PURE, f32, f32)
+DEF_HELPER_3(add_s, f32, env, f32, f32)
+DEF_HELPER_3(sub_s, f32, env, f32, f32)
+DEF_HELPER_3(mul_s, f32, env, f32, f32)
+DEF_HELPER_4(madd_s, f32, env, f32, f32, f32)
+DEF_HELPER_4(msub_s, f32, env, f32, f32, f32)
+DEF_HELPER_FLAGS_3(ftoi, TCG_CALL_CONST | TCG_CALL_PURE, i32, f32, i32, i32)
+DEF_HELPER_FLAGS_3(ftoui, TCG_CALL_CONST | TCG_CALL_PURE, i32, f32, i32, i32)
+DEF_HELPER_3(itof, f32, env, i32, i32)
+DEF_HELPER_3(uitof, f32, env, i32, i32)
+
+DEF_HELPER_4(un_s, void, env, i32, f32, f32)
+DEF_HELPER_4(oeq_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ueq_s, void, env, i32, f32, f32)
+DEF_HELPER_4(olt_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ult_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ole_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ule_s, void, env, i32, f32, f32)
+
 #include "def-helper.h"
diff --git a/target-xtensa/op_helper.c b/target-xtensa/op_helper.c
index 2659c0e..ae0c099 100644
--- a/target-xtensa/op_helper.c
+++ b/target-xtensa/op_helper.c
@@ -771,3 +771,137 @@
     }
     env->sregs[DBREAKC + i] = v;
 }
+
+void HELPER(wur_fcr)(CPUXtensaState *env, uint32_t v)
+{
+    static const int rounding_mode[] = {
+        float_round_nearest_even,
+        float_round_to_zero,
+        float_round_up,
+        float_round_down,
+    };
+
+    env->uregs[FCR] = v & 0xfffff07f;
+    set_float_rounding_mode(rounding_mode[v & 3], &env->fp_status);
+}
+
+float32 HELPER(abs_s)(float32 v)
+{
+    return float32_abs(v);
+}
+
+float32 HELPER(neg_s)(float32 v)
+{
+    return float32_chs(v);
+}
+
+float32 HELPER(add_s)(CPUXtensaState *env, float32 a, float32 b)
+{
+    return float32_add(a, b, &env->fp_status);
+}
+
+float32 HELPER(sub_s)(CPUXtensaState *env, float32 a, float32 b)
+{
+    return float32_sub(a, b, &env->fp_status);
+}
+
+float32 HELPER(mul_s)(CPUXtensaState *env, float32 a, float32 b)
+{
+    return float32_mul(a, b, &env->fp_status);
+}
+
+float32 HELPER(madd_s)(CPUXtensaState *env, float32 a, float32 b, float32 c)
+{
+    return float32_muladd(b, c, a, 0,
+            &env->fp_status);
+}
+
+float32 HELPER(msub_s)(CPUXtensaState *env, float32 a, float32 b, float32 c)
+{
+    return float32_muladd(b, c, a, float_muladd_negate_product,
+            &env->fp_status);
+}
+
+uint32_t HELPER(ftoi)(float32 v, uint32_t rounding_mode, uint32_t scale)
+{
+    float_status fp_status = {0};
+
+    set_float_rounding_mode(rounding_mode, &fp_status);
+    return float32_to_int32(
+            float32_scalbn(v, scale, &fp_status), &fp_status);
+}
+
+uint32_t HELPER(ftoui)(float32 v, uint32_t rounding_mode, uint32_t scale)
+{
+    float_status fp_status = {0};
+    float32 res;
+
+    set_float_rounding_mode(rounding_mode, &fp_status);
+
+    res = float32_scalbn(v, scale, &fp_status);
+
+    if (float32_is_neg(v) && !float32_is_any_nan(v)) {
+        return float32_to_int32(res, &fp_status);
+    } else {
+        return float32_to_uint32(res, &fp_status);
+    }
+}
+
+float32 HELPER(itof)(CPUXtensaState *env, uint32_t v, uint32_t scale)
+{
+    return float32_scalbn(int32_to_float32(v, &env->fp_status),
+            (int32_t)scale, &env->fp_status);
+}
+
+float32 HELPER(uitof)(CPUXtensaState *env, uint32_t v, uint32_t scale)
+{
+    return float32_scalbn(uint32_to_float32(v, &env->fp_status),
+            (int32_t)scale, &env->fp_status);
+}
+
+static inline void set_br(CPUXtensaState *env, bool v, uint32_t br)
+{
+    if (v) {
+        env->sregs[BR] |= br;
+    } else {
+        env->sregs[BR] &= ~br;
+    }
+}
+
+void HELPER(un_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_unordered_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(oeq_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_eq_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(ueq_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    int v = float32_compare_quiet(a, b, &env->fp_status);
+    set_br(env, v == float_relation_equal || v == float_relation_unordered, br);
+}
+
+void HELPER(olt_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_lt_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(ult_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    int v = float32_compare_quiet(a, b, &env->fp_status);
+    set_br(env, v == float_relation_less || v == float_relation_unordered, br);
+}
+
+void HELPER(ole_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_le_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(ule_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    int v = float32_compare_quiet(a, b, &env->fp_status);
+    set_br(env, v != float_relation_greater, br);
+}
diff --git a/target-xtensa/overlay_tool.h b/target-xtensa/overlay_tool.h
index a3a5650..e395053 100644
--- a/target-xtensa/overlay_tool.h
+++ b/target-xtensa/overlay_tool.h
@@ -58,6 +58,7 @@
     XCHAL_OPTION(XCHAL_HAVE_SEXT, XTENSA_OPTION_MISC_OP_SEXT) | \
     XCHAL_OPTION(XCHAL_HAVE_CLAMPS, XTENSA_OPTION_MISC_OP_CLAMPS) | \
     XCHAL_OPTION(XCHAL_HAVE_CP, XTENSA_OPTION_COPROCESSOR) | \
+    XCHAL_OPTION(XCHAL_HAVE_BOOLEANS, XTENSA_OPTION_BOOLEAN) | \
     XCHAL_OPTION(XCHAL_HAVE_FP, XTENSA_OPTION_FP_COPROCESSOR) | \
     XCHAL_OPTION(XCHAL_HAVE_RELEASE_SYNC, XTENSA_OPTION_MP_SYNCHRO) | \
     XCHAL_OPTION(XCHAL_HAVE_S32C1I, XTENSA_OPTION_CONDITIONAL_STORE) | \
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index 1900bd5..ba3ffcb 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -65,11 +65,14 @@
     bool debug;
     bool icount;
     TCGv_i32 next_icount;
+
+    unsigned cpenable;
 } DisasContext;
 
 static TCGv_ptr cpu_env;
 static TCGv_i32 cpu_pc;
 static TCGv_i32 cpu_R[16];
+static TCGv_i32 cpu_FR[16];
 static TCGv_i32 cpu_SR[256];
 static TCGv_i32 cpu_UR[256];
 
@@ -155,6 +158,12 @@
         "ar8", "ar9", "ar10", "ar11",
         "ar12", "ar13", "ar14", "ar15",
     };
+    static const char * const fregnames[] = {
+        "f0", "f1", "f2", "f3",
+        "f4", "f5", "f6", "f7",
+        "f8", "f9", "f10", "f11",
+        "f12", "f13", "f14", "f15",
+    };
     int i;
 
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
@@ -167,6 +176,12 @@
                 regnames[i]);
     }
 
+    for (i = 0; i < 16; i++) {
+        cpu_FR[i] = tcg_global_mem_new_i32(TCG_AREG0,
+                offsetof(CPUXtensaState, fregs[i]),
+                fregnames[i]);
+    }
+
     for (i = 0; i < 256; ++i) {
         if (sregnames[i]) {
             cpu_SR[i] = tcg_global_mem_new_i32(TCG_AREG0,
@@ -318,6 +333,15 @@
     }
 }
 
+static void gen_check_cpenable(DisasContext *dc, unsigned cp)
+{
+    if (option_enabled(dc, XTENSA_OPTION_COPROCESSOR) &&
+            !(dc->cpenable & (1 << cp))) {
+        gen_exception_cause(dc, COPROCESSOR0_DISABLED + cp);
+        dc->is_jmp = DISAS_UPDATE;
+    }
+}
+
 static void gen_jump_slot(DisasContext *dc, TCGv dest, int slot)
 {
     tcg_gen_mov_i32(cpu_pc, dest);
@@ -566,6 +590,13 @@
     }
 }
 
+static void gen_wsr_cpenable(DisasContext *dc, uint32_t sr, TCGv_i32 v)
+{
+    tcg_gen_andi_i32(cpu_SR[sr], v, 0xff);
+    /* This can change tb->flags, so exit tb */
+    gen_jumpi_check_loop_end(dc, -1);
+}
+
 static void gen_wsr_intset(DisasContext *dc, uint32_t sr, TCGv_i32 v)
 {
     tcg_gen_andi_i32(cpu_SR[sr], v,
@@ -668,6 +699,7 @@
         [DBREAKA + 1] = gen_wsr_dbreaka,
         [DBREAKC] = gen_wsr_dbreakc,
         [DBREAKC + 1] = gen_wsr_dbreakc,
+        [CPENABLE] = gen_wsr_cpenable,
         [INTSET] = gen_wsr_intset,
         [INTCLEAR] = gen_wsr_intclear,
         [INTENABLE] = gen_wsr_intenable,
@@ -692,6 +724,23 @@
     }
 }
 
+static void gen_wur(uint32_t ur, TCGv_i32 s)
+{
+    switch (ur) {
+    case FCR:
+        gen_helper_wur_fcr(cpu_env, s);
+        break;
+
+    case FSR:
+        tcg_gen_andi_i32(cpu_UR[ur], s, 0xffffff80);
+        break;
+
+    default:
+        tcg_gen_mov_i32(cpu_UR[ur], s);
+        break;
+    }
+}
+
 static void gen_load_store_alignment(DisasContext *dc, int shift,
         TCGv_i32 addr, bool no_hw_alignment)
 {
@@ -1761,13 +1810,11 @@
 
             case 15: /*WUR*/
                 gen_window_check1(dc, RRR_T);
-                {
-                    if (uregnames[RSR_SR]) {
-                        tcg_gen_mov_i32(cpu_UR[RSR_SR], cpu_R[RRR_T]);
-                    } else {
-                        qemu_log("WUR %d not implemented, ", RSR_SR);
-                        TBD();
-                    }
+                if (uregnames[RSR_SR]) {
+                    gen_wur(RSR_SR, cpu_R[RRR_T]);
+                } else {
+                    qemu_log("WUR %d not implemented, ", RSR_SR);
+                    TBD();
                 }
                 break;
 
@@ -1778,12 +1825,30 @@
         case 5:
             gen_window_check2(dc, RRR_R, RRR_T);
             {
-                int shiftimm = RRR_S | (OP1 << 4);
+                int shiftimm = RRR_S | ((OP1 & 1) << 4);
                 int maskimm = (1 << (OP2 + 1)) - 1;
 
                 TCGv_i32 tmp = tcg_temp_new_i32();
-                tcg_gen_shri_i32(tmp, cpu_R[RRR_T], shiftimm);
-                tcg_gen_andi_i32(cpu_R[RRR_R], tmp, maskimm);
+
+                if (shiftimm) {
+                    tcg_gen_shri_i32(tmp, cpu_R[RRR_T], shiftimm);
+                } else {
+                    tcg_gen_mov_i32(tmp, cpu_R[RRR_T]);
+                }
+
+                switch (maskimm) {
+                case 0xff:
+                    tcg_gen_ext8u_i32(cpu_R[RRR_R], tmp);
+                    break;
+
+                case 0xffff:
+                    tcg_gen_ext16u_i32(cpu_R[RRR_R], tmp);
+                    break;
+
+                default:
+                    tcg_gen_andi_i32(cpu_R[RRR_R], tmp, maskimm);
+                    break;
+                }
                 tcg_temp_free(tmp);
             }
             break;
@@ -1797,8 +1862,34 @@
             break;
 
         case 8: /*LSCXp*/
-            HAS_OPTION(XTENSA_OPTION_COPROCESSOR);
-            TBD();
+            switch (OP2) {
+            case 0: /*LSXf*/
+            case 1: /*LSXUf*/
+            case 4: /*SSXf*/
+            case 5: /*SSXUf*/
+                HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
+                gen_window_check2(dc, RRR_S, RRR_T);
+                gen_check_cpenable(dc, 0);
+                {
+                    TCGv_i32 addr = tcg_temp_new_i32();
+                    tcg_gen_add_i32(addr, cpu_R[RRR_S], cpu_R[RRR_T]);
+                    gen_load_store_alignment(dc, 2, addr, false);
+                    if (OP2 & 0x4) {
+                        tcg_gen_qemu_st32(cpu_FR[RRR_R], addr, dc->cring);
+                    } else {
+                        tcg_gen_qemu_ld32u(cpu_FR[RRR_R], addr, dc->cring);
+                    }
+                    if (OP2 & 0x1) {
+                        tcg_gen_mov_i32(cpu_R[RRR_S], addr);
+                    }
+                    tcg_temp_free(addr);
+                }
+                break;
+
+            default: /*reserved*/
+                RESERVED();
+                break;
+            }
             break;
 
         case 9: /*LSC4*/
@@ -1836,12 +1927,213 @@
 
         case 10: /*FP0*/
             HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
-            TBD();
+            switch (OP2) {
+            case 0: /*ADD.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_add_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 1: /*SUB.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_sub_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 2: /*MUL.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_mul_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 4: /*MADD.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_madd_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_R], cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 5: /*MSUB.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_msub_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_R], cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 8: /*ROUND.Sf*/
+            case 9: /*TRUNC.Sf*/
+            case 10: /*FLOOR.Sf*/
+            case 11: /*CEIL.Sf*/
+            case 14: /*UTRUNC.Sf*/
+                gen_window_check1(dc, RRR_R);
+                gen_check_cpenable(dc, 0);
+                {
+                    static const unsigned rounding_mode_const[] = {
+                        float_round_nearest_even,
+                        float_round_to_zero,
+                        float_round_down,
+                        float_round_up,
+                        [6] = float_round_to_zero,
+                    };
+                    TCGv_i32 rounding_mode = tcg_const_i32(
+                            rounding_mode_const[OP2 & 7]);
+                    TCGv_i32 scale = tcg_const_i32(RRR_T);
+
+                    if (OP2 == 14) {
+                        gen_helper_ftoui(cpu_R[RRR_R], cpu_FR[RRR_S],
+                                rounding_mode, scale);
+                    } else {
+                        gen_helper_ftoi(cpu_R[RRR_R], cpu_FR[RRR_S],
+                                rounding_mode, scale);
+                    }
+
+                    tcg_temp_free(rounding_mode);
+                    tcg_temp_free(scale);
+                }
+                break;
+
+            case 12: /*FLOAT.Sf*/
+            case 13: /*UFLOAT.Sf*/
+                gen_window_check1(dc, RRR_S);
+                gen_check_cpenable(dc, 0);
+                {
+                    TCGv_i32 scale = tcg_const_i32(-RRR_T);
+
+                    if (OP2 == 13) {
+                        gen_helper_uitof(cpu_FR[RRR_R], cpu_env,
+                                cpu_R[RRR_S], scale);
+                    } else {
+                        gen_helper_itof(cpu_FR[RRR_R], cpu_env,
+                                cpu_R[RRR_S], scale);
+                    }
+                    tcg_temp_free(scale);
+                }
+                break;
+
+            case 15: /*FP1OP*/
+                switch (RRR_T) {
+                case 0: /*MOV.Sf*/
+                    gen_check_cpenable(dc, 0);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                case 1: /*ABS.Sf*/
+                    gen_check_cpenable(dc, 0);
+                    gen_helper_abs_s(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                case 4: /*RFRf*/
+                    gen_window_check1(dc, RRR_R);
+                    gen_check_cpenable(dc, 0);
+                    tcg_gen_mov_i32(cpu_R[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                case 5: /*WFRf*/
+                    gen_window_check1(dc, RRR_S);
+                    gen_check_cpenable(dc, 0);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_R[RRR_S]);
+                    break;
+
+                case 6: /*NEG.Sf*/
+                    gen_check_cpenable(dc, 0);
+                    gen_helper_neg_s(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                default: /*reserved*/
+                    RESERVED();
+                    break;
+                }
+                break;
+
+            default: /*reserved*/
+                RESERVED();
+                break;
+            }
             break;
 
         case 11: /*FP1*/
             HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
-            TBD();
+
+#define gen_compare(rel, br, a, b) \
+    do { \
+        TCGv_i32 bit = tcg_const_i32(1 << br); \
+        \
+        gen_check_cpenable(dc, 0); \
+        gen_helper_##rel(cpu_env, bit, cpu_FR[a], cpu_FR[b]); \
+        tcg_temp_free(bit); \
+    } while (0)
+
+            switch (OP2) {
+            case 1: /*UN.Sf*/
+                gen_compare(un_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 2: /*OEQ.Sf*/
+                gen_compare(oeq_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 3: /*UEQ.Sf*/
+                gen_compare(ueq_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 4: /*OLT.Sf*/
+                gen_compare(olt_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 5: /*ULT.Sf*/
+                gen_compare(ult_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 6: /*OLE.Sf*/
+                gen_compare(ole_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 7: /*ULE.Sf*/
+                gen_compare(ule_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+#undef gen_compare
+
+            case 8: /*MOVEQZ.Sf*/
+            case 9: /*MOVNEZ.Sf*/
+            case 10: /*MOVLTZ.Sf*/
+            case 11: /*MOVGEZ.Sf*/
+                gen_window_check1(dc, RRR_T);
+                gen_check_cpenable(dc, 0);
+                {
+                    static const TCGCond cond[] = {
+                        TCG_COND_NE,
+                        TCG_COND_EQ,
+                        TCG_COND_GE,
+                        TCG_COND_LT
+                    };
+                    int label = gen_new_label();
+                    tcg_gen_brcondi_i32(cond[OP2 - 8], cpu_R[RRR_T], 0, label);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    gen_set_label(label);
+                }
+                break;
+
+            case 12: /*MOVF.Sf*/
+            case 13: /*MOVT.Sf*/
+                HAS_OPTION(XTENSA_OPTION_BOOLEAN);
+                gen_check_cpenable(dc, 0);
+                {
+                    int label = gen_new_label();
+                    TCGv_i32 tmp = tcg_temp_new_i32();
+
+                    tcg_gen_andi_i32(tmp, cpu_SR[BR], 1 << RRR_T);
+                    tcg_gen_brcondi_i32(
+                            OP2 & 1 ? TCG_COND_EQ : TCG_COND_NE,
+                            tmp, 0, label);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    gen_set_label(label);
+                    tcg_temp_free(tmp);
+                }
+                break;
+
+            default: /*reserved*/
+                RESERVED();
+                break;
+            }
             break;
 
         default: /*reserved*/
@@ -2072,8 +2364,34 @@
         break;
 
     case 3: /*LSCIp*/
-        HAS_OPTION(XTENSA_OPTION_COPROCESSOR);
-        TBD();
+        switch (RRI8_R) {
+        case 0: /*LSIf*/
+        case 4: /*SSIf*/
+        case 8: /*LSIUf*/
+        case 12: /*SSIUf*/
+            HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
+            gen_window_check1(dc, RRI8_S);
+            gen_check_cpenable(dc, 0);
+            {
+                TCGv_i32 addr = tcg_temp_new_i32();
+                tcg_gen_addi_i32(addr, cpu_R[RRI8_S], RRI8_IMM8 << 2);
+                gen_load_store_alignment(dc, 2, addr, false);
+                if (RRI8_R & 0x4) {
+                    tcg_gen_qemu_st32(cpu_FR[RRI8_T], addr, dc->cring);
+                } else {
+                    tcg_gen_qemu_ld32u(cpu_FR[RRI8_T], addr, dc->cring);
+                }
+                if (RRI8_R & 0x8) {
+                    tcg_gen_mov_i32(cpu_R[RRI8_S], addr);
+                }
+                tcg_temp_free(addr);
+            }
+            break;
+
+        default: /*reserved*/
+            RESERVED();
+            break;
+        }
         break;
 
     case 4: /*MAC16d*/
@@ -2502,7 +2820,9 @@
         break;
     }
 
-    gen_check_loop_end(dc, 0);
+    if (dc->is_jmp == DISAS_NEXT) {
+        gen_check_loop_end(dc, 0);
+    }
     dc->pc = dc->next_pc;
 
     return;
@@ -2569,6 +2889,8 @@
     dc.ccount_delta = 0;
     dc.debug = tb->flags & XTENSA_TBFLAG_DEBUG;
     dc.icount = tb->flags & XTENSA_TBFLAG_ICOUNT;
+    dc.cpenable = (tb->flags & XTENSA_TBFLAG_CPENABLE_MASK) >>
+        XTENSA_TBFLAG_CPENABLE_SHIFT;
 
     init_litbase(&dc);
     init_sar_tracker(&dc);
@@ -2710,6 +3032,16 @@
         cpu_fprintf(f, "AR%02d=%08x%c", i, env->phys_regs[i],
                 (i % 4) == 3 ? '\n' : ' ');
     }
+
+    if (xtensa_option_enabled(env->config, XTENSA_OPTION_FP_COPROCESSOR)) {
+        cpu_fprintf(f, "\n");
+
+        for (i = 0; i < 16; ++i) {
+            cpu_fprintf(f, "F%02d=%08x (%+10.8e)%c", i,
+                    float32_val(env->fregs[i]),
+                    *(float *)&env->fregs[i], (i % 2) == 1 ? '\n' : ' ');
+        }
+    }
 }
 
 void restore_state_to_opc(CPUXtensaState *env, TranslationBlock *tb, int pc_pos)
diff --git a/tcg/README b/tcg/README
index cfdfd96..33783ee 100644
--- a/tcg/README
+++ b/tcg/README
@@ -307,6 +307,12 @@
 
 Set DEST to 1 if (T1 cond T2) is true, otherwise set to 0.
 
+* movcond_i32/i64 cond, dest, c1, c2, v1, v2
+
+dest = (c1 cond c2 ? v1 : v2)
+
+Set DEST to V1 if (C1 cond C2) is true, otherwise set to V2.
+
 ********* Type conversions
 
 * ext_i32_i64 t0, t1
@@ -386,7 +392,8 @@
 
 Exit the current TB and jump to the TB index 'index' (constant) if the
 current TB was linked to this TB. Otherwise execute the next
-instructions.
+instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
+at most once with each slot index per TB.
 
 * qemu_ld8u t0, t1, flags
 qemu_ld8s t0, t1, flags
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index aed3b53..2bad0a2 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -145,12 +145,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 4;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index c0b8f72..e2299ca 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -73,6 +73,7 @@
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #define TCG_TARGET_HAS_GUEST_BASE
 
diff --git a/tcg/hppa/tcg-target.c b/tcg/hppa/tcg-target.c
index 8b81b70..2c79c10 100644
--- a/tcg/hppa/tcg-target.c
+++ b/tcg/hppa/tcg-target.c
@@ -175,12 +175,6 @@
     *insn_ptr = insn;
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 4;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -820,19 +814,34 @@
     tcg_out32(s, op);
 }
 
+static TCGCond const tcg_high_cond[] = {
+    [TCG_COND_EQ] = TCG_COND_EQ,
+    [TCG_COND_NE] = TCG_COND_NE,
+    [TCG_COND_LT] = TCG_COND_LT,
+    [TCG_COND_LE] = TCG_COND_LT,
+    [TCG_COND_GT] = TCG_COND_GT,
+    [TCG_COND_GE] = TCG_COND_GT,
+    [TCG_COND_LTU] = TCG_COND_LTU,
+    [TCG_COND_LEU] = TCG_COND_LTU,
+    [TCG_COND_GTU] = TCG_COND_GTU,
+    [TCG_COND_GEU] = TCG_COND_GTU
+};
+
 static void tcg_out_brcond2(TCGContext *s, int cond, TCGArg al, TCGArg ah,
                             TCGArg bl, int blconst, TCGArg bh, int bhconst,
                             int label_index)
 {
     switch (cond) {
     case TCG_COND_EQ:
-    case TCG_COND_NE:
-        tcg_out_comclr(s, tcg_invert_cond(cond), TCG_REG_R0, al, bl, blconst);
-        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
+        tcg_out_comclr(s, TCG_COND_NE, TCG_REG_R0, al, bl, blconst);
+        tcg_out_brcond(s, TCG_COND_EQ, ah, bh, bhconst, label_index);
         break;
-
+    case TCG_COND_NE:
+        tcg_out_brcond(s, TCG_COND_NE, al, bl, bhconst, label_index);
+        tcg_out_brcond(s, TCG_COND_NE, ah, bh, bhconst, label_index);
+        break;
     default:
-        tcg_out_brcond(s, cond, ah, bh, bhconst, label_index);
+        tcg_out_brcond(s, tcg_high_cond[cond], ah, bh, bhconst, label_index);
         tcg_out_comclr(s, TCG_COND_NE, TCG_REG_R0, ah, bh, bhconst);
         tcg_out_brcond(s, tcg_unsigned_cond(cond),
                        al, bl, blconst, label_index);
@@ -853,9 +862,8 @@
 {
     int scratch = TCG_REG_R20;
 
-    if (ret != al && ret != ah
-        && (blconst || ret != bl)
-        && (bhconst || ret != bh)) {
+    /* Note that the low parts are fully consumed before scratch is set.  */
+    if (ret != ah && (bhconst || ret != bh)) {
         scratch = ret;
     }
 
@@ -867,18 +875,49 @@
         tcg_out_movi(s, TCG_TYPE_I32, scratch, cond == TCG_COND_NE);
         break;
 
-    default:
+    case TCG_COND_GE:
+    case TCG_COND_GEU:
+    case TCG_COND_LT:
+    case TCG_COND_LTU:
+        /* Optimize compares with low part zero.  */
+        if (bl == 0) {
+            tcg_out_setcond(s, cond, ret, ah, bh, bhconst);
+            return;
+        }
+        /* FALLTHRU */
+
+    case TCG_COND_LE:
+    case TCG_COND_LEU:
+    case TCG_COND_GT:
+    case TCG_COND_GTU:
+        /* <= : ah < bh | (ah == bh && al <= bl) */
         tcg_out_setcond(s, tcg_unsigned_cond(cond), scratch, al, bl, blconst);
         tcg_out_comclr(s, TCG_COND_EQ, TCG_REG_R0, ah, bh, bhconst);
         tcg_out_movi(s, TCG_TYPE_I32, scratch, 0);
-        tcg_out_comclr(s, cond, TCG_REG_R0, ah, bh, bhconst);
+        tcg_out_comclr(s, tcg_invert_cond(tcg_high_cond[cond]),
+                       TCG_REG_R0, ah, bh, bhconst);
         tcg_out_movi(s, TCG_TYPE_I32, scratch, 1);
         break;
+
+    default:
+        tcg_abort();
     }
 
     tcg_out_mov(s, TCG_TYPE_I32, ret, scratch);
 }
 
+static void tcg_out_movcond(TCGContext *s, int cond, TCGArg ret,
+                            TCGArg c1, TCGArg c2, int c2const,
+                            TCGArg v1, int v1const)
+{
+    tcg_out_comclr(s, tcg_invert_cond(cond), TCG_REG_R0, c1, c2, c2const);
+    if (v1const) {
+        tcg_out_movi(s, TCG_TYPE_I32, ret, v1);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_I32, ret, v1);
+    }
+}
+
 #if defined(CONFIG_SOFTMMU)
 #include "../../softmmu_defs.h"
 
@@ -943,10 +982,11 @@
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, r1, offset);
     }
 
-    /* Compute the value that ought to appear in the TLB for a hit, namely, the page
-       of the address.  We include the low N bits of the address to catch unaligned
-       accesses and force them onto the slow path.  Do this computation after having
-       issued the load from the TLB slot to give the load time to complete.  */
+    /* Compute the value that ought to appear in the TLB for a hit, namely,
+       the page of the address.  We include the low N bits of the address
+       to catch unaligned accesses and force them onto the slow path.  Do
+       this computation after having issued the load from the TLB slot to
+       give the load time to complete.  */
     tcg_out_andi(s, r0, addrlo, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
 
     /* If not equal, jump to lab_miss. */
@@ -959,6 +999,36 @@
 
     return ret;
 }
+
+static int tcg_out_arg_reg32(TCGContext *s, int argno, TCGArg v, bool vconst)
+{
+    if (argno < 4) {
+        if (vconst) {
+            tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[argno], v);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[argno], v);
+        }
+    } else {
+        if (vconst && v != 0) {
+            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R20, v);
+            v = TCG_REG_R20;
+        }
+        tcg_out_st(s, TCG_TYPE_I32, v, TCG_REG_CALL_STACK,
+                   TCG_TARGET_CALL_STACK_OFFSET - ((argno - 3) * 4));
+    }
+    return argno + 1;
+}
+
+static int tcg_out_arg_reg64(TCGContext *s, int argno, TCGArg vl, TCGArg vh)
+{
+    /* 64-bit arguments must go in even reg pairs and stack slots.  */
+    if (argno & 1) {
+        argno++;
+    }
+    argno = tcg_out_arg_reg32(s, argno, vl, false);
+    argno = tcg_out_arg_reg32(s, argno, vh, false);
+    return argno;
+}
 #endif
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo_reg, int datahi_reg,
@@ -1039,39 +1109,36 @@
     /* Note that addrhi_reg is only used for 64-bit guests.  */
     int addrhi_reg = (TARGET_LONG_BITS == 64 ? *args++ : TCG_REG_R0);
     int mem_index = *args;
-    int lab1, lab2, argreg, offset;
+    int lab1, lab2, argno, offset;
 
     lab1 = gen_new_label();
     lab2 = gen_new_label();
 
     offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_read);
-    offset = tcg_out_tlb_read(s, TCG_REG_R26, TCG_REG_R25, addrlo_reg, addrhi_reg,
-                              opc & 3, lab1, offset);
+    offset = tcg_out_tlb_read(s, TCG_REG_R26, TCG_REG_R25, addrlo_reg,
+                              addrhi_reg, opc & 3, lab1, offset);
 
     /* TLB Hit.  */
-    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : TCG_REG_R25),
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20,
+               (offset ? TCG_REG_R1 : TCG_REG_R25),
                offsetof(CPUArchState, tlb_table[mem_index][0].addend) - offset);
-    tcg_out_qemu_ld_direct(s, datalo_reg, datahi_reg, addrlo_reg, TCG_REG_R20, opc);
+    tcg_out_qemu_ld_direct(s, datalo_reg, datahi_reg, addrlo_reg,
+                           TCG_REG_R20, opc);
     tcg_out_branch(s, lab2, 1);
 
     /* TLB Miss.  */
     /* label1: */
     tcg_out_label(s, lab1, s->code_ptr);
 
-    argreg = TCG_REG_R26;
-    tcg_out_mov(s, TCG_TYPE_I32, argreg--, addrlo_reg);
+    argno = 0;
+    argno = tcg_out_arg_reg32(s, argno, TCG_AREG0, false);
     if (TARGET_LONG_BITS == 64) {
-        tcg_out_mov(s, TCG_TYPE_I32, argreg--, addrhi_reg);
+        argno = tcg_out_arg_reg64(s, argno, addrlo_reg, addrhi_reg);
+    } else {
+        argno = tcg_out_arg_reg32(s, argno, addrlo_reg, false);
     }
-    tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+    argno = tcg_out_arg_reg32(s, argno, mem_index, true);
 
-    /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
     tcg_out_call(s, qemu_ld_helpers[opc & 3]);
 
     switch (opc) {
@@ -1107,8 +1174,8 @@
 #endif
 }
 
-static void tcg_out_qemu_st_direct(TCGContext *s, int datalo_reg, int datahi_reg,
-                                   int addr_reg, int opc)
+static void tcg_out_qemu_st_direct(TCGContext *s, int datalo_reg,
+                                   int datahi_reg, int addr_reg, int opc)
 {
 #ifdef TARGET_WORDS_BIGENDIAN
     const int bswap = 0;
@@ -1161,17 +1228,18 @@
     /* Note that addrhi_reg is only used for 64-bit guests.  */
     int addrhi_reg = (TARGET_LONG_BITS == 64 ? *args++ : TCG_REG_R0);
     int mem_index = *args;
-    int lab1, lab2, argreg, offset;
+    int lab1, lab2, argno, next, offset;
 
     lab1 = gen_new_label();
     lab2 = gen_new_label();
 
     offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
-    offset = tcg_out_tlb_read(s, TCG_REG_R26, TCG_REG_R25, addrlo_reg, addrhi_reg,
-                              opc, lab1, offset);
+    offset = tcg_out_tlb_read(s, TCG_REG_R26, TCG_REG_R25, addrlo_reg,
+                              addrhi_reg, opc, lab1, offset);
 
     /* TLB Hit.  */
-    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20, (offset ? TCG_REG_R1 : TCG_REG_R25),
+    tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R20,
+               (offset ? TCG_REG_R1 : TCG_REG_R25),
                offsetof(CPUArchState, tlb_table[mem_index][0].addend) - offset);
 
     /* There are no indexed stores, so we must do this addition explitly.
@@ -1184,63 +1252,46 @@
     /* label1: */
     tcg_out_label(s, lab1, s->code_ptr);
 
-    argreg = TCG_REG_R26;
-    tcg_out_mov(s, TCG_TYPE_I32, argreg--, addrlo_reg);
+    argno = 0;
+    argno = tcg_out_arg_reg32(s, argno, TCG_AREG0, false);
     if (TARGET_LONG_BITS == 64) {
-        tcg_out_mov(s, TCG_TYPE_I32, argreg--, addrhi_reg);
+        argno = tcg_out_arg_reg64(s, argno, addrlo_reg, addrhi_reg);
+    } else {
+        argno = tcg_out_arg_reg32(s, argno, addrlo_reg, false);
     }
 
+    next = (argno < 4 ? tcg_target_call_iarg_regs[argno] : TCG_REG_R20);
     switch(opc) {
     case 0:
-        tcg_out_andi(s, argreg--, datalo_reg, 0xff);
-        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+        tcg_out_andi(s, next, datalo_reg, 0xff);
+        argno = tcg_out_arg_reg32(s, argno, next, false);
         break;
     case 1:
-        tcg_out_andi(s, argreg--, datalo_reg, 0xffff);
-        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+        tcg_out_andi(s, next, datalo_reg, 0xffff);
+        argno = tcg_out_arg_reg32(s, argno, next, false);
         break;
     case 2:
-        tcg_out_mov(s, TCG_TYPE_I32, argreg--, datalo_reg);
-        tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
+        argno = tcg_out_arg_reg32(s, argno, datalo_reg, false);
         break;
     case 3:
-        /* Because of the alignment required by the 64-bit data argument,
-           we will always use R23/R24.  Also, we will always run out of
-           argument registers for storing mem_index, so that will have
-           to go on the stack.  */
-        if (mem_index == 0) {
-            argreg = TCG_REG_R0;
-        } else {
-            argreg = TCG_REG_R20;
-            tcg_out_movi(s, TCG_TYPE_I32, argreg, mem_index);
-        }
-        tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R23, datahi_reg);
-        tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R24, datalo_reg);
-        tcg_out_st(s, TCG_TYPE_I32, argreg, TCG_REG_CALL_STACK,
-                   TCG_TARGET_CALL_STACK_OFFSET - 4);
+        argno = tcg_out_arg_reg64(s, argno, datalo_reg, datahi_reg);
         break;
     default:
         tcg_abort();
     }
+    argno = tcg_out_arg_reg32(s, argno, mem_index, true);
 
-    /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
     tcg_out_call(s, qemu_st_helpers[opc]);
 
     /* label2: */
     tcg_out_label(s, lab2, s->code_ptr);
 #else
-    /* There are no indexed stores, so if GUEST_BASE is set we must do the add
-       explicitly.  Careful to avoid R20, which is used for the bswaps to follow.  */
+    /* There are no indexed stores, so if GUEST_BASE is set we must do
+       the add explicitly.  Careful to avoid R20, which is used for the
+       bswaps to follow.  */
     if (GUEST_BASE != 0) {
-        tcg_out_arith(s, TCG_REG_R31, addrlo_reg, TCG_GUEST_BASE_REG, INSN_ADDL);
+        tcg_out_arith(s, TCG_REG_R31, addrlo_reg,
+                      TCG_GUEST_BASE_REG, INSN_ADDL);
         addrlo_reg = TCG_REG_R31;
     }
     tcg_out_qemu_st_direct(s, datalo_reg, datahi_reg, addrlo_reg, opc);
@@ -1475,6 +1526,11 @@
                          args[3], const_args[3], args[4], const_args[4]);
         break;
 
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2], const_args[2],
+                        args[3], const_args[3]);
+        break;
+
     case INDEX_op_add2_i32:
         tcg_out_add2(s, args[0], args[1], args[2], args[3],
                      args[4], args[5], const_args[4]);
@@ -1583,6 +1639,10 @@
     { INDEX_op_setcond_i32, { "r", "rZ", "rI" } },
     { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rI", "rI" } },
 
+    /* ??? We can actually support a signed 14-bit arg3, but we
+       only have existing constraints for a signed 11-bit.  */
+    { INDEX_op_movcond_i32, { "r", "rZ", "rI", "rI", "0" } },
+
     { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rI", "rZ" } },
     { INDEX_op_sub2_i32, { "r", "r", "rI", "rZ", "rK", "rZ" } },
 
diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
index 01ef960..5351353 100644
--- a/tcg/hppa/tcg-target.h
+++ b/tcg/hppa/tcg-target.h
@@ -96,6 +96,7 @@
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
+#define TCG_TARGET_HAS_movcond_i32      1
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub rd, 0, rs */
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 34c2df8..122d636 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -75,9 +75,7 @@
     TCG_REG_R8,
     TCG_REG_R9,
 #else
-    TCG_REG_EAX,
-    TCG_REG_EDX,
-    TCG_REG_ECX
+    /* 32 bit mode uses stack based calling convention (GCC default). */
 #endif
 };
 
@@ -88,6 +86,18 @@
 #endif
 };
 
+/* Registers used with L constraint, which are the first argument 
+   registers on x86_64, and two random call clobbered registers on
+   i386. */
+#if TCG_TARGET_REG_BITS == 64
+# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
+# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
+# define TCG_REG_L2 tcg_target_call_iarg_regs[2]
+#else
+# define TCG_REG_L0 TCG_REG_EAX
+# define TCG_REG_L1 TCG_REG_EDX
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -114,16 +124,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    if (TCG_TARGET_REG_BITS == 64) {
-        return 6;
-    }
-
-    return 0;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -179,16 +179,16 @@
         /* qemu_ld/st address constraint */
     case 'L':
         ct->ct |= TCG_CT_REG;
-        if (TCG_TARGET_REG_BITS == 64) {
+#if TCG_TARGET_REG_BITS == 64
             tcg_regset_set32(ct->u.regs, 0, 0xffff);
-            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[0]);
-            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[1]);
-            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[2]);
-        } else {
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L2);
+#else
             tcg_regset_set32(ct->u.regs, 0, 0xff);
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_EAX);
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_EDX);
-        }
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
+#endif
         break;
 
     case 'e':
@@ -249,6 +249,7 @@
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_CALL_Jz	(0xe8)
+#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 #define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
 #define OPC_DEC_r32	(0x48)
 #define OPC_IMUL_GvEv	(0xaf | P_EXT)
@@ -263,6 +264,7 @@
 #define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
 #define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
 #define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
+#define OPC_MOVB_EvIz   (0xc6)
 #define OPC_MOVL_EvIz	(0xc7)
 #define OPC_MOVL_Iv     (0xb8)
 #define OPC_MOVSBL	(0xbe | P_EXT)
@@ -935,6 +937,24 @@
 }
 #endif
 
+static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
+                              TCGArg c1, TCGArg c2, int const_c2,
+                              TCGArg v1)
+{
+    tcg_out_cmp(s, c1, c2, const_c2, 0);
+    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond], dest, v1);
+}
+
+#if TCG_TARGET_REG_BITS == 64
+static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGArg dest,
+                              TCGArg c1, TCGArg c2, int const_c2,
+                              TCGArg v1)
+{
+    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
+    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | P_REXW, dest, v1);
+}
+#endif
+
 static void tcg_out_branch(TCGContext *s, int call, tcg_target_long dest)
 {
     tcg_target_long disp = dest - (tcg_target_long)s->code_ptr - 5;
@@ -1009,8 +1029,8 @@
                                     uint8_t **label_ptr, int which)
 {
     const int addrlo = args[addrlo_idx];
-    const int r0 = tcg_target_call_iarg_regs[0];
-    const int r1 = tcg_target_call_iarg_regs[1];
+    const int r0 = TCG_REG_L0;
+    const int r1 = TCG_REG_L1;
     TCGType type = TCG_TYPE_I32;
     int rexw = 0;
 
@@ -1172,8 +1192,7 @@
                      label_ptr, offsetof(CPUTLBEntry, addr_read));
 
     /* TLB Hit.  */
-    tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
-                           tcg_target_call_iarg_regs[0], 0, opc);
+    tcg_out_qemu_ld_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);
 
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
@@ -1206,14 +1225,10 @@
     tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx],
                  mem_index);
     /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
 #endif
 
     tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
@@ -1279,11 +1294,9 @@
                use the ADDR32 prefix.  For now, do nothing.  */
 
             if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64,
-                             tcg_target_call_iarg_regs[0], GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW,
-                            tcg_target_call_iarg_regs[0], base);
-                base = tcg_target_call_iarg_regs[0];
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
+                base = TCG_REG_L0;
                 offset = 0;
             }
         }
@@ -1304,8 +1317,8 @@
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
        instead of moving to the scratch.  But as it is, the L constraint
-       means that the second argument reg is definitely free here.  */
-    int scratch = tcg_target_call_iarg_regs[1];
+       means that TCG_REG_L1 is definitely free here.  */
+    const int scratch = TCG_REG_L1;
 
     switch (sizeop) {
     case 0:
@@ -1378,8 +1391,7 @@
                      label_ptr, offsetof(CPUTLBEntry, addr_write));
 
     /* TLB Hit.  */
-    tcg_out_qemu_st_direct(s, data_reg, data_reg2,
-                           tcg_target_call_iarg_regs[0], 0, opc);
+    tcg_out_qemu_st_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);
 
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
@@ -1414,18 +1426,14 @@
     stack_adjust += 4;
 #else
     tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-                tcg_target_call_iarg_regs[1], data_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+                TCG_REG_L1, data_reg);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_L2, mem_index);
     stack_adjust = 0;
     /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
 #endif
 
     tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
@@ -1452,11 +1460,9 @@
                use the ADDR32 prefix.  For now, do nothing.  */
 
             if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64,
-                             tcg_target_call_iarg_regs[0], GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW,
-                            tcg_target_call_iarg_regs[0], base);
-                base = tcg_target_call_iarg_regs[0];
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
+                base = TCG_REG_L0;
                 offset = 0;
             }
         }
@@ -1543,18 +1549,35 @@
         break;
 
     OP_32_64(st8):
-        tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R,
-                             args[0], args[1], args[2]);
+        if (const_args[0]) {
+            tcg_out_modrm_offset(s, OPC_MOVB_EvIz,
+                                 0, args[1], args[2]);
+            tcg_out8(s, args[0]);
+        } else {
+            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R,
+                                 args[0], args[1], args[2]);
+        }
         break;
     OP_32_64(st16):
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16,
-                             args[0], args[1], args[2]);
+        if (const_args[0]) {
+            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16,
+                                 0, args[1], args[2]);
+            tcg_out16(s, args[0]);
+        } else {
+            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16,
+                                 args[0], args[1], args[2]);
+        }
         break;
 #if TCG_TARGET_REG_BITS == 64
     case INDEX_op_st32_i64:
 #endif
     case INDEX_op_st_i32:
-        tcg_out_st(s, TCG_TYPE_I32, args[0], args[1], args[2]);
+        if (const_args[0]) {
+            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, args[1], args[2]);
+            tcg_out32(s, args[0]);
+        } else {
+            tcg_out_st(s, TCG_TYPE_I32, args[0], args[1], args[2]);
+        }
         break;
 
     OP_32_64(add):
@@ -1650,6 +1673,10 @@
         tcg_out_setcond32(s, args[3], args[0], args[1],
                           args[2], const_args[2]);
         break;
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond32(s, args[5], args[0], args[1],
+                          args[2], const_args[2], args[3]);
+        break;
 
     OP_32_64(bswap16):
         tcg_out_rolw_8(s, args[0]);
@@ -1758,7 +1785,13 @@
         tcg_out_ld(s, TCG_TYPE_I64, args[0], args[1], args[2]);
         break;
     case INDEX_op_st_i64:
-        tcg_out_st(s, TCG_TYPE_I64, args[0], args[1], args[2]);
+        if (const_args[0]) {
+            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW,
+                                 0, args[1], args[2]);
+            tcg_out32(s, args[0]);
+        } else {
+            tcg_out_st(s, TCG_TYPE_I64, args[0], args[1], args[2]);
+        }
         break;
     case INDEX_op_qemu_ld32s:
         tcg_out_qemu_ld(s, args, 2 | 4);
@@ -1772,6 +1805,10 @@
         tcg_out_setcond64(s, args[3], args[0], args[1],
                           args[2], const_args[2]);
         break;
+    case INDEX_op_movcond_i64:
+        tcg_out_movcond64(s, args[5], args[0], args[1],
+                          args[2], const_args[2], args[3]);
+        break;
 
     case INDEX_op_bswap64_i64:
         tcg_out_bswap64(s, args[0]);
@@ -1820,9 +1857,9 @@
     { INDEX_op_ld16u_i32, { "r", "r" } },
     { INDEX_op_ld16s_i32, { "r", "r" } },
     { INDEX_op_ld_i32, { "r", "r" } },
-    { INDEX_op_st8_i32, { "q", "r" } },
-    { INDEX_op_st16_i32, { "r", "r" } },
-    { INDEX_op_st_i32, { "r", "r" } },
+    { INDEX_op_st8_i32, { "qi", "r" } },
+    { INDEX_op_st16_i32, { "ri", "r" } },
+    { INDEX_op_st_i32, { "ri", "r" } },
 
     { INDEX_op_add_i32, { "r", "r", "ri" } },
     { INDEX_op_sub_i32, { "r", "0", "ri" } },
@@ -1856,6 +1893,7 @@
     { INDEX_op_setcond_i32, { "q", "r", "ri" } },
 
     { INDEX_op_deposit_i32, { "Q", "0", "Q" } },
+    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
 
 #if TCG_TARGET_REG_BITS == 32
     { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
@@ -1873,10 +1911,10 @@
     { INDEX_op_ld32u_i64, { "r", "r" } },
     { INDEX_op_ld32s_i64, { "r", "r" } },
     { INDEX_op_ld_i64, { "r", "r" } },
-    { INDEX_op_st8_i64, { "r", "r" } },
-    { INDEX_op_st16_i64, { "r", "r" } },
-    { INDEX_op_st32_i64, { "r", "r" } },
-    { INDEX_op_st_i64, { "r", "r" } },
+    { INDEX_op_st8_i64, { "ri", "r" } },
+    { INDEX_op_st16_i64, { "ri", "r" } },
+    { INDEX_op_st32_i64, { "ri", "r" } },
+    { INDEX_op_st_i64, { "re", "r" } },
 
     { INDEX_op_add_i64, { "r", "0", "re" } },
     { INDEX_op_mul_i64, { "r", "0", "re" } },
@@ -1910,6 +1948,7 @@
     { INDEX_op_ext32u_i64, { "r", "r" } },
 
     { INDEX_op_deposit_i64, { "Q", "0", "Q" } },
+    { INDEX_op_movcond_i64, { "r", "r", "re", "r", "0" } },
 #endif
 
 #if TCG_TARGET_REG_BITS == 64
@@ -2008,15 +2047,17 @@
 #if TCG_TARGET_REG_BITS == 32
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
-    tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[1], TCG_REG_ESP,
-               (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4);
+    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+    /* jmp *tb.  */
+    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
+		         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
+			 + stack_addend);
 #else
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
-#endif
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
-
     /* jmp *tb.  */
     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
+#endif
 
     /* TB epilogue */
     tb_ret_addr = s->code_ptr;
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 8be42f3..ace63ba 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -67,7 +67,11 @@
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_ESP 
 #define TCG_TARGET_STACK_ALIGN 16
+#if defined(_WIN64)
+#define TCG_TARGET_CALL_STACK_OFFSET 32
+#else
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#endif
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
@@ -86,6 +90,12 @@
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
+#if defined(__x86_64__) || defined(__i686__)
+/* Use cmov only if the compiler is already doing so.  */
+#define TCG_TARGET_HAS_movcond_i32      1
+#else
+#define TCG_TARGET_HAS_movcond_i32      0
+#endif
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -107,6 +117,7 @@
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      1
+#define TCG_TARGET_HAS_movcond_i64      1
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/ia64/tcg-target.c b/tcg/ia64/tcg-target.c
index 1745038..dc9c12c 100644
--- a/tcg/ia64/tcg-target.c
+++ b/tcg/ia64/tcg-target.c
@@ -176,12 +176,6 @@
     TCG_REG_R8
 };
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 8;
-}
-
 /*
  * opcode formation
  */
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index c22962a..368aee4 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -133,6 +133,8 @@
 #define TCG_TARGET_HAS_rot_i64          1
 #define TCG_TARGET_HAS_deposit_i32      0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i64      0
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub r1, r0, r3 */
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index 74db83d..f70910a 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -68,7 +68,7 @@
 #endif
 
 /* check if we really need so many registers :P */
-static const int tcg_target_reg_alloc_order[] = {
+static const TCGReg tcg_target_reg_alloc_order[] = {
     TCG_REG_S0,
     TCG_REG_S1,
     TCG_REG_S2,
@@ -94,14 +94,14 @@
     TCG_REG_V1
 };
 
-static const int tcg_target_call_iarg_regs[4] = {
+static const TCGReg tcg_target_call_iarg_regs[4] = {
     TCG_REG_A0,
     TCG_REG_A1,
     TCG_REG_A2,
     TCG_REG_A3
 };
 
-static const int tcg_target_call_oarg_regs[2] = {
+static const TCGReg tcg_target_call_oarg_regs[2] = {
     TCG_REG_V0,
     TCG_REG_V1
 };
@@ -185,12 +185,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 4;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -278,6 +272,8 @@
 enum {
     OPC_BEQ      = 0x04 << 26,
     OPC_BNE      = 0x05 << 26,
+    OPC_BLEZ     = 0x06 << 26,
+    OPC_BGTZ     = 0x07 << 26,
     OPC_ADDIU    = 0x09 << 26,
     OPC_SLTI     = 0x0A << 26,
     OPC_SLTIU    = 0x0B << 26,
@@ -298,12 +294,16 @@
     OPC_SPECIAL  = 0x00 << 26,
     OPC_SLL      = OPC_SPECIAL | 0x00,
     OPC_SRL      = OPC_SPECIAL | 0x02,
+    OPC_ROTR     = OPC_SPECIAL | (0x01 << 21) | 0x02,
     OPC_SRA      = OPC_SPECIAL | 0x03,
     OPC_SLLV     = OPC_SPECIAL | 0x04,
     OPC_SRLV     = OPC_SPECIAL | 0x06,
+    OPC_ROTRV    = OPC_SPECIAL | (0x01 <<  6) | 0x06,
     OPC_SRAV     = OPC_SPECIAL | 0x07,
     OPC_JR       = OPC_SPECIAL | 0x08,
     OPC_JALR     = OPC_SPECIAL | 0x09,
+    OPC_MOVZ     = OPC_SPECIAL | 0x0A,
+    OPC_MOVN     = OPC_SPECIAL | 0x0B,
     OPC_MFHI     = OPC_SPECIAL | 0x10,
     OPC_MFLO     = OPC_SPECIAL | 0x12,
     OPC_MULT     = OPC_SPECIAL | 0x18,
@@ -319,7 +319,13 @@
     OPC_SLT      = OPC_SPECIAL | 0x2A,
     OPC_SLTU     = OPC_SPECIAL | 0x2B,
 
+    OPC_REGIMM   = 0x01 << 26,
+    OPC_BLTZ     = OPC_REGIMM | (0x00 << 16),
+    OPC_BGEZ     = OPC_REGIMM | (0x01 << 16),
+
     OPC_SPECIAL3 = 0x1f << 26,
+    OPC_INS      = OPC_SPECIAL3 | 0x004,
+    OPC_WSBH     = OPC_SPECIAL3 | 0x0a0,
     OPC_SEB      = OPC_SPECIAL3 | 0x420,
     OPC_SEH      = OPC_SPECIAL3 | 0x620,
 };
@@ -327,7 +333,8 @@
 /*
  * Type reg
  */
-static inline void tcg_out_opc_reg(TCGContext *s, int opc, int rd, int rs, int rt)
+static inline void tcg_out_opc_reg(TCGContext *s, int opc,
+                                   TCGReg rd, TCGReg rs, TCGReg rt)
 {
     int32_t inst;
 
@@ -341,7 +348,8 @@
 /*
  * Type immediate
  */
-static inline void tcg_out_opc_imm(TCGContext *s, int opc, int rt, int rs, int imm)
+static inline void tcg_out_opc_imm(TCGContext *s, int opc,
+                                   TCGReg rt, TCGReg rs, TCGArg imm)
 {
     int32_t inst;
 
@@ -355,7 +363,8 @@
 /*
  * Type branch
  */
-static inline void tcg_out_opc_br(TCGContext *s, int opc, int rt, int rs)
+static inline void tcg_out_opc_br(TCGContext *s, int opc,
+                                  TCGReg rt, TCGReg rs)
 {
     /* We pay attention here to not modify the branch target by reading
        the existing value and using it again. This ensure that caches and
@@ -368,7 +377,8 @@
 /*
  * Type sa
  */
-static inline void tcg_out_opc_sa(TCGContext *s, int opc, int rd, int rt, int sa)
+static inline void tcg_out_opc_sa(TCGContext *s, int opc,
+                                  TCGReg rd, TCGReg rt, TCGArg sa)
 {
     int32_t inst;
 
@@ -407,38 +417,47 @@
     }
 }
 
-static inline void tcg_out_bswap16(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap16(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+#else
     /* ret and arg can't be register at */
     if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
     }
 
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0x00ff);
-
     tcg_out_opc_sa(s, OPC_SLL, ret, arg, 8);
     tcg_out_opc_imm(s, OPC_ANDI, ret, ret, 0xff00);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_bswap16s(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap16s(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+    tcg_out_opc_reg(s, OPC_SEH, ret, 0, ret);
+#else
     /* ret and arg can't be register at */
     if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
     }
 
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0xff);
-
     tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
     tcg_out_opc_sa(s, OPC_SRA, ret, ret, 16);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_bswap32(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+    tcg_out_opc_sa(s, OPC_ROTR, ret, ret, 16);
+#else
     /* ret and arg must be different and can't be register at */
     if (ret == arg || ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
@@ -456,9 +475,10 @@
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0xff00);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg)
+static inline void tcg_out_ext8s(TCGContext *s, TCGReg ret, TCGReg arg)
 {
 #ifdef _MIPS_ARCH_MIPS32R2
     tcg_out_opc_reg(s, OPC_SEB, ret, 0, arg);
@@ -468,7 +488,7 @@
 #endif
 }
 
-static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg)
+static inline void tcg_out_ext16s(TCGContext *s, TCGReg ret, TCGReg arg)
 {
 #ifdef _MIPS_ARCH_MIPS32R2
     tcg_out_opc_reg(s, OPC_SEH, ret, 0, arg);
@@ -478,8 +498,8 @@
 #endif
 }
 
-static inline void tcg_out_ldst(TCGContext *s, int opc, int arg,
-                              int arg1, tcg_target_long arg2)
+static inline void tcg_out_ldst(TCGContext *s, int opc, TCGArg arg,
+                                TCGReg arg1, TCGArg arg2)
 {
     if (arg2 == (int16_t) arg2) {
         tcg_out_opc_imm(s, opc, arg, arg1, arg2);
@@ -502,7 +522,7 @@
     tcg_out_ldst(s, OPC_SW, arg, arg1, arg2);
 }
 
-static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
+static inline void tcg_out_addi(TCGContext *s, TCGReg reg, TCGArg val)
 {
     if (val == (int16_t)val) {
         tcg_out_opc_imm(s, OPC_ADDIU, reg, reg, val);
@@ -543,7 +563,7 @@
 #undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
 #define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
     tcg_out_movi(s, TCG_TYPE_I32, A, arg);
-DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_imm32, uint32_t arg)
+DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_imm32, TCGArg arg)
 #undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
 
 /* We don't use the macro for this one to avoid an unnecessary reg-reg
@@ -573,8 +593,8 @@
 #endif
 }
 
-static void tcg_out_brcond(TCGContext *s, TCGCond cond, int arg1,
-                           int arg2, int label_index)
+static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGArg arg1,
+                           TCGArg arg2, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
 
@@ -586,32 +606,48 @@
         tcg_out_opc_br(s, OPC_BNE, arg1, arg2);
         break;
     case TCG_COND_LT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BLTZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
+            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_LTU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
         tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_GE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BGEZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
+            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_GEU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
         tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_LE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BLEZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
+            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_LEU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
         tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_GT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BGTZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
+            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_GTU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
@@ -631,8 +667,9 @@
 
 /* XXX: we implement it at the target level to avoid having to
    handle cross basic blocks temporaries */
-static void tcg_out_brcond2(TCGContext *s, TCGCond cond, int arg1,
-                            int arg2, int arg3, int arg4, int label_index)
+static void tcg_out_brcond2(TCGContext *s, TCGCond cond, TCGArg arg1,
+                            TCGArg arg2, TCGArg arg3, TCGArg arg4,
+                            int label_index)
 {
     void *label_ptr;
 
@@ -694,8 +731,70 @@
     reloc_pc16(label_ptr, (tcg_target_long) s->code_ptr);
 }
 
-static void tcg_out_setcond(TCGContext *s, TCGCond cond, int ret,
-                            int arg1, int arg2)
+static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGArg c1, TCGArg c2, TCGArg v)
+{
+    switch (cond) {
+    case TCG_COND_EQ:
+        if (c1 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, c2);
+        } else if (c2 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, c1);
+        } else {
+            tcg_out_opc_reg(s, OPC_XOR, TCG_REG_AT, c1, c2);
+            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        }
+        break;
+    case TCG_COND_NE:
+        if (c1 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVN, ret, v, c2);
+        } else if (c2 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVN, ret, v, c1);
+        } else {
+            tcg_out_opc_reg(s, OPC_XOR, TCG_REG_AT, c1, c2);
+            tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        }
+        break;
+    case TCG_COND_LT:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_LTU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GE:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GEU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_LE:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_LEU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GT:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GTU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    default:
+        tcg_abort();
+        break;
+    }
+}
+
+static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGArg arg1, TCGArg arg2)
 {
     switch (cond) {
     case TCG_COND_EQ:
@@ -754,8 +853,8 @@
 
 /* XXX: we implement it at the target level to avoid having to
    handle cross basic blocks temporaries */
-static void tcg_out_setcond2(TCGContext *s, TCGCond cond, int ret,
-                             int arg1, int arg2, int arg3, int arg4)
+static void tcg_out_setcond2(TCGContext *s, TCGCond cond, TCGReg ret,
+                             TCGArg arg1, TCGArg arg2, TCGArg arg3, TCGArg arg4)
 {
     switch (cond) {
     case TCG_COND_EQ:
@@ -842,18 +941,17 @@
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
                             int opc)
 {
-    int addr_regl, addr_meml;
-    int data_regl, data_regh, data_reg1, data_reg2;
-    int mem_index, s_bits;
+    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
     void *label1_ptr, *label2_ptr;
     int arg_num;
-#endif
-#if TARGET_LONG_BITS == 64
-# if defined(CONFIG_SOFTMMU)
+    int mem_index, s_bits;
+    int addr_meml;
+# if TARGET_LONG_BITS == 64
     uint8_t *label3_ptr;
+    TCGReg addr_regh;
+    int addr_memh;
 # endif
-    int addr_regh, addr_memh;
 #endif
     data_regl = *args++;
     if (opc == 3)
@@ -861,11 +959,22 @@
     else
         data_regh = 0;
     addr_regl = *args++;
-#if TARGET_LONG_BITS == 64
+#if defined(CONFIG_SOFTMMU)
+# if TARGET_LONG_BITS == 64
     addr_regh = *args++;
-#endif
+#  if defined(TCG_TARGET_WORDS_BIGENDIAN)
+    addr_memh = 0;
+    addr_meml = 4;
+#  else
+    addr_memh = 4;
+    addr_meml = 0;
+#  endif
+# else
+    addr_meml = 0;
+# endif
     mem_index = *args;
     s_bits = opc & 3;
+#endif
 
     if (opc == 3) {
 #if defined(TCG_TARGET_WORDS_BIGENDIAN)
@@ -879,18 +988,6 @@
         data_reg1 = data_regl;
         data_reg2 = 0;
     }
-#if TARGET_LONG_BITS == 64
-# if defined(TCG_TARGET_WORDS_BIGENDIAN)
-    addr_memh = 0;
-    addr_meml = 4;
-# else
-    addr_memh = 4;
-    addr_meml = 0;
-# endif
-#else
-    addr_meml = 0;
-#endif
-
 #if defined(CONFIG_SOFTMMU)
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
@@ -1029,23 +1126,45 @@
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
                             int opc)
 {
-    int addr_regl, addr_meml;
-    int data_regl, data_regh, data_reg1, data_reg2;
-    int mem_index, s_bits;
+    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
     uint8_t *label1_ptr, *label2_ptr;
     int arg_num;
+    int mem_index, s_bits;
+    int addr_meml;
 #endif
 #if TARGET_LONG_BITS == 64
 # if defined(CONFIG_SOFTMMU)
     uint8_t *label3_ptr;
+    TCGReg addr_regh;
+    int addr_memh;
 # endif
-    int addr_regh, addr_memh;
 #endif
-
     data_regl = *args++;
     if (opc == 3) {
         data_regh = *args++;
+    } else {
+        data_regh = 0;
+    }
+    addr_regl = *args++;
+#if defined(CONFIG_SOFTMMU)
+# if TARGET_LONG_BITS == 64
+    addr_regh = *args++;
+#  if defined(TCG_TARGET_WORDS_BIGENDIAN)
+    addr_memh = 0;
+    addr_meml = 4;
+#  else
+    addr_memh = 4;
+    addr_meml = 0;
+#  endif
+# else
+    addr_meml = 0;
+# endif
+    mem_index = *args;
+    s_bits = opc;
+#endif
+
+    if (opc == 3) {
 #if defined(TCG_TARGET_WORDS_BIGENDIAN)
         data_reg1 = data_regh;
         data_reg2 = data_regl;
@@ -1056,23 +1175,7 @@
     } else {
         data_reg1 = data_regl;
         data_reg2 = 0;
-        data_regh = 0;
     }
-    addr_regl = *args++;
-#if TARGET_LONG_BITS == 64
-    addr_regh = *args++;
-# if defined(TCG_TARGET_WORDS_BIGENDIAN)
-    addr_memh = 0;
-    addr_meml = 4;
-# else
-    addr_memh = 4;
-    addr_meml = 0;
-# endif
-#else
-    addr_meml = 0;
-#endif
-    mem_index = *args;
-    s_bits = opc;
 
 #if defined(CONFIG_SOFTMMU)
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
@@ -1157,7 +1260,8 @@
         break;
     case 1:
         if (TCG_NEED_BSWAP) {
-            tcg_out_bswap16(s, TCG_REG_T0, data_reg1);
+            tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_T0, data_reg1, 0xffff);
+            tcg_out_bswap16(s, TCG_REG_T0, TCG_REG_T0);
             tcg_out_opc_imm(s, OPC_SH, TCG_REG_T0, TCG_REG_A0, 0);
         } else {
             tcg_out_opc_imm(s, OPC_SH, data_reg1, TCG_REG_A0, 0);
@@ -1377,6 +1481,31 @@
             tcg_out_opc_reg(s, OPC_SRLV, args[0], args[2], args[1]);
         }
         break;
+    case INDEX_op_rotl_i32:
+        if (const_args[2]) {
+            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], 0x20 - args[2]);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_AT, 32);
+            tcg_out_opc_reg(s, OPC_SUBU, TCG_REG_AT, TCG_REG_AT, args[2]);
+            tcg_out_opc_reg(s, OPC_ROTRV, args[0], TCG_REG_AT, args[1]);
+        }
+        break;
+    case INDEX_op_rotr_i32:
+        if (const_args[2]) {
+            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], args[2]);
+        } else {
+            tcg_out_opc_reg(s, OPC_ROTRV, args[0], args[2], args[1]);
+        }
+        break;
+
+    /* The bswap routines do not work on non-R2 CPU. In that case
+       we let TCG generating the corresponding code. */
+    case INDEX_op_bswap16_i32:
+        tcg_out_bswap16(s, args[0], args[1]);
+        break;
+    case INDEX_op_bswap32_i32:
+        tcg_out_bswap32(s, args[0], args[1]);
+        break;
 
     case INDEX_op_ext8s_i32:
         tcg_out_ext8s(s, args[0], args[1]);
@@ -1385,6 +1514,11 @@
         tcg_out_ext16s(s, args[0], args[1]);
         break;
 
+    case INDEX_op_deposit_i32:
+        tcg_out_opc_imm(s, OPC_INS, args[0], args[2],
+                        ((args[3] + args[4] - 1) << 11) | (args[3] << 6));
+        break;
+
     case INDEX_op_brcond_i32:
         tcg_out_brcond(s, args[2], args[0], args[1], args[3]);
         break;
@@ -1392,6 +1526,10 @@
         tcg_out_brcond2(s, args[4], args[0], args[1], args[2], args[3], args[5]);
         break;
 
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2], args[3]);
+        break;
+
     case INDEX_op_setcond_i32:
         tcg_out_setcond(s, args[3], args[0], args[1], args[2]);
         break;
@@ -1453,34 +1591,42 @@
     { INDEX_op_st16_i32, { "rZ", "r" } },
     { INDEX_op_st_i32, { "rZ", "r" } },
 
-    { INDEX_op_add_i32, { "r", "rZ", "rJZ" } },
+    { INDEX_op_add_i32, { "r", "rZ", "rJ" } },
     { INDEX_op_mul_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_mulu2_i32, { "r", "r", "rZ", "rZ" } },
     { INDEX_op_div_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_divu_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_rem_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_remu_i32, { "r", "rZ", "rZ" } },
-    { INDEX_op_sub_i32, { "r", "rZ", "rJZ" } },
+    { INDEX_op_sub_i32, { "r", "rZ", "rJ" } },
 
-    { INDEX_op_and_i32, { "r", "rZ", "rIZ" } },
+    { INDEX_op_and_i32, { "r", "rZ", "rI" } },
     { INDEX_op_nor_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_not_i32, { "r", "rZ" } },
     { INDEX_op_or_i32, { "r", "rZ", "rIZ" } },
     { INDEX_op_xor_i32, { "r", "rZ", "rIZ" } },
 
-    { INDEX_op_shl_i32, { "r", "rZ", "riZ" } },
-    { INDEX_op_shr_i32, { "r", "rZ", "riZ" } },
-    { INDEX_op_sar_i32, { "r", "rZ", "riZ" } },
+    { INDEX_op_shl_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_shr_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_sar_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_rotr_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_rotl_i32, { "r", "rZ", "ri" } },
+
+    { INDEX_op_bswap16_i32, { "r", "r" } },
+    { INDEX_op_bswap32_i32, { "r", "r" } },
 
     { INDEX_op_ext8s_i32, { "r", "rZ" } },
     { INDEX_op_ext16s_i32, { "r", "rZ" } },
 
+    { INDEX_op_deposit_i32, { "r", "0", "rZ" } },
+
     { INDEX_op_brcond_i32, { "rZ", "rZ" } },
+    { INDEX_op_movcond_i32, { "r", "rZ", "rZ", "rZ", "0" } },
     { INDEX_op_setcond_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rZ", "rZ" } },
 
-    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJZ", "rJZ" } },
-    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJZ", "rJZ" } },
+    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
+    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
     { INDEX_op_brcond2_i32, { "rZ", "rZ", "rZ", "rZ" } },
 
 #if TARGET_LONG_BITS == 32
@@ -1520,7 +1666,6 @@
     TCG_REG_S5,
     TCG_REG_S6,
     TCG_REG_S7,
-    TCG_REG_GP,
     TCG_REG_FP,
     TCG_REG_RA,       /* should be last for ABI compliance */
 };
@@ -1530,11 +1675,15 @@
 {
     int i, frame_size;
 
-    /* reserve some stack space */
+    /* reserve some stack space, also for TCG temps. */
     frame_size = ARRAY_SIZE(tcg_target_callee_save_regs) * 4
-                 + TCG_STATIC_CALL_ARGS_SIZE;
+                 + TCG_STATIC_CALL_ARGS_SIZE
+                 + CPU_TEMP_BUF_NLONGS * sizeof(long);
     frame_size = (frame_size + TCG_TARGET_STACK_ALIGN - 1) &
                  ~(TCG_TARGET_STACK_ALIGN - 1);
+    tcg_set_frame(s, TCG_REG_SP, ARRAY_SIZE(tcg_target_callee_save_regs) * 4
+                  + TCG_STATIC_CALL_ARGS_SIZE,
+                  CPU_TEMP_BUF_NLONGS * sizeof(long));
 
     /* TB prologue */
     tcg_out_addi(s, TCG_REG_SP, -frame_size);
@@ -1586,8 +1735,7 @@
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_T0);   /* internal use */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_RA);   /* return address */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);   /* stack pointer */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_GP);   /* global pointer */
 
     tcg_add_target_add_op_defs(mips_op_defs);
-    tcg_set_frame(s, TCG_AREG0, offsetof(CPUArchState, temp_buf),
-                  CPU_TEMP_BUF_NLONGS * sizeof(long));
 }
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 1c61931..d147e70 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -80,16 +80,34 @@
 #define TCG_TARGET_HAS_div_i32          1
 #define TCG_TARGET_HAS_not_i32          1
 #define TCG_TARGET_HAS_nor_i32          1
-#define TCG_TARGET_HAS_rot_i32          0
 #define TCG_TARGET_HAS_ext8s_i32        1
 #define TCG_TARGET_HAS_ext16s_i32       1
-#define TCG_TARGET_HAS_bswap32_i32      0
-#define TCG_TARGET_HAS_bswap16_i32      0
 #define TCG_TARGET_HAS_andc_i32         0
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
+
+/* optional instructions only implemented on MIPS4, MIPS32 and Loongson 2 */
+#if defined(_MIPS_ARCH_MIPS4) || defined(_MIPS_ARCH_MIPS32) || \
+    defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_LOONGSON2E) || \
+    defined(_MIPS_ARCH_LOONGSON2F)
+#define TCG_TARGET_HAS_movcond_i32      1
+#else
+#define TCG_TARGET_HAS_movcond_i32      0
+#endif
+
+/* optional instructions only implemented on MIPS32R2 */
+#ifdef _MIPS_ARCH_MIPS32R2
+#define TCG_TARGET_HAS_bswap16_i32      1
+#define TCG_TARGET_HAS_bswap32_i32      1
+#define TCG_TARGET_HAS_rot_i32          1
+#define TCG_TARGET_HAS_deposit_i32      1
+#else
+#define TCG_TARGET_HAS_bswap16_i32      0
+#define TCG_TARGET_HAS_bswap32_i32      0
+#define TCG_TARGET_HAS_rot_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#endif
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub  rd, zero, rt   */
diff --git a/tcg/optimize.c b/tcg/optimize.c
index fba0ed9..35532a1 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -39,8 +39,6 @@
     TCG_TEMP_UNDEF = 0,
     TCG_TEMP_CONST,
     TCG_TEMP_COPY,
-    TCG_TEMP_HAS_COPY,
-    TCG_TEMP_ANY
 } tcg_temp_state;
 
 struct tcg_temp_info {
@@ -52,39 +50,19 @@
 
 static struct tcg_temp_info temps[TCG_MAX_TEMPS];
 
-/* Reset TEMP's state to TCG_TEMP_ANY.  If TEMP was a representative of some
-   class of equivalent temp's, a new representative should be chosen in this
-   class. */
-static void reset_temp(TCGArg temp, int nb_temps, int nb_globals)
+/* Reset TEMP's state to TCG_TEMP_UNDEF.  If TEMP only had one copy, remove
+   the copy flag from the left temp.  */
+static void reset_temp(TCGArg temp)
 {
-    int i;
-    TCGArg new_base = (TCGArg)-1;
-    if (temps[temp].state == TCG_TEMP_HAS_COPY) {
-        for (i = temps[temp].next_copy; i != temp; i = temps[i].next_copy) {
-            if (i >= nb_globals) {
-                temps[i].state = TCG_TEMP_HAS_COPY;
-                new_base = i;
-                break;
-            }
+    if (temps[temp].state == TCG_TEMP_COPY) {
+        if (temps[temp].prev_copy == temps[temp].next_copy) {
+            temps[temps[temp].next_copy].state = TCG_TEMP_UNDEF;
+        } else {
+            temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
+            temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
         }
-        for (i = temps[temp].next_copy; i != temp; i = temps[i].next_copy) {
-            if (new_base == (TCGArg)-1) {
-                temps[i].state = TCG_TEMP_ANY;
-            } else {
-                temps[i].val = new_base;
-            }
-        }
-        temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
-        temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
-    } else if (temps[temp].state == TCG_TEMP_COPY) {
-        temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
-        temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
-        new_base = temps[temp].val;
     }
-    temps[temp].state = TCG_TEMP_ANY;
-    if (new_base != (TCGArg)-1 && temps[new_base].next_copy == new_base) {
-        temps[new_base].state = TCG_TEMP_ANY;
-    }
+    temps[temp].state = TCG_TEMP_UNDEF;
 }
 
 static int op_bits(TCGOpcode op)
@@ -107,36 +85,83 @@
     }
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args, TCGArg dst,
-                            TCGArg src, int nb_temps, int nb_globals)
+static TCGArg find_better_copy(TCGContext *s, TCGArg temp)
 {
-        reset_temp(dst, nb_temps, nb_globals);
-        assert(temps[src].state != TCG_TEMP_COPY);
-        /* Don't try to copy if one of temps is a global or either one
-           is local and another is register */
-        if (src >= nb_globals && dst >= nb_globals &&
-            tcg_arg_is_local(s, src) == tcg_arg_is_local(s, dst)) {
-            assert(temps[src].state != TCG_TEMP_CONST);
-            if (temps[src].state != TCG_TEMP_HAS_COPY) {
-                temps[src].state = TCG_TEMP_HAS_COPY;
+    TCGArg i;
+
+    /* If this is already a global, we can't do better. */
+    if (temp < s->nb_globals) {
+        return temp;
+    }
+
+    /* Search for a global first. */
+    for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
+        if (i < s->nb_globals) {
+            return i;
+        }
+    }
+
+    /* If it is a temp, search for a temp local. */
+    if (!s->temps[temp].temp_local) {
+        for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
+            if (s->temps[i].temp_local) {
+                return i;
+            }
+        }
+    }
+
+    /* Failure to find a better representation, return the same temp. */
+    return temp;
+}
+
+static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
+{
+    TCGArg i;
+
+    if (arg1 == arg2) {
+        return true;
+    }
+
+    if (temps[arg1].state != TCG_TEMP_COPY
+        || temps[arg2].state != TCG_TEMP_COPY) {
+        return false;
+    }
+
+    for (i = temps[arg1].next_copy ; i != arg1 ; i = temps[i].next_copy) {
+        if (i == arg2) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args,
+                            TCGArg dst, TCGArg src)
+{
+        reset_temp(dst);
+        assert(temps[src].state != TCG_TEMP_CONST);
+
+        if (s->temps[src].type == s->temps[dst].type) {
+            if (temps[src].state != TCG_TEMP_COPY) {
+                temps[src].state = TCG_TEMP_COPY;
                 temps[src].next_copy = src;
                 temps[src].prev_copy = src;
             }
             temps[dst].state = TCG_TEMP_COPY;
-            temps[dst].val = src;
             temps[dst].next_copy = temps[src].next_copy;
             temps[dst].prev_copy = src;
             temps[temps[dst].next_copy].prev_copy = dst;
             temps[src].next_copy = dst;
         }
+
         gen_args[0] = dst;
         gen_args[1] = src;
 }
 
-static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val,
-                             int nb_temps, int nb_globals)
+static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val)
 {
-        reset_temp(dst, nb_temps, nb_globals);
+        reset_temp(dst);
         temps[dst].state = TCG_TEMP_CONST;
         temps[dst].val = val;
         gen_args[0] = dst;
@@ -267,58 +292,88 @@
     return res;
 }
 
+/* Return 2 if the condition can't be simplified, and the result
+   of the condition (0 or 1) if it can */
 static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
                                        TCGArg y, TCGCond c)
 {
-    switch (op_bits(op)) {
-    case 32:
-        switch (c) {
-        case TCG_COND_EQ:
-            return (uint32_t)x == (uint32_t)y;
-        case TCG_COND_NE:
-            return (uint32_t)x != (uint32_t)y;
-        case TCG_COND_LT:
-            return (int32_t)x < (int32_t)y;
-        case TCG_COND_GE:
-            return (int32_t)x >= (int32_t)y;
-        case TCG_COND_LE:
-            return (int32_t)x <= (int32_t)y;
-        case TCG_COND_GT:
-            return (int32_t)x > (int32_t)y;
-        case TCG_COND_LTU:
-            return (uint32_t)x < (uint32_t)y;
-        case TCG_COND_GEU:
-            return (uint32_t)x >= (uint32_t)y;
-        case TCG_COND_LEU:
-            return (uint32_t)x <= (uint32_t)y;
-        case TCG_COND_GTU:
-            return (uint32_t)x > (uint32_t)y;
+    if (temps[x].state == TCG_TEMP_CONST && temps[y].state == TCG_TEMP_CONST) {
+        switch (op_bits(op)) {
+        case 32:
+            switch (c) {
+            case TCG_COND_EQ:
+                return (uint32_t)temps[x].val == (uint32_t)temps[y].val;
+            case TCG_COND_NE:
+                return (uint32_t)temps[x].val != (uint32_t)temps[y].val;
+            case TCG_COND_LT:
+                return (int32_t)temps[x].val < (int32_t)temps[y].val;
+            case TCG_COND_GE:
+                return (int32_t)temps[x].val >= (int32_t)temps[y].val;
+            case TCG_COND_LE:
+                return (int32_t)temps[x].val <= (int32_t)temps[y].val;
+            case TCG_COND_GT:
+                return (int32_t)temps[x].val > (int32_t)temps[y].val;
+            case TCG_COND_LTU:
+                return (uint32_t)temps[x].val < (uint32_t)temps[y].val;
+            case TCG_COND_GEU:
+                return (uint32_t)temps[x].val >= (uint32_t)temps[y].val;
+            case TCG_COND_LEU:
+                return (uint32_t)temps[x].val <= (uint32_t)temps[y].val;
+            case TCG_COND_GTU:
+                return (uint32_t)temps[x].val > (uint32_t)temps[y].val;
+            }
+            break;
+        case 64:
+            switch (c) {
+            case TCG_COND_EQ:
+                return (uint64_t)temps[x].val == (uint64_t)temps[y].val;
+            case TCG_COND_NE:
+                return (uint64_t)temps[x].val != (uint64_t)temps[y].val;
+            case TCG_COND_LT:
+                return (int64_t)temps[x].val < (int64_t)temps[y].val;
+            case TCG_COND_GE:
+                return (int64_t)temps[x].val >= (int64_t)temps[y].val;
+            case TCG_COND_LE:
+                return (int64_t)temps[x].val <= (int64_t)temps[y].val;
+            case TCG_COND_GT:
+                return (int64_t)temps[x].val > (int64_t)temps[y].val;
+            case TCG_COND_LTU:
+                return (uint64_t)temps[x].val < (uint64_t)temps[y].val;
+            case TCG_COND_GEU:
+                return (uint64_t)temps[x].val >= (uint64_t)temps[y].val;
+            case TCG_COND_LEU:
+                return (uint64_t)temps[x].val <= (uint64_t)temps[y].val;
+            case TCG_COND_GTU:
+                return (uint64_t)temps[x].val > (uint64_t)temps[y].val;
+            }
+            break;
         }
-        break;
-    case 64:
+    } else if (temps_are_copies(x, y)) {
         switch (c) {
-        case TCG_COND_EQ:
-            return (uint64_t)x == (uint64_t)y;
-        case TCG_COND_NE:
-            return (uint64_t)x != (uint64_t)y;
-        case TCG_COND_LT:
-            return (int64_t)x < (int64_t)y;
-        case TCG_COND_GE:
-            return (int64_t)x >= (int64_t)y;
-        case TCG_COND_LE:
-            return (int64_t)x <= (int64_t)y;
         case TCG_COND_GT:
-            return (int64_t)x > (int64_t)y;
         case TCG_COND_LTU:
-            return (uint64_t)x < (uint64_t)y;
-        case TCG_COND_GEU:
-            return (uint64_t)x >= (uint64_t)y;
-        case TCG_COND_LEU:
-            return (uint64_t)x <= (uint64_t)y;
+        case TCG_COND_LT:
         case TCG_COND_GTU:
-            return (uint64_t)x > (uint64_t)y;
+        case TCG_COND_NE:
+            return 0;
+        case TCG_COND_GE:
+        case TCG_COND_GEU:
+        case TCG_COND_LE:
+        case TCG_COND_LEU:
+        case TCG_COND_EQ:
+            return 1;
         }
-        break;
+    } else if (temps[y].state == TCG_TEMP_CONST && temps[y].val == 0) {
+        switch (c) {
+        case TCG_COND_LTU:
+            return 0;
+        case TCG_COND_GEU:
+            return 1;
+        default:
+            return 2;
+        }
+    } else {
+        return 2;
     }
 
     fprintf(stderr,
@@ -327,7 +382,6 @@
     tcg_abort();
 }
 
-
 /* Propagate constants and copies, fold constant expressions. */
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
@@ -337,12 +391,12 @@
     const TCGOpDef *def;
     TCGArg *gen_args;
     TCGArg tmp;
+    TCGCond cond;
+
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
-       If this temp is a copy of other ones then this equivalence class'
-       representative is kept in VALS' element.
-       If this temp is neither copy nor constant then corresponding VALS'
-       element is unused. */
+       If this temp is a copy of other ones then the other copies are
+       available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
@@ -354,11 +408,18 @@
         op = gen_opc_buf[op_index];
         def = &tcg_op_defs[op];
         /* Do copy propagation */
-        if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS))) {
-            assert(op != INDEX_op_call);
+        if (op == INDEX_op_call) {
+            int nb_oargs = args[0] >> 16;
+            int nb_iargs = args[0] & 0xffff;
+            for (i = nb_oargs + 1; i < nb_oargs + nb_iargs + 1; i++) {
+                if (temps[args[i]].state == TCG_TEMP_COPY) {
+                    args[i] = find_better_copy(s, args[i]);
+                }
+            }
+        } else {
             for (i = def->nb_oargs; i < def->nb_oargs + def->nb_iargs; i++) {
                 if (temps[args[i]].state == TCG_TEMP_COPY) {
-                    args[i] = temps[args[i]].val;
+                    args[i] = find_better_copy(s, args[i]);
                 }
             }
         }
@@ -373,7 +434,10 @@
         CASE_OP_32_64(eqv):
         CASE_OP_32_64(nand):
         CASE_OP_32_64(nor):
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
+            /* Prefer the constant in second argument, and then the form
+               op a, a, b, which is better handled on non-RISC hosts. */
+            if (temps[args[1]].state == TCG_TEMP_CONST || (args[0] == args[2]
+                && temps[args[2]].state != TCG_TEMP_CONST)) {
                 tmp = args[1];
                 args[1] = args[2];
                 args[2] = tmp;
@@ -397,6 +461,25 @@
                 args[3] = tcg_swap_cond(args[3]);
             }
             break;
+        CASE_OP_32_64(movcond):
+            cond = args[5];
+            if (temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[2]].state != TCG_TEMP_CONST) {
+                tmp = args[1];
+                args[1] = args[2];
+                args[2] = tmp;
+                cond = tcg_swap_cond(cond);
+            }
+            /* For movcond, we canonicalize the "false" input reg to match
+               the destination reg so that the tcg backend can implement
+               a "move if true" operation.  */
+            if (args[0] == args[3]) {
+                tmp = args[3];
+                args[3] = args[4];
+                args[4] = tmp;
+                cond = tcg_invert_cond(cond);
+            }
+            args[5] = cond;
         default:
             break;
         }
@@ -411,7 +494,7 @@
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[1]].val == 0) {
                 gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
                 args += 3;
                 gen_args += 2;
                 continue;
@@ -438,14 +521,11 @@
             }
             if (temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0) {
-                if ((temps[args[0]].state == TCG_TEMP_COPY
-                    && temps[args[0]].val == args[1])
-                    || args[0] == args[1]) {
+                if (temps_are_copies(args[0], args[1])) {
                     gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
                     gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(s, gen_args, args[0], args[1],
-                                    nb_temps, nb_globals);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                     gen_args += 2;
                 }
                 args += 3;
@@ -463,7 +543,7 @@
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
                 gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
                 args += 3;
                 gen_args += 2;
                 continue;
@@ -477,13 +557,12 @@
         switch (op) {
         CASE_OP_32_64(or):
         CASE_OP_32_64(and):
-            if (args[1] == args[2]) {
-                if (args[1] == args[0]) {
+            if (temps_are_copies(args[1], args[2])) {
+                if (temps_are_copies(args[0], args[1])) {
                     gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
                     gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(s, gen_args, args[0], args[1], nb_temps,
-                                    nb_globals);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                     gen_args += 2;
                 }
                 args += 3;
@@ -494,21 +573,34 @@
             break;
         }
 
+        /* Simplify expression for "op r, a, a => movi r, 0" cases */
+        switch (op) {
+        CASE_OP_32_64(sub):
+        CASE_OP_32_64(xor):
+            if (temps_are_copies(args[1], args[2])) {
+                gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
+                gen_args += 2;
+                args += 3;
+                continue;
+            }
+            break;
+        default:
+            break;
+        }
+
         /* Propagate constants through copy operations and do constant
            folding.  Constants will be substituted to arguments by register
            allocator where needed and possible.  Also detect copies. */
         switch (op) {
         CASE_OP_32_64(mov):
-            if ((temps[args[1]].state == TCG_TEMP_COPY
-                && temps[args[1]].val == args[0])
-                || args[0] == args[1]) {
+            if (temps_are_copies(args[0], args[1])) {
                 args += 2;
                 gen_opc_buf[op_index] = INDEX_op_nop;
                 break;
             }
             if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, gen_args, args[0], args[1],
-                                nb_temps, nb_globals);
+                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                 gen_args += 2;
                 args += 2;
                 break;
@@ -520,7 +612,7 @@
             args[1] = temps[args[1]].val;
             /* fallthrough */
         CASE_OP_32_64(movi):
-            tcg_opt_gen_movi(gen_args, args[0], args[1], nb_temps, nb_globals);
+            tcg_opt_gen_movi(gen_args, args[0], args[1]);
             gen_args += 2;
             args += 2;
             break;
@@ -535,9 +627,9 @@
             if (temps[args[1]].state == TCG_TEMP_CONST) {
                 gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val, 0);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
             }
@@ -565,10 +657,10 @@
                 gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val,
                                           temps[args[2]].val);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
                 gen_args += 2;
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
@@ -576,16 +668,34 @@
             }
             args += 3;
             break;
-        CASE_OP_32_64(setcond):
+        CASE_OP_32_64(deposit):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 gen_opc_buf[op_index] = op_to_movi(op);
-                tmp = do_constant_folding_cond(op, temps[args[1]].val,
-                                               temps[args[2]].val, args[3]);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
+                tmp = ((1ull << args[4]) - 1);
+                tmp = (temps[args[1]].val & ~(tmp << args[3]))
+                      | ((temps[args[2]].val & tmp) << args[3]);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
                 gen_args += 2;
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
+                gen_args[0] = args[0];
+                gen_args[1] = args[1];
+                gen_args[2] = args[2];
+                gen_args[3] = args[3];
+                gen_args[4] = args[4];
+                gen_args += 5;
+            }
+            args += 5;
+            break;
+        CASE_OP_32_64(setcond):
+            tmp = do_constant_folding_cond(op, args[1], args[2], args[3]);
+            if (tmp != 2) {
+                gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                gen_args += 2;
+            } else {
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
@@ -595,10 +705,9 @@
             args += 4;
             break;
         CASE_OP_32_64(brcond):
-            if (temps[args[0]].state == TCG_TEMP_CONST
-                && temps[args[1]].state == TCG_TEMP_CONST) {
-                if (do_constant_folding_cond(op, temps[args[0]].val,
-                                             temps[args[1]].val, args[2])) {
+            tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
+            if (tmp != 2) {
+                if (tmp) {
                     memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
                     gen_opc_buf[op_index] = INDEX_op_br;
                     gen_args[0] = args[3];
@@ -608,7 +717,7 @@
                 }
             } else {
                 memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
@@ -617,15 +726,41 @@
             }
             args += 4;
             break;
+        CASE_OP_32_64(movcond):
+            tmp = do_constant_folding_cond(op, args[1], args[2], args[5]);
+            if (tmp != 2) {
+                if (temps_are_copies(args[0], args[4-tmp])) {
+                    gen_opc_buf[op_index] = INDEX_op_nop;
+                } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
+                    gen_opc_buf[op_index] = op_to_movi(op);
+                    tcg_opt_gen_movi(gen_args, args[0], temps[args[4-tmp]].val);
+                    gen_args += 2;
+                } else {
+                    gen_opc_buf[op_index] = op_to_mov(op);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[4-tmp]);
+                    gen_args += 2;
+                }
+            } else {
+                reset_temp(args[0]);
+                gen_args[0] = args[0];
+                gen_args[1] = args[1];
+                gen_args[2] = args[2];
+                gen_args[3] = args[3];
+                gen_args[4] = args[4];
+                gen_args[5] = args[5];
+                gen_args += 6;
+            }
+            args += 6;
+            break;
         case INDEX_op_call:
             nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
             if (!(args[nb_call_args + 1] & (TCG_CALL_CONST | TCG_CALL_PURE))) {
                 for (i = 0; i < nb_globals; i++) {
-                    reset_temp(i, nb_temps, nb_globals);
+                    reset_temp(i);
                 }
             }
             for (i = 0; i < (args[0] >> 16); i++) {
-                reset_temp(args[i + 1], nb_temps, nb_globals);
+                reset_temp(args[i + 1]);
             }
             i = nb_call_args + 3;
             while (i) {
@@ -635,21 +770,17 @@
                 i--;
             }
             break;
-        case INDEX_op_set_label:
-        case INDEX_op_jmp:
-        case INDEX_op_br:
-            memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
-            for (i = 0; i < def->nb_args; i++) {
-                *gen_args = *args;
-                args++;
-                gen_args++;
-            }
-            break;
         default:
             /* Default case: we do know nothing about operation so no
-               propagation is done.  We only trash output args.  */
-            for (i = 0; i < def->nb_oargs; i++) {
-                reset_temp(args[i], nb_temps, nb_globals);
+               propagation is done.  We trash everything if the operation
+               is the end of a basic block, otherwise we only trash the
+               output args.  */
+            if (def->flags & TCG_OPF_BB_END) {
+                memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
+            } else {
+                for (i = 0; i < def->nb_oargs; i++) {
+                    reset_temp(args[i]);
+                }
             }
             for (i = 0; i < def->nb_args; i++) {
                 gen_args[i] = args[i];
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 26c4b33..90c275d 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -221,12 +221,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return ARRAY_SIZE (tcg_target_call_iarg_regs);
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -390,6 +384,7 @@
 #define ORC    XO31(412)
 #define EQV    XO31(284)
 #define NAND   XO31(476)
+#define ISEL   XO31( 15)
 
 #define LBZX   XO31( 87)
 #define LHZX   XO31(279)
@@ -1269,6 +1264,72 @@
         );
 }
 
+static void tcg_out_movcond (TCGContext *s, TCGCond cond,
+                             TCGArg dest,
+                             TCGArg c1, TCGArg c2,
+                             TCGArg v1, TCGArg v2,
+                             int const_c2)
+{
+    tcg_out_cmp (s, cond, c1, c2, const_c2, 7);
+
+    if (1) {
+        /* At least here on 7747A bit twiddling hacks are outperformed
+           by jumpy code (the testing was not scientific) */
+        if (dest == v2) {
+            cond = tcg_invert_cond (cond);
+            v2 = v1;
+        }
+        else {
+            if (dest != v1) {
+                tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
+            }
+        }
+        /* Branch forward over one insn */
+        tcg_out32 (s, tcg_to_bc[cond] | 8);
+        tcg_out_mov (s, TCG_TYPE_I32, dest, v2);
+    }
+    else {
+        /* isel version, "if (1)" above should be replaced once a way
+           to figure out availability of isel on the underlying
+           hardware is found */
+        int tab, bc;
+
+        switch (cond) {
+        case TCG_COND_EQ:
+            tab = TAB (dest, v1, v2);
+            bc = CR_EQ;
+            break;
+        case TCG_COND_NE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_EQ;
+            break;
+        case TCG_COND_LTU:
+        case TCG_COND_LT:
+            tab = TAB (dest, v1, v2);
+            bc = CR_LT;
+            break;
+        case TCG_COND_GEU:
+        case TCG_COND_GE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_LT;
+            break;
+        case TCG_COND_LEU:
+        case TCG_COND_LE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_GT;
+            break;
+        case TCG_COND_GTU:
+        case TCG_COND_GT:
+            tab = TAB (dest, v1, v2);
+            bc = CR_GT;
+            break;
+        default:
+            tcg_abort ();
+        }
+        tcg_out32 (s, ISEL | tab | ((bc + 28) << 6));
+    }
+}
+
 static void tcg_out_brcond (TCGContext *s, TCGCond cond,
                             TCGArg arg1, TCGArg arg2, int const_arg2,
                             int label_index)
@@ -1826,6 +1887,13 @@
             );
         break;
 
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond (s, args[5], args[0],
+                         args[1], args[2],
+                         args[3], args[4],
+                         const_args[2]);
+        break;
+
     default:
         tcg_dump_ops (s);
         tcg_abort ();
@@ -1922,6 +1990,7 @@
     { INDEX_op_ext16u_i32, { "r", "r" } },
 
     { INDEX_op_deposit_i32, { "r", "0", "r" } },
+    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "r" } },
 
     { -1 },
 };
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 2f37fd2..3259d89 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -92,6 +92,7 @@
 #define TCG_TARGET_HAS_nand_i32         1
 #define TCG_TARGET_HAS_nor_i32          1
 #define TCG_TARGET_HAS_deposit_i32      1
+#define TCG_TARGET_HAS_movcond_i32      1
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
index 337cd41..19944bc 100644
--- a/tcg/ppc64/tcg-target.c
+++ b/tcg/ppc64/tcg-target.c
@@ -208,12 +208,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static int tcg_target_get_call_iarg_regs_count (int flags)
-{
-    return ARRAY_SIZE (tcg_target_call_iarg_regs);
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint (TCGArgConstraint *ct, const char **pct_str)
 {
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index 97eec08..57569e8 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -83,6 +83,7 @@
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rot_i64          0
@@ -103,6 +104,7 @@
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index aac11d9..3b90605 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -356,11 +356,6 @@
     }
 }
 
-static int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return sizeof(tcg_target_call_iarg_regs) / sizeof(int);
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index 4f7dfab..ed55c33 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -63,6 +63,7 @@
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -84,6 +85,7 @@
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif
 
 #define TCG_TARGET_HAS_GUEST_BASE
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index baed3b4..8fd7f86 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -59,7 +59,15 @@
 };
 #endif
 
-#define ARG_OFFSET 1
+/* Define some temporary registers.  T2 is used for constant generation.  */
+#define TCG_REG_T1  TCG_REG_G1
+#define TCG_REG_T2  TCG_REG_O7
+
+#ifdef CONFIG_USE_GUEST_BASE
+# define TCG_GUEST_BASE_REG TCG_REG_I5
+#else
+# define TCG_GUEST_BASE_REG TCG_REG_G0
+#endif
 
 static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_L0,
@@ -70,11 +78,25 @@
     TCG_REG_L5,
     TCG_REG_L6,
     TCG_REG_L7,
+
     TCG_REG_I0,
     TCG_REG_I1,
     TCG_REG_I2,
     TCG_REG_I3,
     TCG_REG_I4,
+    TCG_REG_I5,
+
+    TCG_REG_G2,
+    TCG_REG_G3,
+    TCG_REG_G4,
+    TCG_REG_G5,
+
+    TCG_REG_O0,
+    TCG_REG_O1,
+    TCG_REG_O2,
+    TCG_REG_O3,
+    TCG_REG_O4,
+    TCG_REG_O5,
 };
 
 static const int tcg_target_call_iarg_regs[6] = {
@@ -133,12 +155,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 6;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -157,7 +173,6 @@
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_O0);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_O1);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_O2);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_O3);
         break;
     case 'I':
         ct->ct |= TCG_CT_CONST_S11;
@@ -236,7 +251,7 @@
 #define ARITH_XOR  (INSN_OP(2) | INSN_OP3(0x03))
 #define ARITH_SUB  (INSN_OP(2) | INSN_OP3(0x04))
 #define ARITH_SUBCC (INSN_OP(2) | INSN_OP3(0x14))
-#define ARITH_ADDX (INSN_OP(2) | INSN_OP3(0x10))
+#define ARITH_ADDX (INSN_OP(2) | INSN_OP3(0x08))
 #define ARITH_SUBX (INSN_OP(2) | INSN_OP3(0x0c))
 #define ARITH_UMUL (INSN_OP(2) | INSN_OP3(0x0a))
 #define ARITH_UDIV (INSN_OP(2) | INSN_OP3(0x0e))
@@ -288,6 +303,16 @@
 #define ASI_PRIMARY_LITTLE 0x88
 #endif
 
+#define LDUH_LE    (LDUHA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDSH_LE    (LDSHA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDUW_LE    (LDUWA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDSW_LE    (LDSWA | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define LDX_LE     (LDXA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+
+#define STH_LE     (STHA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define STW_LE     (STWA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+#define STX_LE     (STXA  | INSN_ASI(ASI_PRIMARY_LITTLE))
+
 static inline void tcg_out_arith(TCGContext *s, int rd, int rs1, int rs2,
                                  int op)
 {
@@ -353,71 +378,50 @@
         tcg_out_sethi(s, ret, ~arg);
         tcg_out_arithi(s, ret, ret, (arg & 0x3ff) | -0x400, ARITH_XOR);
     } else {
-        tcg_out_movi_imm32(s, TCG_REG_I4, arg >> (TCG_TARGET_REG_BITS / 2));
-        tcg_out_arithi(s, TCG_REG_I4, TCG_REG_I4, 32, SHIFT_SLLX);
-        tcg_out_movi_imm32(s, ret, arg);
-        tcg_out_arith(s, ret, ret, TCG_REG_I4, ARITH_OR);
+        tcg_out_movi_imm32(s, ret, arg >> (TCG_TARGET_REG_BITS / 2));
+        tcg_out_arithi(s, ret, ret, 32, SHIFT_SLLX);
+        tcg_out_movi_imm32(s, TCG_REG_T2, arg);
+        tcg_out_arith(s, ret, ret, TCG_REG_T2, ARITH_OR);
     }
 }
 
-static inline void tcg_out_ld_raw(TCGContext *s, int ret,
-                                  tcg_target_long arg)
+static inline void tcg_out_ldst_rr(TCGContext *s, int data, int a1,
+                                   int a2, int op)
 {
-    tcg_out_sethi(s, ret, arg);
-    tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
-              INSN_IMM13(arg & 0x3ff));
+    tcg_out32(s, op | INSN_RD(data) | INSN_RS1(a1) | INSN_RS2(a2));
 }
 
-static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
-                                  tcg_target_long arg)
+static inline void tcg_out_ldst(TCGContext *s, int ret, int addr,
+                                int offset, int op)
 {
-    if (!check_fit_tl(arg, 10))
-        tcg_out_movi(s, TCG_TYPE_PTR, ret, arg & ~0x3ffULL);
-    if (TCG_TARGET_REG_BITS == 64) {
-        tcg_out32(s, LDX | INSN_RD(ret) | INSN_RS1(ret) |
-                  INSN_IMM13(arg & 0x3ff));
-    } else {
-        tcg_out32(s, LDUW | INSN_RD(ret) | INSN_RS1(ret) |
-                  INSN_IMM13(arg & 0x3ff));
-    }
-}
-
-static inline void tcg_out_ldst(TCGContext *s, int ret, int addr, int offset, int op)
-{
-    if (check_fit_tl(offset, 13))
+    if (check_fit_tl(offset, 13)) {
         tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(addr) |
                   INSN_IMM13(offset));
-    else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I5, offset);
-        tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(TCG_REG_I5) |
-                  INSN_RS2(addr));
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, offset);
+        tcg_out_ldst_rr(s, ret, addr, TCG_REG_T1, op);
     }
 }
 
-static inline void tcg_out_ldst_asi(TCGContext *s, int ret, int addr,
-                                    int offset, int op, int asi)
-{
-    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I5, offset);
-    tcg_out32(s, op | INSN_RD(ret) | INSN_RS1(TCG_REG_I5) |
-              INSN_ASI(asi) | INSN_RS2(addr));
-}
-
 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
                               TCGReg arg1, tcg_target_long arg2)
 {
-    if (type == TCG_TYPE_I32)
-        tcg_out_ldst(s, ret, arg1, arg2, LDUW);
-    else
-        tcg_out_ldst(s, ret, arg1, arg2, LDX);
+    tcg_out_ldst(s, ret, arg1, arg2, (type == TCG_TYPE_I32 ? LDUW : LDX));
 }
 
 static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
                               TCGReg arg1, tcg_target_long arg2)
 {
-    if (type == TCG_TYPE_I32)
-        tcg_out_ldst(s, arg, arg1, arg2, STW);
-    else
-        tcg_out_ldst(s, arg, arg1, arg2, STX);
+    tcg_out_ldst(s, arg, arg1, arg2, (type == TCG_TYPE_I32 ? STW : STX));
+}
+
+static inline void tcg_out_ld_ptr(TCGContext *s, int ret,
+                                  tcg_target_long arg)
+{
+    if (!check_fit_tl(arg, 10)) {
+        tcg_out_movi(s, TCG_TYPE_PTR, ret, arg & ~0x3ff);
+    }
+    tcg_out_ld(s, TCG_TYPE_PTR, ret, ret, arg & 0x3ff);
 }
 
 static inline void tcg_out_sety(TCGContext *s, int rs)
@@ -436,20 +440,21 @@
         if (check_fit_tl(val, 13))
             tcg_out_arithi(s, reg, reg, val, ARITH_ADD);
         else {
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_I5, val);
-            tcg_out_arith(s, reg, reg, TCG_REG_I5, ARITH_ADD);
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, val);
+            tcg_out_arith(s, reg, reg, TCG_REG_T1, ARITH_ADD);
         }
     }
 }
 
-static inline void tcg_out_andi(TCGContext *s, int reg, tcg_target_long val)
+static inline void tcg_out_andi(TCGContext *s, int rd, int rs,
+                                tcg_target_long val)
 {
     if (val != 0) {
         if (check_fit_tl(val, 13))
-            tcg_out_arithi(s, reg, reg, val, ARITH_AND);
+            tcg_out_arithi(s, rd, rs, val, ARITH_AND);
         else {
-            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_I5, val);
-            tcg_out_arith(s, reg, reg, TCG_REG_I5, ARITH_AND);
+            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T1, val);
+            tcg_out_arith(s, rd, rs, TCG_REG_T1, ARITH_AND);
         }
     }
 }
@@ -461,8 +466,8 @@
     if (uns) {
         tcg_out_sety(s, TCG_REG_G0);
     } else {
-        tcg_out_arithi(s, TCG_REG_I5, rs1, 31, SHIFT_SRA);
-        tcg_out_sety(s, TCG_REG_I5);
+        tcg_out_arithi(s, TCG_REG_T1, rs1, 31, SHIFT_SRA);
+        tcg_out_sety(s, TCG_REG_T1);
     }
 
     tcg_out_arithc(s, rd, rs1, val2, val2const,
@@ -477,30 +482,33 @@
 static void tcg_out_branch_i32(TCGContext *s, int opc, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
+    uint32_t off22;
 
     if (l->has_value) {
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x2)
-                      | INSN_OFF22(l->u.value - (unsigned long)s->code_ptr)));
+        off22 = INSN_OFF22(l->u.value - (unsigned long)s->code_ptr);
     } else {
+        /* Make sure to preserve destinations during retranslation.  */
+        off22 = *(uint32_t *)s->code_ptr & INSN_OFF22(-1);
         tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP22, label_index, 0);
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x2) | 0));
     }
+    tcg_out32(s, INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x2) | off22);
 }
 
 #if TCG_TARGET_REG_BITS == 64
 static void tcg_out_branch_i64(TCGContext *s, int opc, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
+    uint32_t off19;
 
     if (l->has_value) {
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x1) |
-                      (0x5 << 19) |
-                      INSN_OFF19(l->u.value - (unsigned long)s->code_ptr)));
+        off19 = INSN_OFF19(l->u.value - (unsigned long)s->code_ptr);
     } else {
+        /* Make sure to preserve destinations during retranslation.  */
+        off19 = *(uint32_t *)s->code_ptr & INSN_OFF19(-1);
         tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP19, label_index, 0);
-        tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x1) |
-                      (0x5 << 19) | 0));
     }
+    tcg_out32(s, (INSN_OP(0) | INSN_COND(opc, 0) | INSN_OP2(0x1) |
+                  (0x5 << 19) | off19));
 }
 #endif
 
@@ -608,8 +616,8 @@
     case TCG_COND_GTU:
     case TCG_COND_GEU:
         if (c2const && c2 != 0) {
-            tcg_out_movi_imm13(s, TCG_REG_I5, c2);
-            c2 = TCG_REG_I5;
+            tcg_out_movi_imm13(s, TCG_REG_T1, c2);
+            c2 = TCG_REG_T1;
         }
         t = c1, c1 = c2, c2 = t, c2const = 0;
         cond = tcg_swap_cond(cond);
@@ -621,18 +629,10 @@
 
     default:
         tcg_out_cmp(s, c1, c2, c2const);
-#if defined(__sparc_v9__) || defined(__sparc_v8plus__)
         tcg_out_movi_imm13(s, ret, 0);
-        tcg_out32 (s, ARITH_MOVCC | INSN_RD(ret)
-                   | INSN_RS1(tcg_cond_to_bcond[cond])
-                   | MOVCC_ICC | INSN_IMM11(1));
-#else
-        t = gen_new_label();
-        tcg_out_branch_i32(s, INSN_COND(tcg_cond_to_bcond[cond], 1), t);
-        tcg_out_movi_imm13(s, ret, 1);
-        tcg_out_movi_imm13(s, ret, 0);
-        tcg_out_label(s, t, s->code_ptr);
-#endif
+        tcg_out32(s, ARITH_MOVCC | INSN_RD(ret)
+                  | INSN_RS1(tcg_cond_to_bcond[cond])
+                  | MOVCC_ICC | INSN_IMM11(1));
         return;
     }
 
@@ -664,15 +664,15 @@
 
     switch (cond) {
     case TCG_COND_EQ:
-        tcg_out_setcond_i32(s, TCG_COND_EQ, TCG_REG_I5, al, bl, blconst);
+        tcg_out_setcond_i32(s, TCG_COND_EQ, TCG_REG_T1, al, bl, blconst);
         tcg_out_setcond_i32(s, TCG_COND_EQ, ret, ah, bh, bhconst);
-        tcg_out_arith(s, ret, ret, TCG_REG_I5, ARITH_AND);
+        tcg_out_arith(s, ret, ret, TCG_REG_T1, ARITH_AND);
         break;
 
     case TCG_COND_NE:
-        tcg_out_setcond_i32(s, TCG_COND_NE, TCG_REG_I5, al, al, blconst);
+        tcg_out_setcond_i32(s, TCG_COND_NE, TCG_REG_T1, al, al, blconst);
         tcg_out_setcond_i32(s, TCG_COND_NE, ret, ah, bh, bhconst);
-        tcg_out_arith(s, ret, ret, TCG_REG_I5, ARITH_OR);
+        tcg_out_arith(s, ret, ret, TCG_REG_T1, ARITH_OR);
         break;
 
     default:
@@ -695,14 +695,36 @@
 /* Generate global QEMU prologue and epilogue code */
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
-    tcg_set_frame(s, TCG_REG_I6, TCG_TARGET_CALL_STACK_OFFSET,
-                  CPU_TEMP_BUF_NLONGS * (int)sizeof(long));
+    int tmp_buf_size, frame_size;
+
+    /* The TCG temp buffer is at the top of the frame, immediately
+       below the frame pointer.  */
+    tmp_buf_size = CPU_TEMP_BUF_NLONGS * (int)sizeof(long);
+    tcg_set_frame(s, TCG_REG_I6, TCG_TARGET_STACK_BIAS - tmp_buf_size,
+                  tmp_buf_size);
+
+    /* TCG_TARGET_CALL_STACK_OFFSET includes the stack bias, but is
+       otherwise the minimal frame usable by callees.  */
+    frame_size = TCG_TARGET_CALL_STACK_OFFSET - TCG_TARGET_STACK_BIAS;
+    frame_size += TCG_STATIC_CALL_ARGS_SIZE + tmp_buf_size;
+    frame_size += TCG_TARGET_STACK_ALIGN - 1;
+    frame_size &= -TCG_TARGET_STACK_ALIGN;
     tcg_out32(s, SAVE | INSN_RD(TCG_REG_O6) | INSN_RS1(TCG_REG_O6) |
-              INSN_IMM13(-(TCG_TARGET_STACK_MINFRAME +
-                           CPU_TEMP_BUF_NLONGS * (int)sizeof(long))));
+              INSN_IMM13(-frame_size));
+
+#ifdef CONFIG_USE_GUEST_BASE
+    if (GUEST_BASE != 0) {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, GUEST_BASE);
+        tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
+    }
+#endif
+
     tcg_out32(s, JMPL | INSN_RD(TCG_REG_G0) | INSN_RS1(TCG_REG_I1) |
               INSN_RS2(TCG_REG_G0));
-    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_I0);
+    /* delay slot */
+    tcg_out_nop(s);
+
+    /* No epilogue required.  We issue ret + restore directly in the TB.  */
 }
 
 #if defined(CONFIG_SOFTMMU)
@@ -726,418 +748,309 @@
     helper_stl_mmu,
     helper_stq_mmu,
 };
-#endif
 
-#if TARGET_LONG_BITS == 32
-#define TARGET_LD_OP LDUW
-#else
-#define TARGET_LD_OP LDX
-#endif
+/* Perform the TLB load and compare.
 
-#if defined(CONFIG_SOFTMMU)
-#if HOST_LONG_BITS == 32
-#define TARGET_ADDEND_LD_OP LDUW
-#else
-#define TARGET_ADDEND_LD_OP LDX
-#endif
-#endif
+   Inputs:
+   ADDRLO_IDX contains the index into ARGS of the low part of the
+   address; the high part of the address is at ADDR_LOW_IDX+1.
 
-#ifdef __arch64__
-#define HOST_LD_OP LDX
-#define HOST_ST_OP STX
-#define HOST_SLL_OP SHIFT_SLLX
-#define HOST_SRA_OP SHIFT_SRAX
-#else
-#define HOST_LD_OP LDUW
-#define HOST_ST_OP STW
-#define HOST_SLL_OP SHIFT_SLL
-#define HOST_SRA_OP SHIFT_SRA
-#endif
+   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
-                            int opc)
+   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
+   This should be offsetof addr_read or addr_write.
+
+   The result of the TLB comparison is in %[ix]cc.  The sanitized address
+   is in the returned register, maybe %o0.  The TLB addend is in %o1.  */
+
+static int tcg_out_tlb_load(TCGContext *s, int addrlo_idx, int mem_index,
+                            int s_bits, const TCGArg *args, int which)
 {
-    int addr_reg, data_reg, arg0, arg1, arg2, mem_index, s_bits;
-#if defined(CONFIG_SOFTMMU)
-    uint32_t *label1_ptr, *label2_ptr;
-#endif
+    const int addrlo = args[addrlo_idx];
+    const int r0 = TCG_REG_O0;
+    const int r1 = TCG_REG_O1;
+    const int r2 = TCG_REG_O2;
+    int addr = addrlo;
+    int tlb_ofs;
 
-    data_reg = *args++;
-    addr_reg = *args++;
-    mem_index = *args;
-    s_bits = opc & 3;
+    if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) {
+        /* Assemble the 64-bit address in R0.  */
+        tcg_out_arithi(s, r0, addrlo, 0, SHIFT_SRL);
+        tcg_out_arithi(s, r1, args[addrlo_idx + 1], 32, SHIFT_SLLX);
+        tcg_out_arith(s, r0, r0, r1, ARITH_OR);
+    }
 
-    arg0 = TCG_REG_O0;
-    arg1 = TCG_REG_O1;
-    arg2 = TCG_REG_O2;
+    /* Shift the page number down to tlb-entry.  */
+    tcg_out_arithi(s, r1, addrlo,
+                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS, SHIFT_SRL);
 
-#if defined(CONFIG_SOFTMMU)
-    /* srl addr_reg, x, arg1 */
-    tcg_out_arithi(s, arg1, addr_reg, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS,
-                   SHIFT_SRL);
-    /* and addr_reg, x, arg0 */
-    tcg_out_arithi(s, arg0, addr_reg, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
-                   ARITH_AND);
+    /* Mask out the page offset, except for the required alignment.  */
+    tcg_out_andi(s, r0, addr, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
 
-    /* and arg1, x, arg1 */
-    tcg_out_andi(s, arg1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
+    /* Compute tlb index, modulo tlb size.  */
+    tcg_out_andi(s, r1, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
 
-    /* add arg1, x, arg1 */
-    tcg_out_addi(s, arg1, offsetof(CPUArchState,
-                                   tlb_table[mem_index][0].addr_read));
+    /* Relative to the current ENV.  */
+    tcg_out_arith(s, r1, TCG_AREG0, r1, ARITH_ADD);
 
-    /* add env, arg1, arg1 */
-    tcg_out_arith(s, arg1, TCG_AREG0, arg1, ARITH_ADD);
+    /* Find a base address that can load both tlb comparator and addend.  */
+    tlb_ofs = offsetof(CPUArchState, tlb_table[mem_index][0]);
+    if (!check_fit_tl(tlb_ofs + sizeof(CPUTLBEntry), 13)) {
+        tcg_out_addi(s, r1, tlb_ofs);
+        tlb_ofs = 0;
+    }
 
-    /* ld [arg1], arg2 */
-    tcg_out32(s, TARGET_LD_OP | INSN_RD(arg2) | INSN_RS1(arg1) |
-              INSN_RS2(TCG_REG_G0));
+    /* Load the tlb comparator and the addend.  */
+    tcg_out_ld(s, TCG_TYPE_TL, r2, r1, tlb_ofs + which);
+    tcg_out_ld(s, TCG_TYPE_PTR, r1, r1, tlb_ofs+offsetof(CPUTLBEntry, addend));
 
     /* subcc arg0, arg2, %g0 */
-    tcg_out_arith(s, TCG_REG_G0, arg0, arg2, ARITH_SUBCC);
+    tcg_out_cmp(s, r0, r2, 0);
 
-    /* will become:
-       be label1
-        or
-       be,pt %xcc label1 */
-    label1_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, 0);
+    /* If the guest address must be zero-extended, do so now.  */
+    if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 32) {
+        tcg_out_arithi(s, r0, addrlo, 0, SHIFT_SRL);
+        return r0;
+    }
+    return addrlo;
+}
+#endif /* CONFIG_SOFTMMU */
 
-    /* mov (delay slot) */
-    tcg_out_mov(s, TCG_TYPE_PTR, arg0, addr_reg);
+static const int qemu_ld_opc[8] = {
+#ifdef TARGET_WORDS_BIGENDIAN
+    LDUB, LDUH, LDUW, LDX, LDSB, LDSH, LDSW, LDX
+#else
+    LDUB, LDUH_LE, LDUW_LE, LDX_LE, LDSB, LDSH_LE, LDSW_LE, LDX_LE
+#endif
+};
 
-    /* mov */
-    tcg_out_movi(s, TCG_TYPE_I32, arg1, mem_index);
-    /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
+static const int qemu_st_opc[4] = {
+#ifdef TARGET_WORDS_BIGENDIAN
+    STB, STH, STW, STX
+#else
+    STB, STH_LE, STW_LE, STX_LE
+#endif
+};
 
-    /* XXX: move that code at the end of the TB */
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int sizeop)
+{
+    int addrlo_idx = 1, datalo, datahi, addr_reg;
+#if defined(CONFIG_SOFTMMU)
+    int memi_idx, memi, s_bits, n;
+    uint32_t *label_ptr[2];
+#endif
+
+    datahi = datalo = args[0];
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        datahi = args[1];
+        addrlo_idx = 2;
+    }
+
+#if defined(CONFIG_SOFTMMU)
+    memi_idx = addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
+    memi = args[memi_idx];
+    s_bits = sizeop & 3;
+
+    addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, s_bits, args,
+                                offsetof(CPUTLBEntry, addr_read));
+
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        int reg64;
+
+        /* bne,pn %[xi]cc, label0 */
+        label_ptr[0] = (uint32_t *)s->code_ptr;
+        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_NE, 0) | INSN_OP2(0x1)
+                      | ((TARGET_LONG_BITS == 64) << 21)));
+
+        /* TLB Hit.  */
+        /* Load all 64-bits into an O/G register.  */
+        reg64 = (datalo < 16 ? datalo : TCG_REG_O0);
+        tcg_out_ldst_rr(s, reg64, addr_reg, TCG_REG_O1, qemu_ld_opc[sizeop]);
+
+        /* Move the two 32-bit pieces into the destination registers.  */
+        tcg_out_arithi(s, datahi, reg64, 32, SHIFT_SRLX);
+        if (reg64 != datalo) {
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, reg64);
+        }
+
+        /* b,a,pt label1 */
+        label_ptr[1] = (uint32_t *)s->code_ptr;
+        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x1)
+                      | (1 << 29) | (1 << 19)));
+    } else {
+        /* The fast path is exactly one insn.  Thus we can perform the
+           entire TLB Hit in the (annulled) delay slot of the branch
+           over the TLB Miss case.  */
+
+        /* beq,a,pt %[xi]cc, label0 */
+        label_ptr[0] = NULL;
+        label_ptr[1] = (uint32_t *)s->code_ptr;
+        tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1)
+                      | ((TARGET_LONG_BITS == 64) << 21)
+                      | (1 << 29) | (1 << 19)));
+        /* delay slot */
+        tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_O1, qemu_ld_opc[sizeop]);
+    }
+
+    /* TLB Miss.  */
+
+    if (label_ptr[0]) {
+        *label_ptr[0] |= INSN_OFF19((unsigned long)s->code_ptr -
+                                    (unsigned long)label_ptr[0]);
+    }
+    n = 0;
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[n++], TCG_AREG0);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
+                    args[addrlo_idx + 1]);
+    }
+    tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
+                args[addrlo_idx]);
+
     /* qemu_ld_helper[s_bits](arg0, arg1) */
     tcg_out32(s, CALL | ((((tcg_target_ulong)qemu_ld_helpers[s_bits]
                            - (tcg_target_ulong)s->code_ptr) >> 2)
                          & 0x3fffffff));
-    /* Store AREG0 in stack to avoid ugly glibc bugs that mangle
-       global registers */
-    // delay slot
-    tcg_out_ldst(s, TCG_AREG0, TCG_REG_CALL_STACK,
-                 TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
-                 sizeof(long), HOST_ST_OP);
-    tcg_out_ldst(s, TCG_AREG0, TCG_REG_CALL_STACK,
-                 TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
-                 sizeof(long), HOST_LD_OP);
+    /* delay slot */
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[n], memi);
 
-    /* data_reg = sign_extend(arg0) */
-    switch(opc) {
+    n = tcg_target_call_oarg_regs[0];
+    /* datalo = sign_extend(arg0) */
+    switch (sizeop) {
     case 0 | 4:
-        /* sll arg0, 24/56, data_reg */
-        tcg_out_arithi(s, data_reg, arg0, (int)sizeof(tcg_target_long) * 8 - 8,
-                       HOST_SLL_OP);
-        /* sra data_reg, 24/56, data_reg */
-        tcg_out_arithi(s, data_reg, data_reg,
-                       (int)sizeof(tcg_target_long) * 8 - 8, HOST_SRA_OP);
+        /* Recall that SRA sign extends from bit 31 through bit 63.  */
+        tcg_out_arithi(s, datalo, n, 24, SHIFT_SLL);
+        tcg_out_arithi(s, datalo, datalo, 24, SHIFT_SRA);
         break;
     case 1 | 4:
-        /* sll arg0, 16/48, data_reg */
-        tcg_out_arithi(s, data_reg, arg0,
-                       (int)sizeof(tcg_target_long) * 8 - 16, HOST_SLL_OP);
-        /* sra data_reg, 16/48, data_reg */
-        tcg_out_arithi(s, data_reg, data_reg,
-                       (int)sizeof(tcg_target_long) * 8 - 16, HOST_SRA_OP);
+        tcg_out_arithi(s, datalo, n, 16, SHIFT_SLL);
+        tcg_out_arithi(s, datalo, datalo, 16, SHIFT_SRA);
         break;
     case 2 | 4:
-        /* sll arg0, 32, data_reg */
-        tcg_out_arithi(s, data_reg, arg0, 32, HOST_SLL_OP);
-        /* sra data_reg, 32, data_reg */
-        tcg_out_arithi(s, data_reg, data_reg, 32, HOST_SRA_OP);
+        tcg_out_arithi(s, datalo, n, 0, SHIFT_SRA);
         break;
+    case 3:
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_out_mov(s, TCG_TYPE_REG, datahi, n);
+            tcg_out_mov(s, TCG_TYPE_REG, datalo, n + 1);
+            break;
+        }
+        /* FALLTHRU */
     case 0:
     case 1:
     case 2:
-    case 3:
     default:
         /* mov */
-        tcg_out_mov(s, TCG_TYPE_REG, data_reg, arg0);
+        tcg_out_mov(s, TCG_TYPE_REG, datalo, n);
         break;
     }
 
-    /* will become:
-       ba label2 */
-    label2_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, 0);
-
-    /* nop (delay slot */
-    tcg_out_nop(s);
-
-    /* label1: */
-#if TARGET_LONG_BITS == 32
-    /* be label1 */
-    *label1_ptr = (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x2) |
-                   INSN_OFF22((unsigned long)s->code_ptr -
-                              (unsigned long)label1_ptr));
+    *label_ptr[1] |= INSN_OFF19((unsigned long)s->code_ptr -
+                                (unsigned long)label_ptr[1]);
 #else
-    /* be,pt %xcc label1 */
-    *label1_ptr = (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1) |
-                   (0x5 << 19) | INSN_OFF19((unsigned long)s->code_ptr -
-                              (unsigned long)label1_ptr));
-#endif
-
-    /* ld [arg1 + x], arg1 */
-    tcg_out_ldst(s, arg1, arg1, offsetof(CPUTLBEntry, addend) -
-                 offsetof(CPUTLBEntry, addr_read), TARGET_ADDEND_LD_OP);
-
-#if TARGET_LONG_BITS == 32
-    /* and addr_reg, x, arg0 */
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_I5, 0xffffffff);
-    tcg_out_arith(s, arg0, addr_reg, TCG_REG_I5, ARITH_AND);
-    /* add arg0, arg1, arg0 */
-    tcg_out_arith(s, arg0, arg0, arg1, ARITH_ADD);
-#else
-    /* add addr_reg, arg1, arg0 */
-    tcg_out_arith(s, arg0, addr_reg, arg1, ARITH_ADD);
-#endif
-
-#else
-    arg0 = addr_reg;
-#endif
-
-    switch(opc) {
-    case 0:
-        /* ldub [arg0], data_reg */
-        tcg_out_ldst(s, data_reg, arg0, 0, LDUB);
-        break;
-    case 0 | 4:
-        /* ldsb [arg0], data_reg */
-        tcg_out_ldst(s, data_reg, arg0, 0, LDSB);
-        break;
-    case 1:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* lduh [arg0], data_reg */
-        tcg_out_ldst(s, data_reg, arg0, 0, LDUH);
-#else
-        /* lduha [arg0] ASI_PRIMARY_LITTLE, data_reg */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, LDUHA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    case 1 | 4:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* ldsh [arg0], data_reg */
-        tcg_out_ldst(s, data_reg, arg0, 0, LDSH);
-#else
-        /* ldsha [arg0] ASI_PRIMARY_LITTLE, data_reg */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, LDSHA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    case 2:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* lduw [arg0], data_reg */
-        tcg_out_ldst(s, data_reg, arg0, 0, LDUW);
-#else
-        /* lduwa [arg0] ASI_PRIMARY_LITTLE, data_reg */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, LDUWA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    case 2 | 4:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* ldsw [arg0], data_reg */
-        tcg_out_ldst(s, data_reg, arg0, 0, LDSW);
-#else
-        /* ldswa [arg0] ASI_PRIMARY_LITTLE, data_reg */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, LDSWA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    case 3:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* ldx [arg0], data_reg */
-        tcg_out_ldst(s, data_reg, arg0, 0, LDX);
-#else
-        /* ldxa [arg0] ASI_PRIMARY_LITTLE, data_reg */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, LDXA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    default:
-        tcg_abort();
+    addr_reg = args[addrlo_idx];
+    if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 32) {
+        tcg_out_arithi(s, TCG_REG_T1, addr_reg, 0, SHIFT_SRL);
+        addr_reg = TCG_REG_T1;
     }
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        int reg64 = (datalo < 16 ? datalo : TCG_REG_O0);
 
-#if defined(CONFIG_SOFTMMU)
-    /* label2: */
-    *label2_ptr = (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x2) |
-                   INSN_OFF22((unsigned long)s->code_ptr -
-                              (unsigned long)label2_ptr));
-#endif
+        tcg_out_ldst_rr(s, reg64, addr_reg,
+                        (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
+                        qemu_ld_opc[sizeop]);
+
+        tcg_out_arithi(s, datahi, reg64, 32, SHIFT_SRLX);
+        if (reg64 != datalo) {
+            tcg_out_mov(s, TCG_TYPE_I32, datalo, reg64);
+        }
+    } else {
+        tcg_out_ldst_rr(s, datalo, addr_reg,
+                        (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
+                        qemu_ld_opc[sizeop]);
+    }
+#endif /* CONFIG_SOFTMMU */
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
-                            int opc)
+static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int sizeop)
 {
-    int addr_reg, data_reg, arg0, arg1, arg2, mem_index, s_bits;
+    int addrlo_idx = 1, datalo, datahi, addr_reg;
 #if defined(CONFIG_SOFTMMU)
-    uint32_t *label1_ptr, *label2_ptr;
+    int memi_idx, memi, n;
+    uint32_t *label_ptr;
 #endif
 
-    data_reg = *args++;
-    addr_reg = *args++;
-    mem_index = *args;
-
-    s_bits = opc;
-
-    arg0 = TCG_REG_O0;
-    arg1 = TCG_REG_O1;
-    arg2 = TCG_REG_O2;
-
-#if defined(CONFIG_SOFTMMU)
-    /* srl addr_reg, x, arg1 */
-    tcg_out_arithi(s, arg1, addr_reg, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS,
-                   SHIFT_SRL);
-
-    /* and addr_reg, x, arg0 */
-    tcg_out_arithi(s, arg0, addr_reg, TARGET_PAGE_MASK | ((1 << s_bits) - 1),
-                   ARITH_AND);
-
-    /* and arg1, x, arg1 */
-    tcg_out_andi(s, arg1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
-
-    /* add arg1, x, arg1 */
-    tcg_out_addi(s, arg1, offsetof(CPUArchState,
-                                   tlb_table[mem_index][0].addr_write));
-
-    /* add env, arg1, arg1 */
-    tcg_out_arith(s, arg1, TCG_AREG0, arg1, ARITH_ADD);
-
-    /* ld [arg1], arg2 */
-    tcg_out32(s, TARGET_LD_OP | INSN_RD(arg2) | INSN_RS1(arg1) |
-              INSN_RS2(TCG_REG_G0));
-
-    /* subcc arg0, arg2, %g0 */
-    tcg_out_arith(s, TCG_REG_G0, arg0, arg2, ARITH_SUBCC);
-
-    /* will become:
-       be label1
-        or
-       be,pt %xcc label1 */
-    label1_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, 0);
-
-    /* mov (delay slot) */
-    tcg_out_mov(s, TCG_TYPE_PTR, arg0, addr_reg);
-
-    /* mov */
-    tcg_out_mov(s, TCG_TYPE_REG, arg1, data_reg);
-
-    /* mov */
-    tcg_out_movi(s, TCG_TYPE_I32, arg2, mem_index);
-
-    /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
-    /* XXX: move that code at the end of the TB */
-    /* qemu_st_helper[s_bits](arg0, arg1, arg2) */
-    tcg_out32(s, CALL | ((((tcg_target_ulong)qemu_st_helpers[s_bits]
-                           - (tcg_target_ulong)s->code_ptr) >> 2)
-                         & 0x3fffffff));
-    /* Store AREG0 in stack to avoid ugly glibc bugs that mangle
-       global registers */
-    // delay slot
-    tcg_out_ldst(s, TCG_AREG0, TCG_REG_CALL_STACK,
-                 TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
-                 sizeof(long), HOST_ST_OP);
-    tcg_out_ldst(s, TCG_AREG0, TCG_REG_CALL_STACK,
-                 TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
-                 sizeof(long), HOST_LD_OP);
-
-    /* will become:
-       ba label2 */
-    label2_ptr = (uint32_t *)s->code_ptr;
-    tcg_out32(s, 0);
-
-    /* nop (delay slot) */
-    tcg_out_nop(s);
-
-#if TARGET_LONG_BITS == 32
-    /* be label1 */
-    *label1_ptr = (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x2) |
-                   INSN_OFF22((unsigned long)s->code_ptr -
-                              (unsigned long)label1_ptr));
-#else
-    /* be,pt %xcc label1 */
-    *label1_ptr = (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1) |
-                   (0x5 << 19) | INSN_OFF19((unsigned long)s->code_ptr -
-                              (unsigned long)label1_ptr));
-#endif
-
-    /* ld [arg1 + x], arg1 */
-    tcg_out_ldst(s, arg1, arg1, offsetof(CPUTLBEntry, addend) -
-                 offsetof(CPUTLBEntry, addr_write), TARGET_ADDEND_LD_OP);
-
-#if TARGET_LONG_BITS == 32
-    /* and addr_reg, x, arg0 */
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_I5, 0xffffffff);
-    tcg_out_arith(s, arg0, addr_reg, TCG_REG_I5, ARITH_AND);
-    /* add arg0, arg1, arg0 */
-    tcg_out_arith(s, arg0, arg0, arg1, ARITH_ADD);
-#else
-    /* add addr_reg, arg1, arg0 */
-    tcg_out_arith(s, arg0, addr_reg, arg1, ARITH_ADD);
-#endif
-
-#else
-    arg0 = addr_reg;
-#endif
-
-    switch(opc) {
-    case 0:
-        /* stb data_reg, [arg0] */
-        tcg_out_ldst(s, data_reg, arg0, 0, STB);
-        break;
-    case 1:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* sth data_reg, [arg0] */
-        tcg_out_ldst(s, data_reg, arg0, 0, STH);
-#else
-        /* stha data_reg, [arg0] ASI_PRIMARY_LITTLE */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, STHA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    case 2:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* stw data_reg, [arg0] */
-        tcg_out_ldst(s, data_reg, arg0, 0, STW);
-#else
-        /* stwa data_reg, [arg0] ASI_PRIMARY_LITTLE */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, STWA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    case 3:
-#ifdef TARGET_WORDS_BIGENDIAN
-        /* stx data_reg, [arg0] */
-        tcg_out_ldst(s, data_reg, arg0, 0, STX);
-#else
-        /* stxa data_reg, [arg0] ASI_PRIMARY_LITTLE */
-        tcg_out_ldst_asi(s, data_reg, arg0, 0, STXA, ASI_PRIMARY_LITTLE);
-#endif
-        break;
-    default:
-        tcg_abort();
+    datahi = datalo = args[0];
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        datahi = args[1];
+        addrlo_idx = 2;
     }
 
 #if defined(CONFIG_SOFTMMU)
-    /* label2: */
-    *label2_ptr = (INSN_OP(0) | INSN_COND(COND_A, 0) | INSN_OP2(0x2) |
-                   INSN_OFF22((unsigned long)s->code_ptr -
-                              (unsigned long)label2_ptr));
-#endif
+    memi_idx = addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
+    memi = args[memi_idx];
+
+    addr_reg = tcg_out_tlb_load(s, addrlo_idx, memi, sizeop, args,
+                                offsetof(CPUTLBEntry, addr_write));
+
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        /* Reconstruct the full 64-bit value.  */
+        tcg_out_arithi(s, TCG_REG_T1, datalo, 0, SHIFT_SRL);
+        tcg_out_arithi(s, TCG_REG_O2, datahi, 32, SHIFT_SLLX);
+        tcg_out_arith(s, TCG_REG_O2, TCG_REG_T1, TCG_REG_O2, ARITH_OR);
+        datalo = TCG_REG_O2;
+    }
+
+    /* The fast path is exactly one insn.  Thus we can perform the entire
+       TLB Hit in the (annulled) delay slot of the branch over TLB Miss.  */
+    /* beq,a,pt %[xi]cc, label0 */
+    label_ptr = (uint32_t *)s->code_ptr;
+    tcg_out32(s, (INSN_OP(0) | INSN_COND(COND_E, 0) | INSN_OP2(0x1)
+                  | ((TARGET_LONG_BITS == 64) << 21)
+                  | (1 << 29) | (1 << 19)));
+    /* delay slot */
+    tcg_out_ldst_rr(s, datalo, addr_reg, TCG_REG_O1, qemu_st_opc[sizeop]);
+
+    /* TLB Miss.  */
+
+    n = 0;
+    tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[n++], TCG_AREG0);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
+                    args[addrlo_idx + 1]);
+    }
+    tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++],
+                args[addrlo_idx]);
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++], datahi);
+    }
+    tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n++], datalo);
+
+    /* qemu_st_helper[s_bits](arg0, arg1, arg2) */
+    tcg_out32(s, CALL | ((((tcg_target_ulong)qemu_st_helpers[sizeop]
+                           - (tcg_target_ulong)s->code_ptr) >> 2)
+                         & 0x3fffffff));
+    /* delay slot */
+    tcg_out_movi(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[n], memi);
+
+    *label_ptr |= INSN_OFF19((unsigned long)s->code_ptr -
+                             (unsigned long)label_ptr);
+#else
+    addr_reg = args[addrlo_idx];
+    if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 32) {
+        tcg_out_arithi(s, TCG_REG_T1, addr_reg, 0, SHIFT_SRL);
+        addr_reg = TCG_REG_T1;
+    }
+    if (TCG_TARGET_REG_BITS == 32 && sizeop == 3) {
+        tcg_out_arithi(s, TCG_REG_T1, datalo, 0, SHIFT_SRL);
+        tcg_out_arithi(s, TCG_REG_O2, datahi, 32, SHIFT_SLLX);
+        tcg_out_arith(s, TCG_REG_O2, TCG_REG_T1, TCG_REG_O2, ARITH_OR);
+        datalo = TCG_REG_O2;
+    }
+    tcg_out_ldst_rr(s, datalo, addr_reg,
+                    (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
+                    qemu_st_opc[sizeop]);
+#endif /* CONFIG_SOFTMMU */
 }
 
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
@@ -1156,39 +1069,33 @@
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* direct jump method */
-            tcg_out_sethi(s, TCG_REG_I5, args[0] & 0xffffe000);
-            tcg_out32(s, JMPL | INSN_RD(TCG_REG_G0) | INSN_RS1(TCG_REG_I5) |
-                      INSN_IMM13((args[0] & 0x1fff)));
+            uint32_t old_insn = *(uint32_t *)s->code_ptr;
             s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
+            /* Make sure to preserve links during retranslation.  */
+            tcg_out32(s, CALL | (old_insn & ~INSN_OP(-1)));
         } else {
             /* indirect jump method */
-            tcg_out_ld_ptr(s, TCG_REG_I5, (tcg_target_long)(s->tb_next + args[0]));
-            tcg_out32(s, JMPL | INSN_RD(TCG_REG_G0) | INSN_RS1(TCG_REG_I5) |
+            tcg_out_ld_ptr(s, TCG_REG_T1,
+                           (tcg_target_long)(s->tb_next + args[0]));
+            tcg_out32(s, JMPL | INSN_RD(TCG_REG_G0) | INSN_RS1(TCG_REG_T1) |
                       INSN_RS2(TCG_REG_G0));
         }
         tcg_out_nop(s);
         s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
         break;
     case INDEX_op_call:
-        if (const_args[0])
+        if (const_args[0]) {
             tcg_out32(s, CALL | ((((tcg_target_ulong)args[0]
                                    - (tcg_target_ulong)s->code_ptr) >> 2)
                                  & 0x3fffffff));
-        else {
-            tcg_out_ld_ptr(s, TCG_REG_I5,
+        } else {
+            tcg_out_ld_ptr(s, TCG_REG_T1,
                            (tcg_target_long)(s->tb_next + args[0]));
-            tcg_out32(s, JMPL | INSN_RD(TCG_REG_O7) | INSN_RS1(TCG_REG_I5) |
+            tcg_out32(s, JMPL | INSN_RD(TCG_REG_O7) | INSN_RS1(TCG_REG_T1) |
                       INSN_RS2(TCG_REG_G0));
         }
-        /* Store AREG0 in stack to avoid ugly glibc bugs that mangle
-           global registers */
-        // delay slot
-        tcg_out_ldst(s, TCG_AREG0, TCG_REG_CALL_STACK,
-                     TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
-                     sizeof(long), HOST_ST_OP);
-        tcg_out_ldst(s, TCG_AREG0, TCG_REG_CALL_STACK,
-                     TCG_TARGET_CALL_STACK_OFFSET - TCG_STATIC_CALL_ARGS_SIZE -
-                     sizeof(long), HOST_LD_OP);
+        /* delay slot */
+        tcg_out_nop(s);
         break;
     case INDEX_op_jmp:
     case INDEX_op_br:
@@ -1260,13 +1167,16 @@
         goto gen_arith;
     case INDEX_op_shl_i32:
         c = SHIFT_SLL;
-        goto gen_arith;
+    do_shift32:
+        /* Limit immediate shift count lest we create an illegal insn.  */
+        tcg_out_arithc(s, args[0], args[1], args[2] & 31, const_args[2], c);
+        break;
     case INDEX_op_shr_i32:
         c = SHIFT_SRL;
-        goto gen_arith;
+        goto do_shift32;
     case INDEX_op_sar_i32:
         c = SHIFT_SRA;
-        goto gen_arith;
+        goto do_shift32;
     case INDEX_op_mul_i32:
         c = ARITH_UMUL;
         goto gen_arith;
@@ -1287,11 +1197,11 @@
 
     case INDEX_op_rem_i32:
     case INDEX_op_remu_i32:
-        tcg_out_div32(s, TCG_REG_I5, args[1], args[2], const_args[2],
+        tcg_out_div32(s, TCG_REG_T1, args[1], args[2], const_args[2],
                       opc == INDEX_op_remu_i32);
-        tcg_out_arithc(s, TCG_REG_I5, TCG_REG_I5, args[2], const_args[2],
+        tcg_out_arithc(s, TCG_REG_T1, TCG_REG_T1, args[2], const_args[2],
                        ARITH_UMUL);
-        tcg_out_arith(s, args[0], args[1], TCG_REG_I5, ARITH_SUB);
+        tcg_out_arith(s, args[0], args[1], TCG_REG_T1, ARITH_SUB);
         break;
 
     case INDEX_op_brcond_i32:
@@ -1356,6 +1266,9 @@
         tcg_out_qemu_ld(s, args, 2 | 4);
         break;
 #endif
+    case INDEX_op_qemu_ld64:
+        tcg_out_qemu_ld(s, args, 3);
+        break;
     case INDEX_op_qemu_st8:
         tcg_out_qemu_st(s, args, 0);
         break;
@@ -1365,6 +1278,9 @@
     case INDEX_op_qemu_st32:
         tcg_out_qemu_st(s, args, 2);
         break;
+    case INDEX_op_qemu_st64:
+        tcg_out_qemu_st(s, args, 3);
+        break;
 
 #if TCG_TARGET_REG_BITS == 64
     case INDEX_op_movi_i64:
@@ -1381,13 +1297,16 @@
         break;
     case INDEX_op_shl_i64:
         c = SHIFT_SLLX;
-        goto gen_arith;
+    do_shift64:
+        /* Limit immediate shift count lest we create an illegal insn.  */
+        tcg_out_arithc(s, args[0], args[1], args[2] & 63, const_args[2], c);
+        break;
     case INDEX_op_shr_i64:
         c = SHIFT_SRLX;
-        goto gen_arith;
+        goto do_shift64;
     case INDEX_op_sar_i64:
         c = SHIFT_SRAX;
-        goto gen_arith;
+        goto do_shift64;
     case INDEX_op_mul_i64:
         c = ARITH_MULX;
         goto gen_arith;
@@ -1399,11 +1318,11 @@
         goto gen_arith;
     case INDEX_op_rem_i64:
     case INDEX_op_remu_i64:
-        tcg_out_arithc(s, TCG_REG_I5, args[1], args[2], const_args[2],
+        tcg_out_arithc(s, TCG_REG_T1, args[1], args[2], const_args[2],
                        opc == INDEX_op_rem_i64 ? ARITH_SDIVX : ARITH_UDIVX);
-        tcg_out_arithc(s, TCG_REG_I5, TCG_REG_I5, args[2], const_args[2],
+        tcg_out_arithc(s, TCG_REG_T1, TCG_REG_T1, args[2], const_args[2],
                        ARITH_MULX);
-        tcg_out_arith(s, args[0], args[1], TCG_REG_I5, ARITH_SUB);
+        tcg_out_arith(s, args[0], args[1], TCG_REG_T1, ARITH_SUB);
         break;
     case INDEX_op_ext32s_i64:
         if (const_args[1]) {
@@ -1429,13 +1348,6 @@
                             args[2], const_args[2]);
         break;
 
-    case INDEX_op_qemu_ld64:
-        tcg_out_qemu_ld(s, args, 3);
-        break;
-    case INDEX_op_qemu_st64:
-        tcg_out_qemu_st(s, args, 3);
-        break;
-
 #endif
     gen_arith:
         tcg_out_arithc(s, args[0], args[1], args[2], const_args[2], c);
@@ -1500,20 +1412,6 @@
     { INDEX_op_mulu2_i32, { "r", "r", "r", "rJ" } },
 #endif
 
-    { INDEX_op_qemu_ld8u, { "r", "L" } },
-    { INDEX_op_qemu_ld8s, { "r", "L" } },
-    { INDEX_op_qemu_ld16u, { "r", "L" } },
-    { INDEX_op_qemu_ld16s, { "r", "L" } },
-    { INDEX_op_qemu_ld32, { "r", "L" } },
-#if TCG_TARGET_REG_BITS == 64
-    { INDEX_op_qemu_ld32u, { "r", "L" } },
-    { INDEX_op_qemu_ld32s, { "r", "L" } },
-#endif
-
-    { INDEX_op_qemu_st8, { "L", "L" } },
-    { INDEX_op_qemu_st16, { "L", "L" } },
-    { INDEX_op_qemu_st32, { "L", "L" } },
-
 #if TCG_TARGET_REG_BITS == 64
     { INDEX_op_mov_i64, { "r", "r" } },
     { INDEX_op_movi_i64, { "r" } },
@@ -1528,8 +1426,6 @@
     { INDEX_op_st16_i64, { "r", "r" } },
     { INDEX_op_st32_i64, { "r", "r" } },
     { INDEX_op_st_i64, { "r", "r" } },
-    { INDEX_op_qemu_ld64, { "L", "L" } },
-    { INDEX_op_qemu_st64, { "L", "L" } },
 
     { INDEX_op_add_i64, { "r", "r", "rJ" } },
     { INDEX_op_mul_i64, { "r", "r", "rJ" } },
@@ -1557,6 +1453,47 @@
     { INDEX_op_brcond_i64, { "r", "rJ" } },
     { INDEX_op_setcond_i64, { "r", "r", "rJ" } },
 #endif
+
+#if TCG_TARGET_REG_BITS == 64
+    { INDEX_op_qemu_ld8u, { "r", "L" } },
+    { INDEX_op_qemu_ld8s, { "r", "L" } },
+    { INDEX_op_qemu_ld16u, { "r", "L" } },
+    { INDEX_op_qemu_ld16s, { "r", "L" } },
+    { INDEX_op_qemu_ld32, { "r", "L" } },
+    { INDEX_op_qemu_ld32u, { "r", "L" } },
+    { INDEX_op_qemu_ld32s, { "r", "L" } },
+    { INDEX_op_qemu_ld64, { "r", "L" } },
+
+    { INDEX_op_qemu_st8, { "L", "L" } },
+    { INDEX_op_qemu_st16, { "L", "L" } },
+    { INDEX_op_qemu_st32, { "L", "L" } },
+    { INDEX_op_qemu_st64, { "L", "L" } },
+#elif TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+    { INDEX_op_qemu_ld8u, { "r", "L" } },
+    { INDEX_op_qemu_ld8s, { "r", "L" } },
+    { INDEX_op_qemu_ld16u, { "r", "L" } },
+    { INDEX_op_qemu_ld16s, { "r", "L" } },
+    { INDEX_op_qemu_ld32, { "r", "L" } },
+    { INDEX_op_qemu_ld64, { "r", "r", "L" } },
+
+    { INDEX_op_qemu_st8, { "L", "L" } },
+    { INDEX_op_qemu_st16, { "L", "L" } },
+    { INDEX_op_qemu_st32, { "L", "L" } },
+    { INDEX_op_qemu_st64, { "L", "L", "L" } },
+#else
+    { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld16u, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld16s, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld32, { "r", "L", "L" } },
+    { INDEX_op_qemu_ld64, { "L", "L", "L", "L" } },
+
+    { INDEX_op_qemu_st8, { "L", "L", "L" } },
+    { INDEX_op_qemu_st16, { "L", "L", "L" } },
+    { INDEX_op_qemu_st32, { "L", "L", "L" } },
+    { INDEX_op_qemu_st64, { "L", "L", "L", "L" } },
+#endif
+
     { -1 },
 };
 
@@ -1583,25 +1520,23 @@
                      (1 << TCG_REG_O7));
 
     tcg_regset_clear(s->reserved_regs);
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_G0);
-#if TCG_TARGET_REG_BITS == 64
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_I4); // for internal use
-#endif
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_I5); // for internal use
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_I6);
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_I7);
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_O6);
-    tcg_regset_set_reg(s->reserved_regs, TCG_REG_O7);
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_G0); /* zero */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_G6); /* reserved for os */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_G7); /* thread pointer */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_I6); /* frame pointer */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_I7); /* return address */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_O6); /* stack pointer */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_T1); /* for internal use */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_T2); /* for internal use */
+
     tcg_add_target_add_op_defs(sparc_op_defs);
 }
 
 #if TCG_TARGET_REG_BITS == 64
 # define ELF_HOST_MACHINE  EM_SPARCV9
-#elif defined(__sparc_v8plus__)
+#else
 # define ELF_HOST_MACHINE  EM_SPARC32PLUS
 # define ELF_HOST_FLAGS    EF_SPARC_32PLUS
-#else
-# define ELF_HOST_MACHINE  EM_SPARC
 #endif
 
 typedef struct {
@@ -1657,3 +1592,18 @@
 
     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
 }
+
+void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
+{
+    uint32_t *ptr = (uint32_t *)jmp_addr;
+    tcg_target_long disp = (tcg_target_long)(addr - jmp_addr) >> 2;
+
+    /* We can reach the entire address space for 32-bit.  For 64-bit
+       the code_gen_buffer can't be larger than 2GB.  */
+    if (TCG_TARGET_REG_BITS == 64 && !check_fit_tl(disp, 30)) {
+        tcg_abort();
+    }
+
+    *ptr = CALL | (disp & 0x3fffffff);
+    flush_icache_range(jmp_addr, jmp_addr + 4);
+}
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 0ea87be..6314ffb 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -66,22 +66,19 @@
 #define TCG_CT_CONST_S13 0x200
 
 /* used for function call generation */
-#define TCG_REG_CALL_STACK TCG_REG_I6
-#ifdef __arch64__
-// Reserve space for AREG0
-#define TCG_TARGET_STACK_MINFRAME (176 + 4 * (int)sizeof(long) + \
-                                   TCG_STATIC_CALL_ARGS_SIZE)
-#define TCG_TARGET_CALL_STACK_OFFSET (2047 - 16)
-#define TCG_TARGET_STACK_ALIGN 16
+#define TCG_REG_CALL_STACK TCG_REG_O6
+
+#if TCG_TARGET_REG_BITS == 64
+#define TCG_TARGET_STACK_BIAS           2047
+#define TCG_TARGET_STACK_ALIGN          16
+#define TCG_TARGET_CALL_STACK_OFFSET    (128 + 6*8 + TCG_TARGET_STACK_BIAS)
 #else
-// AREG0 + one word for alignment
-#define TCG_TARGET_STACK_MINFRAME (92 + (2 + 1) * (int)sizeof(long) + \
-                                   TCG_STATIC_CALL_ARGS_SIZE)
-#define TCG_TARGET_CALL_STACK_OFFSET TCG_TARGET_STACK_MINFRAME
-#define TCG_TARGET_STACK_ALIGN 8
+#define TCG_TARGET_STACK_BIAS           0
+#define TCG_TARGET_STACK_ALIGN          8
+#define TCG_TARGET_CALL_STACK_OFFSET    (64 + 4 + 6*4)
 #endif
 
-#ifdef __arch64__
+#if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_EXTEND_ARGS 1
 #endif
 
@@ -102,6 +99,7 @@
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div_i64          1
@@ -123,15 +121,12 @@
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif
 
-#ifdef CONFIG_SOLARIS
-#define TCG_AREG0 TCG_REG_G2
-#elif defined(__sparc_v9__)
-#define TCG_AREG0 TCG_REG_G5
-#else
-#define TCG_AREG0 TCG_REG_G6
-#endif
+#define TCG_TARGET_HAS_GUEST_BASE
+
+#define TCG_AREG0 TCG_REG_I0
 
 static inline void flush_icache_range(tcg_target_ulong start,
                                       tcg_target_ulong stop)
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 169d3b2..6d28f82 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -2118,6 +2118,44 @@
     tcg_temp_free_i64(t1);
 }
 
+static inline void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret,
+                                       TCGv_i32 c1, TCGv_i32 c2,
+                                       TCGv_i32 v1, TCGv_i32 v2)
+{
+    if (TCG_TARGET_HAS_movcond_i32) {
+        tcg_gen_op6i_i32(INDEX_op_movcond_i32, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        tcg_gen_setcond_i32(cond, t0, c1, c2);
+        tcg_gen_neg_i32(t0, t0);
+        tcg_gen_and_i32(t1, v1, t0);
+        tcg_gen_andc_i32(ret, v2, t0);
+        tcg_gen_or_i32(ret, ret, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
+                                       TCGv_i64 c1, TCGv_i64 c2,
+                                       TCGv_i64 v1, TCGv_i64 v2)
+{
+    if (TCG_TARGET_HAS_movcond_i64) {
+        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_setcond_i64(cond, t0, c1, c2);
+        tcg_gen_neg_i64(t0, t0);
+        tcg_gen_and_i64(t1, v1, t0);
+        tcg_gen_andc_i64(ret, v2, t0);
+        tcg_gen_or_i64(ret, ret, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
 /***************************************/
 /* QEMU specific operations. Their type depend on the QEMU CPU
    type. */
@@ -2434,6 +2472,7 @@
 #define tcg_gen_deposit_tl tcg_gen_deposit_i64
 #define tcg_const_tl tcg_const_i64
 #define tcg_const_local_tl tcg_const_local_i64
+#define tcg_gen_movcond_tl tcg_gen_movcond_i64
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -2505,6 +2544,7 @@
 #define tcg_gen_deposit_tl tcg_gen_deposit_i32
 #define tcg_const_tl tcg_const_i32
 #define tcg_const_local_tl tcg_const_local_i32
+#define tcg_gen_movcond_tl tcg_gen_movcond_i32
 #endif
 
 #if TCG_TARGET_REG_BITS == 32
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 8e06d03..dbb0e39 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -36,7 +36,7 @@
 
 DEF(discard, 1, 0, 0, 0)
 
-DEF(set_label, 0, 0, 1, 0)
+DEF(set_label, 0, 0, 1, TCG_OPF_BB_END)
 DEF(call, 0, 1, 2, TCG_OPF_SIDE_EFFECTS) /* variable number of parameters */
 DEF(jmp, 0, 1, 0, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
 DEF(br, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
@@ -51,6 +51,7 @@
 DEF(mov_i32, 1, 1, 0, 0)
 DEF(movi_i32, 1, 0, 1, 0)
 DEF(setcond_i32, 1, 2, 1, 0)
+DEF(movcond_i32, 1, 4, 1, IMPL(TCG_TARGET_HAS_movcond_i32))
 /* load/store */
 DEF(ld8u_i32, 1, 1, 1, 0)
 DEF(ld8s_i32, 1, 1, 1, 0)
@@ -107,6 +108,7 @@
 DEF(mov_i64, 1, 1, 0, IMPL64)
 DEF(movi_i64, 1, 0, 1, IMPL64)
 DEF(setcond_i64, 1, 2, 1, IMPL64)
+DEF(movcond_i64, 1, 4, 1, IMPL64 | IMPL(TCG_TARGET_HAS_movcond_i64))
 /* load/store */
 DEF(ld8u_i64, 1, 1, 1, IMPL64)
 DEF(ld8s_i64, 1, 1, 1, IMPL64)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index a4e7f42..b3c2650 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -89,7 +89,6 @@
                        tcg_target_long arg2);
 static int tcg_target_const_match(tcg_target_long val,
                                   const TCGArgConstraint *arg_ct);
-static int tcg_target_get_call_iarg_regs_count(int flags);
 
 TCGOpDef tcg_op_defs[] = {
 #define DEF(s, oargs, iargs, cargs, flags) { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
@@ -937,11 +936,7 @@
                                                        args[nb_oargs + i]));
                 }
             }
-        } else if (c == INDEX_op_movi_i32 
-#if TCG_TARGET_REG_BITS == 64
-                   || c == INDEX_op_movi_i64
-#endif
-                   ) {
+        } else if (c == INDEX_op_movi_i32 || c == INDEX_op_movi_i64) {
             tcg_target_ulong val;
             TCGHelperInfo *th;
 
@@ -991,17 +986,13 @@
             }
             switch (c) {
             case INDEX_op_brcond_i32:
-#if TCG_TARGET_REG_BITS == 32
-            case INDEX_op_brcond2_i32:
-#elif TCG_TARGET_REG_BITS == 64
-            case INDEX_op_brcond_i64:
-#endif
             case INDEX_op_setcond_i32:
-#if TCG_TARGET_REG_BITS == 32
+            case INDEX_op_movcond_i32:
+            case INDEX_op_brcond2_i32:
             case INDEX_op_setcond2_i32:
-#elif TCG_TARGET_REG_BITS == 64
+            case INDEX_op_brcond_i64:
             case INDEX_op_setcond_i64:
-#endif
+            case INDEX_op_movcond_i64:
                 if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]]) {
                     qemu_log(",%s", cond_name[args[k++]]);
                 } else {
@@ -1297,11 +1288,6 @@
                 args--;
             }
             break;
-        case INDEX_op_set_label:
-            args--;
-            /* mark end of basic block */
-            tcg_la_bb_end(s, dead_temps);
-            break;
         case INDEX_op_debug_insn_start:
             args -= def->nb_args;
             break;
@@ -1463,7 +1449,8 @@
 {
     TCGTemp *ts;
     ts = &s->temps[temp];
-#ifndef __sparc_v9__ /* Sparc64 stack is accessed with offset of 2047 */
+#if !(defined(__sparc__) && TCG_TARGET_REG_BITS == 64)
+    /* Sparc64 stack is accessed with offset of 2047 */
     s->current_frame_offset = (s->current_frame_offset +
                                (tcg_target_long)sizeof(tcg_target_long) - 1) &
         ~(sizeof(tcg_target_long) - 1);
@@ -1866,7 +1853,7 @@
 
     flags = args[nb_oargs + nb_iargs];
 
-    nb_regs = tcg_target_get_call_iarg_regs_count(flags);
+    nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
     if (nb_regs > nb_params)
         nb_regs = nb_params;
 
@@ -2108,16 +2095,12 @@
 #endif
         switch(opc) {
         case INDEX_op_mov_i32:
-#if TCG_TARGET_REG_BITS == 64
         case INDEX_op_mov_i64:
-#endif
             dead_args = s->op_dead_args[op_index];
             tcg_reg_alloc_mov(s, def, args, dead_args);
             break;
         case INDEX_op_movi_i32:
-#if TCG_TARGET_REG_BITS == 64
         case INDEX_op_movi_i64:
-#endif
             tcg_reg_alloc_movi(s, args);
             break;
         case INDEX_op_debug_insn_start:
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 7a72729..48a56f0 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -79,6 +79,7 @@
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif
 
 #ifndef TCG_TARGET_deposit_i32_valid
@@ -343,7 +344,7 @@
 
     /* goto_tb support */
     uint8_t *code_buf;
-    unsigned long *tb_next;
+    uintptr_t *tb_next;
     uint16_t *tb_next_offset;
     uint16_t *tb_jmp_offset; /* != NULL if USE_DIRECT_JUMP */
 
@@ -459,11 +460,6 @@
 void tcg_temp_free_i64(TCGv_i64 arg);
 char *tcg_get_arg_str_i64(TCGContext *s, char *buf, int buf_size, TCGv_i64 arg);
 
-static inline bool tcg_arg_is_local(TCGContext *s, TCGArg arg)
-{
-    return s->temps[arg].temp_local;
-}
-
 #if defined(CONFIG_DEBUG_TCG)
 /* If you call tcg_clear_temp_count() at the start of a section of
  * code which is not supposed to leak any TCG temporaries, then
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 003244c..3f4a24b 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -863,12 +863,6 @@
     return arg_ct->ct & TCG_CT_CONST;
 }
 
-/* Maximum number of register used for input function arguments. */
-static int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return ARRAY_SIZE(tcg_target_call_iarg_regs);
-}
-
 static void tcg_target_init(TCGContext *s)
 {
 #if defined(CONFIG_DEBUG_TCG_INTERPRETER)
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 30a0f21..6d89495 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -75,6 +75,7 @@
 #define TCG_TARGET_HAS_not_i32          1
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_rot_i32          1
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_bswap16_i64      1
@@ -98,6 +99,7 @@
 #define TCG_TARGET_HAS_not_i64          1
 #define TCG_TARGET_HAS_orc_i64          0
 #define TCG_TARGET_HAS_rot_i64          1
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif /* TCG_TARGET_REG_BITS == 64 */
 
 /* Offset to user memory in user mode. */
diff --git a/trace-events b/trace-events
index b25ae1c..f5b5097 100644
--- a/trace-events
+++ b/trace-events
@@ -243,9 +243,12 @@
 
 # hw/usb/hcd-ehci.c
 usb_ehci_reset(void) "=== RESET ==="
-usb_ehci_mmio_readl(uint32_t addr, const char *str, uint32_t val) "rd mmio %04x [%s] = %x"
-usb_ehci_mmio_writel(uint32_t addr, const char *str, uint32_t val) "wr mmio %04x [%s] = %x"
-usb_ehci_mmio_change(uint32_t addr, const char *str, uint32_t new, uint32_t old) "ch mmio %04x [%s] = %x (old: %x)"
+usb_ehci_opreg_read(uint32_t addr, const char *str, uint32_t val) "rd mmio %04x [%s] = %x"
+usb_ehci_opreg_write(uint32_t addr, const char *str, uint32_t val) "wr mmio %04x [%s] = %x"
+usb_ehci_opreg_change(uint32_t addr, const char *str, uint32_t new, uint32_t old) "ch mmio %04x [%s] = %x (old: %x)"
+usb_ehci_portsc_read(uint32_t addr, uint32_t port, uint32_t val) "rd mmio %04x [port %d] = %x"
+usb_ehci_portsc_write(uint32_t addr, uint32_t port, uint32_t val) "wr mmio %04x [port %d] = %x"
+usb_ehci_portsc_change(uint32_t addr, uint32_t port, uint32_t new, uint32_t old) "ch mmio %04x [port %d] = %x (old: %x)"
 usb_ehci_usbsts(const char *sts, int state) "usbsts %s %d"
 usb_ehci_state(const char *schedule, const char *state) "%s schedule %s"
 usb_ehci_qh_ptrs(void *q, uint32_t addr, uint32_t nxt, uint32_t c_qtd, uint32_t n_qtd, uint32_t a_qtd) "q %p - QH @ %08x: next %08x qtds %08x,%08x,%08x"
@@ -932,8 +935,9 @@
 qxl_interface_update_area_complete_overflow(int qid, int max) "%d max=%d"
 qxl_interface_update_area_complete_schedule_bh(int qid, uint32_t num_dirty) "%d #dirty=%d"
 qxl_io_destroy_primary_ignored(int qid, const char *mode) "%d %s"
+qxl_io_log(int qid, const uint8_t *log_buf) "%d %s"
 qxl_io_read_unexpected(int qid) "%d"
-qxl_io_unexpected_vga_mode(int qid, uint32_t io_port, const char *desc) "%d 0x%x (%s)"
+qxl_io_unexpected_vga_mode(int qid, uint64_t addr, uint64_t val, const char *desc) "%d 0x%"PRIx64"=%"PRIu64" (%s)"
 qxl_io_write(int qid, const char *mode, uint64_t addr, uint64_t val, unsigned size, int async) "%d %s addr=%"PRIu64 " val=%"PRIu64" size=%u async=%d"
 qxl_memslot_add_guest(int qid, uint32_t slot_id, uint64_t guest_start, uint64_t guest_end) "%d %u: guest phys 0x%"PRIx64 " - 0x%" PRIx64
 qxl_post_load(int qid, const char *mode) "%d %s"
@@ -964,7 +968,7 @@
 qxl_spice_destroy_surface_wait_complete(int qid, uint32_t id) "%d sid=%d"
 qxl_spice_destroy_surface_wait(int qid, uint32_t id, int async) "%d sid=%d async=%d"
 qxl_spice_flush_surfaces_async(int qid, uint32_t surface_count, uint32_t num_free_res) "%d s#=%d, res#=%d"
-qxl_spice_monitors_config(int id) "%d"
+qxl_spice_monitors_config(int qid) "%d"
 qxl_spice_loadvm_commands(int qid, void *ext, uint32_t count) "%d ext=%p count=%d"
 qxl_spice_oom(int qid) "%d"
 qxl_spice_reset_cursor(int qid) "%d"
@@ -973,6 +977,12 @@
 qxl_spice_update_area(int qid, uint32_t surface_id, uint32_t left, uint32_t right, uint32_t top, uint32_t bottom) "%d sid=%d [%d,%d,%d,%d]"
 qxl_spice_update_area_rest(int qid, uint32_t num_dirty_rects, uint32_t clear_dirty_region) "%d #d=%d clear=%d"
 qxl_surfaces_dirty(int qid, int surface, int offset, int size) "%d surface=%d offset=%d size=%d"
+qxl_send_events(int qid, uint32_t events) "%d %d"
+qxl_set_guest_bug(int qid) "%d"
+qxl_interrupt_client_monitors_config(int qid, int num_heads, void *heads) "%d %d %p"
+qxl_client_monitors_config_unsupported_by_guest(int qid, uint32_t int_mask, void *client_monitors_config) "%d %X %p"
+qxl_client_monitors_config_capped(int qid, int requested, int limit) "%d %d %d"
+qxl_client_monitors_config_crc(int qid, unsigned size, uint32_t crc32) "%d %u %u"
 
 # hw/qxl-render.c
 qxl_render_blit_guest_primary_initialized(void) ""
diff --git a/ui/spice-display.c b/ui/spice-display.c
index 99bc665..50fbefb 100644
--- a/ui/spice-display.c
+++ b/ui/spice-display.c
@@ -164,34 +164,31 @@
 #endif
 }
 
-static SimpleSpiceUpdate *qemu_spice_create_update(SimpleSpiceDisplay *ssd)
+static void qemu_spice_create_one_update(SimpleSpiceDisplay *ssd,
+                                         QXLRect *rect)
 {
     SimpleSpiceUpdate *update;
     QXLDrawable *drawable;
     QXLImage *image;
     QXLCommand *cmd;
-    uint8_t *src, *dst;
-    int by, bw, bh;
+    uint8_t *src, *mirror, *dst;
+    int by, bw, bh, offset, bytes;
     struct timespec time_space;
 
-    if (qemu_spice_rect_is_empty(&ssd->dirty)) {
-        return NULL;
-    };
-
     trace_qemu_spice_create_update(
-           ssd->dirty.left, ssd->dirty.right,
-           ssd->dirty.top, ssd->dirty.bottom);
+           rect->left, rect->right,
+           rect->top, rect->bottom);
 
     update   = g_malloc0(sizeof(*update));
     drawable = &update->drawable;
     image    = &update->image;
     cmd      = &update->ext.cmd;
 
-    bw       = ssd->dirty.right - ssd->dirty.left;
-    bh       = ssd->dirty.bottom - ssd->dirty.top;
+    bw       = rect->right - rect->left;
+    bh       = rect->bottom - rect->top;
     update->bitmap = g_malloc(bw * bh * 4);
 
-    drawable->bbox            = ssd->dirty;
+    drawable->bbox            = *rect;
     drawable->clip.type       = SPICE_CLIP_TYPE_NONE;
     drawable->effect          = QXL_EFFECT_OPAQUE;
     drawable->release_info.id = (uintptr_t)update;
@@ -219,31 +216,103 @@
     image->bitmap.palette = 0;
     image->bitmap.format = SPICE_BITMAP_FMT_32BIT;
 
-    if (ssd->conv == NULL) {
-        PixelFormat dst = qemu_default_pixelformat(32);
-        ssd->conv = qemu_pf_conv_get(&dst, &ssd->ds->surface->pf);
-        assert(ssd->conv);
-    }
-
-    src = ds_get_data(ssd->ds) +
-        ssd->dirty.top * ds_get_linesize(ssd->ds) +
-        ssd->dirty.left * ds_get_bytes_per_pixel(ssd->ds);
+    offset =
+        rect->top * ds_get_linesize(ssd->ds) +
+        rect->left * ds_get_bytes_per_pixel(ssd->ds);
+    bytes = ds_get_bytes_per_pixel(ssd->ds) * bw;
+    src = ds_get_data(ssd->ds) + offset;
+    mirror = ssd->ds_mirror + offset;
     dst = update->bitmap;
     for (by = 0; by < bh; by++) {
-        qemu_pf_conv_run(ssd->conv, dst, src, bw);
+        memcpy(mirror, src, bytes);
+        qemu_pf_conv_run(ssd->conv, dst, mirror, bw);
         src += ds_get_linesize(ssd->ds);
+        mirror += ds_get_linesize(ssd->ds);
         dst += image->bitmap.stride;
     }
 
     cmd->type = QXL_CMD_DRAW;
     cmd->data = (uintptr_t)drawable;
 
+    QTAILQ_INSERT_TAIL(&ssd->updates, update, next);
+}
+
+static void qemu_spice_create_update(SimpleSpiceDisplay *ssd)
+{
+    static const int blksize = 32;
+    int blocks = (ds_get_width(ssd->ds) + blksize - 1) / blksize;
+    int dirty_top[blocks];
+    int y, yoff, x, xoff, blk, bw;
+    int bpp = ds_get_bytes_per_pixel(ssd->ds);
+    uint8_t *guest, *mirror;
+
+    if (qemu_spice_rect_is_empty(&ssd->dirty)) {
+        return;
+    };
+
+    if (ssd->conv == NULL) {
+        PixelFormat dst = qemu_default_pixelformat(32);
+        ssd->conv = qemu_pf_conv_get(&dst, &ssd->ds->surface->pf);
+        assert(ssd->conv);
+    }
+    if (ssd->ds_mirror == NULL) {
+        int size = ds_get_height(ssd->ds) * ds_get_linesize(ssd->ds);
+        ssd->ds_mirror = g_malloc0(size);
+    }
+
+    for (blk = 0; blk < blocks; blk++) {
+        dirty_top[blk] = -1;
+    }
+
+    guest = ds_get_data(ssd->ds);
+    mirror = ssd->ds_mirror;
+    for (y = ssd->dirty.top; y < ssd->dirty.bottom; y++) {
+        yoff = y * ds_get_linesize(ssd->ds);
+        for (x = ssd->dirty.left; x < ssd->dirty.right; x += blksize) {
+            xoff = x * bpp;
+            blk = x / blksize;
+            bw = MIN(blksize, ssd->dirty.right - x);
+            if (memcmp(guest + yoff + xoff,
+                       mirror + yoff + xoff,
+                       bw * bpp) == 0) {
+                if (dirty_top[blk] != -1) {
+                    QXLRect update = {
+                        .top    = dirty_top[blk],
+                        .bottom = y,
+                        .left   = x,
+                        .right  = x + bw,
+                    };
+                    qemu_spice_create_one_update(ssd, &update);
+                    dirty_top[blk] = -1;
+                }
+            } else {
+                if (dirty_top[blk] == -1) {
+                    dirty_top[blk] = y;
+                }
+            }
+        }
+    }
+
+    for (x = ssd->dirty.left; x < ssd->dirty.right; x += blksize) {
+        blk = x / blksize;
+        bw = MIN(blksize, ssd->dirty.right - x);
+        if (dirty_top[blk] != -1) {
+            QXLRect update = {
+                .top    = dirty_top[blk],
+                .bottom = ssd->dirty.bottom,
+                .left   = x,
+                .right  = x + bw,
+            };
+            qemu_spice_create_one_update(ssd, &update);
+            dirty_top[blk] = -1;
+        }
+    }
+
     memset(&ssd->dirty, 0, sizeof(ssd->dirty));
-    return update;
 }
 
 /*
- * Called from spice server thread context (via interface_release_ressource)
+ * Called from spice server thread context (via interface_release_resource)
  * We do *not* hold the global qemu mutex here, so extra care is needed
  * when calling qemu functions.  QEMU interfaces used:
  *    - g_free (underlying glibc free is re-entrant).
@@ -315,6 +384,7 @@
 {
     ssd->ds = ds;
     qemu_mutex_init(&ssd->lock);
+    QTAILQ_INIT(&ssd->updates);
     ssd->mouse_x = -1;
     ssd->mouse_y = -1;
     if (ssd->num_surfaces == 0) {
@@ -345,16 +415,20 @@
 
 void qemu_spice_display_resize(SimpleSpiceDisplay *ssd)
 {
+    SimpleSpiceUpdate *update;
+
     dprint(1, "%s:\n", __FUNCTION__);
 
     memset(&ssd->dirty, 0, sizeof(ssd->dirty));
     qemu_pf_conv_put(ssd->conv);
     ssd->conv = NULL;
+    g_free(ssd->ds_mirror);
+    ssd->ds_mirror = NULL;
 
     qemu_mutex_lock(&ssd->lock);
-    if (ssd->update != NULL) {
-        qemu_spice_destroy_update(ssd, ssd->update);
-        ssd->update = NULL;
+    while ((update = QTAILQ_FIRST(&ssd->updates)) != NULL) {
+        QTAILQ_REMOVE(&ssd->updates, update, next);
+        qemu_spice_destroy_update(ssd, update);
     }
     qemu_mutex_unlock(&ssd->lock);
     qemu_spice_destroy_host_primary(ssd);
@@ -384,8 +458,8 @@
     vga_hw_update();
 
     qemu_mutex_lock(&ssd->lock);
-    if (ssd->update == NULL) {
-        ssd->update = qemu_spice_create_update(ssd);
+    if (QTAILQ_EMPTY(&ssd->updates)) {
+        qemu_spice_create_update(ssd);
         ssd->notify++;
     }
     qemu_spice_cursor_refresh_unlocked(ssd);
@@ -442,9 +516,9 @@
     dprint(3, "%s:\n", __FUNCTION__);
 
     qemu_mutex_lock(&ssd->lock);
-    if (ssd->update != NULL) {
-        update = ssd->update;
-        ssd->update = NULL;
+    update = QTAILQ_FIRST(&ssd->updates);
+    if (update != NULL) {
+        QTAILQ_REMOVE(&ssd->updates, update, next);
         *ext = update->ext;
         ret = true;
     }
diff --git a/ui/spice-display.h b/ui/spice-display.h
index 512ab78..dea41c1 100644
--- a/ui/spice-display.h
+++ b/ui/spice-display.h
@@ -72,6 +72,7 @@
 
 struct SimpleSpiceDisplay {
     DisplayState *ds;
+    uint8_t *ds_mirror;
     void *buf;
     int bufsize;
     QXLWorker *worker;
@@ -92,7 +93,7 @@
      * to them must be protected by the lock.
      */
     QemuMutex lock;
-    SimpleSpiceUpdate *update;
+    QTAILQ_HEAD(, SimpleSpiceUpdate) updates;
     QEMUCursor *cursor;
     int mouse_x, mouse_y;
 };
@@ -102,6 +103,7 @@
     QXLImage image;
     QXLCommandExt ext;
     uint8_t *bitmap;
+    QTAILQ_ENTRY(SimpleSpiceUpdate) next;
 };
 
 int qemu_spice_rect_is_empty(const QXLRect* r);