Merge remote-tracking branch 'kwolf/for-anthony' into staging

# By Paolo Bonzini (7) and others
# Via Kevin Wolf
* kwolf/for-anthony: (22 commits)
  pc: add compatibility machine types for 1.4
  blockdev: enable discard by default
  qemu-nbd: add --discard option
  blockdev: add discard suboption to -drive
  block: implement BDRV_O_UNMAP
  block: complete all IOs before .bdrv_truncate
  coroutine: trim down nesting level in perf_nesting test
  coroutine: move pooling to common code
  qemu-iotests: Test qcow2 image creation options
  qemu-iotests: Add qemu-img compare test
  qemu-img: Add compare subcommand
  qemu-img: Add "Quiet mode" option
  block: Add synchronous wrapper for bdrv_co_is_allocated_above
  block: refuse negative iops and bps values
  block: use Error in do_check_io_limits()
  qcow2: support compressed clusters in BlockFragInfo
  qemu-img: add compressed clusters to BlockFragInfo
  qemu-img: fix missing space in qemu-img check output
  qcow2: record fragmentation statistics during check
  qcow2: introduce check_refcounts_l1/l2() flags
  ...
diff --git a/arch_init.c b/arch_init.c
index 8da868b..8daeafa 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -414,6 +414,7 @@
     if (end_time > start_time + 1000) {
         s->dirty_pages_rate = num_dirty_pages_period * 1000
             / (end_time - start_time);
+        s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
         start_time = end_time;
         num_dirty_pages_period = 0;
     }
diff --git a/configure b/configure
index 0eb25dd..dcaa67c 100755
--- a/configure
+++ b/configure
@@ -284,7 +284,7 @@
 # default flags for all hosts
 QEMU_CFLAGS="-fno-strict-aliasing $QEMU_CFLAGS"
 QEMU_CFLAGS="-Wall -Wundef -Wwrite-strings -Wmissing-prototypes $QEMU_CFLAGS"
-QEMU_CFLAGS="-Wredundant-decls $QEMU_CFLAGS"
+QEMU_CFLAGS="-Wstrict-prototypes -Wredundant-decls $QEMU_CFLAGS"
 QEMU_CFLAGS="-D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE $QEMU_CFLAGS"
 QEMU_INCLUDES="-I. -I\$(SRC_PATH) -I\$(SRC_PATH)/include"
 if test "$debug_info" = "yes"; then
@@ -1645,7 +1645,7 @@
 
 if test "$gtk" != "no"; then
     if $pkg_config --exists 'gtk+-2.0 >= 2.18.0' && \
-       $pkg_config --exists 'vte >= 0.26.0'; then
+       $pkg_config --exists 'vte >= 0.24.0'; then
 	gtk_cflags=`$pkg_config --cflags gtk+-2.0 2>/dev/null`
 	gtk_libs=`$pkg_config --libs gtk+-2.0 2>/dev/null`
 	vte_cflags=`$pkg_config --cflags vte 2>/dev/null`
@@ -3115,20 +3115,27 @@
 fi
 
 ########################################
-# check whether we can disable the -Wunused-but-set-variable
-# option with a pragma (this is needed to silence a warning in
-# some versions of the valgrind VALGRIND_STACK_DEREGISTER macro.)
-# This test has to be compiled with -Werror as otherwise an
-# unknown pragma is only a warning.
+# check whether we can disable warning option with a pragma (this is needed
+# to silence warnings in the headers of some versions of external libraries).
+# This test has to be compiled with -Werror as otherwise an unknown pragma is
+# only a warning.
+#
+# If we can't selectively disable warning in the code, disable -Werror so that
+# the build doesn't fail anyway.
+
 pragma_disable_unused_but_set=no
 cat > $TMPC << EOF
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+
 int main(void) {
     return 0;
 }
 EOF
 if compile_prog "-Werror" "" ; then
     pragma_diagnostic_available=yes
+else
+    werror=no
 fi
 
 ########################################
diff --git a/coroutine-sigaltstack.c b/coroutine-sigaltstack.c
index b4d1762..3de0bb3 100644
--- a/coroutine-sigaltstack.c
+++ b/coroutine-sigaltstack.c
@@ -36,7 +36,7 @@
 typedef struct {
     Coroutine base;
     void *stack;
-    jmp_buf env;
+    sigjmp_buf env;
 } CoroutineUContext;
 
 /**
@@ -50,7 +50,7 @@
     CoroutineUContext leader;
 
     /** Information for the signal handler (trampoline) */
-    jmp_buf tr_reenter;
+    sigjmp_buf tr_reenter;
     volatile sig_atomic_t tr_called;
     void *tr_handler;
 } CoroutineThreadState;
@@ -95,8 +95,8 @@
 static void coroutine_bootstrap(CoroutineUContext *self, Coroutine *co)
 {
     /* Initialize longjmp environment and switch back the caller */
-    if (!setjmp(self->env)) {
-        longjmp(*(jmp_buf *)co->entry_arg, 1);
+    if (!sigsetjmp(self->env, 0)) {
+        siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
     }
 
     while (true) {
@@ -125,14 +125,14 @@
     /*
      * Here we have to do a bit of a ping pong between the caller, given that
      * this is a signal handler and we have to do a return "soon". Then the
-     * caller can reestablish everything and do a longjmp here again.
+     * caller can reestablish everything and do a siglongjmp here again.
      */
-    if (!setjmp(coTS->tr_reenter)) {
+    if (!sigsetjmp(coTS->tr_reenter, 0)) {
         return;
     }
 
     /*
-     * Ok, the caller has longjmp'ed back to us, so now prepare
+     * Ok, the caller has siglongjmp'ed back to us, so now prepare
      * us for the real machine state switching. We have to jump
      * into another function here to get a new stack context for
      * the auto variables (which have to be auto-variables
@@ -159,7 +159,7 @@
 
     /* The way to manipulate stack is with the sigaltstack function. We
      * prepare a stack, with it delivering a signal to ourselves and then
-     * put setjmp/longjmp where needed.
+     * put sigsetjmp/siglongjmp where needed.
      * This has been done keeping coroutine-ucontext as a model and with the
      * pth ideas (GNU Portable Threads). See coroutine-ucontext for the basics
      * of the coroutines and see pth_mctx.c (from the pth project) for the
@@ -200,7 +200,7 @@
 
     /*
      * Now transfer control onto the signal stack and set it up.
-     * It will return immediately via "return" after the setjmp()
+     * It will return immediately via "return" after the sigsetjmp()
      * was performed. Be careful here with race conditions.  The
      * signal can be delivered the first time sigsuspend() is
      * called.
@@ -241,8 +241,8 @@
      * type-conversion warnings related to the `volatile' qualifier and
      * the fact that `jmp_buf' usually is an array type.
      */
-    if (!setjmp(old_env)) {
-        longjmp(coTS->tr_reenter, 1);
+    if (!sigsetjmp(old_env, 0)) {
+        siglongjmp(coTS->tr_reenter, 1);
     }
 
     /*
@@ -270,9 +270,9 @@
 
     s->current = to_;
 
-    ret = setjmp(from->env);
+    ret = sigsetjmp(from->env, 0);
     if (ret == 0) {
-        longjmp(to->env, action);
+        siglongjmp(to->env, action);
     }
     return ret;
 }
diff --git a/coroutine-ucontext.c b/coroutine-ucontext.c
index 6f8ffa8..867a662 100644
--- a/coroutine-ucontext.c
+++ b/coroutine-ucontext.c
@@ -37,7 +37,7 @@
 typedef struct {
     Coroutine base;
     void *stack;
-    jmp_buf env;
+    sigjmp_buf env;
 
 #ifdef CONFIG_VALGRIND_H
     unsigned int valgrind_stack_id;
@@ -110,8 +110,8 @@
     co = &self->base;
 
     /* Initialize longjmp environment and switch back the caller */
-    if (!setjmp(self->env)) {
-        longjmp(*(jmp_buf *)co->entry_arg, 1);
+    if (!sigsetjmp(self->env, 0)) {
+        siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
     }
 
     while (true) {
@@ -125,14 +125,15 @@
     const size_t stack_size = 1 << 20;
     CoroutineUContext *co;
     ucontext_t old_uc, uc;
-    jmp_buf old_env;
+    sigjmp_buf old_env;
     union cc_arg arg = {0};
 
-    /* The ucontext functions preserve signal masks which incurs a system call
-     * overhead.  setjmp()/longjmp() does not preserve signal masks but only
-     * works on the current stack.  Since we need a way to create and switch to
-     * a new stack, use the ucontext functions for that but setjmp()/longjmp()
-     * for everything else.
+    /* The ucontext functions preserve signal masks which incurs a
+     * system call overhead.  sigsetjmp(buf, 0)/siglongjmp() does not
+     * preserve signal masks but only works on the current stack.
+     * Since we need a way to create and switch to a new stack, use
+     * the ucontext functions for that but sigsetjmp()/siglongjmp() for
+     * everything else.
      */
 
     if (getcontext(&uc) == -1) {
@@ -158,8 +159,8 @@
     makecontext(&uc, (void (*)(void))coroutine_trampoline,
                 2, arg.i[0], arg.i[1]);
 
-    /* swapcontext() in, longjmp() back out */
-    if (!setjmp(old_env)) {
+    /* swapcontext() in, siglongjmp() back out */
+    if (!sigsetjmp(old_env, 0)) {
         swapcontext(&old_uc, &uc);
     }
     return &co->base;
@@ -201,9 +202,9 @@
 
     s->current = to_;
 
-    ret = setjmp(from->env);
+    ret = sigsetjmp(from->env, 0);
     if (ret == 0) {
-        longjmp(to->env, action);
+        siglongjmp(to->env, action);
     }
     return ret;
 }
diff --git a/cpu-exec.c b/cpu-exec.c
index 9fcfe9e..afbe497 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -35,7 +35,7 @@
     CPUState *cpu = ENV_GET_CPU(env);
 
     cpu->current_tb = NULL;
-    longjmp(env->jmp_env, 1);
+    siglongjmp(env->jmp_env, 1);
 }
 
 /* exit the current TB from a signal handler. The host registers are
@@ -47,7 +47,7 @@
     /* XXX: restore cpu registers saved in host registers */
 
     env->exception_index = -1;
-    longjmp(env->jmp_env, 1);
+    siglongjmp(env->jmp_env, 1);
 }
 #endif
 
@@ -234,7 +234,7 @@
 
     /* prepare setjmp context for exception handling */
     for(;;) {
-        if (setjmp(env->jmp_env) == 0) {
+        if (sigsetjmp(env->jmp_env, 0) == 0) {
             /* if an exception is pending, we execute it here */
             if (env->exception_index >= 0) {
                 if (env->exception_index >= EXCP_INTERRUPT) {
diff --git a/disas/i386.c b/disas/i386.c
index 3b006b1..73cc06f 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -226,7 +226,7 @@
   bfd_byte the_buffer[MAX_MNEM_SIZE];
   bfd_vma insn_start;
   int orig_sizeflag;
-  jmp_buf bailout;
+  sigjmp_buf bailout;
 };
 
 enum address_mode
@@ -303,7 +303,7 @@
 	 STATUS.  */
       if (priv->max_fetched == priv->the_buffer)
 	(*info->memory_error_func) (status, start, info);
-      longjmp (priv->bailout, 1);
+      siglongjmp(priv->bailout, 1);
     }
   else
     priv->max_fetched = addr;
@@ -3661,7 +3661,7 @@
   start_codep = priv.the_buffer;
   codep = priv.the_buffer;
 
-  if (setjmp (priv.bailout) != 0)
+  if (sigsetjmp(priv.bailout, 0) != 0)
     {
       const char *name;
 
@@ -4720,7 +4720,8 @@
 	  buf[0] = '0';
 	  buf[1] = 'x';
           snprintf_vma (tmp, sizeof(tmp), disp);
-	  for (i = 0; tmp[i] == '0' && tmp[i + 1]; i++);
+          for (i = 0; tmp[i] == '0' && tmp[i + 1]; i++) {
+          }
           pstrcpy (buf + 2, bufsize - 2, tmp + i);
 	}
       else
diff --git a/disas/m68k.c b/disas/m68k.c
index c950241..cc0db96 100644
--- a/disas/m68k.c
+++ b/disas/m68k.c
@@ -624,7 +624,7 @@
   bfd_byte *max_fetched;
   bfd_byte the_buffer[MAXLEN];
   bfd_vma insn_start;
-  jmp_buf bailout;
+  sigjmp_buf bailout;
 };
 
 /* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive)
@@ -644,7 +644,7 @@
   if (status != 0)
     {
       (*info->memory_error_func) (status, start, info);
-      longjmp (priv->bailout, 1);
+      siglongjmp(priv->bailout, 1);
     }
   else
     priv->max_fetched = addr;
@@ -1912,9 +1912,10 @@
   priv.max_fetched = priv.the_buffer;
   priv.insn_start = memaddr;
 
-  if (setjmp (priv.bailout) != 0)
-    /* Error return.  */
-    return -1;
+  if (sigsetjmp(priv.bailout, 0) != 0) {
+      /* Error return.  */
+      return -1;
+  }
 
   switch (info->mach)
     {
diff --git a/hw/grlib_apbuart.c b/hw/grlib_apbuart.c
index 760bed0..ba1685a 100644
--- a/hw/grlib_apbuart.c
+++ b/hw/grlib_apbuart.c
@@ -75,7 +75,6 @@
     CharDriverState *chr;
 
     /* registers */
-    uint32_t receive;
     uint32_t status;
     uint32_t control;
 
@@ -136,12 +135,14 @@
 {
     UART *uart = opaque;
 
-    uart_add_to_fifo(uart, buf, size);
+    if (uart->control & UART_RECEIVE_ENABLE) {
+        uart_add_to_fifo(uart, buf, size);
 
-    uart->status |= UART_DATA_READY;
+        uart->status |= UART_DATA_READY;
 
-    if (uart->control & UART_RECEIVE_INTERRUPT) {
-        qemu_irq_pulse(uart->irq);
+        if (uart->control & UART_RECEIVE_INTERRUPT) {
+            qemu_irq_pulse(uart->irq);
+        }
     }
 }
 
@@ -193,8 +194,15 @@
     switch (addr) {
     case DATA_OFFSET:
     case DATA_OFFSET + 3:       /* When only one byte write */
-        c = value & 0xFF;
-        qemu_chr_fe_write(uart->chr, &c, 1);
+        /* Transmit when character device available and transmitter enabled */
+        if ((uart->chr) && (uart->control & UART_TRANSMIT_ENABLE)) {
+            c = value & 0xFF;
+            qemu_chr_fe_write(uart->chr, &c, 1);
+            /* Generate interrupt */
+            if (uart->control & UART_TRANSMIT_INTERRUPT) {
+                qemu_irq_pulse(uart->irq);
+            }
+        }
         return;
 
     case STATUS_OFFSET:
@@ -242,30 +250,44 @@
     return 0;
 }
 
-static Property grlib_gptimer_properties[] = {
+static void grlib_apbuart_reset(DeviceState *d)
+{
+    UART *uart = container_of(d, UART, busdev.qdev);
+
+    /* Transmitter FIFO and shift registers are always empty in QEMU */
+    uart->status =  UART_TRANSMIT_FIFO_EMPTY | UART_TRANSMIT_SHIFT_EMPTY;
+    /* Everything is off */
+    uart->control = 0;
+    /* Flush receive FIFO */
+    uart->len = 0;
+    uart->current = 0;
+}
+
+static Property grlib_apbuart_properties[] = {
     DEFINE_PROP_CHR("chrdev", UART, chr),
     DEFINE_PROP_END_OF_LIST(),
 };
 
-static void grlib_gptimer_class_init(ObjectClass *klass, void *data)
+static void grlib_apbuart_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
     SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
     k->init = grlib_apbuart_init;
-    dc->props = grlib_gptimer_properties;
+    dc->reset = grlib_apbuart_reset;
+    dc->props = grlib_apbuart_properties;
 }
 
-static const TypeInfo grlib_gptimer_info = {
+static const TypeInfo grlib_apbuart_info = {
     .name          = "grlib,apbuart",
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(UART),
-    .class_init    = grlib_gptimer_class_init,
+    .class_init    = grlib_apbuart_class_init,
 };
 
-static void grlib_gptimer_register_types(void)
+static void grlib_apbuart_register_types(void)
 {
-    type_register_static(&grlib_gptimer_info);
+    type_register_static(&grlib_apbuart_info);
 }
 
-type_init(grlib_gptimer_register_types)
+type_init(grlib_apbuart_register_types)
diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index 5796102..07afdee 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -1152,8 +1152,8 @@
 
     if (sctx->sct == -1) {
         xhci_dma_read_u32s(epctx->xhci, sctx->pctx, ctx, sizeof(ctx));
-        fprintf(stderr, "%s: init sctx #%d @ %lx: %08x %08x\n", __func__,
-                streamid, sctx->pctx, ctx[0], ctx[1]);
+        fprintf(stderr, "%s: init sctx #%d @ " DMA_ADDR_FMT ": %08x %08x\n",
+                __func__, streamid, sctx->pctx, ctx[0], ctx[1]);
         sct = (ctx[0] >> 1) & 0x07;
         if (epctx->lsa && sct != 1) {
             *cc_error = CC_INVALID_STREAM_TYPE_ERROR;
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index ae64590..3dc9656 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -184,7 +184,7 @@
     struct GDBRegisterState *gdb_regs;                                  \
                                                                         \
     /* Core interrupt code */                                           \
-    jmp_buf jmp_env;                                                    \
+    sigjmp_buf jmp_env;                                                 \
     int exception_index;                                                \
                                                                         \
     CPUArchState *next_cpu; /* next CPU sharing TB cache */                 \
diff --git a/include/migration/migration.h b/include/migration/migration.h
index a8c9639..d121409 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -51,6 +51,7 @@
     int64_t downtime;
     int64_t expected_downtime;
     int64_t dirty_pages_rate;
+    int64_t dirty_bytes_rate;
     bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
     int64_t xbzrle_cache_size;
     bool complete;
diff --git a/include/qemu/log.h b/include/qemu/log.h
index 5a46555..4527003 100644
--- a/include/qemu/log.h
+++ b/include/qemu/log.h
@@ -126,14 +126,6 @@
     qemu_logfile = f;
 }
 
-/* Set up a new log file, only if none is set */
-static inline void qemu_log_try_set_file(FILE *f)
-{
-    if (!qemu_logfile) {
-        qemu_logfile = f;
-    }
-}
-
 /* define log items */
 typedef struct QEMULogItem {
     int mask;
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index bf9edeb..71f5fa0 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -63,6 +63,14 @@
 # undef setjmp
 # define setjmp(env) _setjmp(env, NULL)
 #endif
+/* QEMU uses sigsetjmp()/siglongjmp() as the portable way to specify
+ * "longjmp and don't touch the signal masks". Since we know that the
+ * savemask parameter will always be zero we can safely define these
+ * in terms of setjmp/longjmp on Win32.
+ */
+#define sigjmp_buf jmp_buf
+#define sigsetjmp(env, savemask) setjmp(env)
+#define siglongjmp(env, val) longjmp(env, val)
 
 /* Declaration of ffs() is missing in MinGW's strings.h. */
 int ffs(int i);
diff --git a/migration.c b/migration.c
index b1ebb01..11725ae 100644
--- a/migration.c
+++ b/migration.c
@@ -658,6 +658,7 @@
 {
     MigrationState *s = opaque;
     int64_t initial_time = qemu_get_clock_ms(rt_clock);
+    int64_t sleep_time = 0;
     int64_t max_size = 0;
     bool last_round = false;
     int ret;
@@ -673,7 +674,7 @@
     qemu_mutex_unlock_iothread();
 
     while (true) {
-        int64_t current_time = qemu_get_clock_ms(rt_clock);
+        int64_t current_time;
         uint64_t pending_size;
 
         qemu_mutex_lock_iothread();
@@ -727,22 +728,30 @@
             }
         }
         qemu_mutex_unlock_iothread();
+        current_time = qemu_get_clock_ms(rt_clock);
         if (current_time >= initial_time + BUFFER_DELAY) {
             uint64_t transferred_bytes = s->bytes_xfer;
-            uint64_t time_spent = current_time - initial_time;
+            uint64_t time_spent = current_time - initial_time - sleep_time;
             double bandwidth = transferred_bytes / time_spent;
             max_size = bandwidth * migrate_max_downtime() / 1000000;
 
             DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
                     " bandwidth %g max_size %" PRId64 "\n",
                     transferred_bytes, time_spent, bandwidth, max_size);
+            /* if we haven't sent anything, we don't want to recalculate
+               10000 is a small enough number for our purposes */
+            if (s->dirty_bytes_rate && transferred_bytes > 10000) {
+                s->expected_downtime = s->dirty_bytes_rate / bandwidth;
+            }
 
             s->bytes_xfer = 0;
+            sleep_time = 0;
             initial_time = current_time;
         }
         if (!last_round && (s->bytes_xfer >= s->xfer_limit)) {
             /* usleep expects microseconds */
             g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
+            sleep_time += qemu_get_clock_ms(rt_clock) - current_time;
         }
         ret = buffered_flush(s);
         if (ret < 0) {
@@ -774,6 +783,8 @@
     s->buffer = NULL;
     s->buffer_size = 0;
     s->buffer_capacity = 0;
+    /* This is a best 1st approximation. ns to ms */
+    s->expected_downtime = max_downtime/1000000;
 
     s->xfer_limit = s->bandwidth_limit / XFER_LIMIT_RATIO;
     s->complete = false;
diff --git a/monitor.c b/monitor.c
index 6a0f257..32a6e74 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2740,7 +2740,7 @@
 /*******************************************************************/
 
 static const char *pch;
-static jmp_buf expr_env;
+static sigjmp_buf expr_env;
 
 #define MD_TLONG 0
 #define MD_I32   1
@@ -3135,7 +3135,7 @@
 static void expr_error(Monitor *mon, const char *msg)
 {
     monitor_printf(mon, "%s\n", msg);
-    longjmp(expr_env, 1);
+    siglongjmp(expr_env, 1);
 }
 
 /* return 0 if OK, -1 if not found */
@@ -3345,7 +3345,7 @@
 static int get_expr(Monitor *mon, int64_t *pval, const char **pp)
 {
     pch = *pp;
-    if (setjmp(expr_env)) {
+    if (sigsetjmp(expr_env, 0)) {
         *pp = pch;
         return -1;
     }
diff --git a/po/de_DE.po b/po/de_DE.po
index cb74d7c..8755783 100644
--- a/po/de_DE.po
+++ b/po/de_DE.po
@@ -24,10 +24,6 @@
 msgid "_View"
 msgstr "_Ansicht"
 
-#: ../ui/gtk.c:1002
-msgid "_Full Screen"
-msgstr "Voll_bild"
-
 #: ../ui/gtk.c:1029
 msgid "Zoom To _Fit"
 msgstr "Auf _Fenstergröße skalieren"
diff --git a/po/it.po b/po/it.po
index 2b23491..7d77fff 100644
--- a/po/it.po
+++ b/po/it.po
@@ -24,10 +24,6 @@
 msgid "_View"
 msgstr "_Visualizza"
 
-#: ../ui/gtk.c:1002
-msgid "_Full Screen"
-msgstr "_Schermo intero"
-
 #: ../ui/gtk.c:1029
 msgid "Zoom To _Fit"
 msgstr "Adatta alla _finestra"
diff --git a/po/messages.po b/po/messages.po
index a90cd6f..191e81c 100644
--- a/po/messages.po
+++ b/po/messages.po
@@ -24,10 +24,6 @@
 msgid "_View"
 msgstr ""
 
-#: ../ui/gtk.c:1002
-msgid "_Full Screen"
-msgstr ""
-
 #: ../ui/gtk.c:1029
 msgid "Zoom To _Fit"
 msgstr ""
diff --git a/target-alpha/helper.h b/target-alpha/helper.h
index eac3041..3321fde 100644
--- a/target-alpha/helper.h
+++ b/target-alpha/helper.h
@@ -9,7 +9,6 @@
 DEF_HELPER_FLAGS_3(sublv, TCG_CALL_NO_WG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_3(mullv, TCG_CALL_NO_WG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_3(mulqv, TCG_CALL_NO_WG, i64, env, i64, i64)
-DEF_HELPER_FLAGS_2(umulh, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 
 DEF_HELPER_FLAGS_1(ctpop, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(ctlz, TCG_CALL_NO_RWG_SE, i64, i64)
diff --git a/target-alpha/int_helper.c b/target-alpha/int_helper.c
index c9b42b6..51ccd41 100644
--- a/target-alpha/int_helper.c
+++ b/target-alpha/int_helper.c
@@ -22,13 +22,6 @@
 #include "qemu/host-utils.h"
 
 
-uint64_t helper_umulh(uint64_t op1, uint64_t op2)
-{
-    uint64_t tl, th;
-    mulu64(&tl, &th, op1, op2);
-    return th;
-}
-
 uint64_t helper_ctpop(uint64_t arg)
 {
     return ctpop64(arg);
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index f687b95..f8f7695 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -1390,7 +1390,6 @@
         tcg_temp_free(tmp1);                                          \
     }                                                                 \
 }
-ARITH3(umulh)
 ARITH3(cmpbge)
 ARITH3(minub8)
 ARITH3(minsb8)
@@ -2426,7 +2425,24 @@
             break;
         case 0x30:
             /* UMULH */
-            gen_umulh(ra, rb, rc, islit, lit);
+            {
+                TCGv low;
+                if (unlikely(rc == 31)){
+                    break;
+                }
+                if (ra == 31) {
+                    tcg_gen_movi_i64(cpu_ir[rc], 0);
+                    break;
+                }
+                low = tcg_temp_new();
+                if (islit) {
+                    tcg_gen_movi_tl(low, lit);
+                    tcg_gen_mulu2_i64(low, cpu_ir[rc], cpu_ir[ra], low);
+                } else {
+                    tcg_gen_mulu2_i64(low, cpu_ir[rc], cpu_ir[ra], cpu_ir[rb]);
+                }
+                tcg_temp_free(low);
+            }
             break;
         case 0x40:
             /* MULL/V */
diff --git a/target-arm/helper.c b/target-arm/helper.c
index e63da57..e97e1a5 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -2893,11 +2893,6 @@
     return (a & mask) | (b & ~mask);
 }
 
-uint32_t HELPER(logicq_cc)(uint64_t val)
-{
-    return (val >> 32) | (val != 0);
-}
-
 /* VFP support.  We follow the convention used for VFP instructions:
    Single precision routines have a "s" suffix, double precision a
    "d" suffix.  */
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 8544f82..63ae13a 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -46,8 +46,6 @@
 
 DEF_HELPER_FLAGS_2(usad8, TCG_CALL_NO_RWG_SE, i32, i32, i32)
 
-DEF_HELPER_1(logicq_cc, i32, i64)
-
 DEF_HELPER_FLAGS_3(sel_flags, TCG_CALL_NO_RWG_SE,
                    i32, i32, i32, i32)
 DEF_HELPER_2(exception, void, env, i32)
@@ -142,9 +140,6 @@
 DEF_HELPER_2(rsqrte_u32, i32, i32, env)
 DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
 
-DEF_HELPER_3(adc_cc, i32, env, i32, i32)
-DEF_HELPER_3(sbc_cc, i32, env, i32, i32)
-
 DEF_HELPER_3(shl_cc, i32, env, i32, i32)
 DEF_HELPER_3(shr_cc, i32, env, i32, i32)
 DEF_HELPER_3(sar_cc, i32, env, i32, i32)
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 99610d7..a522313 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -315,36 +315,6 @@
    The only way to do that in TCG is a conditional branch, which clobbers
    all our temporaries.  For now implement these as helper functions.  */
 
-uint32_t HELPER(adc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t result;
-    if (!env->CF) {
-        result = a + b;
-        env->CF = result < a;
-    } else {
-        result = a + b + 1;
-        env->CF = result <= a;
-    }
-    env->VF = (a ^ b ^ -1) & (a ^ result);
-    env->NF = env->ZF = result;
-    return result;
-}
-
-uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t result;
-    if (!env->CF) {
-        result = a - b - 1;
-        env->CF = a > b;
-    } else {
-        result = a - b;
-        env->CF = a >= b;
-    }
-    env->VF = (a ^ b) & (a ^ result);
-    env->NF = env->ZF = result;
-    return result;
-}
-
 /* Similarly for variable shift instructions.  */
 
 uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i)
diff --git a/target-arm/translate.c b/target-arm/translate.c
index a8893f7..f2f649d 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -305,35 +305,41 @@
     return a;
 }
 
-/* FIXME: Most targets have native widening multiplication.
-   It would be good to use that instead of a full wide multiply.  */
 /* 32x32->64 multiply.  Marks inputs as dead.  */
 static TCGv_i64 gen_mulu_i64_i32(TCGv a, TCGv b)
 {
-    TCGv_i64 tmp1 = tcg_temp_new_i64();
-    TCGv_i64 tmp2 = tcg_temp_new_i64();
+    TCGv lo = tcg_temp_new_i32();
+    TCGv hi = tcg_temp_new_i32();
+    TCGv_i64 ret;
 
-    tcg_gen_extu_i32_i64(tmp1, a);
+    tcg_gen_mulu2_i32(lo, hi, a, b);
     tcg_temp_free_i32(a);
-    tcg_gen_extu_i32_i64(tmp2, b);
     tcg_temp_free_i32(b);
-    tcg_gen_mul_i64(tmp1, tmp1, tmp2);
-    tcg_temp_free_i64(tmp2);
-    return tmp1;
+
+    ret = tcg_temp_new_i64();
+    tcg_gen_concat_i32_i64(ret, lo, hi);
+    tcg_temp_free(lo);
+    tcg_temp_free(hi);
+
+    return ret;
 }
 
 static TCGv_i64 gen_muls_i64_i32(TCGv a, TCGv b)
 {
-    TCGv_i64 tmp1 = tcg_temp_new_i64();
-    TCGv_i64 tmp2 = tcg_temp_new_i64();
+    TCGv lo = tcg_temp_new_i32();
+    TCGv hi = tcg_temp_new_i32();
+    TCGv_i64 ret;
 
-    tcg_gen_ext_i32_i64(tmp1, a);
+    tcg_gen_muls2_i32(lo, hi, a, b);
     tcg_temp_free_i32(a);
-    tcg_gen_ext_i32_i64(tmp2, b);
     tcg_temp_free_i32(b);
-    tcg_gen_mul_i64(tmp1, tmp1, tmp2);
-    tcg_temp_free_i64(tmp2);
-    return tmp1;
+
+    ret = tcg_temp_new_i64();
+    tcg_gen_concat_i32_i64(ret, lo, hi);
+    tcg_temp_free(lo);
+    tcg_temp_free(hi);
+
+    return ret;
 }
 
 /* Swap low and high halfwords.  */
@@ -404,12 +410,39 @@
 /* dest = T0 + T1. Compute C, N, V and Z flags */
 static void gen_add_CC(TCGv dest, TCGv t0, TCGv t1)
 {
-    TCGv tmp;
-    tcg_gen_add_i32(cpu_NF, t0, t1);
+    TCGv tmp = tcg_temp_new_i32();
+    tcg_gen_movi_i32(tmp, 0);
+    tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, t1, tmp);
     tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-    tcg_gen_setcond_i32(TCG_COND_LTU, cpu_CF, cpu_NF, t0);
     tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
-    tmp = tcg_temp_new_i32();
+    tcg_gen_xor_i32(tmp, t0, t1);
+    tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 + T1 + CF.  Compute C, N, V and Z flags */
+static void gen_adc_CC(TCGv dest, TCGv t0, TCGv t1)
+{
+    TCGv tmp = tcg_temp_new_i32();
+    if (TCG_TARGET_HAS_add2_i32) {
+        tcg_gen_movi_i32(tmp, 0);
+        tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, cpu_CF, tmp);
+        tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1, tmp);
+    } else {
+        TCGv_i64 q0 = tcg_temp_new_i64();
+        TCGv_i64 q1 = tcg_temp_new_i64();
+        tcg_gen_extu_i32_i64(q0, t0);
+        tcg_gen_extu_i32_i64(q1, t1);
+        tcg_gen_add_i64(q0, q0, q1);
+        tcg_gen_extu_i32_i64(q1, cpu_CF);
+        tcg_gen_add_i64(q0, q0, q1);
+        tcg_gen_extr_i64_i32(cpu_NF, cpu_CF, q0);
+        tcg_temp_free_i64(q0);
+        tcg_temp_free_i64(q1);
+    }
+    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
     tcg_gen_xor_i32(tmp, t0, t1);
     tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
     tcg_temp_free_i32(tmp);
@@ -431,6 +464,15 @@
     tcg_gen_mov_i32(dest, cpu_NF);
 }
 
+/* dest = T0 + ~T1 + CF.  Compute C, N, V and Z flags */
+static void gen_sbc_CC(TCGv dest, TCGv t0, TCGv t1)
+{
+    TCGv tmp = tcg_temp_new_i32();
+    tcg_gen_not_i32(tmp, t1);
+    gen_adc_CC(dest, t0, tmp);
+    tcg_temp_free(tmp);
+}
+
 #define GEN_SHIFT(name)                                               \
 static void gen_##name(TCGv dest, TCGv t0, TCGv t1)                   \
 {                                                                     \
@@ -6427,13 +6469,11 @@
     tcg_temp_free_i64(tmp);
 }
 
-/* Set N and Z flags from a 64-bit value.  */
-static void gen_logicq_cc(TCGv_i64 val)
+/* Set N and Z flags from hi|lo.  */
+static void gen_logicq_cc(TCGv lo, TCGv hi)
 {
-    TCGv tmp = tcg_temp_new_i32();
-    gen_helper_logicq_cc(tmp, val);
-    gen_logic_CC(tmp);
-    tcg_temp_free_i32(tmp);
+    tcg_gen_mov_i32(cpu_NF, hi);
+    tcg_gen_or_i32(cpu_ZF, lo, hi);
 }
 
 /* Load/Store exclusive instructions are implemented by remembering
@@ -7070,7 +7110,7 @@
             break;
         case 0x05:
             if (set_cc) {
-                gen_helper_adc_cc(tmp, cpu_env, tmp, tmp2);
+                gen_adc_CC(tmp, tmp, tmp2);
             } else {
                 gen_add_carry(tmp, tmp, tmp2);
             }
@@ -7078,7 +7118,7 @@
             break;
         case 0x06:
             if (set_cc) {
-                gen_helper_sbc_cc(tmp, cpu_env, tmp, tmp2);
+                gen_sbc_CC(tmp, tmp, tmp2);
             } else {
                 gen_sub_carry(tmp, tmp, tmp2);
             }
@@ -7086,7 +7126,7 @@
             break;
         case 0x07:
             if (set_cc) {
-                gen_helper_sbc_cc(tmp, cpu_env, tmp2, tmp);
+                gen_sbc_CC(tmp, tmp2, tmp);
             } else {
                 gen_sub_carry(tmp, tmp2, tmp);
             }
@@ -7213,18 +7253,22 @@
                         tmp = load_reg(s, rs);
                         tmp2 = load_reg(s, rm);
                         if (insn & (1 << 22)) {
-                            tmp64 = gen_muls_i64_i32(tmp, tmp2);
+                            tcg_gen_muls2_i32(tmp, tmp2, tmp, tmp2);
                         } else {
-                            tmp64 = gen_mulu_i64_i32(tmp, tmp2);
+                            tcg_gen_mulu2_i32(tmp, tmp2, tmp, tmp2);
                         }
                         if (insn & (1 << 21)) { /* mult accumulate */
-                            gen_addq(s, tmp64, rn, rd);
+                            TCGv al = load_reg(s, rn);
+                            TCGv ah = load_reg(s, rd);
+                            tcg_gen_add2_i32(tmp, tmp2, tmp, tmp2, al, ah);
+                            tcg_temp_free(al);
+                            tcg_temp_free(ah);
                         }
                         if (insn & (1 << 20)) {
-                            gen_logicq_cc(tmp64);
+                            gen_logicq_cc(tmp, tmp2);
                         }
-                        gen_storeq_reg(s, rn, rd, tmp64);
-                        tcg_temp_free_i64(tmp64);
+                        store_reg(s, rn, tmp);
+                        store_reg(s, rd, tmp2);
                         break;
                     default:
                         goto illegal_op;
@@ -7907,15 +7951,16 @@
         break;
     case 10: /* adc */
         if (conds)
-            gen_helper_adc_cc(t0, cpu_env, t0, t1);
+            gen_adc_CC(t0, t0, t1);
         else
             gen_adc(t0, t1);
         break;
     case 11: /* sbc */
-        if (conds)
-            gen_helper_sbc_cc(t0, cpu_env, t0, t1);
-        else
+        if (conds) {
+            gen_sbc_CC(t0, t0, t1);
+        } else {
             gen_sub_carry(t0, t0, t1);
+        }
         break;
     case 13: /* sub */
         if (conds)
@@ -9225,16 +9270,18 @@
             }
             break;
         case 0x5: /* adc */
-            if (s->condexec_mask)
+            if (s->condexec_mask) {
                 gen_adc(tmp, tmp2);
-            else
-                gen_helper_adc_cc(tmp, cpu_env, tmp, tmp2);
+            } else {
+                gen_adc_CC(tmp, tmp, tmp2);
+            }
             break;
         case 0x6: /* sbc */
-            if (s->condexec_mask)
+            if (s->condexec_mask) {
                 gen_sub_carry(tmp, tmp, tmp2);
-            else
-                gen_helper_sbc_cc(tmp, cpu_env, tmp, tmp2);
+            } else {
+                gen_sbc_CC(tmp, tmp, tmp2);
+            }
             break;
         case 0x7: /* ror */
             if (s->condexec_mask) {
diff --git a/target-cris/translate.c b/target-cris/translate.c
index 04a5379..14c167f 100644
--- a/target-cris/translate.c
+++ b/target-cris/translate.c
@@ -340,46 +340,6 @@
     tcg_temp_free(t_31);
 }
 
-/* 64-bit signed mul, lower result in d and upper in d2.  */
-static void t_gen_muls(TCGv d, TCGv d2, TCGv a, TCGv b)
-{
-    TCGv_i64 t0, t1;
-
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-
-    tcg_gen_ext_i32_i64(t0, a);
-    tcg_gen_ext_i32_i64(t1, b);
-    tcg_gen_mul_i64(t0, t0, t1);
-
-    tcg_gen_trunc_i64_i32(d, t0);
-    tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_trunc_i64_i32(d2, t0);
-
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i64(t1);
-}
-
-/* 64-bit unsigned muls, lower result in d and upper in d2.  */
-static void t_gen_mulu(TCGv d, TCGv d2, TCGv a, TCGv b)
-{
-    TCGv_i64 t0, t1;
-
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-
-    tcg_gen_extu_i32_i64(t0, a);
-    tcg_gen_extu_i32_i64(t1, b);
-    tcg_gen_mul_i64(t0, t0, t1);
-
-    tcg_gen_trunc_i64_i32(d, t0);
-    tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_trunc_i64_i32(d2, t0);
-
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i64(t1);
-}
-
 static void t_gen_cris_dstep(TCGv d, TCGv a, TCGv b)
 {
     int l1;
@@ -832,10 +792,10 @@
         gen_helper_lz(dst, b);
         break;
     case CC_OP_MULS:
-        t_gen_muls(dst, cpu_PR[PR_MOF], a, b);
+        tcg_gen_muls2_tl(dst, cpu_PR[PR_MOF], a, b);
         break;
     case CC_OP_MULU:
-        t_gen_mulu(dst, cpu_PR[PR_MOF], a, b);
+        tcg_gen_mulu2_tl(dst, cpu_PR[PR_MOF], a, b);
         break;
     case CC_OP_DSTEP:
         t_gen_cris_dstep(dst, a, b);
@@ -3215,8 +3175,6 @@
     int num_insns;
     int max_insns;
 
-    qemu_log_try_set_file(stderr);
-
     if (env->pregs[PR_VR] == 32) {
         dc->decoder = crisv32_decoder;
         dc->clear_locked_irq = 0;
diff --git a/target-i386/cc_helper.c b/target-i386/cc_helper.c
index 9422003..9daa1a0 100644
--- a/target-i386/cc_helper.c
+++ b/target-i386/cc_helper.c
@@ -75,243 +75,247 @@
 
 #endif
 
-static int compute_all_eflags(CPUX86State *env)
+static target_ulong compute_all_adcx(target_ulong dst, target_ulong src1,
+                                     target_ulong src2)
 {
-    return CC_SRC;
+    return (src1 & ~CC_C) | (dst * CC_C);
 }
 
-static int compute_c_eflags(CPUX86State *env)
+static target_ulong compute_all_adox(target_ulong dst, target_ulong src1,
+                                     target_ulong src2)
 {
-    return CC_SRC & CC_C;
+    return (src1 & ~CC_O) | (src2 * CC_O);
 }
 
-uint32_t helper_cc_compute_all(CPUX86State *env, int op)
+static target_ulong compute_all_adcox(target_ulong dst, target_ulong src1,
+                                      target_ulong src2)
+{
+    return (src1 & ~(CC_C | CC_O)) | (dst * CC_C) | (src2 * CC_O);
+}
+
+target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1,
+                                   target_ulong src2, int op)
 {
     switch (op) {
     default: /* should never happen */
         return 0;
 
     case CC_OP_EFLAGS:
-        return compute_all_eflags(env);
+        return src1;
+    case CC_OP_CLR:
+        return CC_Z;
 
     case CC_OP_MULB:
-        return compute_all_mulb(env);
+        return compute_all_mulb(dst, src1);
     case CC_OP_MULW:
-        return compute_all_mulw(env);
+        return compute_all_mulw(dst, src1);
     case CC_OP_MULL:
-        return compute_all_mull(env);
+        return compute_all_mull(dst, src1);
 
     case CC_OP_ADDB:
-        return compute_all_addb(env);
+        return compute_all_addb(dst, src1);
     case CC_OP_ADDW:
-        return compute_all_addw(env);
+        return compute_all_addw(dst, src1);
     case CC_OP_ADDL:
-        return compute_all_addl(env);
+        return compute_all_addl(dst, src1);
 
     case CC_OP_ADCB:
-        return compute_all_adcb(env);
+        return compute_all_adcb(dst, src1, src2);
     case CC_OP_ADCW:
-        return compute_all_adcw(env);
+        return compute_all_adcw(dst, src1, src2);
     case CC_OP_ADCL:
-        return compute_all_adcl(env);
+        return compute_all_adcl(dst, src1, src2);
 
     case CC_OP_SUBB:
-        return compute_all_subb(env);
+        return compute_all_subb(dst, src1);
     case CC_OP_SUBW:
-        return compute_all_subw(env);
+        return compute_all_subw(dst, src1);
     case CC_OP_SUBL:
-        return compute_all_subl(env);
+        return compute_all_subl(dst, src1);
 
     case CC_OP_SBBB:
-        return compute_all_sbbb(env);
+        return compute_all_sbbb(dst, src1, src2);
     case CC_OP_SBBW:
-        return compute_all_sbbw(env);
+        return compute_all_sbbw(dst, src1, src2);
     case CC_OP_SBBL:
-        return compute_all_sbbl(env);
+        return compute_all_sbbl(dst, src1, src2);
 
     case CC_OP_LOGICB:
-        return compute_all_logicb(env);
+        return compute_all_logicb(dst, src1);
     case CC_OP_LOGICW:
-        return compute_all_logicw(env);
+        return compute_all_logicw(dst, src1);
     case CC_OP_LOGICL:
-        return compute_all_logicl(env);
+        return compute_all_logicl(dst, src1);
 
     case CC_OP_INCB:
-        return compute_all_incb(env);
+        return compute_all_incb(dst, src1);
     case CC_OP_INCW:
-        return compute_all_incw(env);
+        return compute_all_incw(dst, src1);
     case CC_OP_INCL:
-        return compute_all_incl(env);
+        return compute_all_incl(dst, src1);
 
     case CC_OP_DECB:
-        return compute_all_decb(env);
+        return compute_all_decb(dst, src1);
     case CC_OP_DECW:
-        return compute_all_decw(env);
+        return compute_all_decw(dst, src1);
     case CC_OP_DECL:
-        return compute_all_decl(env);
+        return compute_all_decl(dst, src1);
 
     case CC_OP_SHLB:
-        return compute_all_shlb(env);
+        return compute_all_shlb(dst, src1);
     case CC_OP_SHLW:
-        return compute_all_shlw(env);
+        return compute_all_shlw(dst, src1);
     case CC_OP_SHLL:
-        return compute_all_shll(env);
+        return compute_all_shll(dst, src1);
 
     case CC_OP_SARB:
-        return compute_all_sarb(env);
+        return compute_all_sarb(dst, src1);
     case CC_OP_SARW:
-        return compute_all_sarw(env);
+        return compute_all_sarw(dst, src1);
     case CC_OP_SARL:
-        return compute_all_sarl(env);
+        return compute_all_sarl(dst, src1);
+
+    case CC_OP_BMILGB:
+        return compute_all_bmilgb(dst, src1);
+    case CC_OP_BMILGW:
+        return compute_all_bmilgw(dst, src1);
+    case CC_OP_BMILGL:
+        return compute_all_bmilgl(dst, src1);
+
+    case CC_OP_ADCX:
+        return compute_all_adcx(dst, src1, src2);
+    case CC_OP_ADOX:
+        return compute_all_adox(dst, src1, src2);
+    case CC_OP_ADCOX:
+        return compute_all_adcox(dst, src1, src2);
 
 #ifdef TARGET_X86_64
     case CC_OP_MULQ:
-        return compute_all_mulq(env);
-
+        return compute_all_mulq(dst, src1);
     case CC_OP_ADDQ:
-        return compute_all_addq(env);
-
+        return compute_all_addq(dst, src1);
     case CC_OP_ADCQ:
-        return compute_all_adcq(env);
-
+        return compute_all_adcq(dst, src1, src2);
     case CC_OP_SUBQ:
-        return compute_all_subq(env);
-
+        return compute_all_subq(dst, src1);
     case CC_OP_SBBQ:
-        return compute_all_sbbq(env);
-
+        return compute_all_sbbq(dst, src1, src2);
     case CC_OP_LOGICQ:
-        return compute_all_logicq(env);
-
+        return compute_all_logicq(dst, src1);
     case CC_OP_INCQ:
-        return compute_all_incq(env);
-
+        return compute_all_incq(dst, src1);
     case CC_OP_DECQ:
-        return compute_all_decq(env);
-
+        return compute_all_decq(dst, src1);
     case CC_OP_SHLQ:
-        return compute_all_shlq(env);
-
+        return compute_all_shlq(dst, src1);
     case CC_OP_SARQ:
-        return compute_all_sarq(env);
+        return compute_all_sarq(dst, src1);
+    case CC_OP_BMILGQ:
+        return compute_all_bmilgq(dst, src1);
 #endif
     }
 }
 
 uint32_t cpu_cc_compute_all(CPUX86State *env, int op)
 {
-    return helper_cc_compute_all(env, op);
+    return helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, op);
 }
 
-uint32_t helper_cc_compute_c(CPUX86State *env, int op)
+target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1,
+                                 target_ulong src2, int op)
 {
     switch (op) {
     default: /* should never happen */
+    case CC_OP_LOGICB:
+    case CC_OP_LOGICW:
+    case CC_OP_LOGICL:
+    case CC_OP_LOGICQ:
+    case CC_OP_CLR:
         return 0;
 
     case CC_OP_EFLAGS:
-        return compute_c_eflags(env);
-
-    case CC_OP_MULB:
-        return compute_c_mull(env);
-    case CC_OP_MULW:
-        return compute_c_mull(env);
-    case CC_OP_MULL:
-        return compute_c_mull(env);
-
-    case CC_OP_ADDB:
-        return compute_c_addb(env);
-    case CC_OP_ADDW:
-        return compute_c_addw(env);
-    case CC_OP_ADDL:
-        return compute_c_addl(env);
-
-    case CC_OP_ADCB:
-        return compute_c_adcb(env);
-    case CC_OP_ADCW:
-        return compute_c_adcw(env);
-    case CC_OP_ADCL:
-        return compute_c_adcl(env);
-
-    case CC_OP_SUBB:
-        return compute_c_subb(env);
-    case CC_OP_SUBW:
-        return compute_c_subw(env);
-    case CC_OP_SUBL:
-        return compute_c_subl(env);
-
-    case CC_OP_SBBB:
-        return compute_c_sbbb(env);
-    case CC_OP_SBBW:
-        return compute_c_sbbw(env);
-    case CC_OP_SBBL:
-        return compute_c_sbbl(env);
-
-    case CC_OP_LOGICB:
-        return compute_c_logicb();
-    case CC_OP_LOGICW:
-        return compute_c_logicw();
-    case CC_OP_LOGICL:
-        return compute_c_logicl();
+    case CC_OP_SARB:
+    case CC_OP_SARW:
+    case CC_OP_SARL:
+    case CC_OP_SARQ:
+    case CC_OP_ADOX:
+        return src1 & 1;
 
     case CC_OP_INCB:
-        return compute_c_incl(env);
     case CC_OP_INCW:
-        return compute_c_incl(env);
     case CC_OP_INCL:
-        return compute_c_incl(env);
-
+    case CC_OP_INCQ:
     case CC_OP_DECB:
-        return compute_c_incl(env);
     case CC_OP_DECW:
-        return compute_c_incl(env);
     case CC_OP_DECL:
-        return compute_c_incl(env);
+    case CC_OP_DECQ:
+        return src1;
+
+    case CC_OP_MULB:
+    case CC_OP_MULW:
+    case CC_OP_MULL:
+    case CC_OP_MULQ:
+        return src1 != 0;
+
+    case CC_OP_ADCX:
+    case CC_OP_ADCOX:
+        return dst;
+
+    case CC_OP_ADDB:
+        return compute_c_addb(dst, src1);
+    case CC_OP_ADDW:
+        return compute_c_addw(dst, src1);
+    case CC_OP_ADDL:
+        return compute_c_addl(dst, src1);
+
+    case CC_OP_ADCB:
+        return compute_c_adcb(dst, src1, src2);
+    case CC_OP_ADCW:
+        return compute_c_adcw(dst, src1, src2);
+    case CC_OP_ADCL:
+        return compute_c_adcl(dst, src1, src2);
+
+    case CC_OP_SUBB:
+        return compute_c_subb(dst, src1);
+    case CC_OP_SUBW:
+        return compute_c_subw(dst, src1);
+    case CC_OP_SUBL:
+        return compute_c_subl(dst, src1);
+
+    case CC_OP_SBBB:
+        return compute_c_sbbb(dst, src1, src2);
+    case CC_OP_SBBW:
+        return compute_c_sbbw(dst, src1, src2);
+    case CC_OP_SBBL:
+        return compute_c_sbbl(dst, src1, src2);
 
     case CC_OP_SHLB:
-        return compute_c_shlb(env);
+        return compute_c_shlb(dst, src1);
     case CC_OP_SHLW:
-        return compute_c_shlw(env);
+        return compute_c_shlw(dst, src1);
     case CC_OP_SHLL:
-        return compute_c_shll(env);
+        return compute_c_shll(dst, src1);
 
-    case CC_OP_SARB:
-        return compute_c_sarl(env);
-    case CC_OP_SARW:
-        return compute_c_sarl(env);
-    case CC_OP_SARL:
-        return compute_c_sarl(env);
+    case CC_OP_BMILGB:
+        return compute_c_bmilgb(dst, src1);
+    case CC_OP_BMILGW:
+        return compute_c_bmilgw(dst, src1);
+    case CC_OP_BMILGL:
+        return compute_c_bmilgl(dst, src1);
 
 #ifdef TARGET_X86_64
-    case CC_OP_MULQ:
-        return compute_c_mull(env);
-
     case CC_OP_ADDQ:
-        return compute_c_addq(env);
-
+        return compute_c_addq(dst, src1);
     case CC_OP_ADCQ:
-        return compute_c_adcq(env);
-
+        return compute_c_adcq(dst, src1, src2);
     case CC_OP_SUBQ:
-        return compute_c_subq(env);
-
+        return compute_c_subq(dst, src1);
     case CC_OP_SBBQ:
-        return compute_c_sbbq(env);
-
-    case CC_OP_LOGICQ:
-        return compute_c_logicq();
-
-    case CC_OP_INCQ:
-        return compute_c_incl(env);
-
-    case CC_OP_DECQ:
-        return compute_c_incl(env);
-
+        return compute_c_sbbq(dst, src1, src2);
     case CC_OP_SHLQ:
-        return compute_c_shlq(env);
-
-    case CC_OP_SARQ:
-        return compute_c_sarl(env);
+        return compute_c_shlq(dst, src1);
+    case CC_OP_BMILGQ:
+        return compute_c_bmilgq(dst, src1);
 #endif
     }
 }
@@ -326,7 +330,7 @@
 {
     uint32_t eflags;
 
-    eflags = helper_cc_compute_all(env, CC_OP);
+    eflags = cpu_cc_compute_all(env, CC_OP);
     eflags |= (DF & DF_MASK);
     eflags |= env->eflags & ~(VM_MASK | RF_MASK);
     return eflags;
diff --git a/target-i386/cc_helper_template.h b/target-i386/cc_helper_template.h
index 1f94e11..607311f 100644
--- a/target-i386/cc_helper_template.h
+++ b/target-i386/cc_helper_template.h
@@ -18,258 +18,223 @@
  */
 
 #define DATA_BITS (1 << (3 + SHIFT))
-#define SIGN_MASK (((target_ulong)1) << (DATA_BITS - 1))
 
 #if DATA_BITS == 8
 #define SUFFIX b
 #define DATA_TYPE uint8_t
-#define DATA_MASK 0xff
 #elif DATA_BITS == 16
 #define SUFFIX w
 #define DATA_TYPE uint16_t
-#define DATA_MASK 0xffff
 #elif DATA_BITS == 32
 #define SUFFIX l
 #define DATA_TYPE uint32_t
-#define DATA_MASK 0xffffffff
 #elif DATA_BITS == 64
 #define SUFFIX q
 #define DATA_TYPE uint64_t
-#define DATA_MASK 0xffffffffffffffffULL
 #else
 #error unhandled operand size
 #endif
 
+#define SIGN_MASK (((DATA_TYPE)1) << (DATA_BITS - 1))
+
 /* dynamic flags computation */
 
-static int glue(compute_all_add, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
     int cf, pf, af, zf, sf, of;
-    target_long src1, src2;
+    DATA_TYPE src2 = dst - src1;
 
-    src1 = CC_SRC;
-    src2 = CC_DST - CC_SRC;
-    cf = (DATA_TYPE)CC_DST < (DATA_TYPE)src1;
-    pf = parity_table[(uint8_t)CC_DST];
-    af = (CC_DST ^ src1 ^ src2) & 0x10;
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    cf = dst < src1;
+    pf = parity_table[(uint8_t)dst];
+    af = (dst ^ src1 ^ src2) & CC_A;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_c_add, SUFFIX)(CPUX86State *env)
+static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf;
-    target_long src1;
-
-    src1 = CC_SRC;
-    cf = (DATA_TYPE)CC_DST < (DATA_TYPE)src1;
-    return cf;
+    return dst < src1;
 }
 
-static int glue(compute_all_adc, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
+                                         DATA_TYPE src3)
 {
     int cf, pf, af, zf, sf, of;
-    target_long src1, src2;
+    DATA_TYPE src2 = dst - src1 - src3;
 
-    src1 = CC_SRC;
-    src2 = CC_DST - CC_SRC - 1;
-    cf = (DATA_TYPE)CC_DST <= (DATA_TYPE)src1;
-    pf = parity_table[(uint8_t)CC_DST];
-    af = (CC_DST ^ src1 ^ src2) & 0x10;
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    cf = (src3 ? dst <= src1 : dst < src1);
+    pf = parity_table[(uint8_t)dst];
+    af = (dst ^ src1 ^ src2) & 0x10;
+    zf = (dst == 0) << 6;
+    sf = lshift(dst, 8 - DATA_BITS) & 0x80;
+    of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_c_adc, SUFFIX)(CPUX86State *env)
+static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
+                                       DATA_TYPE src3)
 {
-    int cf;
-    target_long src1;
-
-    src1 = CC_SRC;
-    cf = (DATA_TYPE)CC_DST <= (DATA_TYPE)src1;
-    return cf;
+    return src3 ? dst <= src1 : dst < src1;
 }
 
-static int glue(compute_all_sub, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
 {
     int cf, pf, af, zf, sf, of;
-    target_long src1, src2;
+    DATA_TYPE src1 = dst + src2;
 
-    src1 = CC_DST + CC_SRC;
-    src2 = CC_SRC;
-    cf = (DATA_TYPE)src1 < (DATA_TYPE)src2;
-    pf = parity_table[(uint8_t)CC_DST];
-    af = (CC_DST ^ src1 ^ src2) & 0x10;
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    of = lshift((src1 ^ src2) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    cf = src1 < src2;
+    pf = parity_table[(uint8_t)dst];
+    af = (dst ^ src1 ^ src2) & CC_A;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_c_sub, SUFFIX)(CPUX86State *env)
+static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
 {
-    int cf;
-    target_long src1, src2;
+    DATA_TYPE src1 = dst + src2;
 
-    src1 = CC_DST + CC_SRC;
-    src2 = CC_SRC;
-    cf = (DATA_TYPE)src1 < (DATA_TYPE)src2;
-    return cf;
+    return src1 < src2;
 }
 
-static int glue(compute_all_sbb, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
+                                         DATA_TYPE src3)
 {
     int cf, pf, af, zf, sf, of;
-    target_long src1, src2;
+    DATA_TYPE src1 = dst + src2 + src3;
 
-    src1 = CC_DST + CC_SRC + 1;
-    src2 = CC_SRC;
-    cf = (DATA_TYPE)src1 <= (DATA_TYPE)src2;
-    pf = parity_table[(uint8_t)CC_DST];
-    af = (CC_DST ^ src1 ^ src2) & 0x10;
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    of = lshift((src1 ^ src2) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O;
+    cf = (src3 ? src1 <= src2 : src1 < src2);
+    pf = parity_table[(uint8_t)dst];
+    af = (dst ^ src1 ^ src2) & 0x10;
+    zf = (dst == 0) << 6;
+    sf = lshift(dst, 8 - DATA_BITS) & 0x80;
+    of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_c_sbb, SUFFIX)(CPUX86State *env)
+static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
+                                       DATA_TYPE src3)
 {
-    int cf;
-    target_long src1, src2;
+    DATA_TYPE src1 = dst + src2 + src3;
 
-    src1 = CC_DST + CC_SRC + 1;
-    src2 = CC_SRC;
-    cf = (DATA_TYPE)src1 <= (DATA_TYPE)src2;
-    return cf;
+    return (src3 ? src1 <= src2 : src1 < src2);
 }
 
-static int glue(compute_all_logic, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
     int cf, pf, af, zf, sf, of;
 
     cf = 0;
-    pf = parity_table[(uint8_t)CC_DST];
+    pf = parity_table[(uint8_t)dst];
     af = 0;
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = 0;
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_c_logic, SUFFIX)(void)
-{
-    return 0;
-}
-
-static int glue(compute_all_inc, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
     int cf, pf, af, zf, sf, of;
-    target_long src1, src2;
+    DATA_TYPE src2;
 
-    src1 = CC_DST - 1;
+    cf = src1;
+    src1 = dst - 1;
     src2 = 1;
-    cf = CC_SRC;
-    pf = parity_table[(uint8_t)CC_DST];
-    af = (CC_DST ^ src1 ^ src2) & 0x10;
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    of = ((CC_DST & DATA_MASK) == SIGN_MASK) << 11;
+    pf = parity_table[(uint8_t)dst];
+    af = (dst ^ src1 ^ src2) & CC_A;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    of = (dst == SIGN_MASK) * CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-#if DATA_BITS == 32
-static int glue(compute_c_inc, SUFFIX)(CPUX86State *env)
-{
-    return CC_SRC;
-}
-#endif
-
-static int glue(compute_all_dec, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
     int cf, pf, af, zf, sf, of;
-    target_long src1, src2;
+    DATA_TYPE src2;
 
-    src1 = CC_DST + 1;
+    cf = src1;
+    src1 = dst + 1;
     src2 = 1;
-    cf = CC_SRC;
-    pf = parity_table[(uint8_t)CC_DST];
-    af = (CC_DST ^ src1 ^ src2) & 0x10;
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    of = ((CC_DST & DATA_MASK) == ((target_ulong)SIGN_MASK - 1)) << 11;
+    pf = parity_table[(uint8_t)dst];
+    af = (dst ^ src1 ^ src2) & CC_A;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    of = (dst == SIGN_MASK - 1) * CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_all_shl, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
     int cf, pf, af, zf, sf, of;
 
-    cf = (CC_SRC >> (DATA_BITS - 1)) & CC_C;
-    pf = parity_table[(uint8_t)CC_DST];
+    cf = (src1 >> (DATA_BITS - 1)) & CC_C;
+    pf = parity_table[(uint8_t)dst];
     af = 0; /* undefined */
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    /* of is defined if shift count == 1 */
-    of = lshift(CC_SRC ^ CC_DST, 12 - DATA_BITS) & CC_O;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    /* of is defined iff shift count == 1 */
+    of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_c_shl, SUFFIX)(CPUX86State *env)
+static int glue(compute_c_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    return (CC_SRC >> (DATA_BITS - 1)) & CC_C;
+    return (src1 >> (DATA_BITS - 1)) & CC_C;
 }
 
-#if DATA_BITS == 32
-static int glue(compute_c_sar, SUFFIX)(CPUX86State *env)
-{
-    return CC_SRC & 1;
-}
-#endif
-
-static int glue(compute_all_sar, SUFFIX)(CPUX86State *env)
+static int glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
     int cf, pf, af, zf, sf, of;
 
-    cf = CC_SRC & 1;
-    pf = parity_table[(uint8_t)CC_DST];
+    cf = src1 & 1;
+    pf = parity_table[(uint8_t)dst];
     af = 0; /* undefined */
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    /* of is defined if shift count == 1 */
-    of = lshift(CC_SRC ^ CC_DST, 12 - DATA_BITS) & CC_O;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    /* of is defined iff shift count == 1 */
+    of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
-#if DATA_BITS == 32
-static int glue(compute_c_mul, SUFFIX)(CPUX86State *env)
-{
-    int cf;
-
-    cf = (CC_SRC != 0);
-    return cf;
-}
-#endif
-
 /* NOTE: we compute the flags like the P4. On olders CPUs, only OF and
-   CF are modified and it is slower to do that. */
-static int glue(compute_all_mul, SUFFIX)(CPUX86State *env)
+   CF are modified and it is slower to do that.  Note as well that we
+   don't truncate SRC1 for computing carry to DATA_TYPE.  */
+static int glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1)
 {
     int cf, pf, af, zf, sf, of;
 
-    cf = (CC_SRC != 0);
-    pf = parity_table[(uint8_t)CC_DST];
+    cf = (src1 != 0);
+    pf = parity_table[(uint8_t)dst];
     af = 0; /* undefined */
-    zf = ((DATA_TYPE)CC_DST == 0) << 6;
-    sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80;
-    of = cf << 11;
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    of = cf * CC_O;
     return cf | pf | af | zf | sf | of;
 }
 
+static int glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+{
+    int cf, pf, af, zf, sf, of;
+
+    cf = (src1 == 0);
+    pf = 0; /* undefined */
+    af = 0; /* undefined */
+    zf = (dst == 0) * CC_Z;
+    sf = lshift(dst, 8 - DATA_BITS) & CC_S;
+    of = 0;
+    return cf | pf | af | zf | sf | of;
+}
+
+static int glue(compute_c_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+{
+    return src1 == 0;
+}
+
 #undef DATA_BITS
 #undef SIGN_MASK
 #undef DATA_TYPE
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index dfcf86e..5582e5f 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -389,10 +389,15 @@
           CPUID_VME, CPUID_DTS, CPUID_SS, CPUID_HT, CPUID_TM, CPUID_PBE */
 #define TCG_EXT_FEATURES (CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | \
           CPUID_EXT_SSSE3 | CPUID_EXT_CX16 | CPUID_EXT_POPCNT | \
-          CPUID_EXT_HYPERVISOR)
+          CPUID_EXT_MOVBE | CPUID_EXT_HYPERVISOR)
           /* missing:
-          CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_EST,
-          CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_XSAVE */
+          CPUID_EXT_PCLMULQDQ, CPUID_EXT_DTES64, CPUID_EXT_DSCPL,
+          CPUID_EXT_VMX, CPUID_EXT_SMX, CPUID_EXT_EST, CPUID_EXT_TM2,
+          CPUID_EXT_CID, CPUID_EXT_FMA, CPUID_EXT_XTPR, CPUID_EXT_PDCM,
+          CPUID_EXT_PCID, CPUID_EXT_DCA, CPUID_EXT_SSE41, CPUID_EXT_SSE42,
+          CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_AES,
+          CPUID_EXT_XSAVE, CPUID_EXT_OSXSAVE, CPUID_EXT_AVX,
+          CPUID_EXT_F16C, CPUID_EXT_RDRAND */
 #define TCG_EXT2_FEATURES ((TCG_FEATURES & CPUID_EXT2_AMD_ALIASES) | \
           CPUID_EXT2_NX | CPUID_EXT2_MMXEXT | CPUID_EXT2_RDTSCP | \
           CPUID_EXT2_3DNOW | CPUID_EXT2_3DNOWEXT)
@@ -401,7 +406,12 @@
 #define TCG_EXT3_FEATURES (CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM | \
           CPUID_EXT3_CR8LEG | CPUID_EXT3_ABM | CPUID_EXT3_SSE4A)
 #define TCG_SVM_FEATURES 0
-#define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP)
+#define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP \
+          CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX)
+          /* missing:
+          CPUID_7_0_EBX_FSGSBASE, CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2,
+          CPUID_7_0_EBX_ERMS, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM,
+          CPUID_7_0_EBX_RDSEED */
 
 /* built-in CPU model definitions
  */
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 7577e4f..493dda8 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -582,7 +582,7 @@
 #define CPU_INTERRUPT_TPR       CPU_INTERRUPT_TGT_INT_3
 
 
-enum {
+typedef enum {
     CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
     CC_OP_EFLAGS,  /* all cc are explicitly computed, CC_SRC = flags */
 
@@ -636,8 +636,19 @@
     CC_OP_SARL,
     CC_OP_SARQ,
 
+    CC_OP_BMILGB, /* Z,S via CC_DST, C = SRC==0; O=0; P,A undefined */
+    CC_OP_BMILGW,
+    CC_OP_BMILGL,
+    CC_OP_BMILGQ,
+
+    CC_OP_ADCX, /* CC_DST = C, CC_SRC = rest.  */
+    CC_OP_ADOX, /* CC_DST = O, CC_SRC = rest.  */
+    CC_OP_ADCOX, /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
+
+    CC_OP_CLR, /* Z set, all other flags clear.  */
+
     CC_OP_NB,
-};
+} CCOp;
 
 typedef struct SegmentCache {
     uint32_t selector;
@@ -725,8 +736,9 @@
                         stored elsewhere */
 
     /* emulator internal eflags handling */
-    target_ulong cc_src;
     target_ulong cc_dst;
+    target_ulong cc_src;
+    target_ulong cc_src2;
     uint32_t cc_op;
     int32_t df; /* D flag : 1 if D = 0, -1 if D = 1 */
     uint32_t hflags; /* TB flags, see HF_xxx constants. These flags
@@ -764,7 +776,6 @@
     XMMReg xmm_regs[CPU_NB_REGS];
     XMMReg xmm_t0;
     MMXReg mmx_t0;
-    target_ulong cc_tmp; /* temporary for rcr/rcl */
 
     /* sysenter registers */
     uint32_t sysenter_cs;
@@ -1117,9 +1128,10 @@
 #define EIP (env->eip)
 #define DF  (env->df)
 
-#define CC_SRC (env->cc_src)
-#define CC_DST (env->cc_dst)
-#define CC_OP  (env->cc_op)
+#define CC_DST  (env->cc_dst)
+#define CC_SRC  (env->cc_src)
+#define CC_SRC2 (env->cc_src2)
+#define CC_OP   (env->cc_op)
 
 /* n must be a constant to be efficient */
 static inline target_long lshift(target_long x, int n)
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 4bf9db7..82a731c 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -55,7 +55,7 @@
 /***********************************************************/
 /* x86 debug */
 
-static const char *cc_op_str[] = {
+static const char *cc_op_str[CC_OP_NB] = {
     "DYNAMIC",
     "EFLAGS",
 
@@ -108,6 +108,17 @@
     "SARW",
     "SARL",
     "SARQ",
+
+    "BMILGB",
+    "BMILGW",
+    "BMILGL",
+    "BMILGQ",
+
+    "ADCX",
+    "ADOX",
+    "ADCOX",
+
+    "CLR",
 };
 
 static void
diff --git a/target-i386/helper.h b/target-i386/helper.h
index 9ed720d..26a0cc8 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -1,7 +1,7 @@
 #include "exec/def-helper.h"
 
-DEF_HELPER_FLAGS_2(cc_compute_all, TCG_CALL_NO_SE, i32, env, int)
-DEF_HELPER_FLAGS_2(cc_compute_c, TCG_CALL_NO_SE, i32, env, int)
+DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
+DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
 
 DEF_HELPER_0(lock, void)
 DEF_HELPER_0(unlock, void)
@@ -19,6 +19,7 @@
 DEF_HELPER_3(imulq_T0_T1, tl, env, tl, tl)
 DEF_HELPER_2(divq_EAX, void, env, tl)
 DEF_HELPER_2(idivq_EAX, void, env, tl)
+DEF_HELPER_FLAGS_2(umulh, TCG_CALL_NO_RWG_SE, tl, tl, tl)
 #endif
 
 DEF_HELPER_2(aam, void, env, int)
@@ -193,9 +194,11 @@
 DEF_HELPER_3(frstor, void, env, tl, int)
 DEF_HELPER_3(fxsave, void, env, tl, int)
 DEF_HELPER_3(fxrstor, void, env, tl, int)
-DEF_HELPER_1(bsf, tl, tl)
-DEF_HELPER_1(bsr, tl, tl)
-DEF_HELPER_2(lzcnt, tl, tl, int)
+
+DEF_HELPER_FLAGS_1(clz, TCG_CALL_NO_RWG_SE, tl, tl)
+DEF_HELPER_FLAGS_1(ctz, TCG_CALL_NO_RWG_SE, tl, tl)
+DEF_HELPER_FLAGS_2(pdep, TCG_CALL_NO_RWG_SE, tl, tl, tl)
+DEF_HELPER_FLAGS_2(pext, TCG_CALL_NO_RWG_SE, tl, tl, tl)
 
 /* MMX/SSE */
 
diff --git a/target-i386/int_helper.c b/target-i386/int_helper.c
index 84b812d..3b56075 100644
--- a/target-i386/int_helper.c
+++ b/target-i386/int_helper.c
@@ -385,6 +385,13 @@
     CC_SRC = r1;
 }
 
+target_ulong helper_umulh(target_ulong t0, target_ulong t1)
+{
+    uint64_t h, l;
+    mulu64(&l, &h, t0, t1);
+    return h;
+}
+
 void helper_imulq_EAX_T0(CPUX86State *env, target_ulong t0)
 {
     uint64_t r0, r1;
@@ -440,45 +447,49 @@
 }
 #endif
 
+#if TARGET_LONG_BITS == 32
+# define ctztl  ctz32
+# define clztl  clz32
+#else
+# define ctztl  ctz64
+# define clztl  clz64
+#endif
+
 /* bit operations */
-target_ulong helper_bsf(target_ulong t0)
+target_ulong helper_ctz(target_ulong t0)
 {
-    int count;
-    target_ulong res;
-
-    res = t0;
-    count = 0;
-    while ((res & 1) == 0) {
-        count++;
-        res >>= 1;
-    }
-    return count;
+    return ctztl(t0);
 }
 
-target_ulong helper_lzcnt(target_ulong t0, int wordsize)
+target_ulong helper_clz(target_ulong t0)
 {
-    int count;
-    target_ulong res, mask;
-
-    if (wordsize > 0 && t0 == 0) {
-        return wordsize;
-    }
-    res = t0;
-    count = TARGET_LONG_BITS - 1;
-    mask = (target_ulong)1 << (TARGET_LONG_BITS - 1);
-    while ((res & mask) == 0) {
-        count--;
-        res <<= 1;
-    }
-    if (wordsize > 0) {
-        return wordsize - 1 - count;
-    }
-    return count;
+    return clztl(t0);
 }
 
-target_ulong helper_bsr(target_ulong t0)
+target_ulong helper_pdep(target_ulong src, target_ulong mask)
 {
-    return helper_lzcnt(t0, 0);
+    target_ulong dest = 0;
+    int i, o;
+
+    for (i = 0; mask != 0; i++) {
+        o = ctztl(mask);
+        mask &= mask - 1;
+        dest |= ((src >> i) & 1) << o;
+    }
+    return dest;
+}
+
+target_ulong helper_pext(target_ulong src, target_ulong mask)
+{
+    target_ulong dest = 0;
+    int i, o;
+
+    for (o = 0; mask != 0; o++) {
+        i = ctztl(mask);
+        mask &= mask - 1;
+        dest |= ((src >> i) & 1) << o;
+    }
+    return dest;
 }
 
 #define SHIFT 0
diff --git a/target-i386/shift_helper_template.h b/target-i386/shift_helper_template.h
index dda0da3..cf91a2d 100644
--- a/target-i386/shift_helper_template.h
+++ b/target-i386/shift_helper_template.h
@@ -55,7 +55,7 @@
     count = rclb_table[count];
 #endif
     if (count) {
-        eflags = helper_cc_compute_all(env, CC_OP);
+        eflags = env->cc_src;
         t0 &= DATA_MASK;
         src = t0;
         res = (t0 << count) | ((target_ulong)(eflags & CC_C) << (count - 1));
@@ -63,11 +63,9 @@
             res |= t0 >> (DATA_BITS + 1 - count);
         }
         t0 = res;
-        env->cc_tmp = (eflags & ~(CC_C | CC_O)) |
+        env->cc_src = (eflags & ~(CC_C | CC_O)) |
             (lshift(src ^ t0, 11 - (DATA_BITS - 1)) & CC_O) |
             ((src >> (DATA_BITS - count)) & CC_C);
-    } else {
-        env->cc_tmp = -1;
     }
     return t0;
 }
@@ -86,7 +84,7 @@
     count = rclb_table[count];
 #endif
     if (count) {
-        eflags = helper_cc_compute_all(env, CC_OP);
+        eflags = env->cc_src;
         t0 &= DATA_MASK;
         src = t0;
         res = (t0 >> count) |
@@ -95,11 +93,9 @@
             res |= t0 << (DATA_BITS + 1 - count);
         }
         t0 = res;
-        env->cc_tmp = (eflags & ~(CC_C | CC_O)) |
+        env->cc_src = (eflags & ~(CC_C | CC_O)) |
             (lshift(src ^ t0, 11 - (DATA_BITS - 1)) & CC_O) |
             ((src >> (count - 1)) & CC_C);
-    } else {
-        env->cc_tmp = -1;
     }
     return t0;
 }
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 112c310..605cd88 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -23,6 +23,7 @@
 #include <inttypes.h>
 #include <signal.h>
 
+#include "qemu/host-utils.h"
 #include "cpu.h"
 #include "disas/disas.h"
 #include "tcg-op.h"
@@ -36,6 +37,7 @@
 #define PREFIX_LOCK   0x04
 #define PREFIX_DATA   0x08
 #define PREFIX_ADR    0x10
+#define PREFIX_VEX    0x20
 
 #ifdef TARGET_X86_64
 #define CODE64(s) ((s)->code64)
@@ -47,21 +49,29 @@
 #define REX_B(s) 0
 #endif
 
+#ifdef TARGET_X86_64
+# define ctztl  ctz64
+# define clztl  clz64
+#else
+# define ctztl  ctz32
+# define clztl  clz32
+#endif
+
 //#define MACRO_TEST   1
 
 /* global register indexes */
 static TCGv_ptr cpu_env;
-static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst, cpu_cc_tmp;
+static TCGv cpu_A0;
+static TCGv cpu_cc_dst, cpu_cc_src, cpu_cc_src2, cpu_cc_srcT;
 static TCGv_i32 cpu_cc_op;
 static TCGv cpu_regs[CPU_NB_REGS];
 /* local temps */
-static TCGv cpu_T[2], cpu_T3;
+static TCGv cpu_T[2];
 /* local register indexes (only used inside old micro ops) */
 static TCGv cpu_tmp0, cpu_tmp4;
 static TCGv_ptr cpu_ptr0, cpu_ptr1;
 static TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
 static TCGv_i64 cpu_tmp1_i64;
-static TCGv cpu_tmp5;
 
 static uint8_t gen_opc_cc_op[OPC_BUF_SIZE];
 
@@ -88,8 +98,11 @@
     int code64; /* 64 bit code segment */
     int rex_x, rex_b;
 #endif
+    int vex_l;  /* vex vector length */
+    int vex_v;  /* vex vvvv register, without 1's compliment.  */
     int ss32;   /* 32 bit stack segment */
-    int cc_op;  /* current CC operation */
+    CCOp cc_op;  /* current CC operation */
+    bool cc_op_dirty;
     int addseg; /* non zero if either DS/ES/SS have a non zero base */
     int f_st;   /* currently unused */
     int vm86;   /* vm86 mode */
@@ -113,6 +126,7 @@
 static void gen_eob(DisasContext *s);
 static void gen_jmp(DisasContext *s, target_ulong eip);
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
+static void gen_op(DisasContext *s1, int op, int ot, int d);
 
 /* i386 arith/logic operations */
 enum {
@@ -173,6 +187,79 @@
     OR_A0, /* temporary register used when doing address evaluation */
 };
 
+enum {
+    USES_CC_DST  = 1,
+    USES_CC_SRC  = 2,
+    USES_CC_SRC2 = 4,
+    USES_CC_SRCT = 8,
+};
+
+/* Bit set if the global variable is live after setting CC_OP to X.  */
+static const uint8_t cc_op_live[CC_OP_NB] = {
+    [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_EFLAGS] = USES_CC_SRC,
+    [CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRCT,
+    [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST,
+    [CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_SHLB ... CC_OP_SHLQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_SARB ... CC_OP_SARQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_BMILGB ... CC_OP_BMILGQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADCX] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_CLR] = 0,
+};
+
+static void set_cc_op(DisasContext *s, CCOp op)
+{
+    int dead;
+
+    if (s->cc_op == op) {
+        return;
+    }
+
+    /* Discard CC computation that will no longer be used.  */
+    dead = cc_op_live[s->cc_op] & ~cc_op_live[op];
+    if (dead & USES_CC_DST) {
+        tcg_gen_discard_tl(cpu_cc_dst);
+    }
+    if (dead & USES_CC_SRC) {
+        tcg_gen_discard_tl(cpu_cc_src);
+    }
+    if (dead & USES_CC_SRC2) {
+        tcg_gen_discard_tl(cpu_cc_src2);
+    }
+    if (dead & USES_CC_SRCT) {
+        tcg_gen_discard_tl(cpu_cc_srcT);
+    }
+
+    if (op == CC_OP_DYNAMIC) {
+        /* The DYNAMIC setting is translator only, and should never be
+           stored.  Thus we always consider it clean.  */
+        s->cc_op_dirty = false;
+    } else {
+        /* Discard any computed CC_OP value (see shifts).  */
+        if (s->cc_op == CC_OP_DYNAMIC) {
+            tcg_gen_discard_i32(cpu_cc_op);
+        }
+        s->cc_op_dirty = true;
+    }
+    s->cc_op = op;
+}
+
+static void gen_update_cc_op(DisasContext *s)
+{
+    if (s->cc_op_dirty) {
+        tcg_gen_movi_i32(cpu_cc_op, s->cc_op);
+        s->cc_op_dirty = false;
+    }
+}
+
 static inline void gen_op_movl_T0_0(void)
 {
     tcg_gen_movi_tl(cpu_T[0], 0);
@@ -323,17 +410,17 @@
 static inline void gen_op_mov_reg_A0(int size, int reg)
 {
     switch(size) {
-    case 0:
+    case OT_BYTE:
         tcg_gen_deposit_tl(cpu_regs[reg], cpu_regs[reg], cpu_A0, 0, 16);
         break;
     default: /* XXX this shouldn't be reached;  abort? */
-    case 1:
+    case OT_WORD:
         /* For x86_64, this sets the higher half of register to zero.
            For i386, this is equivalent to a mov. */
         tcg_gen_ext32u_tl(cpu_regs[reg], cpu_A0);
         break;
 #ifdef TARGET_X86_64
-    case 2:
+    case OT_LONG:
         tcg_gen_mov_tl(cpu_regs[reg], cpu_A0);
         break;
 #endif
@@ -398,11 +485,11 @@
 static inline void gen_op_add_reg_im(int size, int reg, int32_t val)
 {
     switch(size) {
-    case 0:
+    case OT_BYTE:
         tcg_gen_addi_tl(cpu_tmp0, cpu_regs[reg], val);
         tcg_gen_deposit_tl(cpu_regs[reg], cpu_regs[reg], cpu_tmp0, 0, 16);
         break;
-    case 1:
+    case OT_WORD:
         tcg_gen_addi_tl(cpu_tmp0, cpu_regs[reg], val);
         /* For x86_64, this sets the higher half of register to zero.
            For i386, this is equivalent to a nop. */
@@ -410,7 +497,7 @@
         tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
         break;
 #ifdef TARGET_X86_64
-    case 2:
+    case OT_LONG:
         tcg_gen_addi_tl(cpu_regs[reg], cpu_regs[reg], val);
         break;
 #endif
@@ -420,11 +507,11 @@
 static inline void gen_op_add_reg_T0(int size, int reg)
 {
     switch(size) {
-    case 0:
+    case OT_BYTE:
         tcg_gen_add_tl(cpu_tmp0, cpu_regs[reg], cpu_T[0]);
         tcg_gen_deposit_tl(cpu_regs[reg], cpu_regs[reg], cpu_tmp0, 0, 16);
         break;
-    case 1:
+    case OT_WORD:
         tcg_gen_add_tl(cpu_tmp0, cpu_regs[reg], cpu_T[0]);
         /* For x86_64, this sets the higher half of register to zero.
            For i386, this is equivalent to a nop. */
@@ -432,18 +519,13 @@
         tcg_gen_mov_tl(cpu_regs[reg], cpu_tmp0);
         break;
 #ifdef TARGET_X86_64
-    case 2:
+    case OT_LONG:
         tcg_gen_add_tl(cpu_regs[reg], cpu_regs[reg], cpu_T[0]);
         break;
 #endif
     }
 }
 
-static inline void gen_op_set_cc_op(int32_t val)
-{
-    tcg_gen_movi_i32(cpu_cc_op, val);
-}
-
 static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
 {
     tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
@@ -506,14 +588,14 @@
 {
     int mem_index = (idx >> 2) - 1;
     switch(idx & 3) {
-    case 0:
+    case OT_BYTE:
         tcg_gen_qemu_ld8s(cpu_T[0], cpu_A0, mem_index);
         break;
-    case 1:
+    case OT_WORD:
         tcg_gen_qemu_ld16s(cpu_T[0], cpu_A0, mem_index);
         break;
     default:
-    case 2:
+    case OT_LONG:
         tcg_gen_qemu_ld32s(cpu_T[0], cpu_A0, mem_index);
         break;
     }
@@ -523,17 +605,17 @@
 {
     int mem_index = (idx >> 2) - 1;
     switch(idx & 3) {
-    case 0:
+    case OT_BYTE:
         tcg_gen_qemu_ld8u(t0, a0, mem_index);
         break;
-    case 1:
+    case OT_WORD:
         tcg_gen_qemu_ld16u(t0, a0, mem_index);
         break;
-    case 2:
+    case OT_LONG:
         tcg_gen_qemu_ld32u(t0, a0, mem_index);
         break;
     default:
-    case 3:
+    case OT_QUAD:
         /* Should never happen on 32-bit targets.  */
 #ifdef TARGET_X86_64
         tcg_gen_qemu_ld64(t0, a0, mem_index);
@@ -562,17 +644,17 @@
 {
     int mem_index = (idx >> 2) - 1;
     switch(idx & 3) {
-    case 0:
+    case OT_BYTE:
         tcg_gen_qemu_st8(t0, a0, mem_index);
         break;
-    case 1:
+    case OT_WORD:
         tcg_gen_qemu_st16(t0, a0, mem_index);
         break;
-    case 2:
+    case OT_LONG:
         tcg_gen_qemu_st32(t0, a0, mem_index);
         break;
     default:
-    case 3:
+    case OT_QUAD:
         /* Should never happen on 32-bit targets.  */
 #ifdef TARGET_X86_64
         tcg_gen_qemu_st64(t0, a0, mem_index);
@@ -659,38 +741,45 @@
     tcg_gen_shli_tl(cpu_T[0], cpu_T[0], ot);
 };
 
+static TCGv gen_ext_tl(TCGv dst, TCGv src, int size, bool sign)
+{
+    switch (size) {
+    case OT_BYTE:
+        if (sign) {
+            tcg_gen_ext8s_tl(dst, src);
+        } else {
+            tcg_gen_ext8u_tl(dst, src);
+        }
+        return dst;
+    case OT_WORD:
+        if (sign) {
+            tcg_gen_ext16s_tl(dst, src);
+        } else {
+            tcg_gen_ext16u_tl(dst, src);
+        }
+        return dst;
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        if (sign) {
+            tcg_gen_ext32s_tl(dst, src);
+        } else {
+            tcg_gen_ext32u_tl(dst, src);
+        }
+        return dst;
+#endif
+    default:
+        return src;
+    }
+}
+
 static void gen_extu(int ot, TCGv reg)
 {
-    switch(ot) {
-    case OT_BYTE:
-        tcg_gen_ext8u_tl(reg, reg);
-        break;
-    case OT_WORD:
-        tcg_gen_ext16u_tl(reg, reg);
-        break;
-    case OT_LONG:
-        tcg_gen_ext32u_tl(reg, reg);
-        break;
-    default:
-        break;
-    }
+    gen_ext_tl(reg, reg, ot, false);
 }
 
 static void gen_exts(int ot, TCGv reg)
 {
-    switch(ot) {
-    case OT_BYTE:
-        tcg_gen_ext8s_tl(reg, reg);
-        break;
-    case OT_WORD:
-        tcg_gen_ext16s_tl(reg, reg);
-        break;
-    case OT_LONG:
-        tcg_gen_ext32s_tl(reg, reg);
-        break;
-    default:
-        break;
-    }
+    gen_ext_tl(reg, reg, ot, true);
 }
 
 static inline void gen_op_jnz_ecx(int size, int label1)
@@ -710,21 +799,31 @@
 static void gen_helper_in_func(int ot, TCGv v, TCGv_i32 n)
 {
     switch (ot) {
-    case 0: gen_helper_inb(v, n); break;
-    case 1: gen_helper_inw(v, n); break;
-    case 2: gen_helper_inl(v, n); break;
+    case OT_BYTE:
+        gen_helper_inb(v, n);
+        break;
+    case OT_WORD:
+        gen_helper_inw(v, n);
+        break;
+    case OT_LONG:
+        gen_helper_inl(v, n);
+        break;
     }
-
 }
 
 static void gen_helper_out_func(int ot, TCGv_i32 v, TCGv_i32 n)
 {
     switch (ot) {
-    case 0: gen_helper_outb(v, n); break;
-    case 1: gen_helper_outw(v, n); break;
-    case 2: gen_helper_outl(v, n); break;
+    case OT_BYTE:
+        gen_helper_outb(v, n);
+        break;
+    case OT_WORD:
+        gen_helper_outw(v, n);
+        break;
+    case OT_LONG:
+        gen_helper_outl(v, n);
+        break;
     }
-
 }
 
 static void gen_check_io(DisasContext *s, int ot, target_ulong cur_eip,
@@ -735,27 +834,25 @@
 
     state_saved = 0;
     if (s->pe && (s->cpl > s->iopl || s->vm86)) {
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(cur_eip);
         state_saved = 1;
         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
         switch (ot) {
-        case 0:
+        case OT_BYTE:
             gen_helper_check_iob(cpu_env, cpu_tmp2_i32);
             break;
-        case 1:
+        case OT_WORD:
             gen_helper_check_iow(cpu_env, cpu_tmp2_i32);
             break;
-        case 2:
+        case OT_LONG:
             gen_helper_check_iol(cpu_env, cpu_tmp2_i32);
             break;
         }
     }
     if(s->flags & HF_SVMI_MASK) {
         if (!state_saved) {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(cur_eip);
         }
         svm_flags |= (1 << (4 + ot));
@@ -778,17 +875,8 @@
     gen_op_add_reg_T0(s->aflag, R_EDI);
 }
 
-static inline void gen_update_cc_op(DisasContext *s)
-{
-    if (s->cc_op != CC_OP_DYNAMIC) {
-        gen_op_set_cc_op(s->cc_op);
-        s->cc_op = CC_OP_DYNAMIC;
-    }
-}
-
 static void gen_op_update1_cc(void)
 {
-    tcg_gen_discard_tl(cpu_cc_src);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
@@ -798,339 +886,393 @@
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
-static inline void gen_op_cmpl_T0_T1_cc(void)
+static void gen_op_update3_cc(TCGv reg)
 {
+    tcg_gen_mov_tl(cpu_cc_src2, reg);
     tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-    tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
 static inline void gen_op_testl_T0_T1_cc(void)
 {
-    tcg_gen_discard_tl(cpu_cc_src);
     tcg_gen_and_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
 }
 
 static void gen_op_update_neg_cc(void)
 {
-    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-}
-
-/* compute eflags.C to reg */
-static void gen_compute_eflags_c(TCGv reg)
-{
-    gen_helper_cc_compute_c(cpu_tmp2_i32, cpu_env, cpu_cc_op);
-    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
+    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
+    tcg_gen_movi_tl(cpu_cc_srcT, 0);
 }
 
 /* compute all eflags to cc_src */
-static void gen_compute_eflags(TCGv reg)
+static void gen_compute_eflags(DisasContext *s)
 {
-    gen_helper_cc_compute_all(cpu_tmp2_i32, cpu_env, cpu_cc_op);
-    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
-}
+    TCGv zero, dst, src1, src2;
+    int live, dead;
 
-static inline void gen_setcc_slow_T0(DisasContext *s, int jcc_op)
-{
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
-    switch(jcc_op) {
-    case JCC_O:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 11);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_B:
-        gen_compute_eflags_c(cpu_T[0]);
-        break;
-    case JCC_Z:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 6);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_BE:
-        gen_compute_eflags(cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 6);
-        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_S:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 7);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_P:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 2);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_L:
-        gen_compute_eflags(cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
-        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 7); /* CC_S */
-        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    default:
-    case JCC_LE:
-        gen_compute_eflags(cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
-        tcg_gen_shri_tl(cpu_tmp4, cpu_tmp0, 7); /* CC_S */
-        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 6); /* CC_Z */
-        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
-        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
+    if (s->cc_op == CC_OP_EFLAGS) {
+        return;
+    }
+    if (s->cc_op == CC_OP_CLR) {
+        tcg_gen_movi_tl(cpu_cc_src, CC_Z);
+        set_cc_op(s, CC_OP_EFLAGS);
+        return;
+    }
+
+    TCGV_UNUSED(zero);
+    dst = cpu_cc_dst;
+    src1 = cpu_cc_src;
+    src2 = cpu_cc_src2;
+
+    /* Take care to not read values that are not live.  */
+    live = cc_op_live[s->cc_op] & ~USES_CC_SRCT;
+    dead = live ^ (USES_CC_DST | USES_CC_SRC | USES_CC_SRC2);
+    if (dead) {
+        zero = tcg_const_tl(0);
+        if (dead & USES_CC_DST) {
+            dst = zero;
+        }
+        if (dead & USES_CC_SRC) {
+            src1 = zero;
+        }
+        if (dead & USES_CC_SRC2) {
+            src2 = zero;
+        }
+    }
+
+    gen_update_cc_op(s);
+    gen_helper_cc_compute_all(cpu_cc_src, dst, src1, src2, cpu_cc_op);
+    set_cc_op(s, CC_OP_EFLAGS);
+
+    if (dead) {
+        tcg_temp_free(zero);
     }
 }
 
-/* return true if setcc_slow is not needed (WARNING: must be kept in
-   sync with gen_jcc1) */
-static int is_fast_jcc_case(DisasContext *s, int b)
+typedef struct CCPrepare {
+    TCGCond cond;
+    TCGv reg;
+    TCGv reg2;
+    target_ulong imm;
+    target_ulong mask;
+    bool use_reg2;
+    bool no_setcond;
+} CCPrepare;
+
+/* compute eflags.C to reg */
+static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
 {
-    int jcc_op;
-    jcc_op = (b >> 1) & 7;
-    switch(s->cc_op) {
-        /* we optimize the cmp/jcc case */
-    case CC_OP_SUBB:
-    case CC_OP_SUBW:
-    case CC_OP_SUBL:
-    case CC_OP_SUBQ:
-        if (jcc_op == JCC_O || jcc_op == JCC_P)
-            goto slow_jcc;
-        break;
+    TCGv t0, t1;
+    int size, shift;
 
-        /* some jumps are easy to compute */
-    case CC_OP_ADDB:
-    case CC_OP_ADDW:
-    case CC_OP_ADDL:
-    case CC_OP_ADDQ:
+    switch (s->cc_op) {
+    case CC_OP_SUBB ... CC_OP_SUBQ:
+        /* (DATA_TYPE)CC_SRCT < (DATA_TYPE)CC_SRC */
+        size = s->cc_op - CC_OP_SUBB;
+        t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+        /* If no temporary was used, be careful not to alias t1 and t0.  */
+        t0 = TCGV_EQUAL(t1, cpu_cc_src) ? cpu_tmp0 : reg;
+        tcg_gen_mov_tl(t0, cpu_cc_srcT);
+        gen_extu(size, t0);
+        goto add_sub;
 
-    case CC_OP_LOGICB:
-    case CC_OP_LOGICW:
-    case CC_OP_LOGICL:
-    case CC_OP_LOGICQ:
+    case CC_OP_ADDB ... CC_OP_ADDQ:
+        /* (DATA_TYPE)CC_DST < (DATA_TYPE)CC_SRC */
+        size = s->cc_op - CC_OP_ADDB;
+        t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+        t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+    add_sub:
+        return (CCPrepare) { .cond = TCG_COND_LTU, .reg = t0,
+                             .reg2 = t1, .mask = -1, .use_reg2 = true };
 
-    case CC_OP_INCB:
-    case CC_OP_INCW:
-    case CC_OP_INCL:
-    case CC_OP_INCQ:
+    case CC_OP_LOGICB ... CC_OP_LOGICQ:
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
 
-    case CC_OP_DECB:
-    case CC_OP_DECW:
-    case CC_OP_DECL:
-    case CC_OP_DECQ:
+    case CC_OP_INCB ... CC_OP_INCQ:
+    case CC_OP_DECB ... CC_OP_DECQ:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = -1, .no_setcond = true };
 
-    case CC_OP_SHLB:
-    case CC_OP_SHLW:
-    case CC_OP_SHLL:
-    case CC_OP_SHLQ:
-        if (jcc_op != JCC_Z && jcc_op != JCC_S)
-            goto slow_jcc;
-        break;
+    case CC_OP_SHLB ... CC_OP_SHLQ:
+        /* (CC_SRC >> (DATA_BITS - 1)) & 1 */
+        size = s->cc_op - CC_OP_SHLB;
+        shift = (8 << size) - 1;
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = (target_ulong)1 << shift };
+
+    case CC_OP_MULB ... CC_OP_MULQ:
+        return (CCPrepare) { .cond = TCG_COND_NE,
+                             .reg = cpu_cc_src, .mask = -1 };
+
+    case CC_OP_BMILGB ... CC_OP_BMILGQ:
+        size = s->cc_op - CC_OP_BMILGB;
+        t0 = gen_ext_tl(reg, cpu_cc_src, size, false);
+        return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+
+    case CC_OP_ADCX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst,
+                             .mask = -1, .no_setcond = true };
+
+    case CC_OP_EFLAGS:
+    case CC_OP_SARB ... CC_OP_SARQ:
+        /* CC_SRC & 1 */
+        return (CCPrepare) { .cond = TCG_COND_NE,
+                             .reg = cpu_cc_src, .mask = CC_C };
+
     default:
-    slow_jcc:
-        return 0;
+       /* The need to compute only C from CC_OP_DYNAMIC is important
+          in efficiently implementing e.g. INC at the start of a TB.  */
+       gen_update_cc_op(s);
+       gen_helper_cc_compute_c(reg, cpu_cc_dst, cpu_cc_src,
+                               cpu_cc_src2, cpu_cc_op);
+       return (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                            .mask = -1, .no_setcond = true };
     }
-    return 1;
 }
 
-/* generate a conditional jump to label 'l1' according to jump opcode
+/* compute eflags.P to reg */
+static CCPrepare gen_prepare_eflags_p(DisasContext *s, TCGv reg)
+{
+    gen_compute_eflags(s);
+    return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                         .mask = CC_P };
+}
+
+/* compute eflags.S to reg */
+static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
+{
+    switch (s->cc_op) {
+    case CC_OP_DYNAMIC:
+        gen_compute_eflags(s);
+        /* FALLTHRU */
+    case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_S };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+    default:
+        {
+            int size = (s->cc_op - CC_OP_ADDB) & 3;
+            TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, true);
+            return (CCPrepare) { .cond = TCG_COND_LT, .reg = t0, .mask = -1 };
+        }
+    }
+}
+
+/* compute eflags.O to reg */
+static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
+{
+    switch (s->cc_op) {
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
+                             .mask = -1, .no_setcond = true };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+    default:
+        gen_compute_eflags(s);
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_O };
+    }
+}
+
+/* compute eflags.Z to reg */
+static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
+{
+    switch (s->cc_op) {
+    case CC_OP_DYNAMIC:
+        gen_compute_eflags(s);
+        /* FALLTHRU */
+    case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_Z };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_ALWAYS, .mask = -1 };
+    default:
+        {
+            int size = (s->cc_op - CC_OP_ADDB) & 3;
+            TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+            return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+        }
+    }
+}
+
+/* perform a conditional store into register 'reg' according to jump opcode
    value 'b'. In the fast case, T0 is guaranted not to be used. */
-static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1)
+static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
 {
     int inv, jcc_op, size, cond;
+    CCPrepare cc;
     TCGv t0;
 
     inv = b & 1;
     jcc_op = (b >> 1) & 7;
 
-    switch(cc_op) {
-        /* we optimize the cmp/jcc case */
-    case CC_OP_SUBB:
-    case CC_OP_SUBW:
-    case CC_OP_SUBL:
-    case CC_OP_SUBQ:
-        
-        size = cc_op - CC_OP_SUBB;
-        switch(jcc_op) {
-        case JCC_Z:
-        fast_jcc_z:
-            switch(size) {
-            case 0:
-                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0xff);
-                t0 = cpu_tmp0;
-                break;
-            case 1:
-                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0xffff);
-                t0 = cpu_tmp0;
-                break;
-#ifdef TARGET_X86_64
-            case 2:
-                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0xffffffff);
-                t0 = cpu_tmp0;
-                break;
-#endif
-            default:
-                t0 = cpu_cc_dst;
-                break;
-            }
-            tcg_gen_brcondi_tl(inv ? TCG_COND_NE : TCG_COND_EQ, t0, 0, l1);
-            break;
-        case JCC_S:
-        fast_jcc_s:
-            switch(size) {
-            case 0:
-                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x80);
-                tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, 
-                                   0, l1);
-                break;
-            case 1:
-                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x8000);
-                tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, 
-                                   0, l1);
-                break;
-#ifdef TARGET_X86_64
-            case 2:
-                tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x80000000);
-                tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, 
-                                   0, l1);
-                break;
-#endif
-            default:
-                tcg_gen_brcondi_tl(inv ? TCG_COND_GE : TCG_COND_LT, cpu_cc_dst, 
-                                   0, l1);
-                break;
-            }
-            break;
-            
-        case JCC_B:
-            cond = inv ? TCG_COND_GEU : TCG_COND_LTU;
-            goto fast_jcc_b;
+    switch (s->cc_op) {
+    case CC_OP_SUBB ... CC_OP_SUBQ:
+        /* We optimize relational operators for the cmp/jcc case.  */
+        size = s->cc_op - CC_OP_SUBB;
+        switch (jcc_op) {
         case JCC_BE:
-            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
-        fast_jcc_b:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
-            switch(size) {
-            case 0:
-                t0 = cpu_tmp0;
-                tcg_gen_andi_tl(cpu_tmp4, cpu_tmp4, 0xff);
-                tcg_gen_andi_tl(t0, cpu_cc_src, 0xff);
-                break;
-            case 1:
-                t0 = cpu_tmp0;
-                tcg_gen_andi_tl(cpu_tmp4, cpu_tmp4, 0xffff);
-                tcg_gen_andi_tl(t0, cpu_cc_src, 0xffff);
-                break;
-#ifdef TARGET_X86_64
-            case 2:
-                t0 = cpu_tmp0;
-                tcg_gen_andi_tl(cpu_tmp4, cpu_tmp4, 0xffffffff);
-                tcg_gen_andi_tl(t0, cpu_cc_src, 0xffffffff);
-                break;
-#endif
-            default:
-                t0 = cpu_cc_src;
-                break;
-            }
-            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
+            gen_extu(size, cpu_tmp4);
+            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+            cc = (CCPrepare) { .cond = TCG_COND_LEU, .reg = cpu_tmp4,
+                               .reg2 = t0, .mask = -1, .use_reg2 = true };
             break;
-            
+
         case JCC_L:
-            cond = inv ? TCG_COND_GE : TCG_COND_LT;
+            cond = TCG_COND_LT;
             goto fast_jcc_l;
         case JCC_LE:
-            cond = inv ? TCG_COND_GT : TCG_COND_LE;
+            cond = TCG_COND_LE;
         fast_jcc_l:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
-            switch(size) {
-            case 0:
-                t0 = cpu_tmp0;
-                tcg_gen_ext8s_tl(cpu_tmp4, cpu_tmp4);
-                tcg_gen_ext8s_tl(t0, cpu_cc_src);
-                break;
-            case 1:
-                t0 = cpu_tmp0;
-                tcg_gen_ext16s_tl(cpu_tmp4, cpu_tmp4);
-                tcg_gen_ext16s_tl(t0, cpu_cc_src);
-                break;
-#ifdef TARGET_X86_64
-            case 2:
-                t0 = cpu_tmp0;
-                tcg_gen_ext32s_tl(cpu_tmp4, cpu_tmp4);
-                tcg_gen_ext32s_tl(t0, cpu_cc_src);
-                break;
-#endif
-            default:
-                t0 = cpu_cc_src;
-                break;
-            }
-            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
+            gen_exts(size, cpu_tmp4);
+            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, true);
+            cc = (CCPrepare) { .cond = cond, .reg = cpu_tmp4,
+                               .reg2 = t0, .mask = -1, .use_reg2 = true };
             break;
-            
+
         default:
             goto slow_jcc;
         }
         break;
-        
-        /* some jumps are easy to compute */
-    case CC_OP_ADDB:
-    case CC_OP_ADDW:
-    case CC_OP_ADDL:
-    case CC_OP_ADDQ:
-        
-    case CC_OP_ADCB:
-    case CC_OP_ADCW:
-    case CC_OP_ADCL:
-    case CC_OP_ADCQ:
-        
-    case CC_OP_SBBB:
-    case CC_OP_SBBW:
-    case CC_OP_SBBL:
-    case CC_OP_SBBQ:
-        
-    case CC_OP_LOGICB:
-    case CC_OP_LOGICW:
-    case CC_OP_LOGICL:
-    case CC_OP_LOGICQ:
-        
-    case CC_OP_INCB:
-    case CC_OP_INCW:
-    case CC_OP_INCL:
-    case CC_OP_INCQ:
-        
-    case CC_OP_DECB:
-    case CC_OP_DECW:
-    case CC_OP_DECL:
-    case CC_OP_DECQ:
-        
-    case CC_OP_SHLB:
-    case CC_OP_SHLW:
-    case CC_OP_SHLL:
-    case CC_OP_SHLQ:
-        
-    case CC_OP_SARB:
-    case CC_OP_SARW:
-    case CC_OP_SARL:
-    case CC_OP_SARQ:
-        switch(jcc_op) {
-        case JCC_Z:
-            size = (cc_op - CC_OP_ADDB) & 3;
-            goto fast_jcc_z;
-        case JCC_S:
-            size = (cc_op - CC_OP_ADDB) & 3;
-            goto fast_jcc_s;
-        default:
-            goto slow_jcc;
-        }
-        break;
+
     default:
     slow_jcc:
-        gen_setcc_slow_T0(s, jcc_op);
-        tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, 
-                           cpu_T[0], 0, l1);
+        /* This actually generates good code for JC, JZ and JS.  */
+        switch (jcc_op) {
+        case JCC_O:
+            cc = gen_prepare_eflags_o(s, reg);
+            break;
+        case JCC_B:
+            cc = gen_prepare_eflags_c(s, reg);
+            break;
+        case JCC_Z:
+            cc = gen_prepare_eflags_z(s, reg);
+            break;
+        case JCC_BE:
+            gen_compute_eflags(s);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                               .mask = CC_Z | CC_C };
+            break;
+        case JCC_S:
+            cc = gen_prepare_eflags_s(s, reg);
+            break;
+        case JCC_P:
+            cc = gen_prepare_eflags_p(s, reg);
+            break;
+        case JCC_L:
+            gen_compute_eflags(s);
+            if (TCGV_EQUAL(reg, cpu_cc_src)) {
+                reg = cpu_tmp0;
+            }
+            tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+            tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                               .mask = CC_S };
+            break;
+        default:
+        case JCC_LE:
+            gen_compute_eflags(s);
+            if (TCGV_EQUAL(reg, cpu_cc_src)) {
+                reg = cpu_tmp0;
+            }
+            tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+            tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                               .mask = CC_S | CC_Z };
+            break;
+        }
         break;
     }
+
+    if (inv) {
+        cc.cond = tcg_invert_cond(cc.cond);
+    }
+    return cc;
+}
+
+static void gen_setcc1(DisasContext *s, int b, TCGv reg)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, reg);
+
+    if (cc.no_setcond) {
+        if (cc.cond == TCG_COND_EQ) {
+            tcg_gen_xori_tl(reg, cc.reg, 1);
+        } else {
+            tcg_gen_mov_tl(reg, cc.reg);
+        }
+        return;
+    }
+
+    if (cc.cond == TCG_COND_NE && !cc.use_reg2 && cc.imm == 0 &&
+        cc.mask != 0 && (cc.mask & (cc.mask - 1)) == 0) {
+        tcg_gen_shri_tl(reg, cc.reg, ctztl(cc.mask));
+        tcg_gen_andi_tl(reg, reg, 1);
+        return;
+    }
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(reg, cc.reg, cc.mask);
+        cc.reg = reg;
+    }
+    if (cc.use_reg2) {
+        tcg_gen_setcond_tl(cc.cond, reg, cc.reg, cc.reg2);
+    } else {
+        tcg_gen_setcondi_tl(cc.cond, reg, cc.reg, cc.imm);
+    }
+}
+
+static inline void gen_compute_eflags_c(DisasContext *s, TCGv reg)
+{
+    gen_setcc1(s, JCC_B << 1, reg);
+}
+
+/* generate a conditional jump to label 'l1' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used. */
+static inline void gen_jcc1_noeob(DisasContext *s, int b, int l1)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+        cc.reg = cpu_T[0];
+    }
+    if (cc.use_reg2) {
+        tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+    } else {
+        tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
+    }
+}
+
+/* Generate a conditional jump to label 'l1' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used.
+   A translation block must end soon.  */
+static inline void gen_jcc1(DisasContext *s, int b, int l1)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+    gen_update_cc_op(s);
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+        cc.reg = cpu_T[0];
+    }
+    set_cc_op(s, CC_OP_DYNAMIC);
+    if (cc.use_reg2) {
+        tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+    } else {
+        tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
+    }
 }
 
 /* XXX: does not work with gdbstub "ice" single step - not a
@@ -1168,21 +1310,19 @@
 
 static inline void gen_scas(DisasContext *s, int ot)
 {
-    gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
     gen_string_movl_A0_EDI(s);
     gen_op_ld_T1_A0(ot + s->mem_index);
-    gen_op_cmpl_T0_T1_cc();
+    gen_op(s, OP_CMPL, ot, R_EAX);
     gen_op_movl_T0_Dshift(ot);
     gen_op_add_reg_T0(s->aflag, R_EDI);
 }
 
 static inline void gen_cmps(DisasContext *s, int ot)
 {
-    gen_string_movl_A0_ESI(s);
-    gen_op_ld_T0_A0(ot + s->mem_index);
     gen_string_movl_A0_EDI(s);
     gen_op_ld_T1_A0(ot + s->mem_index);
-    gen_op_cmpl_T0_T1_cc();
+    gen_string_movl_A0_ESI(s);
+    gen_op(s, OP_CMPL, ot, OR_TMP0);
     gen_op_movl_T0_Dshift(ot);
     gen_op_add_reg_T0(s->aflag, R_ESI);
     gen_op_add_reg_T0(s->aflag, R_EDI);
@@ -1256,8 +1396,8 @@
     l2 = gen_jz_ecx_string(s, next_eip);                                      \
     gen_ ## op(s, ot);                                                        \
     gen_op_add_reg_im(s->aflag, R_ECX, -1);                                   \
-    gen_op_set_cc_op(CC_OP_SUBB + ot);                                        \
-    gen_jcc1(s, CC_OP_SUBB + ot, (JCC_Z << 1) | (nz ^ 1), l2);                \
+    gen_update_cc_op(s);                                                      \
+    gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);                                 \
     if (!s->jmp_opt)                                                          \
         gen_op_jz_ecx(s->aflag, l2);                                          \
     gen_jmp(s, cur_eip);                                                      \
@@ -1337,38 +1477,26 @@
     }
     switch(op) {
     case OP_ADCL:
-        if (s1->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s1->cc_op);
-        gen_compute_eflags_c(cpu_tmp4);
+        gen_compute_eflags_c(s1, cpu_tmp4);
         tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
-        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
-        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
-        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot);
-        s1->cc_op = CC_OP_DYNAMIC;
+        gen_op_update3_cc(cpu_tmp4);
+        set_cc_op(s1, CC_OP_ADCB + ot);
         break;
     case OP_SBBL:
-        if (s1->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s1->cc_op);
-        gen_compute_eflags_c(cpu_tmp4);
+        gen_compute_eflags_c(s1, cpu_tmp4);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
-        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
-        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
-        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot);
-        s1->cc_op = CC_OP_DYNAMIC;
+        gen_op_update3_cc(cpu_tmp4);
+        set_cc_op(s1, CC_OP_SBBB + ot);
         break;
     case OP_ADDL:
         gen_op_addl_T0_T1();
@@ -1377,16 +1505,17 @@
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update2_cc();
-        s1->cc_op = CC_OP_ADDB + ot;
+        set_cc_op(s1, CC_OP_ADDB + ot);
         break;
     case OP_SUBL:
+        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update2_cc();
-        s1->cc_op = CC_OP_SUBB + ot;
+        set_cc_op(s1, CC_OP_SUBB + ot);
         break;
     default:
     case OP_ANDL:
@@ -1396,7 +1525,7 @@
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update1_cc();
-        s1->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_ORL:
         tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
@@ -1405,7 +1534,7 @@
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update1_cc();
-        s1->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_XORL:
         tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
@@ -1414,11 +1543,13 @@
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update1_cc();
-        s1->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_CMPL:
-        gen_op_cmpl_T0_T1_cc();
-        s1->cc_op = CC_OP_SUBB + ot;
+        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
+        tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+        set_cc_op(s1, CC_OP_SUBB + ot);
         break;
     }
 }
@@ -1430,35 +1561,70 @@
         gen_op_mov_TN_reg(ot, 0, d);
     else
         gen_op_ld_T0_A0(ot + s1->mem_index);
-    if (s1->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s1->cc_op);
+    gen_compute_eflags_c(s1, cpu_cc_src);
     if (c > 0) {
         tcg_gen_addi_tl(cpu_T[0], cpu_T[0], 1);
-        s1->cc_op = CC_OP_INCB + ot;
+        set_cc_op(s1, CC_OP_INCB + ot);
     } else {
         tcg_gen_addi_tl(cpu_T[0], cpu_T[0], -1);
-        s1->cc_op = CC_OP_DECB + ot;
+        set_cc_op(s1, CC_OP_DECB + ot);
     }
     if (d != OR_TMP0)
         gen_op_mov_reg_T0(ot, d);
     else
         gen_op_st_T0_A0(ot + s1->mem_index);
-    gen_compute_eflags_c(cpu_cc_src);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
+static void gen_shift_flags(DisasContext *s, int ot, TCGv result, TCGv shm1,
+                            TCGv count, bool is_right)
+{
+    TCGv_i32 z32, s32, oldop;
+    TCGv z_tl;
+
+    /* Store the results into the CC variables.  If we know that the
+       variable must be dead, store unconditionally.  Otherwise we'll
+       need to not disrupt the current contents.  */
+    z_tl = tcg_const_tl(0);
+    if (cc_op_live[s->cc_op] & USES_CC_DST) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_dst, count, z_tl,
+                           result, cpu_cc_dst);
+    } else {
+        tcg_gen_mov_tl(cpu_cc_dst, result);
+    }
+    if (cc_op_live[s->cc_op] & USES_CC_SRC) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_src, count, z_tl,
+                           shm1, cpu_cc_src);
+    } else {
+        tcg_gen_mov_tl(cpu_cc_src, shm1);
+    }
+    tcg_temp_free(z_tl);
+
+    /* Get the two potential CC_OP values into temporaries.  */
+    tcg_gen_movi_i32(cpu_tmp2_i32, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
+    if (s->cc_op == CC_OP_DYNAMIC) {
+        oldop = cpu_cc_op;
+    } else {
+        tcg_gen_movi_i32(cpu_tmp3_i32, s->cc_op);
+        oldop = cpu_tmp3_i32;
+    }
+
+    /* Conditionally store the CC_OP value.  */
+    z32 = tcg_const_i32(0);
+    s32 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(s32, count);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, s32, z32, cpu_tmp2_i32, oldop);
+    tcg_temp_free_i32(z32);
+    tcg_temp_free_i32(s32);
+
+    /* The CC_OP value is no longer predictable.  */
+    set_cc_op(s, CC_OP_DYNAMIC);
+}
+
 static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, 
                             int is_right, int is_arith)
 {
-    target_ulong mask;
-    int shift_label;
-    TCGv t0, t1, t2;
-
-    if (ot == OT_QUAD) {
-        mask = 0x3f;
-    } else {
-        mask = 0x1f;
-    }
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
 
     /* load */
     if (op1 == OR_TMP0) {
@@ -1467,25 +1633,22 @@
         gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-
-    tcg_gen_andi_tl(t2, cpu_T[1], mask);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
+    tcg_gen_subi_tl(cpu_tmp0, cpu_T[1], 1);
 
     if (is_right) {
         if (is_arith) {
             gen_exts(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_sar_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         } else {
             gen_extu(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         }
     } else {
-        tcg_gen_mov_tl(t0, cpu_T[0]);
-        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], t2);
+        tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
     }
 
     /* store */
@@ -1495,52 +1658,13 @@
         gen_op_mov_reg_T0(ot, op1);
     }
 
-    /* update eflags if non zero shift */
-    if (s->cc_op != CC_OP_DYNAMIC) {
-        gen_op_set_cc_op(s->cc_op);
-    }
-
-    tcg_gen_mov_tl(t1, cpu_T[0]);
-
-    shift_label = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, shift_label);
-
-    tcg_gen_addi_tl(t2, t2, -1);
-    tcg_gen_mov_tl(cpu_cc_dst, t1);
-
-    if (is_right) {
-        if (is_arith) {
-            tcg_gen_sar_tl(cpu_cc_src, t0, t2);
-        } else {
-            tcg_gen_shr_tl(cpu_cc_src, t0, t2);
-        }
-    } else {
-        tcg_gen_shl_tl(cpu_cc_src, t0, t2);
-    }
-
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
-    } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
-    }
-
-    gen_set_label(shift_label);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, cpu_T[1], is_right);
 }
 
 static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2,
                             int is_right, int is_arith)
 {
-    int mask;
-    
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
 
     /* load */
     if (op1 == OR_TMP0)
@@ -1576,10 +1700,7 @@
     if (op2 != 0) {
         tcg_gen_mov_tl(cpu_cc_src, cpu_tmp4);
         tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        if (is_right)
-            s->cc_op = CC_OP_SARB + ot;
-        else
-            s->cc_op = CC_OP_SHLB + ot;
+        set_cc_op(s, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
     }
 }
 
@@ -1591,187 +1712,180 @@
         tcg_gen_shri_tl(ret, arg1, -arg2);
 }
 
-static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, 
-                          int is_right)
+static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, int is_right)
 {
-    target_ulong mask;
-    int label1, label2, data_bits;
-    TCGv t0, t1, t2, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    TCGv_i32 t0, t1;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_mov_tl(t1, cpu_T[1]);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
 
-    tcg_gen_andi_tl(t1, t1, mask);
-
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label1);
-    
-    if (ot <= OT_WORD)
-        tcg_gen_andi_tl(cpu_tmp0, t1, (1 << (3 + ot)) - 1);
-    else
-        tcg_gen_mov_tl(cpu_tmp0, t1);
-    
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t2, t0);
-
-    data_bits = 8 << ot;
-    /* XXX: rely on behaviour of shifts when operand 2 overflows (XXX:
-       fix TCG definition) */
-    if (is_right) {
-        tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shl_tl(t0, t0, cpu_tmp0);
-    } else {
-        tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shr_tl(t0, t0, cpu_tmp0);
+    switch (ot) {
+    case OT_BYTE:
+        /* Replicate the 8-bit input so that a 32-bit rotate works.  */
+        tcg_gen_ext8u_tl(cpu_T[0], cpu_T[0]);
+        tcg_gen_muli_tl(cpu_T[0], cpu_T[0], 0x01010101);
+        goto do_long;
+    case OT_WORD:
+        /* Replicate the 16-bit input so that a 32-bit rotate works.  */
+        tcg_gen_deposit_tl(cpu_T[0], cpu_T[0], cpu_T[0], 16, 16);
+        goto do_long;
+    do_long:
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+        if (is_right) {
+            tcg_gen_rotr_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        } else {
+            tcg_gen_rotl_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        }
+        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+        break;
+#endif
+    default:
+        if (is_right) {
+            tcg_gen_rotr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        } else {
+            tcg_gen_rotl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        }
+        break;
     }
-    tcg_gen_or_tl(t0, t0, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    
-    /* update eflags */
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
 
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label2);
+    /* We'll need the flags computed into CC_SRC.  */
+    gen_compute_eflags(s);
 
-    gen_compute_eflags(cpu_cc_src);
-    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-    tcg_gen_xor_tl(cpu_tmp0, t2, t0);
-    tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-    tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+    /* The value that was "rotated out" is now present at the other end
+       of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+       since we've computed the flags into CC_SRC, these variables are
+       currently dead.  */
     if (is_right) {
-        tcg_gen_shri_tl(t0, t0, data_bits - 1);
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+        tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+    } else {
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+        tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
     }
-    tcg_gen_andi_tl(t0, t0, CC_C);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
-    
-    tcg_gen_discard_tl(cpu_cc_dst);
-    tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
-        
-    gen_set_label(label2);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
+    tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+    tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
 
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    /* Now conditionally store the new CC_OP value.  If the shift count
+       is 0 we keep the CC_OP_EFLAGS setting so that only CC_SRC is live.
+       Otherwise reuse CC_OP_ADCOX which have the C and O flags split out
+       exactly as we computed above.  */
+    t0 = tcg_const_i32(0);
+    t1 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(t1, cpu_T[1]);
+    tcg_gen_movi_i32(cpu_tmp2_i32, CC_OP_ADCOX); 
+    tcg_gen_movi_i32(cpu_tmp3_i32, CC_OP_EFLAGS);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, t1, t0,
+                        cpu_tmp2_i32, cpu_tmp3_i32);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+
+    /* The CC_OP value is no longer predictable.  */ 
+    set_cc_op(s, CC_OP_DYNAMIC);
 }
 
 static void gen_rot_rm_im(DisasContext *s, int ot, int op1, int op2,
                           int is_right)
 {
-    int mask;
-    int data_bits;
-    TCGv t0, t1, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    int shift;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t1, t0);
-
     op2 &= mask;
-    data_bits = 8 << ot;
     if (op2 != 0) {
-        int shift = op2 & ((1 << (3 + ot)) - 1);
-        if (is_right) {
-            tcg_gen_shri_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shli_tl(t0, t0, data_bits - shift);
+        switch (ot) {
+#ifdef TARGET_X86_64
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            if (is_right) {
+                tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            } else {
+                tcg_gen_rotli_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            }
+            tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            break;
+#endif
+        default:
+            if (is_right) {
+                tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], op2);
+            } else {
+                tcg_gen_rotli_tl(cpu_T[0], cpu_T[0], op2);
+            }
+            break;
+        case OT_BYTE:
+            mask = 7;
+            goto do_shifts;
+        case OT_WORD:
+            mask = 15;
+        do_shifts:
+            shift = op2 & mask;
+            if (is_right) {
+                shift = mask + 1 - shift;
+            }
+            gen_extu(ot, cpu_T[0]);
+            tcg_gen_shli_tl(cpu_tmp0, cpu_T[0], shift);
+            tcg_gen_shri_tl(cpu_T[0], cpu_T[0], mask + 1 - shift);
+            tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            break;
         }
-        else {
-            tcg_gen_shli_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shri_tl(t0, t0, data_bits - shift);
-        }
-        tcg_gen_or_tl(t0, t0, cpu_tmp4);
     }
 
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
 
     if (op2 != 0) {
-        /* update eflags */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        /* Compute the flags into CC_SRC.  */
+        gen_compute_eflags(s);
 
-        gen_compute_eflags(cpu_cc_src);
-        tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-        tcg_gen_xor_tl(cpu_tmp0, t1, t0);
-        tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+        /* The value that was "rotated out" is now present at the other end
+           of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+           since we've computed the flags into CC_SRC, these variables are
+           currently dead.  */
         if (is_right) {
-            tcg_gen_shri_tl(t0, t0, data_bits - 1);
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+            tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+        } else {
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+            tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
         }
-        tcg_gen_andi_tl(t0, t0, CC_C);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
-
-        tcg_gen_discard_tl(cpu_cc_dst);
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
-        s->cc_op = CC_OP_EFLAGS;
+        tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+        tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+        set_cc_op(s, CC_OP_ADCOX);
     }
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(a0);
 }
 
 /* XXX: add faster immediate = 1 case */
 static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1, 
                            int is_right)
 {
-    int label1;
-
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_compute_eflags(s);
+    assert(s->cc_op == CC_OP_EFLAGS);
 
     /* load */
     if (op1 == OR_TMP0)
@@ -1781,34 +1895,34 @@
     
     if (is_right) {
         switch (ot) {
-        case 0:
+        case OT_BYTE:
             gen_helper_rcrb(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
-        case 1:
+        case OT_WORD:
             gen_helper_rcrw(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
-        case 2:
+        case OT_LONG:
             gen_helper_rcrl(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
 #ifdef TARGET_X86_64
-        case 3:
+        case OT_QUAD:
             gen_helper_rcrq(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
 #endif
         }
     } else {
         switch (ot) {
-        case 0:
+        case OT_BYTE:
             gen_helper_rclb(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
-        case 1:
+        case OT_WORD:
             gen_helper_rclw(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
-        case 2:
+        case OT_LONG:
             gen_helper_rcll(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
 #ifdef TARGET_X86_64
-        case 3:
+        case OT_QUAD:
             gen_helper_rclq(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
             break;
 #endif
@@ -1819,146 +1933,92 @@
         gen_op_st_T0_A0(ot + s->mem_index);
     else
         gen_op_mov_reg_T0(ot, op1);
-
-    /* update eflags */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_cc_tmp, -1, label1);
-
-    tcg_gen_mov_tl(cpu_cc_src, cpu_cc_tmp);
-    tcg_gen_discard_tl(cpu_cc_dst);
-    tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
-        
-    gen_set_label(label1);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
 }
 
 /* XXX: add faster immediate case */
-static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, 
-                                int is_right)
+static void gen_shiftd_rm_T1(DisasContext *s, int ot, int op1,
+                             bool is_right, TCGv count_in)
 {
-    int label1, label2, data_bits;
-    target_ulong mask;
-    TCGv t0, t1, t2, a0;
-
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 63 : 31);
+    TCGv count;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_andi_tl(cpu_T3, cpu_T3, mask);
+    count = tcg_temp_new();
+    tcg_gen_andi_tl(count, count_in, mask);
 
-    tcg_gen_mov_tl(t1, cpu_T[1]);
-    tcg_gen_mov_tl(t2, cpu_T3);
-
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
-    
-    tcg_gen_addi_tl(cpu_tmp5, t2, -1);
-    if (ot == OT_WORD) {
-        /* Note: we implement the Intel behaviour for shift count > 16 */
+    switch (ot) {
+    case OT_WORD:
+        /* Note: we implement the Intel behaviour for shift count > 16.
+           This means "shrdw C, B, A" shifts A:B:A >> C.  Build the B:A
+           portion by constructing it as a 32-bit value.  */
         if (is_right) {
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(cpu_tmp0, t1, 16);
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
-            tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            /* only needed if count > 16, but a test would complicate */
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shl_tl(cpu_tmp0, t0, cpu_tmp5);
-
-            tcg_gen_shr_tl(t0, t0, t2);
-
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
+            tcg_gen_deposit_tl(cpu_tmp0, cpu_T[0], cpu_T[1], 16, 16);
+            tcg_gen_mov_tl(cpu_T[1], cpu_T[0]);
+            tcg_gen_mov_tl(cpu_T[0], cpu_tmp0);
         } else {
-            /* XXX: not optimal */
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(t1, t1, 16);
-            tcg_gen_or_tl(t1, t1, t0);
-            tcg_gen_ext32u_tl(t1, t1);
-            
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            tcg_gen_subfi_tl(cpu_tmp0, 32, cpu_tmp5);
-            tcg_gen_shr_tl(cpu_tmp5, t1, cpu_tmp0);
-            tcg_gen_or_tl(cpu_tmp4, cpu_tmp4, cpu_tmp5);
-
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_deposit_tl(cpu_T[1], cpu_T[0], cpu_T[1], 16, 16);
         }
-    } else {
-        data_bits = 8 << ot;
+        /* FALLTHRU */
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        /* Concatenate the two 32-bit values and use a 64-bit shift.  */
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
         if (is_right) {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
-
-            tcg_gen_shr_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shl_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
-            
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[0], cpu_T[1]);
+            tcg_gen_shr_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_i64(cpu_T[0], cpu_T[0], count);
         } else {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t1, t1);
-
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[1], cpu_T[0]);
+            tcg_gen_shl_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shl_i64(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shri_i64(cpu_tmp0, cpu_tmp0, 32);
+            tcg_gen_shri_i64(cpu_T[0], cpu_T[0], 32);
         }
+        break;
+#endif
+    default:
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
+        if (is_right) {
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
+        } else {
+            tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            if (ot == OT_WORD) {
+                /* Only needed if count > 16, for Intel behaviour.  */
+                tcg_gen_subfi_tl(cpu_tmp4, 33, count);
+                tcg_gen_shr_tl(cpu_tmp4, cpu_T[1], cpu_tmp4);
+                tcg_gen_or_tl(cpu_tmp0, cpu_tmp0, cpu_tmp4);
+            }
+
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shl_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shr_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
+        }
+        tcg_gen_movi_tl(cpu_tmp4, 0);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[1], count, cpu_tmp4,
+                           cpu_tmp4, cpu_T[1]);
+        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        break;
     }
-    tcg_gen_mov_tl(t1, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    
-    /* update eflags */
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
 
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label2);
-
-    tcg_gen_mov_tl(cpu_cc_src, t1);
-    tcg_gen_mov_tl(cpu_cc_dst, t0);
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
-    } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
-    }
-    gen_set_label(label2);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, count, is_right);
+    tcg_temp_free(count);
 }
 
 static void gen_shift(DisasContext *s1, int op, int ot, int d, int s)
@@ -2362,24 +2422,21 @@
 static inline void gen_jcc(DisasContext *s, int b,
                            target_ulong val, target_ulong next_eip)
 {
-    int l1, l2, cc_op;
+    int l1, l2;
 
-    cc_op = s->cc_op;
-    gen_update_cc_op(s);
     if (s->jmp_opt) {
         l1 = gen_new_label();
-        gen_jcc1(s, cc_op, b, l1);
-        
+        gen_jcc1(s, b, l1);
+
         gen_goto_tb(s, 0, next_eip);
 
         gen_set_label(l1);
         gen_goto_tb(s, 1, val);
         s->is_jmp = DISAS_TB_JUMP;
     } else {
-
         l1 = gen_new_label();
         l2 = gen_new_label();
-        gen_jcc1(s, cc_op, b, l1);
+        gen_jcc1(s, b, l1);
 
         gen_jmp_im(next_eip);
         tcg_gen_br(l2);
@@ -2391,32 +2448,32 @@
     }
 }
 
-static void gen_setcc(DisasContext *s, int b)
+static void gen_cmovcc1(CPUX86State *env, DisasContext *s, int ot, int b,
+                        int modrm, int reg)
 {
-    int inv, jcc_op, l1;
-    TCGv t0;
+    CCPrepare cc;
 
-    if (is_fast_jcc_case(s, b)) {
-        /* nominal case: we use a jump */
-        /* XXX: make it faster by adding new instructions in TCG */
-        t0 = tcg_temp_local_new();
-        tcg_gen_movi_tl(t0, 0);
-        l1 = gen_new_label();
-        gen_jcc1(s, s->cc_op, b ^ 1, l1);
-        tcg_gen_movi_tl(t0, 1);
-        gen_set_label(l1);
-        tcg_gen_mov_tl(cpu_T[0], t0);
-        tcg_temp_free(t0);
-    } else {
-        /* slow case: it is more efficient not to generate a jump,
-           although it is questionnable whether this optimization is
-           worth to */
-        inv = b & 1;
-        jcc_op = (b >> 1) & 7;
-        gen_setcc_slow_T0(s, jcc_op);
-        if (inv) {
-            tcg_gen_xori_tl(cpu_T[0], cpu_T[0], 1);
-        }
+    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+    cc = gen_prepare_cc(s, b, cpu_T[1]);
+    if (cc.mask != -1) {
+        TCGv t0 = tcg_temp_new();
+        tcg_gen_andi_tl(t0, cc.reg, cc.mask);
+        cc.reg = t0;
+    }
+    if (!cc.use_reg2) {
+        cc.reg2 = tcg_const_tl(cc.imm);
+    }
+
+    tcg_gen_movcond_tl(cc.cond, cpu_T[0], cc.reg, cc.reg2,
+                       cpu_T[0], cpu_regs[reg]);
+    gen_op_mov_reg_T0(ot, reg);
+
+    if (cc.mask != -1) {
+        tcg_temp_free(cc.reg);
+    }
+    if (!cc.use_reg2) {
+        tcg_temp_free(cc.reg2);
     }
 }
 
@@ -2442,8 +2499,7 @@
 {
     if (s->pe && !s->vm86) {
         /* XXX: optimize by finding processor state dynamically */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(cur_eip);
         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
         gen_helper_load_seg(cpu_env, tcg_const_i32(seg_reg), cpu_tmp2_i32);
@@ -2472,8 +2528,7 @@
     /* no SVM activated; fast case */
     if (likely(!(s->flags & HF_SVMI_MASK)))
         return;
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(pc_start - s->cs_base);
     gen_helper_svm_check_intercept_param(cpu_env, tcg_const_i32(type),
                                          tcg_const_i64(param));
@@ -2720,8 +2775,7 @@
 
 static void gen_exception(DisasContext *s, int trapno, target_ulong cur_eip)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(cur_eip);
     gen_helper_raise_exception(cpu_env, tcg_const_i32(trapno));
     s->is_jmp = DISAS_TB_JUMP;
@@ -2732,8 +2786,7 @@
 static void gen_interrupt(DisasContext *s, int intno,
                           target_ulong cur_eip, target_ulong next_eip)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(cur_eip);
     gen_helper_raise_interrupt(cpu_env, tcg_const_i32(intno),
                                tcg_const_i32(next_eip - cur_eip));
@@ -2742,8 +2795,7 @@
 
 static void gen_debug(DisasContext *s, target_ulong cur_eip)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(cur_eip);
     gen_helper_debug(cpu_env);
     s->is_jmp = DISAS_TB_JUMP;
@@ -2753,8 +2805,7 @@
    if needed */
 static void gen_eob(DisasContext *s)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     if (s->tb->flags & HF_INHIBIT_IRQ_MASK) {
         gen_helper_reset_inhibit_irq(cpu_env);
     }
@@ -2775,8 +2826,9 @@
    direct call to the next block may occur */
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
 {
+    gen_update_cc_op(s);
+    set_cc_op(s, CC_OP_DYNAMIC);
     if (s->jmp_opt) {
-        gen_update_cc_op(s);
         gen_goto_tb(s, tb_num, eip);
         s->is_jmp = DISAS_TB_JUMP;
     } else {
@@ -2912,8 +2964,9 @@
     [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
                (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
 
-    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
-    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
+    /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX.  */
+    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
+    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
 
     /* MMX ops and their SSE extensions */
     [0x60] = MMX_OP2(punpcklbw),
@@ -3794,11 +3847,13 @@
             reg = ((modrm >> 3) & 7) | rex_r;
             gen_op_mov_reg_T0(OT_LONG, reg);
             break;
+
         case 0x138:
-            if (s->prefix & PREFIX_REPNZ)
-                goto crc32;
         case 0x038:
             b = modrm;
+            if ((b & 0xf0) == 0xf0) {
+                goto do_0f_38_fx;
+            }
             modrm = cpu_ldub_code(env, s->pc++);
             rm = modrm & 7;
             reg = ((modrm >> 3) & 7) | rex_r;
@@ -3867,39 +3922,418 @@
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
 
-            if (b == 0x17)
-                s->cc_op = CC_OP_EFLAGS;
+            if (b == 0x17) {
+                set_cc_op(s, CC_OP_EFLAGS);
+            }
             break;
-        case 0x338: /* crc32 */
-        crc32:
-            b = modrm;
+
+        case 0x238:
+        case 0x338:
+        do_0f_38_fx:
+            /* Various integer extensions at 0f 38 f[0-f].  */
+            b = modrm | (b1 << 8);
             modrm = cpu_ldub_code(env, s->pc++);
             reg = ((modrm >> 3) & 7) | rex_r;
 
-            if (b != 0xf0 && b != 0xf1)
+            switch (b) {
+            case 0x3f0: /* crc32 Gd,Eb */
+            case 0x3f1: /* crc32 Gd,Ey */
+            do_crc32:
+                if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) {
+                    goto illegal_op;
+                }
+                if ((b & 0xff) == 0xf0) {
+                    ot = OT_BYTE;
+                } else if (s->dflag != 2) {
+                    ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+                } else {
+                    ot = OT_QUAD;
+                }
+
+                gen_op_mov_TN_reg(OT_LONG, 0, reg);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
+                                 cpu_T[0], tcg_const_i32(8 << ot));
+
+                ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            case 0x1f0: /* crc32 or movbe */
+            case 0x1f1:
+                /* For these insns, the f3 prefix is supposed to have priority
+                   over the 66 prefix, but that's not what we implement above
+                   setting b1.  */
+                if (s->prefix & PREFIX_REPNZ) {
+                    goto do_crc32;
+                }
+                /* FALLTHRU */
+            case 0x0f0: /* movbe Gy,My */
+            case 0x0f1: /* movbe My,Gy */
+                if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) {
+                    goto illegal_op;
+                }
+                if (s->dflag != 2) {
+                    ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+                } else {
+                    ot = OT_QUAD;
+                }
+
+                /* Load the data incoming to the bswap.  Note that the TCG
+                   implementation of bswap requires the input be zero
+                   extended.  In the case of the loads, we simply know that
+                   gen_op_ld_v via gen_ldst_modrm does that already.  */
+                if ((b & 1) == 0) {
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                } else {
+                    switch (ot) {
+                    case OT_WORD:
+                        tcg_gen_ext16u_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    default:
+                        tcg_gen_ext32u_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    case OT_QUAD:
+                        tcg_gen_mov_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    }
+                }
+
+                switch (ot) {
+                case OT_WORD:
+                    tcg_gen_bswap16_tl(cpu_T[0], cpu_T[0]);
+                    break;
+                default:
+                    tcg_gen_bswap32_tl(cpu_T[0], cpu_T[0]);
+                    break;
+#ifdef TARGET_X86_64
+                case OT_QUAD:
+                    tcg_gen_bswap64_tl(cpu_T[0], cpu_T[0]);
+                    break;
+#endif
+                }
+
+                if ((b & 1) == 0) {
+                    gen_op_mov_reg_T0(ot, reg);
+                } else {
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 1);
+                }
+                break;
+
+            case 0x0f2: /* andn Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                tcg_gen_andc_tl(cpu_T[0], cpu_regs[s->vex_v], cpu_T[0]);
+                gen_op_mov_reg_T0(ot, reg);
+                gen_op_update1_cc();
+                set_cc_op(s, CC_OP_LOGICB + ot);
+                break;
+
+            case 0x0f7: /* bextr Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                {
+                    TCGv bound, zero;
+
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                    /* Extract START, and shift the operand.
+                       Shifts larger than operand size get zeros.  */
+                    tcg_gen_ext8u_tl(cpu_A0, cpu_regs[s->vex_v]);
+                    tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_A0);
+
+                    bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+                    zero = tcg_const_tl(0);
+                    tcg_gen_movcond_tl(TCG_COND_LEU, cpu_T[0], cpu_A0, bound,
+                                       cpu_T[0], zero);
+                    tcg_temp_free(zero);
+
+                    /* Extract the LEN into a mask.  Lengths larger than
+                       operand size get all ones.  */
+                    tcg_gen_shri_tl(cpu_A0, cpu_regs[s->vex_v], 8);
+                    tcg_gen_ext8u_tl(cpu_A0, cpu_A0);
+                    tcg_gen_movcond_tl(TCG_COND_LEU, cpu_A0, cpu_A0, bound,
+                                       cpu_A0, bound);
+                    tcg_temp_free(bound);
+                    tcg_gen_movi_tl(cpu_T[1], 1);
+                    tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_A0);
+                    tcg_gen_subi_tl(cpu_T[1], cpu_T[1], 1);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+
+                    gen_op_mov_reg_T0(ot, reg);
+                    gen_op_update1_cc();
+                    set_cc_op(s, CC_OP_LOGICB + ot);
+                }
+                break;
+
+            case 0x0f5: /* bzhi Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                tcg_gen_ext8u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                {
+                    TCGv bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+                    /* Note that since we're using BMILG (in order to get O
+                       cleared) we need to store the inverse into C.  */
+                    tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src,
+                                       cpu_T[1], bound);
+                    tcg_gen_movcond_tl(TCG_COND_GT, cpu_T[1], cpu_T[1],
+                                       bound, bound, cpu_T[1]);
+                    tcg_temp_free(bound);
+                }
+                tcg_gen_movi_tl(cpu_A0, -1);
+                tcg_gen_shl_tl(cpu_A0, cpu_A0, cpu_T[1]);
+                tcg_gen_andc_tl(cpu_T[0], cpu_T[0], cpu_A0);
+                gen_op_mov_reg_T0(ot, reg);
+                gen_op_update1_cc();
+                set_cc_op(s, CC_OP_BMILGB + ot);
+                break;
+
+            case 0x3f6: /* mulx By, Gy, rdx, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                switch (ot) {
+                    TCGv_i64 t0, t1;
+                default:
+                    t0 = tcg_temp_new_i64();
+                    t1 = tcg_temp_new_i64();
+#ifdef TARGET_X86_64
+                    tcg_gen_ext32u_i64(t0, cpu_T[0]);
+                    tcg_gen_ext32u_i64(t1, cpu_regs[R_EDX]);
+#else
+                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
+                    tcg_gen_extu_i32_i64(t0, cpu_regs[R_EDX]);
+#endif
+                    tcg_gen_mul_i64(t0, t0, t1);
+                    tcg_gen_trunc_i64_tl(cpu_T[0], t0);
+                    tcg_gen_shri_i64(t0, t0, 32);
+                    tcg_gen_trunc_i64_tl(cpu_T[1], t0);
+                    tcg_temp_free_i64(t0);
+                    tcg_temp_free_i64(t1);
+                    gen_op_mov_reg_T0(OT_LONG, s->vex_v);
+                    gen_op_mov_reg_T1(OT_LONG, reg);
+                    break;
+#ifdef TARGET_X86_64
+                case OT_QUAD:
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[R_EDX]);
+                    tcg_gen_mul_tl(cpu_regs[s->vex_v], cpu_T[0], cpu_T[1]);
+                    gen_helper_umulh(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                    break;
+#endif
+                }
+                break;
+
+            case 0x3f5: /* pdep Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pdep(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x2f5: /* pext Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pext(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x1f6: /* adcx Gy, Ey */
+            case 0x2f6: /* adox Gy, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) {
+                    goto illegal_op;
+                } else {
+                    TCGv carry_in, carry_out, zero;
+                    int end_op;
+
+                    ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                    /* Re-use the carry-out from a previous round.  */
+                    TCGV_UNUSED(carry_in);
+                    carry_out = (b == 0x1f6 ? cpu_cc_dst : cpu_cc_src2);
+                    switch (s->cc_op) {
+                    case CC_OP_ADCX:
+                        if (b == 0x1f6) {
+                            carry_in = cpu_cc_dst;
+                            end_op = CC_OP_ADCX;
+                        } else {
+                            end_op = CC_OP_ADCOX;
+                        }
+                        break;
+                    case CC_OP_ADOX:
+                        if (b == 0x1f6) {
+                            end_op = CC_OP_ADCOX;
+                        } else {
+                            carry_in = cpu_cc_src2;
+                            end_op = CC_OP_ADOX;
+                        }
+                        break;
+                    case CC_OP_ADCOX:
+                        end_op = CC_OP_ADCOX;
+                        carry_in = carry_out;
+                        break;
+                    default:
+                        end_op = (b == 0x1f6 ? CC_OP_ADCX : CC_OP_ADCOX);
+                        break;
+                    }
+                    /* If we can't reuse carry-out, get it out of EFLAGS.  */
+                    if (TCGV_IS_UNUSED(carry_in)) {
+                        if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
+                            gen_compute_eflags(s);
+                        }
+                        carry_in = cpu_tmp0;
+                        tcg_gen_shri_tl(carry_in, cpu_cc_src,
+                                        ctz32(b == 0x1f6 ? CC_C : CC_O));
+                        tcg_gen_andi_tl(carry_in, carry_in, 1);
+                    }
+
+                    switch (ot) {
+#ifdef TARGET_X86_64
+                    case OT_LONG:
+                        /* If we know TL is 64-bit, and we want a 32-bit
+                           result, just do everything in 64-bit arithmetic.  */
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_regs[reg]);
+                        tcg_gen_ext32u_i64(cpu_T[0], cpu_T[0]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], cpu_regs[reg]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], carry_in);
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_T[0]);
+                        tcg_gen_shri_i64(carry_out, cpu_T[0], 32);
+                        break;
+#endif
+                    default:
+                        /* Otherwise compute the carry-out in two steps.  */
+                        zero = tcg_const_tl(0);
+                        tcg_gen_add2_tl(cpu_T[0], carry_out,
+                                        cpu_T[0], zero,
+                                        carry_in, zero);
+                        tcg_gen_add2_tl(cpu_regs[reg], carry_out,
+                                        cpu_regs[reg], carry_out,
+                                        cpu_T[0], zero);
+                        tcg_temp_free(zero);
+                        break;
+                    }
+                    set_cc_op(s, end_op);
+                }
+                break;
+
+            case 0x1f7: /* shlx Gy, Ey, By */
+            case 0x2f7: /* sarx Gy, Ey, By */
+            case 0x3f7: /* shrx Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                if (ot == OT_QUAD) {
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 63);
+                } else {
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 31);
+                }
+                if (b == 0x1f7) {
+                    tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else if (b == 0x2f7) {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            case 0x0f3:
+            case 0x1f3:
+            case 0x2f3:
+            case 0x3f3: /* Group 17 */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                switch (reg & 7) {
+                case 1: /* blsr By,Ey */
+                    tcg_gen_neg_tl(cpu_T[1], cpu_T[0]);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                    gen_op_mov_reg_T0(ot, s->vex_v);
+                    gen_op_update2_cc();
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
+
+                case 2: /* blsmsk By,Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
+
+                case 3: /* blsi By, Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
+
+                default:
+                    goto illegal_op;
+                }
+                break;
+
+            default:
                 goto illegal_op;
-            if (!(s->cpuid_ext_features & CPUID_EXT_SSE42))
-                goto illegal_op;
-
-            if (b == 0xf0)
-                ot = OT_BYTE;
-            else if (b == 0xf1 && s->dflag != 2)
-                if (s->prefix & PREFIX_DATA)
-                    ot = OT_WORD;
-                else
-                    ot = OT_LONG;
-            else
-                ot = OT_QUAD;
-
-            gen_op_mov_TN_reg(OT_LONG, 0, reg);
-            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-            gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
-            gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
-                             cpu_T[0], tcg_const_i32(8 << ot));
-
-            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
-            gen_op_mov_reg_T0(ot, reg);
+            }
             break;
+
         case 0x03a:
         case 0x13a:
             b = modrm;
@@ -4070,7 +4504,7 @@
             val = cpu_ldub_code(env, s->pc++);
 
             if ((b & 0xfc) == 0x60) { /* pcmpXstrX */
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
 
                 if (s->dflag == 2)
                     /* The helper must use entire 64-bit gp registers */
@@ -4081,6 +4515,38 @@
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             sse_fn_eppi(cpu_env, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
             break;
+
+        case 0x33a:
+            /* Various integer extensions at 0f 3a f[0-f].  */
+            b = modrm | (b1 << 8);
+            modrm = cpu_ldub_code(env, s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+
+            switch (b) {
+            case 0x3f0: /* rorx Gy,Ey, Ib */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                b = cpu_ldub_code(env, s->pc++);
+                if (ot == OT_QUAD) {
+                    tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], b & 63);
+                } else {
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, b & 31);
+                    tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            default:
+                goto illegal_op;
+            }
+            break;
+
         default:
             goto illegal_op;
         }
@@ -4191,7 +4657,7 @@
             break;
         }
         if (b == 0x2e || b == 0x2f) {
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
         }
     }
 }
@@ -4223,47 +4689,49 @@
     x86_64_hregs = 0;
 #endif
     s->rip_offset = 0; /* for relative ip address */
+    s->vex_l = 0;
+    s->vex_v = 0;
  next_byte:
     b = cpu_ldub_code(env, s->pc);
     s->pc++;
-    /* check prefixes */
+    /* Collect prefixes.  */
+    switch (b) {
+    case 0xf3:
+        prefixes |= PREFIX_REPZ;
+        goto next_byte;
+    case 0xf2:
+        prefixes |= PREFIX_REPNZ;
+        goto next_byte;
+    case 0xf0:
+        prefixes |= PREFIX_LOCK;
+        goto next_byte;
+    case 0x2e:
+        s->override = R_CS;
+        goto next_byte;
+    case 0x36:
+        s->override = R_SS;
+        goto next_byte;
+    case 0x3e:
+        s->override = R_DS;
+        goto next_byte;
+    case 0x26:
+        s->override = R_ES;
+        goto next_byte;
+    case 0x64:
+        s->override = R_FS;
+        goto next_byte;
+    case 0x65:
+        s->override = R_GS;
+        goto next_byte;
+    case 0x66:
+        prefixes |= PREFIX_DATA;
+        goto next_byte;
+    case 0x67:
+        prefixes |= PREFIX_ADR;
+        goto next_byte;
 #ifdef TARGET_X86_64
-    if (CODE64(s)) {
-        switch (b) {
-        case 0xf3:
-            prefixes |= PREFIX_REPZ;
-            goto next_byte;
-        case 0xf2:
-            prefixes |= PREFIX_REPNZ;
-            goto next_byte;
-        case 0xf0:
-            prefixes |= PREFIX_LOCK;
-            goto next_byte;
-        case 0x2e:
-            s->override = R_CS;
-            goto next_byte;
-        case 0x36:
-            s->override = R_SS;
-            goto next_byte;
-        case 0x3e:
-            s->override = R_DS;
-            goto next_byte;
-        case 0x26:
-            s->override = R_ES;
-            goto next_byte;
-        case 0x64:
-            s->override = R_FS;
-            goto next_byte;
-        case 0x65:
-            s->override = R_GS;
-            goto next_byte;
-        case 0x66:
-            prefixes |= PREFIX_DATA;
-            goto next_byte;
-        case 0x67:
-            prefixes |= PREFIX_ADR;
-            goto next_byte;
-        case 0x40 ... 0x4f:
+    case 0x40 ... 0x4f:
+        if (CODE64(s)) {
             /* REX prefix */
             rex_w = (b >> 3) & 1;
             rex_r = (b & 0x4) << 1;
@@ -4272,58 +4740,85 @@
             x86_64_hregs = 1; /* select uniform byte register addressing */
             goto next_byte;
         }
+        break;
+#endif
+    case 0xc5: /* 2-byte VEX */
+    case 0xc4: /* 3-byte VEX */
+        /* VEX prefixes cannot be used except in 32-bit mode.
+           Otherwise the instruction is LES or LDS.  */
+        if (s->code32 && !s->vm86) {
+            static const int pp_prefix[4] = {
+                0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ
+            };
+            int vex3, vex2 = cpu_ldub_code(env, s->pc);
+
+            if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) {
+                /* 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b,
+                   otherwise the instruction is LES or LDS.  */
+                break;
+            }
+            s->pc++;
+
+            /* 4.1.1-4.1.3: No preceeding lock, 66, f2, f3, or rex prefixes. */
+            if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ
+                            | PREFIX_LOCK | PREFIX_DATA)) {
+                goto illegal_op;
+            }
+#ifdef TARGET_X86_64
+            if (x86_64_hregs) {
+                goto illegal_op;
+            }
+#endif
+            rex_r = (~vex2 >> 4) & 8;
+            if (b == 0xc5) {
+                vex3 = vex2;
+                b = cpu_ldub_code(env, s->pc++);
+            } else {
+#ifdef TARGET_X86_64
+                s->rex_x = (~vex2 >> 3) & 8;
+                s->rex_b = (~vex2 >> 2) & 8;
+#endif
+                vex3 = cpu_ldub_code(env, s->pc++);
+                rex_w = (vex3 >> 7) & 1;
+                switch (vex2 & 0x1f) {
+                case 0x01: /* Implied 0f leading opcode bytes.  */
+                    b = cpu_ldub_code(env, s->pc++) | 0x100;
+                    break;
+                case 0x02: /* Implied 0f 38 leading opcode bytes.  */
+                    b = 0x138;
+                    break;
+                case 0x03: /* Implied 0f 3a leading opcode bytes.  */
+                    b = 0x13a;
+                    break;
+                default:   /* Reserved for future use.  */
+                    goto illegal_op;
+                }
+            }
+            s->vex_v = (~vex3 >> 3) & 0xf;
+            s->vex_l = (vex3 >> 2) & 1;
+            prefixes |= pp_prefix[vex3 & 3] | PREFIX_VEX;
+        }
+        break;
+    }
+
+    /* Post-process prefixes.  */
+    if (prefixes & PREFIX_DATA) {
+        dflag ^= 1;
+    }
+    if (prefixes & PREFIX_ADR) {
+        aflag ^= 1;
+    }
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
         if (rex_w == 1) {
             /* 0x66 is ignored if rex.w is set */
             dflag = 2;
-        } else {
-            if (prefixes & PREFIX_DATA)
-                dflag ^= 1;
         }
-        if (!(prefixes & PREFIX_ADR))
+        if (!(prefixes & PREFIX_ADR)) {
             aflag = 2;
-    } else
-#endif
-    {
-        switch (b) {
-        case 0xf3:
-            prefixes |= PREFIX_REPZ;
-            goto next_byte;
-        case 0xf2:
-            prefixes |= PREFIX_REPNZ;
-            goto next_byte;
-        case 0xf0:
-            prefixes |= PREFIX_LOCK;
-            goto next_byte;
-        case 0x2e:
-            s->override = R_CS;
-            goto next_byte;
-        case 0x36:
-            s->override = R_SS;
-            goto next_byte;
-        case 0x3e:
-            s->override = R_DS;
-            goto next_byte;
-        case 0x26:
-            s->override = R_ES;
-            goto next_byte;
-        case 0x64:
-            s->override = R_FS;
-            goto next_byte;
-        case 0x65:
-            s->override = R_GS;
-            goto next_byte;
-        case 0x66:
-            prefixes |= PREFIX_DATA;
-            goto next_byte;
-        case 0x67:
-            prefixes |= PREFIX_ADR;
-            goto next_byte;
         }
-        if (prefixes & PREFIX_DATA)
-            dflag ^= 1;
-        if (prefixes & PREFIX_ADR)
-            aflag ^= 1;
     }
+#endif
 
     s->prefix = prefixes;
     s->aflag = aflag;
@@ -4374,10 +4869,9 @@
                 } else if (op == OP_XORL && rm == reg) {
                 xor_zero:
                     /* xor reg, reg optimisation */
+                    set_cc_op(s, CC_OP_CLR);
                     gen_op_movl_T0_0();
-                    s->cc_op = CC_OP_LOGICB + ot;
                     gen_op_mov_reg_T0(ot, reg);
-                    gen_op_update1_cc();
                     break;
                 } else {
                     opreg = rm;
@@ -4490,7 +4984,7 @@
             val = insn_get(env, s, ot);
             gen_op_movl_T1_im(val);
             gen_op_testl_T0_T1_cc();
-            s->cc_op = CC_OP_LOGICB + ot;
+            set_cc_op(s, CC_OP_LOGICB + ot);
             break;
         case 2: /* not */
             tcg_gen_not_tl(cpu_T[0], cpu_T[0]);
@@ -4508,7 +5002,7 @@
                 gen_op_mov_reg_T0(ot, rm);
             }
             gen_op_update_neg_cc();
-            s->cc_op = CC_OP_SUBB + ot;
+            set_cc_op(s, CC_OP_SUBB + ot);
             break;
         case 4: /* mul */
             switch(ot) {
@@ -4521,7 +5015,7 @@
                 gen_op_mov_reg_T0(OT_WORD, R_EAX);
                 tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
                 tcg_gen_andi_tl(cpu_cc_src, cpu_T[0], 0xff00);
-                s->cc_op = CC_OP_MULB;
+                set_cc_op(s, CC_OP_MULB);
                 break;
             case OT_WORD:
                 gen_op_mov_TN_reg(OT_WORD, 1, R_EAX);
@@ -4534,7 +5028,7 @@
                 tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 16);
                 gen_op_mov_reg_T0(OT_WORD, R_EDX);
                 tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-                s->cc_op = CC_OP_MULW;
+                set_cc_op(s, CC_OP_MULW);
                 break;
             default:
             case OT_LONG:
@@ -4566,12 +5060,12 @@
                     tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
                 }
 #endif
-                s->cc_op = CC_OP_MULL;
+                set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
                 gen_helper_mulq_EAX_T0(cpu_env, cpu_T[0]);
-                s->cc_op = CC_OP_MULQ;
+                set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
             }
@@ -4588,7 +5082,7 @@
                 tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
                 tcg_gen_ext8s_tl(cpu_tmp0, cpu_T[0]);
                 tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                s->cc_op = CC_OP_MULB;
+                set_cc_op(s, CC_OP_MULB);
                 break;
             case OT_WORD:
                 gen_op_mov_TN_reg(OT_WORD, 1, R_EAX);
@@ -4602,7 +5096,7 @@
                 tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
                 tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 16);
                 gen_op_mov_reg_T0(OT_WORD, R_EDX);
-                s->cc_op = CC_OP_MULW;
+                set_cc_op(s, CC_OP_MULW);
                 break;
             default:
             case OT_LONG:
@@ -4636,12 +5130,12 @@
                     tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
                 }
 #endif
-                s->cc_op = CC_OP_MULL;
+                set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
                 gen_helper_imulq_EAX_T0(cpu_env, cpu_T[0]);
-                s->cc_op = CC_OP_MULQ;
+                set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
             }
@@ -4761,8 +5255,7 @@
             gen_op_ldu_T0_A0(OT_WORD + s->mem_index);
         do_lcall:
             if (s->pe && !s->vm86) {
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
                 gen_helper_lcall_protected(cpu_env, cpu_tmp2_i32, cpu_T[1],
@@ -4788,8 +5281,7 @@
             gen_op_ldu_T0_A0(OT_WORD + s->mem_index);
         do_ljmp:
             if (s->pe && !s->vm86) {
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
                 gen_helper_ljmp_protected(cpu_env, cpu_tmp2_i32, cpu_T[1],
@@ -4822,7 +5314,7 @@
         gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
         gen_op_mov_TN_reg(ot, 1, reg);
         gen_op_testl_T0_T1_cc();
-        s->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s, CC_OP_LOGICB + ot);
         break;
 
     case 0xa8: /* test eAX, Iv */
@@ -4836,7 +5328,7 @@
         gen_op_mov_TN_reg(ot, 0, OR_EAX);
         gen_op_movl_T1_im(val);
         gen_op_testl_T0_T1_cc();
-        s->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s, CC_OP_LOGICB + ot);
         break;
 
     case 0x98: /* CWDE/CBW */
@@ -4937,7 +5429,7 @@
             tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
         }
         gen_op_mov_reg_T0(ot, reg);
-        s->cc_op = CC_OP_MULB + ot;
+        set_cc_op(s, CC_OP_MULB + ot);
         break;
     case 0x1c0:
     case 0x1c1: /* xadd Ev, Gv */
@@ -4964,7 +5456,7 @@
             gen_op_mov_reg_T1(ot, reg);
         }
         gen_op_update2_cc();
-        s->cc_op = CC_OP_ADDB + ot;
+        set_cc_op(s, CC_OP_ADDB + ot);
         break;
     case 0x1b0:
     case 0x1b1: /* cmpxchg Ev, Gv */
@@ -4994,9 +5486,10 @@
                 rm = 0; /* avoid warning */
             }
             label1 = gen_new_label();
-            tcg_gen_sub_tl(t2, cpu_regs[R_EAX], t0);
+            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
+            gen_extu(ot, t0);
             gen_extu(ot, t2);
-            tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
+            tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1);
             label2 = gen_new_label();
             if (mod == 3) {
                 gen_op_mov_reg_v(ot, R_EAX, t0);
@@ -5015,8 +5508,9 @@
             }
             gen_set_label(label2);
             tcg_gen_mov_tl(cpu_cc_src, t0);
-            tcg_gen_mov_tl(cpu_cc_dst, t2);
-            s->cc_op = CC_OP_SUBB + ot;
+            tcg_gen_mov_tl(cpu_cc_srcT, t2);
+            tcg_gen_sub_tl(cpu_cc_dst, t2, t0);
+            set_cc_op(s, CC_OP_SUBB + ot);
             tcg_temp_free(t0);
             tcg_temp_free(t1);
             tcg_temp_free(t2);
@@ -5033,8 +5527,7 @@
             if (!(s->cpuid_ext_features & CPUID_EXT_CX16))
                 goto illegal_op;
             gen_jmp_im(pc_start - s->cs_base);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
             gen_helper_cmpxchg16b(cpu_env, cpu_A0);
         } else
@@ -5043,12 +5536,11 @@
             if (!(s->cpuid_features & CPUID_CX8))
                 goto illegal_op;
             gen_jmp_im(pc_start - s->cs_base);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
             gen_helper_cmpxchg8b(cpu_env, cpu_A0);
         }
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
 
         /**************************/
@@ -5452,13 +5944,11 @@
         }
         break;
     case 0xc4: /* les Gv */
-        if (CODE64(s))
-            goto illegal_op;
+        /* In CODE64 this is VEX3; see above.  */
         op = R_ES;
         goto do_lxx;
     case 0xc5: /* lds Gv */
-        if (CODE64(s))
-            goto illegal_op;
+        /* In CODE64 this is VEX2; see above.  */
         op = R_DS;
         goto do_lxx;
     case 0x1b2: /* lss Gv */
@@ -5569,12 +6059,12 @@
         gen_op_mov_TN_reg(ot, 1, reg);
 
         if (shift) {
-            val = cpu_ldub_code(env, s->pc++);
-            tcg_gen_movi_tl(cpu_T3, val);
+            TCGv imm = tcg_const_tl(cpu_ldub_code(env, s->pc++));
+            gen_shiftd_rm_T1(s, ot, opreg, op, imm);
+            tcg_temp_free(imm);
         } else {
-            tcg_gen_mov_tl(cpu_T3, cpu_regs[R_ECX]);
+            gen_shiftd_rm_T1(s, ot, opreg, op, cpu_regs[R_ECX]);
         }
-        gen_shiftd_rm_T1_T3(s, ot, opreg, op);
         break;
 
         /************************/
@@ -5717,8 +6207,7 @@
                 }
                 break;
             case 0x0c: /* fldenv mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fldenv(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
@@ -5728,8 +6217,7 @@
                 gen_helper_fldcw(cpu_env, cpu_tmp2_i32);
                 break;
             case 0x0e: /* fnstenv mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fstenv(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
@@ -5739,27 +6227,23 @@
                 gen_op_st_T0_A0(OT_WORD + s->mem_index);
                 break;
             case 0x1d: /* fldt mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fldt_ST0(cpu_env, cpu_A0);
                 break;
             case 0x1f: /* fstpt mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fstt_ST0(cpu_env, cpu_A0);
                 gen_helper_fpop(cpu_env);
                 break;
             case 0x2c: /* frstor mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_frstor(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
             case 0x2e: /* fnsave mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fsave(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
@@ -5769,14 +6253,12 @@
                 gen_op_st_T0_A0(OT_WORD + s->mem_index);
                 break;
             case 0x3c: /* fbld */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fbld_ST0(cpu_env, cpu_A0);
                 break;
             case 0x3e: /* fbstp */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fbst_ST0(cpu_env, cpu_A0);
                 gen_helper_fpop(cpu_env);
@@ -5814,8 +6296,7 @@
                 switch(rm) {
                 case 0: /* fnop */
                     /* check exceptions (FreeBSD FPU probe) */
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
                     gen_helper_fwait(cpu_env);
                     break;
@@ -5996,18 +6477,16 @@
                 }
                 break;
             case 0x1d: /* fucomi */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fucomi_ST0_FT0(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x1e: /* fcomi */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fcomi_ST0_FT0(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x28: /* ffree sti */
                 gen_helper_ffree_STN(cpu_env, tcg_const_i32(opreg));
@@ -6059,20 +6538,18 @@
                 }
                 break;
             case 0x3d: /* fucomip */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fucomi_ST0_FT0(cpu_env);
                 gen_helper_fpop(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x3e: /* fcomip */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fcomi_ST0_FT0(cpu_env);
                 gen_helper_fpop(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x10 ... 0x13: /* fcmovxx */
             case 0x18 ... 0x1b:
@@ -6086,7 +6563,7 @@
                     };
                     op1 = fcmov_cc[op & 3] | (((op >> 3) & 1) ^ 1);
                     l1 = gen_new_label();
-                    gen_jcc1(s, s->cc_op, op1, l1);
+                    gen_jcc1_noeob(s, op1, l1);
                     gen_helper_fmov_ST0_STN(cpu_env, tcg_const_i32(opreg));
                     gen_set_label(l1);
                 }
@@ -6150,7 +6627,6 @@
             gen_repz_scas(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 0);
         } else {
             gen_scas(s, ot);
-            s->cc_op = CC_OP_SUBB + ot;
         }
         break;
 
@@ -6166,7 +6642,6 @@
             gen_repz_cmps(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 0);
         } else {
             gen_cmps(s, ot);
-            s->cc_op = CC_OP_SUBB + ot;
         }
         break;
     case 0x6c: /* insS */
@@ -6323,8 +6798,7 @@
         s->pc += 2;
     do_lret:
         if (s->pe && !s->vm86) {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_lret_protected(cpu_env, tcg_const_i32(s->dflag),
                                       tcg_const_i32(val));
@@ -6354,21 +6828,20 @@
         if (!s->pe) {
             /* real mode */
             gen_helper_iret_real(cpu_env, tcg_const_i32(s->dflag));
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
         } else if (s->vm86) {
             if (s->iopl != 3) {
                 gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
             } else {
                 gen_helper_iret_real(cpu_env, tcg_const_i32(s->dflag));
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
             }
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_iret_protected(cpu_env, tcg_const_i32(s->dflag),
                                       tcg_const_i32(s->pc - s->cs_base));
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
         }
         gen_eob(s);
         break;
@@ -6455,44 +6928,14 @@
 
     case 0x190 ... 0x19f: /* setcc Gv */
         modrm = cpu_ldub_code(env, s->pc++);
-        gen_setcc(s, b);
+        gen_setcc1(s, b, cpu_T[0]);
         gen_ldst_modrm(env, s, modrm, OT_BYTE, OR_TMP0, 1);
         break;
     case 0x140 ... 0x14f: /* cmov Gv, Ev */
-        {
-            int l1;
-            TCGv t0;
-
-            ot = dflag + OT_WORD;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
-            mod = (modrm >> 6) & 3;
-            t0 = tcg_temp_local_new();
-            if (mod != 3) {
-                gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-                gen_op_ld_v(ot + s->mem_index, t0, cpu_A0);
-            } else {
-                rm = (modrm & 7) | REX_B(s);
-                gen_op_mov_v_reg(ot, t0, rm);
-            }
-#ifdef TARGET_X86_64
-            if (ot == OT_LONG) {
-                /* XXX: specific Intel behaviour ? */
-                l1 = gen_new_label();
-                gen_jcc1(s, s->cc_op, b ^ 1, l1);
-                tcg_gen_mov_tl(cpu_regs[reg], t0);
-                gen_set_label(l1);
-                tcg_gen_ext32u_tl(cpu_regs[reg], cpu_regs[reg]);
-            } else
-#endif
-            {
-                l1 = gen_new_label();
-                gen_jcc1(s, s->cc_op, b ^ 1, l1);
-                gen_op_mov_reg_v(ot, reg, t0);
-                gen_set_label(l1);
-            }
-            tcg_temp_free(t0);
-        }
+        ot = dflag + OT_WORD;
+        modrm = cpu_ldub_code(env, s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        gen_cmovcc1(env, s, ot, b, modrm, reg);
         break;
 
         /************************/
@@ -6502,8 +6945,7 @@
         if (s->vm86 && s->iopl != 3) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_helper_read_eflags(cpu_T[0], cpu_env);
             gen_push_T0(s);
         }
@@ -6560,7 +7002,7 @@
                 }
             }
             gen_pop_update(s);
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
             /* abort translation because TF/AC flag may change */
             gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
@@ -6570,44 +7012,30 @@
         if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
             goto illegal_op;
         gen_op_mov_TN_reg(OT_BYTE, 0, R_AH);
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
         tcg_gen_andi_tl(cpu_T[0], cpu_T[0], CC_S | CC_Z | CC_A | CC_P | CC_C);
         tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_T[0]);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0x9f: /* lahf */
         if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_T[0]);
+        gen_compute_eflags(s);
         /* Note: gen_compute_eflags() only gives the condition codes */
-        tcg_gen_ori_tl(cpu_T[0], cpu_T[0], 0x02);
+        tcg_gen_ori_tl(cpu_T[0], cpu_cc_src, 0x02);
         gen_op_mov_reg_T0(OT_BYTE, R_AH);
         break;
     case 0xf5: /* cmc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_xori_tl(cpu_cc_src, cpu_cc_src, CC_C);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0xf8: /* clc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_C);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0xf9: /* stc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_ori_tl(cpu_cc_src, cpu_cc_src, CC_C);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0xfc: /* cld */
         tcg_gen_movi_i32(cpu_tmp2_i32, 1);
@@ -6697,7 +7125,7 @@
             tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
             break;
         }
-        s->cc_op = CC_OP_SARB + ot;
+        set_cc_op(s, CC_OP_SARB + ot);
         if (op != 0) {
             if (mod != 3)
                 gen_op_st_T0_A0(ot + s->mem_index);
@@ -6707,81 +7135,88 @@
             tcg_gen_movi_tl(cpu_cc_dst, 0);
         }
         break;
-    case 0x1bc: /* bsf */
-    case 0x1bd: /* bsr */
-        {
-            int label1;
-            TCGv t0;
+    case 0x1bc: /* bsf / tzcnt */
+    case 0x1bd: /* bsr / lzcnt */
+        ot = dflag + OT_WORD;
+        modrm = cpu_ldub_code(env, s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+        gen_extu(ot, cpu_T[0]);
 
-            ot = dflag + OT_WORD;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
-            gen_ldst_modrm(env, s,modrm, ot, OR_TMP0, 0);
-            gen_extu(ot, cpu_T[0]);
-            t0 = tcg_temp_local_new();
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            if ((b & 1) && (prefixes & PREFIX_REPZ) &&
-                (s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
-                switch(ot) {
-                case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(16)); break;
-                case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(32)); break;
-                case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(64)); break;
-                }
-                gen_op_mov_reg_T0(ot, reg);
+        /* Note that lzcnt and tzcnt are in different extensions.  */
+        if ((prefixes & PREFIX_REPZ)
+            && (b & 1
+                ? s->cpuid_ext3_features & CPUID_EXT3_ABM
+                : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+            int size = 8 << ot;
+            tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+            if (b & 1) {
+                /* For lzcnt, reduce the target_ulong result by the
+                   number of zeros that we expect to find at the top.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_subi_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - size);
             } else {
-                label1 = gen_new_label();
-                tcg_gen_movi_tl(cpu_cc_dst, 0);
-                tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1);
-                if (b & 1) {
-                    gen_helper_bsr(cpu_T[0], t0);
-                } else {
-                    gen_helper_bsf(cpu_T[0], t0);
-                }
-                gen_op_mov_reg_T0(ot, reg);
-                tcg_gen_movi_tl(cpu_cc_dst, 1);
-                gen_set_label(label1);
-                tcg_gen_discard_tl(cpu_cc_src);
-                s->cc_op = CC_OP_LOGICB + ot;
+                /* For tzcnt, a zero input must return the operand size:
+                   force all bits outside the operand size to 1.  */
+                target_ulong mask = (target_ulong)-2 << (size - 1);
+                tcg_gen_ori_tl(cpu_T[0], cpu_T[0], mask);
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
             }
-            tcg_temp_free(t0);
+            /* For lzcnt/tzcnt, C and Z bits are defined and are
+               related to the result.  */
+            gen_op_update1_cc();
+            set_cc_op(s, CC_OP_BMILGB + ot);
+        } else {
+            /* For bsr/bsf, only the Z bit is defined and it is related
+               to the input and not the result.  */
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+            set_cc_op(s, CC_OP_LOGICB + ot);
+            if (b & 1) {
+                /* For bsr, return the bit index of the first 1 bit,
+                   not the count of leading zeros.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_xori_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - 1);
+            } else {
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
+            }
+            /* ??? The manual says that the output is undefined when the
+               input is zero, but real hardware leaves it unchanged, and
+               real programs appear to depend on that.  */
+            tcg_gen_movi_tl(cpu_tmp0, 0);
+            tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[0], cpu_cc_dst, cpu_tmp0,
+                               cpu_regs[reg], cpu_T[0]);
         }
+        gen_op_mov_reg_T0(ot, reg);
         break;
         /************************/
         /* bcd */
     case 0x27: /* daa */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_daa(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x2f: /* das */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_das(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x37: /* aaa */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_aaa(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x3f: /* aas */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_aas(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0xd4: /* aam */
         if (CODE64(s))
@@ -6791,7 +7226,7 @@
             gen_exception(s, EXCP00_DIVZ, pc_start - s->cs_base);
         } else {
             gen_helper_aam(cpu_env, tcg_const_i32(val));
-            s->cc_op = CC_OP_LOGICB;
+            set_cc_op(s, CC_OP_LOGICB);
         }
         break;
     case 0xd5: /* aad */
@@ -6799,7 +7234,7 @@
             goto illegal_op;
         val = cpu_ldub_code(env, s->pc++);
         gen_helper_aad(cpu_env, tcg_const_i32(val));
-        s->cc_op = CC_OP_LOGICB;
+        set_cc_op(s, CC_OP_LOGICB);
         break;
         /************************/
         /* misc */
@@ -6821,8 +7256,7 @@
             (HF_MP_MASK | HF_TS_MASK)) {
             gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_fwait(cpu_env);
         }
@@ -6841,8 +7275,7 @@
     case 0xce: /* into */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         gen_helper_into(cpu_env, tcg_const_i32(s->pc - pc_start));
         break;
@@ -6935,9 +7368,7 @@
     case 0xd6: /* salc */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags_c(cpu_T[0]);
+        gen_compute_eflags_c(s, cpu_T[0]);
         tcg_gen_neg_tl(cpu_T[0], cpu_T[0]);
         gen_op_mov_reg_T0(OT_BYTE, R_EAX);
         break;
@@ -6961,17 +7392,9 @@
             switch(b) {
             case 0: /* loopnz */
             case 1: /* loopz */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
                 gen_op_add_reg_im(s->aflag, R_ECX, -1);
                 gen_op_jz_ecx(s->aflag, l3);
-                gen_compute_eflags(cpu_tmp0);
-                tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_Z);
-                if (b == 0) {
-                    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, l1);
-                } else {
-                    tcg_gen_brcondi_tl(TCG_COND_NE, cpu_tmp0, 0, l1);
-                }
+                gen_jcc1(s, (JCC_Z << 1) | (b ^ 1), l1);
                 break;
             case 2: /* loop */
                 gen_op_add_reg_im(s->aflag, R_ECX, -1);
@@ -6998,8 +7421,7 @@
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             if (b & 2) {
                 gen_helper_rdmsr(cpu_env);
@@ -7009,8 +7431,7 @@
         }
         break;
     case 0x131: /* rdtsc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         if (use_icount)
             gen_io_start();
@@ -7021,8 +7442,7 @@
         }
         break;
     case 0x133: /* rdpmc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         gen_helper_rdpmc(cpu_env);
         break;
@@ -7068,15 +7488,15 @@
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_sysret(cpu_env, tcg_const_i32(s->dflag));
             /* condition codes are modified only in long mode */
-            if (s->lma)
-                s->cc_op = CC_OP_EFLAGS;
+            if (s->lma) {
+                set_cc_op(s, CC_OP_EFLAGS);
+            }
             gen_eob(s);
         }
         break;
 #endif
     case 0x1a2: /* cpuid */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         gen_helper_cpuid(cpu_env);
         break;
@@ -7084,8 +7504,7 @@
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_hlt(cpu_env, tcg_const_i32(s->pc - pc_start));
             s->is_jmp = DISAS_TB_JUMP;
@@ -7147,14 +7566,13 @@
             if (!s->pe || s->vm86)
                 goto illegal_op;
             gen_ldst_modrm(env, s, modrm, OT_WORD, OR_TMP0, 0);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             if (op == 4) {
                 gen_helper_verr(cpu_env, cpu_T[0]);
             } else {
                 gen_helper_verw(cpu_env, cpu_T[0]);
             }
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
             break;
         default:
             goto illegal_op;
@@ -7186,8 +7604,7 @@
                     if (!(s->cpuid_ext_features & CPUID_EXT_MONITOR) ||
                         s->cpl != 0)
                         goto illegal_op;
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
 #ifdef TARGET_X86_64
                     if (s->aflag == 2) {
@@ -7247,8 +7664,7 @@
         case 2: /* lgdt */
         case 3: /* lidt */
             if (mod == 3) {
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 switch(rm) {
                 case 0: /* VMRUN */
@@ -7376,8 +7792,7 @@
                 if (s->cpl != 0) {
                     gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
                 } else {
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
                     gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
                     gen_helper_invlpg(cpu_env, cpu_A0);
@@ -7410,8 +7825,7 @@
                 case 1: /* rdtscp */
                     if (!(s->cpuid_ext2_features & CPUID_EXT2_RDTSCP))
                         goto illegal_op;
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
                     if (use_icount)
                         gen_io_start();
@@ -7507,12 +7921,9 @@
            } else {
                 gen_op_mov_reg_v(ot, rm, t0);
             }
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
-            gen_compute_eflags(cpu_cc_src);
+            gen_compute_eflags(s);
             tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_Z);
             tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t2);
-            s->cc_op = CC_OP_EFLAGS;
             tcg_temp_free(t0);
             tcg_temp_free(t1);
             tcg_temp_free(t2);
@@ -7530,8 +7941,7 @@
             reg = ((modrm >> 3) & 7) | rex_r;
             gen_ldst_modrm(env, s, modrm, OT_WORD, OR_TMP0, 0);
             t0 = tcg_temp_local_new();
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             if (b == 0x102) {
                 gen_helper_lar(t0, cpu_env, cpu_T[0]);
             } else {
@@ -7542,7 +7952,7 @@
             tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, label1);
             gen_op_mov_reg_v(ot, reg, t0);
             gen_set_label(label1);
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
             tcg_temp_free(t0);
         }
         break;
@@ -7596,8 +8006,7 @@
             case 3:
             case 4:
             case 8:
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 if (b & 2) {
                     gen_op_mov_TN_reg(ot, 0, rm);
@@ -7686,8 +8095,7 @@
                 break;
             }
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_fxsave(cpu_env, cpu_A0, tcg_const_i32((s->dflag == 2)));
             break;
@@ -7700,8 +8108,7 @@
                 break;
             }
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_fxrstor(cpu_env, cpu_A0,
                                tcg_const_i32((s->dflag == 2)));
@@ -7785,7 +8192,7 @@
         gen_helper_popcnt(cpu_T[0], cpu_env, cpu_T[0], tcg_const_i32(ot));
         gen_op_mov_reg_T0(ot, reg);
 
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x10e ... 0x10f:
         /* 3DNow! instructions, ignore prefixes */
@@ -7820,12 +8227,12 @@
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
     cpu_cc_op = tcg_global_mem_new_i32(TCG_AREG0,
                                        offsetof(CPUX86State, cc_op), "cc_op");
-    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
-                                    "cc_src");
     cpu_cc_dst = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_dst),
                                     "cc_dst");
-    cpu_cc_tmp = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_tmp),
-                                    "cc_tmp");
+    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
+                                    "cc_src");
+    cpu_cc_src2 = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src2),
+                                     "cc_src2");
 
 #ifdef TARGET_X86_64
     cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,
@@ -7918,6 +8325,7 @@
     dc->tf = (flags >> TF_SHIFT) & 1;
     dc->singlestep_enabled = env->singlestep_enabled;
     dc->cc_op = CC_OP_DYNAMIC;
+    dc->cc_op_dirty = false;
     dc->cs_base = cs_base;
     dc->tb = tb;
     dc->popl_esp_hack = 0;
@@ -7951,16 +8359,15 @@
     cpu_T[0] = tcg_temp_new();
     cpu_T[1] = tcg_temp_new();
     cpu_A0 = tcg_temp_new();
-    cpu_T3 = tcg_temp_new();
 
     cpu_tmp0 = tcg_temp_new();
     cpu_tmp1_i64 = tcg_temp_new_i64();
     cpu_tmp2_i32 = tcg_temp_new_i32();
     cpu_tmp3_i32 = tcg_temp_new_i32();
     cpu_tmp4 = tcg_temp_new();
-    cpu_tmp5 = tcg_temp_new();
     cpu_ptr0 = tcg_temp_new_ptr();
     cpu_ptr1 = tcg_temp_new_ptr();
+    cpu_cc_srcT = tcg_temp_local_new();
 
     gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
 
diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index 6b87340..ccaf838 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -1012,8 +1012,6 @@
     int num_insns;
     int max_insns;
 
-    qemu_log_try_set_file(stderr);
-
     pc_start = tb->pc;
     dc->env = env;
     dc->tb = tb;
diff --git a/target-microblaze/translate.c b/target-microblaze/translate.c
index 12ea820..687b7d1 100644
--- a/target-microblaze/translate.c
+++ b/target-microblaze/translate.c
@@ -1734,8 +1734,6 @@
     int num_insns;
     int max_insns;
 
-    qemu_log_try_set_file(stderr);
-
     pc_start = tb->pc;
     dc->env = env;
     dc->tb = tb;
diff --git a/target-mips/dsp_helper.c b/target-mips/dsp_helper.c
index 96cb044..841f47b 100644
--- a/target-mips/dsp_helper.c
+++ b/target-mips/dsp_helper.c
@@ -652,7 +652,7 @@
         temp = 0x7FFF0000;
         set_DSPControl_overflow_flag(1, 21, env);
     } else {
-        temp = ((uint32_t)a * (uint32_t)b);
+        temp = (int16_t)a * (int16_t)b;
         temp = temp << 1;
     }
 
@@ -2689,7 +2689,7 @@
 target_ulong helper_##name(target_ulong rs, target_ulong rt,   \
                            CPUMIPSState *env)                  \
 {                                                              \
-    uint32_t rs_t, rt_t;                                       \
+    int32_t rs_t, rt_t;                                        \
     int32_t tempI;                                             \
     int64_t tempL;                                             \
                                                                \
diff --git a/target-mips/helper.h b/target-mips/helper.h
index cd48738..ed75e2c 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -24,8 +24,6 @@
 #ifdef TARGET_MIPS64
 DEF_HELPER_FLAGS_1(dclo, TCG_CALL_NO_RWG_SE, tl, tl)
 DEF_HELPER_FLAGS_1(dclz, TCG_CALL_NO_RWG_SE, tl, tl)
-DEF_HELPER_3(dmult, void, env, tl, tl)
-DEF_HELPER_3(dmultu, void, env, tl, tl)
 #endif
 
 DEF_HELPER_3(muls, tl, env, tl, tl)
diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
index 526f84f..45cbb2f 100644
--- a/target-mips/op_helper.c
+++ b/target-mips/op_helper.c
@@ -267,18 +267,6 @@
                        (uint64_t)(uint32_t)arg2);
 }
 
-#ifdef TARGET_MIPS64
-void helper_dmult(CPUMIPSState *env, target_ulong arg1, target_ulong arg2)
-{
-    muls64(&(env->active_tc.LO[0]), &(env->active_tc.HI[0]), arg1, arg2);
-}
-
-void helper_dmultu(CPUMIPSState *env, target_ulong arg1, target_ulong arg2)
-{
-    mulu64(&(env->active_tc.LO[0]), &(env->active_tc.HI[0]), arg1, arg2);
-}
-#endif
-
 #ifndef CONFIG_USER_ONLY
 
 static inline hwaddr do_translate_address(CPUMIPSState *env,
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 4ee9615..f10a533 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -2715,47 +2715,39 @@
         break;
     case OPC_MULT:
         {
-            TCGv_i64 t2 = tcg_temp_new_i64();
-            TCGv_i64 t3 = tcg_temp_new_i64();
+            TCGv_i32 t2 = tcg_temp_new_i32();
+            TCGv_i32 t3 = tcg_temp_new_i32();
             acc = ((ctx->opcode) >> 11) & 0x03;
             if (acc != 0) {
                 check_dsp(ctx);
             }
 
-            tcg_gen_ext_tl_i64(t2, t0);
-            tcg_gen_ext_tl_i64(t3, t1);
-            tcg_gen_mul_i64(t2, t2, t3);
-            tcg_temp_free_i64(t3);
-            tcg_gen_trunc_i64_tl(t0, t2);
-            tcg_gen_shri_i64(t2, t2, 32);
-            tcg_gen_trunc_i64_tl(t1, t2);
-            tcg_temp_free_i64(t2);
-            tcg_gen_ext32s_tl(cpu_LO[acc], t0);
-            tcg_gen_ext32s_tl(cpu_HI[acc], t1);
+            tcg_gen_trunc_tl_i32(t2, t0);
+            tcg_gen_trunc_tl_i32(t3, t1);
+            tcg_gen_muls2_i32(t2, t3, t2, t3);
+            tcg_gen_ext_i32_tl(cpu_LO[acc], t2);
+            tcg_gen_ext_i32_tl(cpu_HI[acc], t3);
+            tcg_temp_free_i32(t2);
+            tcg_temp_free_i32(t3);
         }
         opn = "mult";
         break;
     case OPC_MULTU:
         {
-            TCGv_i64 t2 = tcg_temp_new_i64();
-            TCGv_i64 t3 = tcg_temp_new_i64();
+            TCGv_i32 t2 = tcg_temp_new_i32();
+            TCGv_i32 t3 = tcg_temp_new_i32();
             acc = ((ctx->opcode) >> 11) & 0x03;
             if (acc != 0) {
                 check_dsp(ctx);
             }
 
-            tcg_gen_ext32u_tl(t0, t0);
-            tcg_gen_ext32u_tl(t1, t1);
-            tcg_gen_extu_tl_i64(t2, t0);
-            tcg_gen_extu_tl_i64(t3, t1);
-            tcg_gen_mul_i64(t2, t2, t3);
-            tcg_temp_free_i64(t3);
-            tcg_gen_trunc_i64_tl(t0, t2);
-            tcg_gen_shri_i64(t2, t2, 32);
-            tcg_gen_trunc_i64_tl(t1, t2);
-            tcg_temp_free_i64(t2);
-            tcg_gen_ext32s_tl(cpu_LO[acc], t0);
-            tcg_gen_ext32s_tl(cpu_HI[acc], t1);
+            tcg_gen_trunc_tl_i32(t2, t0);
+            tcg_gen_trunc_tl_i32(t3, t1);
+            tcg_gen_mulu2_i32(t2, t3, t2, t3);
+            tcg_gen_ext_i32_tl(cpu_LO[acc], t2);
+            tcg_gen_ext_i32_tl(cpu_HI[acc], t3);
+            tcg_temp_free_i32(t2);
+            tcg_temp_free_i32(t3);
         }
         opn = "multu";
         break;
@@ -2791,11 +2783,11 @@
         opn = "ddivu";
         break;
     case OPC_DMULT:
-        gen_helper_dmult(cpu_env, t0, t1);
+        tcg_gen_muls2_i64(cpu_LO[0], cpu_HI[0], t0, t1);
         opn = "dmult";
         break;
     case OPC_DMULTU:
-        gen_helper_dmultu(cpu_env, t0, t1);
+        tcg_gen_mulu2_i64(cpu_LO[0], cpu_HI[0], t0, t1);
         opn = "dmultu";
         break;
 #endif
diff --git a/target-openrisc/translate.c b/target-openrisc/translate.c
index 1e1b30c..23e853e 100644
--- a/target-openrisc/translate.c
+++ b/target-openrisc/translate.c
@@ -1670,8 +1670,6 @@
     int num_insns;
     int max_insns;
 
-    qemu_log_try_set_file(stderr);
-
     pc_start = tb->pc;
     dc->tb = tb;
 
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index 8c081db..20f4565 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -941,8 +941,11 @@
     /* CFAR */
     target_ulong cfar;
 #endif
-    /* XER */
+    /* XER (with SO, OV, CA split out) */
     target_ulong xer;
+    target_ulong so;
+    target_ulong ov;
+    target_ulong ca;
     /* Reservation address */
     target_ulong reserve_addr;
     /* Reservation value */
@@ -1268,9 +1271,9 @@
 #define XER_CA  29
 #define XER_CMP  8
 #define XER_BC   0
-#define xer_so  ((env->xer >> XER_SO)  &    1)
-#define xer_ov  ((env->xer >> XER_OV)  &    1)
-#define xer_ca  ((env->xer >> XER_CA)  &    1)
+#define xer_so  (env->so)
+#define xer_ov  (env->ov)
+#define xer_ca  (env->ca)
 #define xer_cmp ((env->xer >> XER_CMP) & 0xFF)
 #define xer_bc  ((env->xer >> XER_BC)  & 0x7F)
 
@@ -2087,6 +2090,19 @@
 
 /*****************************************************************************/
 
+static inline target_ulong cpu_read_xer(CPUPPCState *env)
+{
+    return env->xer | (env->so << XER_SO) | (env->ov << XER_OV) | (env->ca << XER_CA);
+}
+
+static inline void cpu_write_xer(CPUPPCState *env, target_ulong xer)
+{
+    env->so = (xer >> XER_SO) & 1;
+    env->ov = (xer >> XER_OV) & 1;
+    env->ca = (xer >> XER_CA) & 1;
+    env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
+}
+
 static inline void cpu_get_tb_cpu_state(CPUPPCState *env, target_ulong *pc,
                                         target_ulong *cs_base, int *flags)
 {
diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index 18e0394..fcf372a 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -30,8 +30,6 @@
 DEF_HELPER_5(lscbx, tl, env, tl, i32, i32, i32)
 
 #if defined(TARGET_PPC64)
-DEF_HELPER_FLAGS_2(mulhd, TCG_CALL_NO_RWG_SE, i64, i64, i64)
-DEF_HELPER_FLAGS_2(mulhdu, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_3(mulldo, i64, env, i64, i64)
 #endif
 
diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
index 783079d..54eca9b 100644
--- a/target-ppc/int_helper.c
+++ b/target-ppc/int_helper.c
@@ -25,24 +25,6 @@
 /* Fixed point operations helpers */
 #if defined(TARGET_PPC64)
 
-/* multiply high word */
-uint64_t helper_mulhd(uint64_t arg1, uint64_t arg2)
-{
-    uint64_t tl, th;
-
-    muls64(&tl, &th, arg1, arg2);
-    return th;
-}
-
-/* multiply high word unsigned */
-uint64_t helper_mulhdu(uint64_t arg1, uint64_t arg2)
-{
-    uint64_t tl, th;
-
-    mulu64(&tl, &th, arg1, arg2);
-    return th;
-}
-
 uint64_t helper_mulldo(CPUPPCState *env, uint64_t arg1, uint64_t arg2)
 {
     int64_t th;
@@ -51,9 +33,9 @@
     muls64(&tl, (uint64_t *)&th, arg1, arg2);
     /* If th != 0 && th != -1, then we had an overflow */
     if (likely((uint64_t)(th + 1) <= 1)) {
-        env->xer &= ~(1 << XER_OV);
+        env->ov = 0;
     } else {
-        env->xer |= (1 << XER_OV) | (1 << XER_SO);
+        env->so = env->ov = 1;
     }
     return (int64_t)tl;
 }
@@ -82,21 +64,17 @@
             shift &= 0x1f;
             ret = (int32_t)value >> shift;
             if (likely(ret >= 0 || (value & ((1 << shift) - 1)) == 0)) {
-                env->xer &= ~(1 << XER_CA);
+                env->ca = 0;
             } else {
-                env->xer |= (1 << XER_CA);
+                env->ca = 1;
             }
         } else {
             ret = (int32_t)value;
-            env->xer &= ~(1 << XER_CA);
+            env->ca = 0;
         }
     } else {
         ret = (int32_t)value >> 31;
-        if (ret) {
-            env->xer |= (1 << XER_CA);
-        } else {
-            env->xer &= ~(1 << XER_CA);
-        }
+        env->ca = (ret != 0);
     }
     return (target_long)ret;
 }
@@ -112,21 +90,17 @@
             shift &= 0x3f;
             ret = (int64_t)value >> shift;
             if (likely(ret >= 0 || (value & ((1 << shift) - 1)) == 0)) {
-                env->xer &= ~(1 << XER_CA);
+                env->ca = 0;
             } else {
-                env->xer |= (1 << XER_CA);
+                env->ca = 1;
             }
         } else {
             ret = (int64_t)value;
-            env->xer &= ~(1 << XER_CA);
+            env->ca = 0;
         }
     } else {
         ret = (int64_t)value >> 63;
-        if (ret) {
-            env->xer |= (1 << XER_CA);
-        } else {
-            env->xer &= ~(1 << XER_CA);
-        }
+        env->ca = (ret != 0);
     }
     return ret;
 }
@@ -206,16 +180,16 @@
 
     if (((int32_t)tmp == INT32_MIN && (int32_t)arg2 == (int32_t)-1) ||
         (int32_t)arg2 == 0) {
-        env->xer |= (1 << XER_OV) | (1 << XER_SO);
+        env->so = env->ov = 1;
         env->spr[SPR_MQ] = 0;
         return INT32_MIN;
     } else {
         env->spr[SPR_MQ] = tmp % arg2;
         tmp /= (int32_t)arg2;
         if ((int32_t)tmp != tmp) {
-            env->xer |= (1 << XER_OV) | (1 << XER_SO);
+            env->so = env->ov = 1;
         } else {
-            env->xer &= ~(1 << XER_OV);
+            env->ov = 0;
         }
         return tmp;
     }
@@ -239,11 +213,11 @@
 {
     if (((int32_t)arg1 == INT32_MIN && (int32_t)arg2 == (int32_t)-1) ||
         (int32_t)arg2 == 0) {
-        env->xer |= (1 << XER_OV) | (1 << XER_SO);
+        env->so = env->ov = 1;
         env->spr[SPR_MQ] = 0;
         return INT32_MIN;
     } else {
-        env->xer &= ~(1 << XER_OV);
+        env->ov = 0;
         env->spr[SPR_MQ] = (int32_t)arg1 % (int32_t)arg2;
         return (int32_t)arg1 / (int32_t)arg2;
     }
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 2c64c63..8e64416 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -464,7 +464,7 @@
 
     regs.ctr = env->ctr;
     regs.lr  = env->lr;
-    regs.xer = env->xer;
+    regs.xer = cpu_read_xer(env);
     regs.msr = env->msr;
     regs.pc = env->nip;
 
@@ -566,7 +566,7 @@
 
     env->ctr = regs.ctr;
     env->lr = regs.lr;
-    env->xer = regs.xer;
+    cpu_write_xer(env, regs.xer);
     env->msr = regs.msr;
     env->nip = regs.pc;
 
diff --git a/target-ppc/machine.c b/target-ppc/machine.c
index e014c0c..708a840 100644
--- a/target-ppc/machine.c
+++ b/target-ppc/machine.c
@@ -7,6 +7,7 @@
     CPUPPCState *env = (CPUPPCState *)opaque;
     unsigned int i, j;
     uint32_t fpscr;
+    target_ulong xer;
 
     for (i = 0; i < 32; i++)
         qemu_put_betls(f, &env->gpr[i]);
@@ -18,7 +19,8 @@
     qemu_put_betls(f, &env->ctr);
     for (i = 0; i < 8; i++)
         qemu_put_be32s(f, &env->crf[i]);
-    qemu_put_betls(f, &env->xer);
+    xer = cpu_read_xer(env);
+    qemu_put_betls(f, &xer);
     qemu_put_betls(f, &env->reserve_addr);
     qemu_put_betls(f, &env->msr);
     for (i = 0; i < 4; i++)
@@ -93,6 +95,7 @@
     unsigned int i, j;
     target_ulong sdr1;
     uint32_t fpscr;
+    target_ulong xer;
 
     for (i = 0; i < 32; i++)
         qemu_get_betls(f, &env->gpr[i]);
@@ -104,7 +107,8 @@
     qemu_get_betls(f, &env->ctr);
     for (i = 0; i < 8; i++)
         qemu_get_be32s(f, &env->crf[i]);
-    qemu_get_betls(f, &env->xer);
+    qemu_get_betls(f, &xer);
+    cpu_write_xer(env, xer);
     qemu_get_betls(f, &env->reserve_addr);
     qemu_get_betls(f, &env->msr);
     for (i = 0; i < 4; i++)
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 2ac5794..80d5366 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -66,7 +66,7 @@
 #if defined(TARGET_PPC64)
 static TCGv cpu_cfar;
 #endif
-static TCGv cpu_xer;
+static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca;
 static TCGv cpu_reserve;
 static TCGv cpu_fpscr;
 static TCGv_i32 cpu_access_type;
@@ -158,6 +158,12 @@
 
     cpu_xer = tcg_global_mem_new(TCG_AREG0,
                                  offsetof(CPUPPCState, xer), "xer");
+    cpu_so = tcg_global_mem_new(TCG_AREG0,
+                                offsetof(CPUPPCState, so), "SO");
+    cpu_ov = tcg_global_mem_new(TCG_AREG0,
+                                offsetof(CPUPPCState, ov), "OV");
+    cpu_ca = tcg_global_mem_new(TCG_AREG0,
+                                offsetof(CPUPPCState, ca), "CA");
 
     cpu_reserve = tcg_global_mem_new(TCG_AREG0,
                                      offsetof(CPUPPCState, reserve_addr),
@@ -590,35 +596,33 @@
 
 static inline void gen_op_cmp(TCGv arg0, TCGv arg1, int s, int crf)
 {
-    int l1, l2, l3;
+    TCGv t0 = tcg_temp_new();
+    TCGv_i32 t1 = tcg_temp_new_i32();
 
-    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_xer);
-    tcg_gen_shri_i32(cpu_crf[crf], cpu_crf[crf], XER_SO);
-    tcg_gen_andi_i32(cpu_crf[crf], cpu_crf[crf], 1);
+    tcg_gen_trunc_tl_i32(cpu_crf[crf], cpu_so);
 
-    l1 = gen_new_label();
-    l2 = gen_new_label();
-    l3 = gen_new_label();
-    if (s) {
-        tcg_gen_brcond_tl(TCG_COND_LT, arg0, arg1, l1);
-        tcg_gen_brcond_tl(TCG_COND_GT, arg0, arg1, l2);
-    } else {
-        tcg_gen_brcond_tl(TCG_COND_LTU, arg0, arg1, l1);
-        tcg_gen_brcond_tl(TCG_COND_GTU, arg0, arg1, l2);
-    }
-    tcg_gen_ori_i32(cpu_crf[crf], cpu_crf[crf], 1 << CRF_EQ);
-    tcg_gen_br(l3);
-    gen_set_label(l1);
-    tcg_gen_ori_i32(cpu_crf[crf], cpu_crf[crf], 1 << CRF_LT);
-    tcg_gen_br(l3);
-    gen_set_label(l2);
-    tcg_gen_ori_i32(cpu_crf[crf], cpu_crf[crf], 1 << CRF_GT);
-    gen_set_label(l3);
+    tcg_gen_setcond_tl((s ? TCG_COND_LT: TCG_COND_LTU), t0, arg0, arg1);
+    tcg_gen_trunc_tl_i32(t1, t0);
+    tcg_gen_shli_i32(t1, t1, CRF_LT);
+    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+
+    tcg_gen_setcond_tl((s ? TCG_COND_GT: TCG_COND_GTU), t0, arg0, arg1);
+    tcg_gen_trunc_tl_i32(t1, t0);
+    tcg_gen_shli_i32(t1, t1, CRF_GT);
+    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+
+    tcg_gen_setcond_tl(TCG_COND_EQ, t0, arg0, arg1);
+    tcg_gen_trunc_tl_i32(t1, t0);
+    tcg_gen_shli_i32(t1, t1, CRF_EQ);
+    tcg_gen_or_i32(cpu_crf[crf], cpu_crf[crf], t1);
+
+    tcg_temp_free(t0);
+    tcg_temp_free_i32(t1);
 }
 
 static inline void gen_op_cmpi(TCGv arg0, target_ulong arg1, int s, int crf)
 {
-    TCGv t0 = tcg_const_local_tl(arg1);
+    TCGv t0 = tcg_const_tl(arg1);
     gen_op_cmp(arg0, t0, s, crf);
     tcg_temp_free(t0);
 }
@@ -627,8 +631,8 @@
 static inline void gen_op_cmp32(TCGv arg0, TCGv arg1, int s, int crf)
 {
     TCGv t0, t1;
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
     if (s) {
         tcg_gen_ext32s_tl(t0, arg0);
         tcg_gen_ext32s_tl(t1, arg1);
@@ -643,7 +647,7 @@
 
 static inline void gen_op_cmpi32(TCGv arg0, target_ulong arg1, int s, int crf)
 {
-    TCGv t0 = tcg_const_local_tl(arg1);
+    TCGv t0 = tcg_const_tl(arg1);
     gen_op_cmp32(arg0, t0, s, crf);
     tcg_temp_free(t0);
 }
@@ -742,120 +746,59 @@
 static inline void gen_op_arith_compute_ov(DisasContext *ctx, TCGv arg0,
                                            TCGv arg1, TCGv arg2, int sub)
 {
-    int l1;
-    TCGv t0;
+    TCGv t0 = tcg_temp_new();
 
-    l1 = gen_new_label();
-    /* Start with XER OV disabled, the most likely case */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
-    t0 = tcg_temp_local_new();
-    tcg_gen_xor_tl(t0, arg0, arg1);
-#if defined(TARGET_PPC64)
-    if (!ctx->sf_mode)
-        tcg_gen_ext32s_tl(t0, t0);
-#endif
-    if (sub)
-        tcg_gen_brcondi_tl(TCG_COND_LT, t0, 0, l1);
-    else
-        tcg_gen_brcondi_tl(TCG_COND_GE, t0, 0, l1);
+    tcg_gen_xor_tl(cpu_ov, arg0, arg1);
     tcg_gen_xor_tl(t0, arg1, arg2);
-#if defined(TARGET_PPC64)
-    if (!ctx->sf_mode)
-        tcg_gen_ext32s_tl(t0, t0);
-#endif
-    if (sub)
-        tcg_gen_brcondi_tl(TCG_COND_GE, t0, 0, l1);
-    else
-        tcg_gen_brcondi_tl(TCG_COND_LT, t0, 0, l1);
-    tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
-    gen_set_label(l1);
-    tcg_temp_free(t0);
-}
-
-static inline void gen_op_arith_compute_ca(DisasContext *ctx, TCGv arg1,
-                                           TCGv arg2, int sub)
-{
-    int l1 = gen_new_label();
-
-#if defined(TARGET_PPC64)
-    if (!(ctx->sf_mode)) {
-        TCGv t0, t1;
-        t0 = tcg_temp_new();
-        t1 = tcg_temp_new();
-
-        tcg_gen_ext32u_tl(t0, arg1);
-        tcg_gen_ext32u_tl(t1, arg2);
-        if (sub) {
-            tcg_gen_brcond_tl(TCG_COND_GTU, t0, t1, l1);
-        } else {
-            tcg_gen_brcond_tl(TCG_COND_GEU, t0, t1, l1);
-        }
-        tcg_gen_ori_tl(cpu_xer, cpu_xer, 1 << XER_CA);
-        gen_set_label(l1);
-        tcg_temp_free(t0);
-        tcg_temp_free(t1);
-    } else
-#endif
-    {
-        if (sub) {
-            tcg_gen_brcond_tl(TCG_COND_GTU, arg1, arg2, l1);
-        } else {
-            tcg_gen_brcond_tl(TCG_COND_GEU, arg1, arg2, l1);
-        }
-        tcg_gen_ori_tl(cpu_xer, cpu_xer, 1 << XER_CA);
-        gen_set_label(l1);
+    if (sub) {
+        tcg_gen_and_tl(cpu_ov, cpu_ov, t0);
+    } else {
+        tcg_gen_andc_tl(cpu_ov, cpu_ov, t0);
     }
+    tcg_temp_free(t0);
+#if defined(TARGET_PPC64)
+    if (!ctx->sf_mode) {
+        tcg_gen_ext32s_tl(cpu_ov, cpu_ov);
+    }
+#endif
+    tcg_gen_shri_tl(cpu_ov, cpu_ov, TARGET_LONG_BITS - 1);
+    tcg_gen_or_tl(cpu_so, cpu_so, cpu_ov);
 }
 
 /* Common add function */
 static inline void gen_op_arith_add(DisasContext *ctx, TCGv ret, TCGv arg1,
-                                    TCGv arg2, int add_ca, int compute_ca,
-                                    int compute_ov)
+                                    TCGv arg2, bool add_ca, bool compute_ca,
+                                    bool compute_ov, bool compute_rc0)
 {
-    TCGv t0, t1;
+    TCGv t0 = ret;
 
-    if ((!compute_ca && !compute_ov) ||
-        (!TCGV_EQUAL(ret,arg1) && !TCGV_EQUAL(ret, arg2)))  {
-        t0 = ret;
-    } else {
-        t0 = tcg_temp_local_new();
+    if (((compute_ca && add_ca) || compute_ov)
+        && (TCGV_EQUAL(ret, arg1) || TCGV_EQUAL(ret, arg2)))  {
+        t0 = tcg_temp_new();
     }
 
-    if (add_ca) {
-        t1 = tcg_temp_local_new();
-        tcg_gen_andi_tl(t1, cpu_xer, (1 << XER_CA));
-        tcg_gen_shri_tl(t1, t1, XER_CA);
-    } else {
-        TCGV_UNUSED(t1);
-    }
-
-    if (compute_ca && compute_ov) {
-        /* Start with XER CA and OV disabled, the most likely case */
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~((1 << XER_CA) | (1 << XER_OV)));
-    } else if (compute_ca) {
-        /* Start with XER CA disabled, the most likely case */
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
-    } else if (compute_ov) {
-        /* Start with XER OV disabled, the most likely case */
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
-    }
-
-    tcg_gen_add_tl(t0, arg1, arg2);
-
     if (compute_ca) {
-        gen_op_arith_compute_ca(ctx, t0, arg1, 0);
+        TCGv zero = tcg_const_tl(0);
+        if (add_ca) {
+            tcg_gen_add2_tl(t0, cpu_ca, arg1, zero, cpu_ca, zero);
+            tcg_gen_add2_tl(t0, cpu_ca, t0, cpu_ca, arg2, zero);
+        } else {
+            tcg_gen_add2_tl(t0, cpu_ca, arg1, zero, arg2, zero);
+        }
+        tcg_temp_free(zero);
+    } else {
+        tcg_gen_add_tl(t0, arg1, arg2);
+        if (add_ca) {
+            tcg_gen_add_tl(t0, t0, cpu_ca);
+        }
     }
-    if (add_ca) {
-        tcg_gen_add_tl(t0, t0, t1);
-        gen_op_arith_compute_ca(ctx, t0, t1, 0);
-        tcg_temp_free(t1);
-    }
+
     if (compute_ov) {
         gen_op_arith_compute_ov(ctx, t0, arg1, arg2, 0);
     }
-
-    if (unlikely(Rc(ctx->opcode) != 0))
+    if (unlikely(compute_rc0)) {
         gen_set_Rc0(ctx, t0);
+    }
 
     if (!TCGV_EQUAL(t0, ret)) {
         tcg_gen_mov_tl(ret, t0);
@@ -864,21 +807,21 @@
 }
 /* Add functions with two operands */
 #define GEN_INT_ARITH_ADD(name, opc3, add_ca, compute_ca, compute_ov)         \
-static void glue(gen_, name)(DisasContext *ctx)                                       \
+static void glue(gen_, name)(DisasContext *ctx)                               \
 {                                                                             \
     gen_op_arith_add(ctx, cpu_gpr[rD(ctx->opcode)],                           \
                      cpu_gpr[rA(ctx->opcode)], cpu_gpr[rB(ctx->opcode)],      \
-                     add_ca, compute_ca, compute_ov);                         \
+                     add_ca, compute_ca, compute_ov, Rc(ctx->opcode));        \
 }
 /* Add functions with one operand and one immediate */
 #define GEN_INT_ARITH_ADD_CONST(name, opc3, const_val,                        \
                                 add_ca, compute_ca, compute_ov)               \
-static void glue(gen_, name)(DisasContext *ctx)                                       \
+static void glue(gen_, name)(DisasContext *ctx)                               \
 {                                                                             \
-    TCGv t0 = tcg_const_local_tl(const_val);                                  \
+    TCGv t0 = tcg_const_tl(const_val);                                        \
     gen_op_arith_add(ctx, cpu_gpr[rD(ctx->opcode)],                           \
                      cpu_gpr[rA(ctx->opcode)], t0,                            \
-                     add_ca, compute_ca, compute_ov);                         \
+                     add_ca, compute_ca, compute_ov, Rc(ctx->opcode));        \
     tcg_temp_free(t0);                                                        \
 }
 
@@ -906,40 +849,27 @@
         /* li case */
         tcg_gen_movi_tl(cpu_gpr[rD(ctx->opcode)], simm);
     } else {
-        tcg_gen_addi_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], simm);
+        tcg_gen_addi_tl(cpu_gpr[rD(ctx->opcode)],
+                        cpu_gpr[rA(ctx->opcode)], simm);
     }
 }
 /* addic  addic.*/
-static inline void gen_op_addic(DisasContext *ctx, TCGv ret, TCGv arg1,
-                                int compute_Rc0)
+static inline void gen_op_addic(DisasContext *ctx, bool compute_rc0)
 {
-    target_long simm = SIMM(ctx->opcode);
-
-    /* Start with XER CA and OV disabled, the most likely case */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
-
-    if (likely(simm != 0)) {
-        TCGv t0 = tcg_temp_local_new();
-        tcg_gen_addi_tl(t0, arg1, simm);
-        gen_op_arith_compute_ca(ctx, t0, arg1, 0);
-        tcg_gen_mov_tl(ret, t0);
-        tcg_temp_free(t0);
-    } else {
-        tcg_gen_mov_tl(ret, arg1);
-    }
-    if (compute_Rc0) {
-        gen_set_Rc0(ctx, ret);
-    }
+    TCGv c = tcg_const_tl(SIMM(ctx->opcode));
+    gen_op_arith_add(ctx, cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)],
+                     c, 0, 1, 0, compute_rc0);
+    tcg_temp_free(c);
 }
 
 static void gen_addic(DisasContext *ctx)
 {
-    gen_op_addic(ctx, cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], 0);
+    gen_op_addic(ctx, 0);
 }
 
 static void gen_addic_(DisasContext *ctx)
 {
-    gen_op_addic(ctx, cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], 1);
+    gen_op_addic(ctx, 1);
 }
 
 /* addis */
@@ -951,7 +881,8 @@
         /* lis case */
         tcg_gen_movi_tl(cpu_gpr[rD(ctx->opcode)], simm << 16);
     } else {
-        tcg_gen_addi_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], simm << 16);
+        tcg_gen_addi_tl(cpu_gpr[rD(ctx->opcode)],
+                        cpu_gpr[rA(ctx->opcode)], simm << 16);
     }
 }
 
@@ -976,7 +907,7 @@
         tcg_gen_divu_i32(t0, t0, t1);
     }
     if (compute_ov) {
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
+        tcg_gen_movi_tl(cpu_ov, 0);
     }
     tcg_gen_br(l2);
     gen_set_label(l1);
@@ -986,7 +917,8 @@
         tcg_gen_movi_i32(t0, 0);
     }
     if (compute_ov) {
-        tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
+        tcg_gen_movi_tl(cpu_ov, 1);
+        tcg_gen_movi_tl(cpu_so, 1);
     }
     gen_set_label(l2);
     tcg_gen_extu_i32_tl(ret, t0);
@@ -1027,7 +959,7 @@
         tcg_gen_divu_i64(ret, arg1, arg2);
     }
     if (compute_ov) {
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
+        tcg_gen_movi_tl(cpu_ov, 0);
     }
     tcg_gen_br(l2);
     gen_set_label(l1);
@@ -1037,7 +969,8 @@
         tcg_gen_movi_i64(ret, 0);
     }
     if (compute_ov) {
-        tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
+        tcg_gen_movi_tl(cpu_ov, 1);
+        tcg_gen_movi_tl(cpu_so, 1);
     }
     gen_set_label(l2);
     if (unlikely(Rc(ctx->opcode) != 0))
@@ -1061,24 +994,15 @@
 /* mulhw  mulhw. */
 static void gen_mulhw(DisasContext *ctx)
 {
-    TCGv_i64 t0, t1;
+    TCGv_i32 t0 = tcg_temp_new_i32();
+    TCGv_i32 t1 = tcg_temp_new_i32();
 
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-#if defined(TARGET_PPC64)
-    tcg_gen_ext32s_tl(t0, cpu_gpr[rA(ctx->opcode)]);
-    tcg_gen_ext32s_tl(t1, cpu_gpr[rB(ctx->opcode)]);
-    tcg_gen_mul_i64(t0, t0, t1);
-    tcg_gen_shri_i64(cpu_gpr[rD(ctx->opcode)], t0, 32);
-#else
-    tcg_gen_ext_tl_i64(t0, cpu_gpr[rA(ctx->opcode)]);
-    tcg_gen_ext_tl_i64(t1, cpu_gpr[rB(ctx->opcode)]);
-    tcg_gen_mul_i64(t0, t0, t1);
-    tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_trunc_i64_tl(cpu_gpr[rD(ctx->opcode)], t0);
-#endif
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i64(t1);
+    tcg_gen_trunc_tl_i32(t0, cpu_gpr[rA(ctx->opcode)]);
+    tcg_gen_trunc_tl_i32(t1, cpu_gpr[rB(ctx->opcode)]);
+    tcg_gen_muls2_i32(t0, t1, t0, t1);
+    tcg_gen_extu_i32_tl(cpu_gpr[rD(ctx->opcode)], t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
     if (unlikely(Rc(ctx->opcode) != 0))
         gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
 }
@@ -1086,24 +1010,15 @@
 /* mulhwu  mulhwu.  */
 static void gen_mulhwu(DisasContext *ctx)
 {
-    TCGv_i64 t0, t1;
+    TCGv_i32 t0 = tcg_temp_new_i32();
+    TCGv_i32 t1 = tcg_temp_new_i32();
 
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-#if defined(TARGET_PPC64)
-    tcg_gen_ext32u_i64(t0, cpu_gpr[rA(ctx->opcode)]);
-    tcg_gen_ext32u_i64(t1, cpu_gpr[rB(ctx->opcode)]);
-    tcg_gen_mul_i64(t0, t0, t1);
-    tcg_gen_shri_i64(cpu_gpr[rD(ctx->opcode)], t0, 32);
-#else
-    tcg_gen_extu_tl_i64(t0, cpu_gpr[rA(ctx->opcode)]);
-    tcg_gen_extu_tl_i64(t1, cpu_gpr[rB(ctx->opcode)]);
-    tcg_gen_mul_i64(t0, t0, t1);
-    tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_trunc_i64_tl(cpu_gpr[rD(ctx->opcode)], t0);
-#endif
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i64(t1);
+    tcg_gen_trunc_tl_i32(t0, cpu_gpr[rA(ctx->opcode)]);
+    tcg_gen_trunc_tl_i32(t1, cpu_gpr[rB(ctx->opcode)]);
+    tcg_gen_mulu2_i32(t0, t1, t0, t1);
+    tcg_gen_extu_i32_tl(cpu_gpr[rD(ctx->opcode)], t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
     if (unlikely(Rc(ctx->opcode) != 0))
         gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
 }
@@ -1121,34 +1036,21 @@
 /* mullwo  mullwo. */
 static void gen_mullwo(DisasContext *ctx)
 {
-    int l1;
-    TCGv_i64 t0, t1;
+    TCGv_i32 t0 = tcg_temp_new_i32();
+    TCGv_i32 t1 = tcg_temp_new_i32();
 
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-    l1 = gen_new_label();
-    /* Start with XER OV disabled, the most likely case */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
-#if defined(TARGET_PPC64)
-    tcg_gen_ext32s_i64(t0, cpu_gpr[rA(ctx->opcode)]);
-    tcg_gen_ext32s_i64(t1, cpu_gpr[rB(ctx->opcode)]);
-#else
-    tcg_gen_ext_tl_i64(t0, cpu_gpr[rA(ctx->opcode)]);
-    tcg_gen_ext_tl_i64(t1, cpu_gpr[rB(ctx->opcode)]);
-#endif
-    tcg_gen_mul_i64(t0, t0, t1);
-#if defined(TARGET_PPC64)
-    tcg_gen_ext32s_i64(cpu_gpr[rD(ctx->opcode)], t0);
-    tcg_gen_brcond_i64(TCG_COND_EQ, t0, cpu_gpr[rD(ctx->opcode)], l1);
-#else
-    tcg_gen_trunc_i64_tl(cpu_gpr[rD(ctx->opcode)], t0);
-    tcg_gen_ext32s_i64(t1, t0);
-    tcg_gen_brcond_i64(TCG_COND_EQ, t0, t1, l1);
-#endif
-    tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
-    gen_set_label(l1);
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i64(t1);
+    tcg_gen_trunc_tl_i32(t0, cpu_gpr[rA(ctx->opcode)]);
+    tcg_gen_trunc_tl_i32(t1, cpu_gpr[rB(ctx->opcode)]);
+    tcg_gen_muls2_i32(t0, t1, t0, t1);
+    tcg_gen_ext_i32_tl(cpu_gpr[rD(ctx->opcode)], t0);
+
+    tcg_gen_sari_i32(t0, t0, 31);
+    tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t1);
+    tcg_gen_extu_i32_tl(cpu_ov, t0);
+    tcg_gen_or_tl(cpu_so, cpu_so, cpu_ov);
+
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
     if (unlikely(Rc(ctx->opcode) != 0))
         gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
 }
@@ -1159,19 +1061,31 @@
     tcg_gen_muli_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)],
                     SIMM(ctx->opcode));
 }
+
 #if defined(TARGET_PPC64)
-#define GEN_INT_ARITH_MUL_HELPER(name, opc3)                                  \
-static void glue(gen_, name)(DisasContext *ctx)                                       \
-{                                                                             \
-    gen_helper_##name (cpu_gpr[rD(ctx->opcode)],                              \
-                       cpu_gpr[rA(ctx->opcode)], cpu_gpr[rB(ctx->opcode)]);   \
-    if (unlikely(Rc(ctx->opcode) != 0))                                       \
-        gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);                           \
-}
 /* mulhd  mulhd. */
-GEN_INT_ARITH_MUL_HELPER(mulhdu, 0x00);
+static void gen_mulhd(DisasContext *ctx)
+{
+    TCGv lo = tcg_temp_new();
+    tcg_gen_muls2_tl(lo, cpu_gpr[rD(ctx->opcode)],
+                     cpu_gpr[rA(ctx->opcode)], cpu_gpr[rB(ctx->opcode)]);
+    tcg_temp_free(lo);
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
+    }
+}
+
 /* mulhdu  mulhdu. */
-GEN_INT_ARITH_MUL_HELPER(mulhd, 0x02);
+static void gen_mulhdu(DisasContext *ctx)
+{
+    TCGv lo = tcg_temp_new();
+    tcg_gen_mulu2_tl(lo, cpu_gpr[rD(ctx->opcode)],
+                     cpu_gpr[rA(ctx->opcode)], cpu_gpr[rB(ctx->opcode)]);
+    tcg_temp_free(lo);
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
+    }
+}
 
 /* mulld  mulld. */
 static void gen_mulld(DisasContext *ctx)
@@ -1193,101 +1107,46 @@
 }
 #endif
 
-/* neg neg. nego nego. */
-static inline void gen_op_arith_neg(DisasContext *ctx, TCGv ret, TCGv arg1,
-                                    int ov_check)
-{
-    int l1 = gen_new_label();
-    int l2 = gen_new_label();
-    TCGv t0 = tcg_temp_local_new();
-#if defined(TARGET_PPC64)
-    if (ctx->sf_mode) {
-        tcg_gen_mov_tl(t0, arg1);
-        tcg_gen_brcondi_tl(TCG_COND_EQ, t0, INT64_MIN, l1);
-    } else
-#endif
-    {
-        tcg_gen_ext32s_tl(t0, arg1);
-        tcg_gen_brcondi_tl(TCG_COND_EQ, t0, INT32_MIN, l1);
-    }
-    tcg_gen_neg_tl(ret, arg1);
-    if (ov_check) {
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
-    }
-    tcg_gen_br(l2);
-    gen_set_label(l1);
-    tcg_gen_mov_tl(ret, t0);
-    if (ov_check) {
-        tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
-    }
-    gen_set_label(l2);
-    tcg_temp_free(t0);
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, ret);
-}
-
-static void gen_neg(DisasContext *ctx)
-{
-    gen_op_arith_neg(ctx, cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], 0);
-}
-
-static void gen_nego(DisasContext *ctx)
-{
-    gen_op_arith_neg(ctx, cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], 1);
-}
-
 /* Common subf function */
 static inline void gen_op_arith_subf(DisasContext *ctx, TCGv ret, TCGv arg1,
-                                     TCGv arg2, int add_ca, int compute_ca,
-                                     int compute_ov)
+                                     TCGv arg2, bool add_ca, bool compute_ca,
+                                     bool compute_ov, bool compute_rc0)
 {
-    TCGv t0, t1;
+    TCGv t0 = ret;
 
-    if ((!compute_ca && !compute_ov) ||
-        (!TCGV_EQUAL(ret, arg1) && !TCGV_EQUAL(ret, arg2)))  {
-        t0 = ret;
-    } else {
-        t0 = tcg_temp_local_new();
+    if (((add_ca && compute_ca) || compute_ov)
+        && (TCGV_EQUAL(ret, arg1) || TCGV_EQUAL(ret, arg2)))  {
+        t0 = tcg_temp_new();
     }
 
     if (add_ca) {
-        t1 = tcg_temp_local_new();
-        tcg_gen_andi_tl(t1, cpu_xer, (1 << XER_CA));
-        tcg_gen_shri_tl(t1, t1, XER_CA);
-    } else {
-        TCGV_UNUSED(t1);
-    }
-
-    if (compute_ca && compute_ov) {
-        /* Start with XER CA and OV disabled, the most likely case */
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~((1 << XER_CA) | (1 << XER_OV)));
-    } else if (compute_ca) {
-        /* Start with XER CA disabled, the most likely case */
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
-    } else if (compute_ov) {
-        /* Start with XER OV disabled, the most likely case */
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
-    }
-
-    if (add_ca) {
-        tcg_gen_not_tl(t0, arg1);
-        tcg_gen_add_tl(t0, t0, arg2);
-        gen_op_arith_compute_ca(ctx, t0, arg2, 0);
-        tcg_gen_add_tl(t0, t0, t1);
-        gen_op_arith_compute_ca(ctx, t0, t1, 0);
-        tcg_temp_free(t1);
-    } else {
-        tcg_gen_sub_tl(t0, arg2, arg1);
+        /* dest = ~arg1 + arg2 + ca.  */
         if (compute_ca) {
-            gen_op_arith_compute_ca(ctx, t0, arg2, 1);
+            TCGv zero, inv1 = tcg_temp_new();
+            tcg_gen_not_tl(inv1, arg1);
+            zero = tcg_const_tl(0);
+            tcg_gen_add2_tl(t0, cpu_ca, arg2, zero, cpu_ca, zero);
+            tcg_gen_add2_tl(t0, cpu_ca, t0, cpu_ca, inv1, zero);
+            tcg_temp_free(zero);
+            tcg_temp_free(inv1);
+        } else {
+            tcg_gen_sub_tl(t0, arg2, arg1);
+            tcg_gen_add_tl(t0, t0, cpu_ca);
+            tcg_gen_subi_tl(t0, t0, 1);
         }
+    } else {
+        if (compute_ca) {
+            tcg_gen_setcond_tl(TCG_COND_GEU, cpu_ca, arg2, arg1);
+        }
+        tcg_gen_sub_tl(t0, arg2, arg1);
     }
+
     if (compute_ov) {
         gen_op_arith_compute_ov(ctx, t0, arg1, arg2, 1);
     }
-
-    if (unlikely(Rc(ctx->opcode) != 0))
+    if (unlikely(compute_rc0)) {
         gen_set_Rc0(ctx, t0);
+    }
 
     if (!TCGV_EQUAL(t0, ret)) {
         tcg_gen_mov_tl(ret, t0);
@@ -1296,21 +1155,21 @@
 }
 /* Sub functions with Two operands functions */
 #define GEN_INT_ARITH_SUBF(name, opc3, add_ca, compute_ca, compute_ov)        \
-static void glue(gen_, name)(DisasContext *ctx)                                       \
+static void glue(gen_, name)(DisasContext *ctx)                               \
 {                                                                             \
     gen_op_arith_subf(ctx, cpu_gpr[rD(ctx->opcode)],                          \
                       cpu_gpr[rA(ctx->opcode)], cpu_gpr[rB(ctx->opcode)],     \
-                      add_ca, compute_ca, compute_ov);                        \
+                      add_ca, compute_ca, compute_ov, Rc(ctx->opcode));       \
 }
 /* Sub functions with one operand and one immediate */
 #define GEN_INT_ARITH_SUBF_CONST(name, opc3, const_val,                       \
                                 add_ca, compute_ca, compute_ov)               \
-static void glue(gen_, name)(DisasContext *ctx)                                       \
+static void glue(gen_, name)(DisasContext *ctx)                               \
 {                                                                             \
-    TCGv t0 = tcg_const_local_tl(const_val);                                  \
+    TCGv t0 = tcg_const_tl(const_val);                                        \
     gen_op_arith_subf(ctx, cpu_gpr[rD(ctx->opcode)],                          \
                       cpu_gpr[rA(ctx->opcode)], t0,                           \
-                      add_ca, compute_ca, compute_ov);                        \
+                      add_ca, compute_ca, compute_ov, Rc(ctx->opcode));       \
     tcg_temp_free(t0);                                                        \
 }
 /* subf  subf.  subfo  subfo. */
@@ -1332,15 +1191,29 @@
 /* subfic */
 static void gen_subfic(DisasContext *ctx)
 {
-    /* Start with XER CA and OV disabled, the most likely case */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
-    TCGv t0 = tcg_temp_local_new();
-    TCGv t1 = tcg_const_local_tl(SIMM(ctx->opcode));
-    tcg_gen_sub_tl(t0, t1, cpu_gpr[rA(ctx->opcode)]);
-    gen_op_arith_compute_ca(ctx, t0, t1, 1);
-    tcg_temp_free(t1);
-    tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], t0);
-    tcg_temp_free(t0);
+    TCGv c = tcg_const_tl(SIMM(ctx->opcode));
+    gen_op_arith_subf(ctx, cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)],
+                      c, 0, 1, 0, 0);
+    tcg_temp_free(c);
+}
+
+/* neg neg. nego nego. */
+static inline void gen_op_arith_neg(DisasContext *ctx, bool compute_ov)
+{
+    TCGv zero = tcg_const_tl(0);
+    gen_op_arith_subf(ctx, cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)],
+                      zero, 0, 0, compute_ov, Rc(ctx->opcode));
+    tcg_temp_free(zero);
+}
+
+static void gen_neg(DisasContext *ctx)
+{
+    gen_op_arith_neg(ctx, 0);
+}
+
+static void gen_nego(DisasContext *ctx)
+{
+    gen_op_arith_neg(ctx, 1);
 }
 
 /***                            Integer logical                            ***/
@@ -1887,30 +1760,25 @@
 static void gen_srawi(DisasContext *ctx)
 {
     int sh = SH(ctx->opcode);
-    if (sh != 0) {
-        int l1, l2;
-        TCGv t0;
-        l1 = gen_new_label();
-        l2 = gen_new_label();
-        t0 = tcg_temp_local_new();
-        tcg_gen_ext32s_tl(t0, cpu_gpr[rS(ctx->opcode)]);
-        tcg_gen_brcondi_tl(TCG_COND_GE, t0, 0, l1);
-        tcg_gen_andi_tl(t0, cpu_gpr[rS(ctx->opcode)], (1ULL << sh) - 1);
-        tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, l1);
-        tcg_gen_ori_tl(cpu_xer, cpu_xer, 1 << XER_CA);
-        tcg_gen_br(l2);
-        gen_set_label(l1);
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
-        gen_set_label(l2);
-        tcg_gen_ext32s_tl(t0, cpu_gpr[rS(ctx->opcode)]);
-        tcg_gen_sari_tl(cpu_gpr[rA(ctx->opcode)], t0, sh);
-        tcg_temp_free(t0);
+    TCGv dst = cpu_gpr[rA(ctx->opcode)];
+    TCGv src = cpu_gpr[rS(ctx->opcode)];
+    if (sh == 0) {
+        tcg_gen_mov_tl(dst, src);
+        tcg_gen_movi_tl(cpu_ca, 0);
     } else {
-        tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)]);
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
+        TCGv t0;
+        tcg_gen_ext32s_tl(dst, src);
+        tcg_gen_andi_tl(cpu_ca, dst, (1ULL << sh) - 1);
+        t0 = tcg_temp_new();
+        tcg_gen_sari_tl(t0, dst, TARGET_LONG_BITS - 1);
+        tcg_gen_and_tl(cpu_ca, cpu_ca, t0);
+        tcg_temp_free(t0);
+        tcg_gen_setcondi_tl(TCG_COND_NE, cpu_ca, cpu_ca, 0);
+        tcg_gen_sari_tl(dst, dst, sh);
     }
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, dst);
+    }
 }
 
 /* srw & srw. */
@@ -1970,28 +1838,24 @@
 static inline void gen_sradi(DisasContext *ctx, int n)
 {
     int sh = SH(ctx->opcode) + (n << 5);
-    if (sh != 0) {
-        int l1, l2;
-        TCGv t0;
-        l1 = gen_new_label();
-        l2 = gen_new_label();
-        t0 = tcg_temp_local_new();
-        tcg_gen_brcondi_tl(TCG_COND_GE, cpu_gpr[rS(ctx->opcode)], 0, l1);
-        tcg_gen_andi_tl(t0, cpu_gpr[rS(ctx->opcode)], (1ULL << sh) - 1);
-        tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, l1);
-        tcg_gen_ori_tl(cpu_xer, cpu_xer, 1 << XER_CA);
-        tcg_gen_br(l2);
-        gen_set_label(l1);
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
-        gen_set_label(l2);
-        tcg_temp_free(t0);
-        tcg_gen_sari_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)], sh);
+    TCGv dst = cpu_gpr[rA(ctx->opcode)];
+    TCGv src = cpu_gpr[rS(ctx->opcode)];
+    if (sh == 0) {
+        tcg_gen_mov_tl(dst, src);
+        tcg_gen_movi_tl(cpu_ca, 0);
     } else {
-        tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)]);
-        tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
+        TCGv t0;
+        tcg_gen_andi_tl(cpu_ca, src, (1ULL << sh) - 1);
+        t0 = tcg_temp_new();
+        tcg_gen_sari_tl(t0, src, TARGET_LONG_BITS - 1);
+        tcg_gen_and_tl(cpu_ca, cpu_ca, t0);
+        tcg_temp_free(t0);
+        tcg_gen_setcondi_tl(TCG_COND_NE, cpu_ca, cpu_ca, 0);
+        tcg_gen_sari_tl(dst, src, sh);
     }
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, dst);
+    }
 }
 
 static void gen_sradi0(DisasContext *ctx)
@@ -3176,9 +3040,7 @@
     {
         int l1;
 
-        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_xer);
-        tcg_gen_shri_i32(cpu_crf[0], cpu_crf[0], XER_SO);
-        tcg_gen_andi_i32(cpu_crf[0], cpu_crf[0], 1);
+        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
         l1 = gen_new_label();
         tcg_gen_brcond_tl(TCG_COND_NE, t0, cpu_reserve, l1);
         tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], 1 << CRF_EQ);
@@ -3219,9 +3081,7 @@
 #else
     {
         int l1;
-        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_xer);
-        tcg_gen_shri_i32(cpu_crf[0], cpu_crf[0], XER_SO);
-        tcg_gen_andi_i32(cpu_crf[0], cpu_crf[0], 1);
+        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
         l1 = gen_new_label();
         tcg_gen_brcond_tl(TCG_COND_NE, t0, cpu_reserve, l1);
         tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], 1 << CRF_EQ);
@@ -3797,12 +3657,55 @@
 
 /***                          Processor control                            ***/
 
+static void gen_read_xer(TCGv dst)
+{
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    tcg_gen_mov_tl(dst, cpu_xer);
+    tcg_gen_shli_tl(t0, cpu_so, XER_SO);
+    tcg_gen_shli_tl(t1, cpu_ov, XER_OV);
+    tcg_gen_shli_tl(t2, cpu_ca, XER_CA);
+    tcg_gen_or_tl(t0, t0, t1);
+    tcg_gen_or_tl(dst, dst, t2);
+    tcg_gen_or_tl(dst, dst, t0);
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    tcg_temp_free(t2);
+}
+
+static void gen_write_xer(TCGv src)
+{
+    tcg_gen_andi_tl(cpu_xer, src,
+                    ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
+    tcg_gen_shri_tl(cpu_so, src, XER_SO);
+    tcg_gen_shri_tl(cpu_ov, src, XER_OV);
+    tcg_gen_shri_tl(cpu_ca, src, XER_CA);
+    tcg_gen_andi_tl(cpu_so, cpu_so, 1);
+    tcg_gen_andi_tl(cpu_ov, cpu_ov, 1);
+    tcg_gen_andi_tl(cpu_ca, cpu_ca, 1);
+}
+
 /* mcrxr */
 static void gen_mcrxr(DisasContext *ctx)
 {
-    tcg_gen_trunc_tl_i32(cpu_crf[crfD(ctx->opcode)], cpu_xer);
-    tcg_gen_shri_i32(cpu_crf[crfD(ctx->opcode)], cpu_crf[crfD(ctx->opcode)], XER_CA);
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_SO | 1 << XER_OV | 1 << XER_CA));
+    TCGv_i32 t0 = tcg_temp_new_i32();
+    TCGv_i32 t1 = tcg_temp_new_i32();
+    TCGv_i32 dst = cpu_crf[crfD(ctx->opcode)];
+
+    tcg_gen_trunc_tl_i32(t0, cpu_so);
+    tcg_gen_trunc_tl_i32(t1, cpu_ov);
+    tcg_gen_trunc_tl_i32(dst, cpu_ca);
+    tcg_gen_shri_i32(t0, t0, 2);
+    tcg_gen_shri_i32(t1, t1, 1);
+    tcg_gen_or_i32(dst, dst, t0);
+    tcg_gen_or_i32(dst, dst, t1);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+
+    tcg_gen_movi_tl(cpu_so, 0);
+    tcg_gen_movi_tl(cpu_ov, 0);
+    tcg_gen_movi_tl(cpu_ca, 0);
 }
 
 /* mfcr mfocrf */
@@ -4532,10 +4435,11 @@
     int l2 = gen_new_label();
     int l3 = gen_new_label();
     /* Start with XER OV disabled, the most likely case */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
+    tcg_gen_movi_tl(cpu_ov, 0);
     tcg_gen_brcondi_tl(TCG_COND_GE, cpu_gpr[rA(ctx->opcode)], 0, l2);
     tcg_gen_brcondi_tl(TCG_COND_NE, cpu_gpr[rA(ctx->opcode)], 0x80000000, l1);
-    tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
+    tcg_gen_movi_tl(cpu_ov, 1);
+    tcg_gen_movi_tl(cpu_so, 1);
     tcg_gen_br(l2);
     gen_set_label(l1);
     tcg_gen_neg_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
@@ -4616,7 +4520,7 @@
     TCGv t1 = tcg_temp_new();
     TCGv t2 = tcg_temp_new();
     /* Start with XER OV disabled, the most likely case */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
+    tcg_gen_movi_tl(cpu_ov, 0);
     tcg_gen_brcond_tl(TCG_COND_GE, cpu_gpr[rB(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], l1);
     tcg_gen_sub_tl(t0, cpu_gpr[rB(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
     tcg_gen_xor_tl(t1, cpu_gpr[rB(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
@@ -4624,7 +4528,8 @@
     tcg_gen_andc_tl(t1, t1, t2);
     tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], t0);
     tcg_gen_brcondi_tl(TCG_COND_GE, t1, 0, l2);
-    tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
+    tcg_gen_movi_tl(cpu_ov, 1);
+    tcg_gen_movi_tl(cpu_so, 1);
     tcg_gen_br(l2);
     gen_set_label(l1);
     tcg_gen_movi_tl(cpu_gpr[rD(ctx->opcode)], 0);
@@ -4742,7 +4647,7 @@
     TCGv_i64 t1 = tcg_temp_new_i64();
     TCGv t2 = tcg_temp_new();
     /* Start with XER OV disabled, the most likely case */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
+    tcg_gen_movi_tl(cpu_ov, 0);
     tcg_gen_extu_tl_i64(t0, cpu_gpr[rA(ctx->opcode)]);
     tcg_gen_extu_tl_i64(t1, cpu_gpr[rB(ctx->opcode)]);
     tcg_gen_mul_i64(t0, t0, t1);
@@ -4752,7 +4657,8 @@
     tcg_gen_trunc_i64_tl(cpu_gpr[rD(ctx->opcode)], t1);
     tcg_gen_ext32s_i64(t1, t0);
     tcg_gen_brcond_i64(TCG_COND_EQ, t0, t1, l1);
-    tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
+    tcg_gen_movi_tl(cpu_ov, 1);
+    tcg_gen_movi_tl(cpu_so, 1);
     gen_set_label(l1);
     tcg_temp_free_i64(t0);
     tcg_temp_free_i64(t1);
@@ -4788,7 +4694,7 @@
     tcg_gen_neg_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
     gen_set_label(l2);
     /* nabs never overflows */
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
+    tcg_gen_movi_tl(cpu_ov, 0);
     if (unlikely(Rc(ctx->opcode) != 0))
         gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
 }
@@ -4965,10 +4871,10 @@
     tcg_gen_shli_tl(t1, cpu_gpr[rS(ctx->opcode)], 32 - sh);
     tcg_gen_or_tl(t0, t0, t1);
     gen_store_spr(SPR_MQ, t0);
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
+    tcg_gen_movi_tl(cpu_ca, 0);
     tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, l1);
     tcg_gen_brcondi_tl(TCG_COND_GE, cpu_gpr[rS(ctx->opcode)], 0, l1);
-    tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_CA));
+    tcg_gen_movi_tl(cpu_ca, 1);
     gen_set_label(l1);
     tcg_gen_sari_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)], sh);
     tcg_temp_free(t0);
@@ -4999,10 +4905,10 @@
     gen_set_label(l1);
     tcg_temp_free(t0);
     tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], t1);
-    tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_CA));
+    tcg_gen_movi_tl(cpu_ca, 0);
     tcg_gen_brcondi_tl(TCG_COND_GE, t1, 0, l2);
     tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, l2);
-    tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_CA));
+    tcg_gen_movi_tl(cpu_ca, 1);
     gen_set_label(l2);
     tcg_temp_free(t1);
     tcg_temp_free(t2);
@@ -5571,7 +5477,7 @@
 
             if (opc3 & 0x10) {
                 /* Start with XER OV disabled, the most likely case */
-                tcg_gen_andi_tl(cpu_xer, cpu_xer, ~(1 << XER_OV));
+                tcg_gen_movi_tl(cpu_ov, 0);
             }
             if (opc3 & 0x01) {
                 /* Signed */
@@ -5594,7 +5500,8 @@
             }
             if (opc3 & 0x10) {
                 /* Check overflow */
-                tcg_gen_ori_tl(cpu_xer, cpu_xer, (1 << XER_OV) | (1 << XER_SO));
+                tcg_gen_movi_tl(cpu_ov, 1);
+                tcg_gen_movi_tl(cpu_so, 1);
             }
             gen_set_label(l1);
             tcg_gen_mov_tl(cpu_gpr[rt], t0);
@@ -5982,9 +5889,7 @@
     tcg_temp_free(t0);
     if (Rc(ctx->opcode)) {
         int l1 = gen_new_label();
-        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_xer);
-        tcg_gen_shri_i32(cpu_crf[0], cpu_crf[0], XER_SO);
-        tcg_gen_andi_i32(cpu_crf[0], cpu_crf[0], 1);
+        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
         tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_gpr[rD(ctx->opcode)], -1, l1);
         tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], 0x02);
         gen_set_label(l1);
@@ -6065,9 +5970,7 @@
     tcg_temp_free(t0);
     if (Rc(ctx->opcode)) {
         int l1 = gen_new_label();
-        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_xer);
-        tcg_gen_shri_i32(cpu_crf[0], cpu_crf[0], XER_SO);
-        tcg_gen_andi_i32(cpu_crf[0], cpu_crf[0], 1);
+        tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
         tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_gpr[rD(ctx->opcode)], -1, l1);
         tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], 0x02);
         gen_set_label(l1);
@@ -9416,7 +9319,7 @@
 
     cpu_fprintf(f, "NIP " TARGET_FMT_lx "   LR " TARGET_FMT_lx " CTR "
                 TARGET_FMT_lx " XER " TARGET_FMT_lx "\n",
-                env->nip, env->lr, env->ctr, env->xer);
+                env->nip, env->lr, env->ctr, cpu_read_xer(env));
     cpu_fprintf(f, "MSR " TARGET_FMT_lx " HID0 " TARGET_FMT_lx "  HF "
                 TARGET_FMT_lx " idx %d\n", env->msr, env->spr[SPR_HID0],
                 env->hflags, env->mmu_idx);
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index 5df2057..f5fc9b1 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -118,12 +118,12 @@
 /* XER */
 static void spr_read_xer (void *opaque, int gprn, int sprn)
 {
-    tcg_gen_mov_tl(cpu_gpr[gprn], cpu_xer);
+    gen_read_xer(cpu_gpr[gprn]);
 }
 
 static void spr_write_xer (void *opaque, int sprn, int gprn)
 {
-    tcg_gen_mov_tl(cpu_xer, cpu_gpr[gprn]);
+    gen_write_xer(cpu_gpr[gprn]);
 }
 
 /* LR */
diff --git a/target-s390x/helper.h b/target-s390x/helper.h
index dd90d93..0d80aa0 100644
--- a/target-s390x/helper.h
+++ b/target-s390x/helper.h
@@ -8,7 +8,6 @@
 DEF_HELPER_FLAGS_4(clc, TCG_CALL_NO_WG, i32, env, i32, i64, i64)
 DEF_HELPER_3(mvcl, i32, env, i32, i32)
 DEF_HELPER_FLAGS_4(clm, TCG_CALL_NO_WG, i32, env, i32, i32, i64)
-DEF_HELPER_FLAGS_3(mul128, TCG_CALL_NO_RWG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_3(divs32, TCG_CALL_NO_WG, s64, env, s64, s64)
 DEF_HELPER_FLAGS_3(divu32, TCG_CALL_NO_WG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_3(divs64, TCG_CALL_NO_WG, s64, env, s64, s64)
diff --git a/target-s390x/int_helper.c b/target-s390x/int_helper.c
index 6858301..af16b21 100644
--- a/target-s390x/int_helper.c
+++ b/target-s390x/int_helper.c
@@ -29,14 +29,6 @@
 #define HELPER_LOG(x...)
 #endif
 
-/* 64/64 -> 128 unsigned multiplication */
-uint64_t HELPER(mul128)(CPUS390XState *env, uint64_t v1, uint64_t v2)
-{
-    uint64_t reth;
-    mulu64(&env->retxl, &reth, v1, v2);
-    return reth;
-}
-
 /* 64/32 -> 32 signed division */
 int64_t HELPER(divs32)(CPUS390XState *env, int64_t a, int64_t b64)
 {
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index a57296c..bdf69a3 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -2566,8 +2566,7 @@
 
 static ExitStatus op_mul128(DisasContext *s, DisasOps *o)
 {
-    gen_helper_mul128(o->out, cpu_env, o->in1, o->in2);
-    return_low128(o->out2);
+    tcg_gen_mulu2_i64(o->out2, o->out, o->in1, o->in2);
     return NO_EXIT;
 }
 
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index c58d79a..d255066 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -833,36 +833,10 @@
         gen_helper_div1(REG(B11_8), cpu_env, REG(B7_4), REG(B11_8));
 	return;
     case 0x300d:		/* dmuls.l Rm,Rn */
-	{
-	    TCGv_i64 tmp1 = tcg_temp_new_i64();
-	    TCGv_i64 tmp2 = tcg_temp_new_i64();
-
-	    tcg_gen_ext_i32_i64(tmp1, REG(B7_4));
-	    tcg_gen_ext_i32_i64(tmp2, REG(B11_8));
-	    tcg_gen_mul_i64(tmp1, tmp1, tmp2);
-	    tcg_gen_trunc_i64_i32(cpu_macl, tmp1);
-	    tcg_gen_shri_i64(tmp1, tmp1, 32);
-	    tcg_gen_trunc_i64_i32(cpu_mach, tmp1);
-
-	    tcg_temp_free_i64(tmp2);
-	    tcg_temp_free_i64(tmp1);
-	}
+        tcg_gen_muls2_i32(cpu_macl, cpu_mach, REG(B7_4), REG(B11_8));
 	return;
     case 0x3005:		/* dmulu.l Rm,Rn */
-	{
-	    TCGv_i64 tmp1 = tcg_temp_new_i64();
-	    TCGv_i64 tmp2 = tcg_temp_new_i64();
-
-	    tcg_gen_extu_i32_i64(tmp1, REG(B7_4));
-	    tcg_gen_extu_i32_i64(tmp2, REG(B11_8));
-	    tcg_gen_mul_i64(tmp1, tmp1, tmp2);
-	    tcg_gen_trunc_i64_i32(cpu_macl, tmp1);
-	    tcg_gen_shri_i64(tmp1, tmp1, 32);
-	    tcg_gen_trunc_i64_i32(cpu_mach, tmp1);
-
-	    tcg_temp_free_i64(tmp2);
-	    tcg_temp_free_i64(tmp1);
-	}
+        tcg_gen_mulu2_i32(cpu_macl, cpu_mach, REG(B7_4), REG(B11_8));
 	return;
     case 0x600e:		/* exts.b Rm,Rn */
 	tcg_gen_ext8s_i32(REG(B11_8), REG(B7_4));
diff --git a/target-sparc/cpu.c b/target-sparc/cpu.c
index ef52df6..50def61 100644
--- a/target-sparc/cpu.c
+++ b/target-sparc/cpu.c
@@ -580,13 +580,13 @@
         .fpu_version = 4 << 17, /* FPU version 4 (Meiko) */
         .mmu_version = 0xf3000000,
         .mmu_bm = 0x00000000,
-        .mmu_ctpr_mask = 0x007ffff0,
-        .mmu_cxr_mask = 0x0000003f,
+        .mmu_ctpr_mask = 0xfffffffc,
+        .mmu_cxr_mask = 0x000000ff,
         .mmu_sfsr_mask = 0xffffffff,
         .mmu_trcr_mask = 0xffffffff,
         .nwindows = 8,
         .features = CPU_DEFAULT_FEATURES | CPU_FEATURE_TA0_SHUTDOWN |
-        CPU_FEATURE_ASR17 | CPU_FEATURE_CACHE_CTRL,
+        CPU_FEATURE_ASR17 | CPU_FEATURE_CACHE_CTRL | CPU_FEATURE_POWERDOWN,
     },
 #endif
 };
diff --git a/target-sparc/cpu.h b/target-sparc/cpu.h
index 7389b03..a2f2cc8 100644
--- a/target-sparc/cpu.h
+++ b/target-sparc/cpu.h
@@ -270,6 +270,7 @@
 #define CPU_FEATURE_TA0_SHUTDOWN (1 << 14) /* Shutdown on "ta 0x0" */
 #define CPU_FEATURE_ASR17        (1 << 15)
 #define CPU_FEATURE_CACHE_CTRL   (1 << 16)
+#define CPU_FEATURE_POWERDOWN    (1 << 17)
 
 #ifndef TARGET_SPARC64
 #define CPU_DEFAULT_FEATURES (CPU_FEATURE_FLOAT | CPU_FEATURE_SWAP |  \
diff --git a/target-sparc/helper.c b/target-sparc/helper.c
index 91ecfc7..58e7efe 100644
--- a/target-sparc/helper.c
+++ b/target-sparc/helper.c
@@ -225,3 +225,14 @@
     cpu_restore_state(env, GETPC());
     helper_raise_exception(env, TT_TOVF);
 }
+
+#ifndef TARGET_SPARC64
+void helper_power_down(CPUSPARCState *env)
+{
+    env->halted = 1;
+    env->exception_index = EXCP_HLT;
+    env->pc = env->npc;
+    env->npc = env->pc + 4;
+    cpu_loop_exit(env);
+}
+#endif
diff --git a/target-sparc/helper.h b/target-sparc/helper.h
index cfcdab1..15f7328 100644
--- a/target-sparc/helper.h
+++ b/target-sparc/helper.h
@@ -4,6 +4,7 @@
 DEF_HELPER_1(rett, void, env)
 DEF_HELPER_2(wrpsr, void, env, tl)
 DEF_HELPER_1(rdpsr, tl, env)
+DEF_HELPER_1(power_down, void, env)
 #else
 DEF_HELPER_2(wrpil, void, env, tl)
 DEF_HELPER_2(wrpstate, void, env, tl)
diff --git a/target-sparc/ldst_helper.c b/target-sparc/ldst_helper.c
index 7decd66..6d767fb 100644
--- a/target-sparc/ldst_helper.c
+++ b/target-sparc/ldst_helper.c
@@ -514,6 +514,7 @@
 #endif
         break;
     case 3: /* MMU probe */
+    case 0x18: /* LEON3 MMU probe */
         {
             int mmulev;
 
@@ -528,6 +529,7 @@
         }
         break;
     case 4: /* read MMU regs */
+    case 0x19: /* LEON3 read MMU regs */
         {
             int reg = (addr >> 8) & 0x1f;
 
@@ -603,6 +605,7 @@
     case 0xf: /* D-cache data */
         break;
     case 0x20: /* MMU passthrough */
+    case 0x1c: /* LEON MMU passthrough */
         switch (size) {
         case 1:
             ret = ldub_phys(addr);
@@ -844,6 +847,7 @@
 #endif
         break;
     case 3: /* MMU flush */
+    case 0x18: /* LEON3 MMU flush */
         {
             int mmulev;
 
@@ -868,6 +872,7 @@
         }
         break;
     case 4: /* write MMU regs */
+    case 0x19: /* LEON3 write MMU regs */
         {
             int reg = (addr >> 8) & 0x1f;
             uint32_t oldreg;
@@ -996,6 +1001,7 @@
         }
         break;
     case 0x20: /* MMU passthrough */
+    case 0x1c: /* LEON MMU passthrough */
         {
             switch (size) {
             case 1:
diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index ca75e1a..12276d5 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -448,19 +448,16 @@
     case CC_OP_ADD:
     case CC_OP_TADD:
     case CC_OP_TADDTV:
-#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
-        {
-            /* For 32-bit hosts, we can re-use the host's hardware carry
-               generation by using an ADD2 opcode.  We discard the low
-               part of the output.  Ideally we'd combine this operation
-               with the add that generated the carry in the first place.  */
-            TCGv dst_low = tcg_temp_new();
-            tcg_gen_op6_i32(INDEX_op_add2_i32, dst_low, dst,
-                            cpu_cc_src, src1, cpu_cc_src2, src2);
-            tcg_temp_free(dst_low);
+        if (TARGET_LONG_BITS == 32) {
+            /* We can re-use the host's hardware carry generation by using
+               an ADD2 opcode.  We discard the low part of the output.
+               Ideally we'd combine this operation with the add that
+               generated the carry in the first place.  */
+            carry = tcg_temp_new();
+            tcg_gen_add2_tl(carry, dst, cpu_cc_src, src1, cpu_cc_src2, src2);
+            tcg_temp_free(carry);
             goto add_done;
         }
-#endif
         carry_32 = gen_add32_carry32();
         break;
 
@@ -492,9 +489,7 @@
     tcg_temp_free(carry);
 #endif
 
-#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
  add_done:
-#endif
     if (update_cc) {
         tcg_gen_mov_tl(cpu_cc_src, src1);
         tcg_gen_mov_tl(cpu_cc_src2, src2);
@@ -554,19 +549,16 @@
     case CC_OP_SUB:
     case CC_OP_TSUB:
     case CC_OP_TSUBTV:
-#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
-        {
-            /* For 32-bit hosts, we can re-use the host's hardware carry
-               generation by using a SUB2 opcode.  We discard the low
-               part of the output.  Ideally we'd combine this operation
-               with the add that generated the carry in the first place.  */
-            TCGv dst_low = tcg_temp_new();
-            tcg_gen_op6_i32(INDEX_op_sub2_i32, dst_low, dst,
-                            cpu_cc_src, src1, cpu_cc_src2, src2);
-            tcg_temp_free(dst_low);
+        if (TARGET_LONG_BITS == 32) {
+            /* We can re-use the host's hardware carry generation by using
+               a SUB2 opcode.  We discard the low part of the output.
+               Ideally we'd combine this operation with the add that
+               generated the carry in the first place.  */
+            carry = tcg_temp_new();
+            tcg_gen_sub2_tl(carry, dst, cpu_cc_src, src1, cpu_cc_src2, src2);
+            tcg_temp_free(carry);
             goto sub_done;
         }
-#endif
         carry_32 = gen_sub32_carry32();
         break;
 
@@ -592,9 +584,7 @@
     tcg_temp_free(carry);
 #endif
 
-#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32
  sub_done:
-#endif
     if (update_cc) {
         tcg_gen_mov_tl(cpu_cc_src, src1);
         tcg_gen_mov_tl(cpu_cc_src2, src2);
@@ -652,39 +642,30 @@
 
 static inline void gen_op_multiply(TCGv dst, TCGv src1, TCGv src2, int sign_ext)
 {
-    TCGv_i32 r_src1, r_src2;
-    TCGv_i64 r_temp, r_temp2;
-
-    r_src1 = tcg_temp_new_i32();
-    r_src2 = tcg_temp_new_i32();
-
-    tcg_gen_trunc_tl_i32(r_src1, src1);
-    tcg_gen_trunc_tl_i32(r_src2, src2);
-
-    r_temp = tcg_temp_new_i64();
-    r_temp2 = tcg_temp_new_i64();
+#if TARGET_LONG_BITS == 32
+    if (sign_ext) {
+        tcg_gen_muls2_tl(dst, cpu_y, src1, src2);
+    } else {
+        tcg_gen_mulu2_tl(dst, cpu_y, src1, src2);
+    }
+#else
+    TCGv t0 = tcg_temp_new_i64();
+    TCGv t1 = tcg_temp_new_i64();
 
     if (sign_ext) {
-        tcg_gen_ext_i32_i64(r_temp, r_src2);
-        tcg_gen_ext_i32_i64(r_temp2, r_src1);
+        tcg_gen_ext32s_i64(t0, src1);
+        tcg_gen_ext32s_i64(t1, src2);
     } else {
-        tcg_gen_extu_i32_i64(r_temp, r_src2);
-        tcg_gen_extu_i32_i64(r_temp2, r_src1);
+        tcg_gen_ext32u_i64(t0, src1);
+        tcg_gen_ext32u_i64(t1, src2);
     }
 
-    tcg_gen_mul_i64(r_temp2, r_temp, r_temp2);
+    tcg_gen_mul_i64(dst, t0, t1);
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
 
-    tcg_gen_shri_i64(r_temp, r_temp2, 32);
-    tcg_gen_trunc_i64_tl(cpu_y, r_temp);
-    tcg_temp_free_i64(r_temp);
-    tcg_gen_andi_tl(cpu_y, cpu_y, 0xffffffff);
-
-    tcg_gen_trunc_i64_tl(dst, r_temp2);
-
-    tcg_temp_free_i64(r_temp2);
-
-    tcg_temp_free_i32(r_src1);
-    tcg_temp_free_i32(r_src2);
+    tcg_gen_shri_i64(cpu_y, dst, 32);
+#endif
 }
 
 static inline void gen_op_umul(TCGv dst, TCGv src1, TCGv src2)
@@ -3642,6 +3623,11 @@
                                                    in the SPARCv8
                                                    manual, nop on the
                                                    microSPARC II */
+                                if ((rd == 0x13) && (dc->def->features &
+                                                     CPU_FEATURE_POWERDOWN)) {
+                                    /* LEON3 power-down */
+                                    gen_helper_power_down(cpu_env);
+                                }
                                 break;
 #else
                             case 0x2: /* V9 wrccr */
diff --git a/target-unicore32/translate.c b/target-unicore32/translate.c
index f4498bc..d5039e2 100644
--- a/target-unicore32/translate.c
+++ b/target-unicore32/translate.c
@@ -267,37 +267,6 @@
     dead_tmp(tmp);
 }
 
-/* FIXME: Most targets have native widening multiplication.
-   It would be good to use that instead of a full wide multiply.  */
-/* 32x32->64 multiply.  Marks inputs as dead.  */
-static TCGv_i64 gen_mulu_i64_i32(TCGv a, TCGv b)
-{
-    TCGv_i64 tmp1 = tcg_temp_new_i64();
-    TCGv_i64 tmp2 = tcg_temp_new_i64();
-
-    tcg_gen_extu_i32_i64(tmp1, a);
-    dead_tmp(a);
-    tcg_gen_extu_i32_i64(tmp2, b);
-    dead_tmp(b);
-    tcg_gen_mul_i64(tmp1, tmp1, tmp2);
-    tcg_temp_free_i64(tmp2);
-    return tmp1;
-}
-
-static TCGv_i64 gen_muls_i64_i32(TCGv a, TCGv b)
-{
-    TCGv_i64 tmp1 = tcg_temp_new_i64();
-    TCGv_i64 tmp2 = tcg_temp_new_i64();
-
-    tcg_gen_ext_i32_i64(tmp1, a);
-    dead_tmp(a);
-    tcg_gen_ext_i32_i64(tmp2, b);
-    dead_tmp(b);
-    tcg_gen_mul_i64(tmp1, tmp1, tmp2);
-    tcg_temp_free_i64(tmp2);
-    return tmp1;
-}
-
 #define gen_set_CF(var) tcg_gen_st_i32(var, cpu_env, offsetof(CPUUniCore32State, CF))
 
 /* Set CF to the top bit of var.  */
@@ -1219,38 +1188,6 @@
     }
 }
 
-
-/* Store a 64-bit value to a register pair.  Clobbers val.  */
-static void gen_storeq_reg(DisasContext *s, int rlow, int rhigh, TCGv_i64 val)
-{
-    TCGv tmp;
-    tmp = new_tmp();
-    tcg_gen_trunc_i64_i32(tmp, val);
-    store_reg(s, rlow, tmp);
-    tmp = new_tmp();
-    tcg_gen_shri_i64(val, val, 32);
-    tcg_gen_trunc_i64_i32(tmp, val);
-    store_reg(s, rhigh, tmp);
-}
-
-/* load and add a 64-bit value from a register pair.  */
-static void gen_addq(DisasContext *s, TCGv_i64 val, int rlow, int rhigh)
-{
-    TCGv_i64 tmp;
-    TCGv tmpl;
-    TCGv tmph;
-
-    /* Load 64-bit value rd:rn.  */
-    tmpl = load_reg(s, rlow);
-    tmph = load_reg(s, rhigh);
-    tmp = tcg_temp_new_i64();
-    tcg_gen_concat_i32_i64(tmp, tmpl, tmph);
-    dead_tmp(tmpl);
-    dead_tmp(tmph);
-    tcg_gen_add_i64(val, val, tmp);
-    tcg_temp_free_i64(tmp);
-}
-
 /* data processing instructions */
 static void do_datap(CPUUniCore32State *env, DisasContext *s, uint32_t insn)
 {
@@ -1445,24 +1382,26 @@
 /* multiply */
 static void do_mult(CPUUniCore32State *env, DisasContext *s, uint32_t insn)
 {
-    TCGv tmp;
-    TCGv tmp2;
-    TCGv_i64 tmp64;
+    TCGv tmp, tmp2, tmp3, tmp4;
 
     if (UCOP_SET(27)) {
         /* 64 bit mul */
         tmp = load_reg(s, UCOP_REG_M);
         tmp2 = load_reg(s, UCOP_REG_N);
         if (UCOP_SET(26)) {
-            tmp64 = gen_muls_i64_i32(tmp, tmp2);
+            tcg_gen_muls2_i32(tmp, tmp2, tmp, tmp2);
         } else {
-            tmp64 = gen_mulu_i64_i32(tmp, tmp2);
+            tcg_gen_mulu2_i32(tmp, tmp2, tmp, tmp2);
         }
         if (UCOP_SET(25)) { /* mult accumulate */
-            gen_addq(s, tmp64, UCOP_REG_LO, UCOP_REG_HI);
+            tmp3 = load_reg(s, UCOP_REG_LO);
+            tmp4 = load_reg(s, UCOP_REG_HI);
+            tcg_gen_add2_i32(tmp, tmp2, tmp, tmp2, tmp3, tmp4);
+            dead_tmp(tmp3);
+            dead_tmp(tmp4);
         }
-        gen_storeq_reg(s, UCOP_REG_LO, UCOP_REG_HI, tmp64);
-        tcg_temp_free_i64(tmp64);
+        store_reg(s, UCOP_REG_LO, tmp);
+        store_reg(s, UCOP_REG_HI, tmp2);
     } else {
         /* 32 bit mul */
         tmp = load_reg(s, UCOP_REG_M);
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index 7029ac4..11e06a3 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -1652,24 +1652,16 @@
             case 11: /*MULSHi*/
                 HAS_OPTION(XTENSA_OPTION_32_BIT_IMUL_HIGH);
                 {
-                    TCGv_i64 r = tcg_temp_new_i64();
-                    TCGv_i64 s = tcg_temp_new_i64();
-                    TCGv_i64 t = tcg_temp_new_i64();
+                    TCGv lo = tcg_temp_new();
 
                     if (OP2 == 10) {
-                        tcg_gen_extu_i32_i64(s, cpu_R[RRR_S]);
-                        tcg_gen_extu_i32_i64(t, cpu_R[RRR_T]);
+                        tcg_gen_mulu2_i32(lo, cpu_R[RRR_R],
+                                          cpu_R[RRR_S], cpu_R[RRR_T]);
                     } else {
-                        tcg_gen_ext_i32_i64(s, cpu_R[RRR_S]);
-                        tcg_gen_ext_i32_i64(t, cpu_R[RRR_T]);
+                        tcg_gen_muls2_i32(lo, cpu_R[RRR_R],
+                                          cpu_R[RRR_S], cpu_R[RRR_T]);
                     }
-                    tcg_gen_mul_i64(r, s, t);
-                    tcg_gen_shri_i64(r, r, 32);
-                    tcg_gen_trunc_i64_i32(cpu_R[RRR_R], r);
-
-                    tcg_temp_free_i64(r);
-                    tcg_temp_free_i64(s);
-                    tcg_temp_free_i64(t);
+                    tcg_temp_free(lo);
                 }
                 break;
 
@@ -2495,27 +2487,24 @@
                             tcg_gen_sari_i32(cpu_SR[ACCHI], cpu_SR[ACCLO], 31);
                         }
                     } else {
-                        TCGv_i32 res = tcg_temp_new_i32();
-                        TCGv_i64 res64 = tcg_temp_new_i64();
-                        TCGv_i64 tmp = tcg_temp_new_i64();
+                        TCGv_i32 lo = tcg_temp_new_i32();
+                        TCGv_i32 hi = tcg_temp_new_i32();
 
-                        tcg_gen_mul_i32(res, m1, m2);
-                        tcg_gen_ext_i32_i64(res64, res);
-                        tcg_gen_concat_i32_i64(tmp,
-                                cpu_SR[ACCLO], cpu_SR[ACCHI]);
+                        tcg_gen_mul_i32(lo, m1, m2);
+                        tcg_gen_sari_i32(hi, lo, 31);
                         if (op == MAC16_MULA) {
-                            tcg_gen_add_i64(tmp, tmp, res64);
+                            tcg_gen_add2_i32(cpu_SR[ACCLO], cpu_SR[ACCHI],
+                                             cpu_SR[ACCLO], cpu_SR[ACCHI],
+                                             lo, hi);
                         } else {
-                            tcg_gen_sub_i64(tmp, tmp, res64);
+                            tcg_gen_sub2_i32(cpu_SR[ACCLO], cpu_SR[ACCHI],
+                                             cpu_SR[ACCLO], cpu_SR[ACCHI],
+                                             lo, hi);
                         }
-                        tcg_gen_trunc_i64_i32(cpu_SR[ACCLO], tmp);
-                        tcg_gen_shri_i64(tmp, tmp, 32);
-                        tcg_gen_trunc_i64_i32(cpu_SR[ACCHI], tmp);
                         tcg_gen_ext8s_i32(cpu_SR[ACCHI], cpu_SR[ACCHI]);
 
-                        tcg_temp_free(res);
-                        tcg_temp_free_i64(res64);
-                        tcg_temp_free_i64(tmp);
+                        tcg_temp_free_i32(lo);
+                        tcg_temp_free_i32(hi);
                     }
                     tcg_temp_free(m1);
                     tcg_temp_free(m2);
diff --git a/tcg-runtime.c b/tcg-runtime.c
index abfc364..4b66e51 100644
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -22,7 +22,7 @@
  * THE SOFTWARE.
  */
 #include <stdint.h>
-
+#include "qemu/host-utils.h"
 #include "tcg/tcg-runtime.h"
 
 /* 32-bit helpers */
@@ -83,3 +83,17 @@
 {
     return arg1 % arg2;
 }
+
+uint64_t tcg_helper_muluh_i64(uint64_t arg1, uint64_t arg2)
+{
+    uint64_t l, h;
+    mulu64(&l, &h, arg1, arg2);
+    return h;
+}
+
+int64_t tcg_helper_mulsh_i64(int64_t arg1, int64_t arg2)
+{
+    uint64_t l, h;
+    muls64(&l, &h, arg1, arg2);
+    return h;
+}
diff --git a/tcg/README b/tcg/README
index ec1ac79..934e7af 100644
--- a/tcg/README
+++ b/tcg/README
@@ -361,6 +361,24 @@
 All this opcodes assume that the pointed host memory doesn't correspond
 to a global. In the latter case the behaviour is unpredictable.
 
+********* Multiword arithmetic support
+
+* add2_i32/i64 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
+* sub2_i32/i64 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
+
+Similar to add/sub, except that the double-word inputs T1 and T2 are
+formed from two single-word arguments, and the double-word output T0
+is returned in two single-word outputs.
+
+* mulu2_i32/i64 t0_low, t0_high, t1, t2
+
+Similar to mul, except two unsigned inputs T1 and T2 yielding the full
+double-word product T0.  The later is returned in two single-word outputs.
+
+* muls2_i32/i64 t0_low, t0_high, t1, t2
+
+Similar to mulu2, except the two inputs T1 and T2 are signed.
+
 ********* 64-bit target on 32-bit host support
 
 The following opcodes are internal to TCG.  Thus they are to be implemented by
@@ -372,18 +390,6 @@
 Similar to brcond, except that the 64-bit values T0 and T1
 are formed from two 32-bit arguments.
 
-* add2_i32 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
-* sub2_i32 t0_low, t0_high, t1_low, t1_high, t2_low, t2_high
-
-Similar to add/sub, except that the 64-bit inputs T1 and T2 are
-formed from two 32-bit arguments, and the 64-bit output T0
-is returned in two 32-bit outputs.
-
-* mulu2_i32 t0_low, t0_high, t1, t2
-
-Similar to mul, except two 32-bit (unsigned) inputs T1 and T2 yielding
-the full 64-bit product T0.  The later is returned in two 32-bit outputs.
-
 * setcond2_i32 dest, t1_low, t1_high, t2_low, t2_high, cond
 
 Similar to setcond, except that the 64-bit values T1 and T2 are
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index d9c33d8..94c6ca4 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1647,6 +1647,9 @@
     case INDEX_op_mulu2_i32:
         tcg_out_umull32(s, COND_AL, args[0], args[1], args[2], args[3]);
         break;
+    case INDEX_op_muls2_i32:
+        tcg_out_smull32(s, COND_AL, args[0], args[1], args[2], args[3]);
+        break;
     /* XXX: Perhaps args[2] & 0x1f is wrong */
     case INDEX_op_shl_i32:
         c = const_args[2] ?
@@ -1798,6 +1801,7 @@
     { INDEX_op_sub_i32, { "r", "r", "rI" } },
     { INDEX_op_mul_i32, { "r", "r", "r" } },
     { INDEX_op_mulu2_i32, { "r", "r", "r", "r" } },
+    { INDEX_op_muls2_i32, { "r", "r", "r", "r" } },
     { INDEX_op_and_i32, { "r", "r", "rI" } },
     { INDEX_op_andc_i32, { "r", "r", "rI" } },
     { INDEX_op_or_i32, { "r", "r", "rI" } },
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 7083f3a..b6eed1f 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -75,6 +75,7 @@
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
 #define TCG_TARGET_HAS_movcond_i32      1
+#define TCG_TARGET_HAS_muls2_i32        1
 
 enum {
     TCG_AREG0 = TCG_REG_R6,
diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
index e2754fe..ebd53d9 100644
--- a/tcg/hppa/tcg-target.h
+++ b/tcg/hppa/tcg-target.h
@@ -98,6 +98,7 @@
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_movcond_i32      1
+#define TCG_TARGET_HAS_muls2_i32        0
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub rd, 0, rs */
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 7aec304..9eec06c 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1922,6 +1922,37 @@
         tcg_out_qemu_st(s, args, 3);
         break;
 
+    OP_32_64(mulu2):
+        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
+        break;
+    OP_32_64(muls2):
+        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
+        break;
+    OP_32_64(add2):
+        if (const_args[4]) {
+            tgen_arithi(s, ARITH_ADD + rexw, args[0], args[4], 1);
+        } else {
+            tgen_arithr(s, ARITH_ADD + rexw, args[0], args[4]);
+        }
+        if (const_args[5]) {
+            tgen_arithi(s, ARITH_ADC + rexw, args[1], args[5], 1);
+        } else {
+            tgen_arithr(s, ARITH_ADC + rexw, args[1], args[5]);
+        }
+        break;
+    OP_32_64(sub2):
+        if (const_args[4]) {
+            tgen_arithi(s, ARITH_SUB + rexw, args[0], args[4], 1);
+        } else {
+            tgen_arithr(s, ARITH_SUB + rexw, args[0], args[4]);
+        }
+        if (const_args[5]) {
+            tgen_arithi(s, ARITH_SBB + rexw, args[1], args[5], 1);
+        } else {
+            tgen_arithr(s, ARITH_SBB + rexw, args[1], args[5]);
+        }
+        break;
+
 #if TCG_TARGET_REG_BITS == 32
     case INDEX_op_brcond2_i32:
         tcg_out_brcond2(s, args, const_args, 0);
@@ -1929,33 +1960,6 @@
     case INDEX_op_setcond2_i32:
         tcg_out_setcond2(s, args, const_args);
         break;
-    case INDEX_op_mulu2_i32:
-        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_MUL, args[3]);
-        break;
-    case INDEX_op_add2_i32:
-        if (const_args[4]) {
-            tgen_arithi(s, ARITH_ADD, args[0], args[4], 1);
-        } else {
-            tgen_arithr(s, ARITH_ADD, args[0], args[4]);
-        }
-        if (const_args[5]) {
-            tgen_arithi(s, ARITH_ADC, args[1], args[5], 1);
-        } else {
-            tgen_arithr(s, ARITH_ADC, args[1], args[5]);
-        }
-        break;
-    case INDEX_op_sub2_i32:
-        if (const_args[4]) {
-            tgen_arithi(s, ARITH_SUB, args[0], args[4], 1);
-        } else {
-            tgen_arithr(s, ARITH_SUB, args[0], args[4]);
-        }
-        if (const_args[5]) {
-            tgen_arithi(s, ARITH_SBB, args[1], args[5], 1);
-        } else {
-            tgen_arithr(s, ARITH_SBB, args[1], args[5]);
-        }
-        break;
 #else /* TCG_TARGET_REG_BITS == 64 */
     case INDEX_op_movi_i64:
         tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
@@ -2078,10 +2082,12 @@
     { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
 #endif
 
-#if TCG_TARGET_REG_BITS == 32
     { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
+    { INDEX_op_muls2_i32, { "a", "d", "a", "r" } },
     { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
     { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
+
+#if TCG_TARGET_REG_BITS == 32
     { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
     { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
 #else
@@ -2132,6 +2138,11 @@
 
     { INDEX_op_deposit_i64, { "Q", "0", "Q" } },
     { INDEX_op_movcond_i64, { "r", "r", "re", "r", "0" } },
+
+    { INDEX_op_mulu2_i64, { "a", "d", "a", "r" } },
+    { INDEX_op_muls2_i64, { "a", "d", "a", "r" } },
+    { INDEX_op_add2_i64, { "r", "r", "0", "1", "re", "re" } },
+    { INDEX_op_sub2_i64, { "r", "r", "0", "1", "re", "re" } },
 #endif
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index e63db9c..e3f6bb9 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -92,6 +92,10 @@
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_movcond_i32      1
+#define TCG_TARGET_HAS_add2_i32         1
+#define TCG_TARGET_HAS_sub2_i32         1
+#define TCG_TARGET_HAS_mulu2_i32        1
+#define TCG_TARGET_HAS_muls2_i32        1
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -114,6 +118,10 @@
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      1
 #define TCG_TARGET_HAS_movcond_i64      1
+#define TCG_TARGET_HAS_add2_i64         1
+#define TCG_TARGET_HAS_sub2_i64         1
+#define TCG_TARGET_HAS_mulu2_i64        1
+#define TCG_TARGET_HAS_muls2_i64        1
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index 7f3401e..e3d72ea 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -136,6 +136,14 @@
 #define TCG_TARGET_HAS_movcond_i64      1
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_deposit_i64      1
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i32        0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muls2_i64        0
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) ((len) <= 16)
 #define TCG_TARGET_deposit_i64_valid(ofs, len) ((len) <= 16)
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 78af664..0384bd3 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -87,6 +87,7 @@
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
+#define TCG_TARGET_HAS_muls2_i32        0
 
 /* optional instructions only implemented on MIPS4, MIPS32 and Loongson 2 */
 #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 973d2d6..bc6e5c1 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -554,11 +554,12 @@
                 args[5] = tcg_invert_cond(args[5]);
             }
             break;
-        case INDEX_op_add2_i32:
+        CASE_OP_32_64(add2):
             swap_commutative(args[0], &args[2], &args[4]);
             swap_commutative(args[1], &args[3], &args[5]);
             break;
-        case INDEX_op_mulu2_i32:
+        CASE_OP_32_64(mulu2):
+        CASE_OP_32_64(muls2):
             swap_commutative(args[0], &args[2], &args[3]);
             break;
         case INDEX_op_brcond2_i32:
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 0fdad04..17a6bb3 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -94,6 +94,7 @@
 #define TCG_TARGET_HAS_nor_i32          1
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_movcond_i32      1
+#define TCG_TARGET_HAS_muls2_i32        0
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index 9b8e9a0..aa6a0f0 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -85,6 +85,10 @@
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
 #define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_mulu2_i32        0
+#define TCG_TARGET_HAS_muls2_i32        0
 
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rot_i64          0
@@ -106,6 +110,10 @@
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
 #define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muls2_i64        0
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index c87b413..40211e6 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -65,6 +65,10 @@
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
 #define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_mulu2_i32        0
+#define TCG_TARGET_HAS_muls2_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -87,6 +91,10 @@
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
 #define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muls2_i64        0
 #endif
 
 /* used for function call generation */
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 03db514..6d489fc 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -1327,6 +1327,8 @@
                              args[3], const_args[3],
                              args[4], const_args[4]);
         break;
+#endif
+
     case INDEX_op_add2_i32:
         tcg_out_addsub2(s, args[0], args[1], args[2], args[3],
                         args[4], const_args[4], args[5], const_args[5],
@@ -1342,7 +1344,6 @@
                        ARITH_UMUL);
         tcg_out_rdy(s, args[1]);
         break;
-#endif
 
     case INDEX_op_qemu_ld8u:
         tcg_out_qemu_ld(s, args, 0);
@@ -1511,10 +1512,11 @@
 #if TCG_TARGET_REG_BITS == 32
     { INDEX_op_brcond2_i32, { "rZ", "rZ", "rJ", "rJ" } },
     { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rJ", "rJ" } },
+#endif
+
     { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
     { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
     { INDEX_op_mulu2_i32, { "r", "r", "rZ", "rJ" } },
-#endif
 
 #if TCG_TARGET_REG_BITS == 64
     { INDEX_op_mov_i64, { "r", "r" } },
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 256f973..b5217be 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -102,6 +102,10 @@
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
 #define TCG_TARGET_HAS_movcond_i32      1
+#define TCG_TARGET_HAS_add2_i32         1
+#define TCG_TARGET_HAS_sub2_i32         1
+#define TCG_TARGET_HAS_mulu2_i32        1
+#define TCG_TARGET_HAS_muls2_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div_i64          1
@@ -124,6 +128,10 @@
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
 #define TCG_TARGET_HAS_movcond_i64      1
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muls2_i64        0
 #endif
 
 #define TCG_AREG0 TCG_REG_I0
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 91c9d80..d70b2eb 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -2246,6 +2246,26 @@
     tcg_gen_deposit_i64(dest, low, high, 32, 32);
 }
 
+static inline void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg)
+{
+#if TCG_TARGET_REG_BITS == 32
+    tcg_gen_mov_i32(lo, TCGV_LOW(arg));
+    tcg_gen_mov_i32(hi, TCGV_HIGH(arg));
+#else
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    tcg_gen_trunc_i64_i32(lo, arg);
+    tcg_gen_shri_i64(t0, arg, 32);
+    tcg_gen_trunc_i64_i32(hi, t0);
+    tcg_temp_free_i64(t0);
+#endif
+}
+
+static inline void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg)
+{
+    tcg_gen_ext32u_i64(lo, arg);
+    tcg_gen_shri_i64(hi, arg, 32);
+}
+
 static inline void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret,
                                        TCGv_i32 c1, TCGv_i32 c2,
                                        TCGv_i32 v1, TCGv_i32 v2)
@@ -2312,6 +2332,204 @@
 #endif
 }
 
+static inline void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                                    TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
+{
+    if (TCG_TARGET_HAS_add2_i32) {
+        tcg_gen_op6_i32(INDEX_op_add2_i32, rl, rh, al, ah, bl, bh);
+        /* Allow the optimizer room to replace add2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_concat_i32_i64(t0, al, ah);
+        tcg_gen_concat_i32_i64(t1, bl, bh);
+        tcg_gen_add_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+static inline void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                                    TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
+{
+    if (TCG_TARGET_HAS_sub2_i32) {
+        tcg_gen_op6_i32(INDEX_op_sub2_i32, rl, rh, al, ah, bl, bh);
+        /* Allow the optimizer room to replace sub2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_concat_i32_i64(t0, al, ah);
+        tcg_gen_concat_i32_i64(t1, bl, bh);
+        tcg_gen_sub_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+static inline void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh,
+                                     TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_mulu2_i32) {
+        tcg_gen_op4_i32(INDEX_op_mulu2_i32, rl, rh, arg1, arg2);
+        /* Allow the optimizer room to replace mulu2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_extu_i32_i64(t0, arg1);
+        tcg_gen_extu_i32_i64(t1, arg2);
+        tcg_gen_mul_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+static inline void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh,
+                                     TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_muls2_i32) {
+        tcg_gen_op4_i32(INDEX_op_muls2_i32, rl, rh, arg1, arg2);
+        /* Allow the optimizer room to replace muls2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else if (TCG_TARGET_REG_BITS == 32 && TCG_TARGET_HAS_mulu2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        TCGv_i32 t2 = tcg_temp_new_i32();
+        TCGv_i32 t3 = tcg_temp_new_i32();
+        tcg_gen_op4_i32(INDEX_op_mulu2_i32, t0, t1, arg1, arg2);
+        /* Allow the optimizer room to replace mulu2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+        /* Adjust for negative inputs.  */
+        tcg_gen_sari_i32(t2, arg1, 31);
+        tcg_gen_sari_i32(t3, arg2, 31);
+        tcg_gen_and_i32(t2, t2, arg2);
+        tcg_gen_and_i32(t3, t3, arg1);
+        tcg_gen_sub_i32(rh, t1, t2);
+        tcg_gen_sub_i32(rh, rh, t3);
+        tcg_gen_mov_i32(rl, t0);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+        tcg_temp_free_i32(t2);
+        tcg_temp_free_i32(t3);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(t0, arg1);
+        tcg_gen_ext_i32_i64(t1, arg2);
+        tcg_gen_mul_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+static inline void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                                    TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
+{
+    if (TCG_TARGET_HAS_add2_i64) {
+        tcg_gen_op6_i64(INDEX_op_add2_i64, rl, rh, al, ah, bl, bh);
+        /* Allow the optimizer room to replace add2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_add_i64(t0, al, bl);
+        tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, al);
+        tcg_gen_add_i64(rh, ah, bh);
+        tcg_gen_add_i64(rh, rh, t1);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+static inline void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                                    TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
+{
+    if (TCG_TARGET_HAS_sub2_i64) {
+        tcg_gen_op6_i64(INDEX_op_sub2_i64, rl, rh, al, ah, bl, bh);
+        /* Allow the optimizer room to replace sub2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_sub_i64(t0, al, bl);
+        tcg_gen_setcond_i64(TCG_COND_LTU, t1, al, bl);
+        tcg_gen_sub_i64(rh, ah, bh);
+        tcg_gen_sub_i64(rh, rh, t1);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+static inline void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh,
+                                     TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_mulu2_i64) {
+        tcg_gen_op4_i64(INDEX_op_mulu2_i64, rl, rh, arg1, arg2);
+        /* Allow the optimizer room to replace mulu2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else if (TCG_TARGET_HAS_mulu2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        TCGv_i64 t2 = tcg_temp_new_i64();
+        TCGv_i64 t3 = tcg_temp_new_i64();
+        tcg_gen_op4_i64(INDEX_op_mulu2_i64, t0, t1, arg1, arg2);
+        /* Allow the optimizer room to replace mulu2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+        /* Adjust for negative inputs.  */
+        tcg_gen_sari_i64(t2, arg1, 63);
+        tcg_gen_sari_i64(t3, arg2, 63);
+        tcg_gen_and_i64(t2, t2, arg2);
+        tcg_gen_and_i64(t3, t3, arg1);
+        tcg_gen_sub_i64(rh, t1, t2);
+        tcg_gen_sub_i64(rh, rh, t3);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+        tcg_temp_free_i64(t2);
+        tcg_temp_free_i64(t3);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        int sizemask = 0;
+        /* Return value and both arguments are 64-bit and unsigned.  */
+        sizemask |= tcg_gen_sizemask(0, 1, 0);
+        sizemask |= tcg_gen_sizemask(1, 1, 0);
+        sizemask |= tcg_gen_sizemask(2, 1, 0);
+        tcg_gen_mul_i64(t0, arg1, arg2);
+        tcg_gen_helper64(tcg_helper_muluh_i64, sizemask, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
+                                     TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_muls2_i64) {
+        tcg_gen_op4_i64(INDEX_op_muls2_i64, rl, rh, arg1, arg2);
+        /* Allow the optimizer room to replace muls2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        int sizemask = 0;
+        /* Return value and both arguments are 64-bit and signed.  */
+        sizemask |= tcg_gen_sizemask(0, 1, 1);
+        sizemask |= tcg_gen_sizemask(1, 1, 1);
+        sizemask |= tcg_gen_sizemask(2, 1, 1);
+        tcg_gen_mul_i64(t0, arg1, arg2);
+        tcg_gen_helper64(tcg_helper_mulsh_i64, sizemask, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
 /***************************************/
 /* QEMU specific operations. Their type depend on the QEMU CPU
    type. */
@@ -2625,6 +2843,7 @@
 #define tcg_gen_bswap32_tl tcg_gen_bswap32_i64
 #define tcg_gen_bswap64_tl tcg_gen_bswap64_i64
 #define tcg_gen_concat_tl_i64 tcg_gen_concat32_i64
+#define tcg_gen_extr_i64_tl tcg_gen_extr32_i64
 #define tcg_gen_andc_tl tcg_gen_andc_i64
 #define tcg_gen_eqv_tl tcg_gen_eqv_i64
 #define tcg_gen_nand_tl tcg_gen_nand_i64
@@ -2638,6 +2857,10 @@
 #define tcg_const_tl tcg_const_i64
 #define tcg_const_local_tl tcg_const_local_i64
 #define tcg_gen_movcond_tl tcg_gen_movcond_i64
+#define tcg_gen_add2_tl tcg_gen_add2_i64
+#define tcg_gen_sub2_tl tcg_gen_sub2_i64
+#define tcg_gen_mulu2_tl tcg_gen_mulu2_i64
+#define tcg_gen_muls2_tl tcg_gen_muls2_i64
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -2697,6 +2920,7 @@
 #define tcg_gen_bswap16_tl tcg_gen_bswap16_i32
 #define tcg_gen_bswap32_tl tcg_gen_bswap32_i32
 #define tcg_gen_concat_tl_i64 tcg_gen_concat_i32_i64
+#define tcg_gen_extr_tl_i64 tcg_gen_extr_i32_i64
 #define tcg_gen_andc_tl tcg_gen_andc_i32
 #define tcg_gen_eqv_tl tcg_gen_eqv_i32
 #define tcg_gen_nand_tl tcg_gen_nand_i32
@@ -2710,6 +2934,10 @@
 #define tcg_const_tl tcg_const_i32
 #define tcg_const_local_tl tcg_const_local_i32
 #define tcg_gen_movcond_tl tcg_gen_movcond_i32
+#define tcg_gen_add2_tl tcg_gen_add2_i32
+#define tcg_gen_sub2_tl tcg_gen_sub2_i32
+#define tcg_gen_mulu2_tl tcg_gen_mulu2_i32
+#define tcg_gen_muls2_tl tcg_gen_muls2_i32
 #endif
 
 #if TCG_TARGET_REG_BITS == 32
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 9651063..4246e9c 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -83,10 +83,11 @@
 
 DEF(brcond_i32, 0, 2, 2, TCG_OPF_BB_END)
 
-DEF(add2_i32, 2, 4, 0, IMPL(TCG_TARGET_REG_BITS == 32))
-DEF(sub2_i32, 2, 4, 0, IMPL(TCG_TARGET_REG_BITS == 32))
+DEF(add2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_add2_i32))
+DEF(sub2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_sub2_i32))
+DEF(mulu2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_mulu2_i32))
+DEF(muls2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_muls2_i32))
 DEF(brcond2_i32, 0, 4, 2, TCG_OPF_BB_END | IMPL(TCG_TARGET_REG_BITS == 32))
-DEF(mulu2_i32, 2, 2, 0, IMPL(TCG_TARGET_REG_BITS == 32))
 DEF(setcond2_i32, 1, 4, 1, IMPL(TCG_TARGET_REG_BITS == 32))
 
 DEF(ext8s_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_ext8s_i32))
@@ -158,6 +159,11 @@
 DEF(nand_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_nand_i64))
 DEF(nor_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_nor_i64))
 
+DEF(add2_i64, 2, 4, 0, IMPL64 | IMPL(TCG_TARGET_HAS_add2_i64))
+DEF(sub2_i64, 2, 4, 0, IMPL64 | IMPL(TCG_TARGET_HAS_sub2_i64))
+DEF(mulu2_i64, 2, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_mulu2_i64))
+DEF(muls2_i64, 2, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_muls2_i64))
+
 /* QEMU specific */
 #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
 DEF(debug_insn_start, 0, 0, 2, 0)
diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
index 5615b13..a1ebef9 100644
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -12,7 +12,9 @@
 int64_t tcg_helper_sar_i64(int64_t arg1, int64_t arg2);
 int64_t tcg_helper_div_i64(int64_t arg1, int64_t arg2);
 int64_t tcg_helper_rem_i64(int64_t arg1, int64_t arg2);
+int64_t tcg_helper_mulsh_i64(int64_t arg1, int64_t arg2);
 uint64_t tcg_helper_divu_i64(uint64_t arg1, uint64_t arg2);
 uint64_t tcg_helper_remu_i64(uint64_t arg1, uint64_t arg2);
+uint64_t tcg_helper_muluh_i64(uint64_t arg1, uint64_t arg2);
 
 #endif
diff --git a/tcg/tcg.c b/tcg/tcg.c
index c8a843e..1d8265e 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1217,7 +1217,7 @@
 static void tcg_liveness_analysis(TCGContext *s)
 {
     int i, op_index, nb_args, nb_iargs, nb_oargs, arg, nb_ops;
-    TCGOpcode op;
+    TCGOpcode op, op_new;
     TCGArg *args;
     const TCGOpDef *def;
     uint8_t *dead_temps, *mem_temps;
@@ -1324,7 +1324,17 @@
             break;
 
         case INDEX_op_add2_i32:
+            op_new = INDEX_op_add_i32;
+            goto do_addsub2;
         case INDEX_op_sub2_i32:
+            op_new = INDEX_op_sub_i32;
+            goto do_addsub2;
+        case INDEX_op_add2_i64:
+            op_new = INDEX_op_add_i64;
+            goto do_addsub2;
+        case INDEX_op_sub2_i64:
+            op_new = INDEX_op_sub_i64;
+        do_addsub2:
             args -= 6;
             nb_iargs = 4;
             nb_oargs = 2;
@@ -1337,12 +1347,7 @@
                     goto do_remove;
                 }
                 /* Create the single operation plus nop.  */
-                if (op == INDEX_op_add2_i32) {
-                    op = INDEX_op_add_i32;
-                } else {
-                    op = INDEX_op_sub_i32;
-                }
-                s->gen_opc_buf[op_index] = op;
+                s->gen_opc_buf[op_index] = op = op_new;
                 args[1] = args[2];
                 args[2] = args[4];
                 assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
@@ -1354,6 +1359,13 @@
             goto do_not_remove;
 
         case INDEX_op_mulu2_i32:
+        case INDEX_op_muls2_i32:
+            op_new = INDEX_op_mul_i32;
+            goto do_mul2;
+        case INDEX_op_mulu2_i64:
+        case INDEX_op_muls2_i64:
+            op_new = INDEX_op_mul_i64;
+        do_mul2:
             args -= 4;
             nb_iargs = 2;
             nb_oargs = 2;
@@ -1362,7 +1374,7 @@
                 if (dead_temps[args[0]] && !mem_temps[args[0]]) {
                     goto do_remove;
                 }
-                s->gen_opc_buf[op_index] = op = INDEX_op_mul_i32;
+                s->gen_opc_buf[op_index] = op = op_new;
                 args[1] = args[2];
                 args[2] = args[3];
                 assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 51c8176..b195396 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -57,8 +57,8 @@
 #error unsupported
 #endif
 
-/* Turn some undef macros into false macros.  */
 #if TCG_TARGET_REG_BITS == 32
+/* Turn some undef macros into false macros.  */
 #define TCG_TARGET_HAS_div_i64          0
 #define TCG_TARGET_HAS_div2_i64         0
 #define TCG_TARGET_HAS_rot_i64          0
@@ -80,6 +80,14 @@
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
 #define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muls2_i64        0
+/* Turn some undef macros into true macros.  */
+#define TCG_TARGET_HAS_add2_i32         1
+#define TCG_TARGET_HAS_sub2_i32         1
+#define TCG_TARGET_HAS_mulu2_i32        1
 #endif
 
 #ifndef TCG_TARGET_deposit_i32_valid
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index a832f5c..1f17576 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -76,6 +76,7 @@
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_rot_i32          1
 #define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_muls2_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_bswap16_i64      1
@@ -100,6 +101,14 @@
 #define TCG_TARGET_HAS_orc_i64          0
 #define TCG_TARGET_HAS_rot_i64          1
 #define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_muls2_i64        0
+
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_mulu2_i32        0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
 #endif /* TCG_TARGET_REG_BITS == 64 */
 
 /* Number of registers available.
diff --git a/tests/check-qjson.c b/tests/check-qjson.c
index 32ffb43..ec85a0c 100644
--- a/tests/check-qjson.c
+++ b/tests/check-qjson.c
@@ -1,8 +1,10 @@
 /*
  * Copyright IBM, Corp. 2009
+ * Copyright (c) 2013 Red Hat Inc.
  *
  * Authors:
  *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Markus Armbruster <armbru@redhat.com>,
  *
  * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
  * See the COPYING.LIB file in the top-level directory.
@@ -131,6 +133,667 @@
     }
 }
 
+static void utf8_string(void)
+{
+    /*
+     * FIXME Current behavior for invalid UTF-8 sequences is
+     * incorrect.  This test expects current, incorrect results.
+     * They're all marked "bug:" below, and are to be replaced by
+     * correct ones as the bugs get fixed.
+     *
+     * The JSON parser rejects some invalid sequences, but accepts
+     * others without correcting the problem.
+     *
+     * The JSON formatter replaces some invalid sequences by U+FFFF (a
+     * noncharacter), and goes wonky for others.
+     *
+     * For both directions, we should either reject all invalid
+     * sequences, or minimize overlong sequences and replace all other
+     * invalid sequences by a suitable replacement character.  A
+     * common choice for replacement is U+FFFD.
+     *
+     * Problem: we can't easily deal with embedded U+0000.  Parsing
+     * the JSON string "this \\u0000" is fun" yields "this \0 is fun",
+     * which gets misinterpreted as NUL-terminated "this ".  We should
+     * consider using overlong encoding \xC0\x80 for U+0000 ("modified
+     * UTF-8").
+     *
+     * Test cases are scraped from Markus Kuhn's UTF-8 decoder
+     * capability and stress test at
+     * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+     */
+    static const struct {
+        const char *json_in;
+        const char *utf8_out;
+        const char *json_out;   /* defaults to @json_in */
+        const char *utf8_in;    /* defaults to @utf8_out */
+    } test_cases[] = {
+        /*
+         * Bug markers used here:
+         * - bug: not corrected
+         *   JSON parser fails to correct invalid sequence(s)
+         * - bug: rejected
+         *   JSON parser rejects invalid sequence(s)
+         *   We may choose to define this as feature
+         * - bug: want "\"...\""
+         *   JSON formatter produces incorrect result, this is the
+         *   correct one, assuming replacement character U+FFFF
+         * - bug: want "..." (no \")
+         *   JSON parser produces incorrect result, this is the
+         *   correct one, assuming replacement character U+FFFF
+         *   We may choose to reject instead of replace
+         * Not marked explicitly, but trivial to find:
+         * - JSON formatter replacing invalid sequence by \\uFFFF is a
+         *   bug if we want it to fail for invalid sequences.
+         */
+
+        /* 1  Some correct UTF-8 text */
+        {
+            /* a bit of German */
+            "\"Falsches \xC3\x9C" "ben von Xylophonmusik qu\xC3\xA4lt"
+            " jeden gr\xC3\xB6\xC3\x9F" "eren Zwerg.\"",
+            "Falsches \xC3\x9C" "ben von Xylophonmusik qu\xC3\xA4lt"
+            " jeden gr\xC3\xB6\xC3\x9F" "eren Zwerg.",
+            "\"Falsches \\u00DCben von Xylophonmusik qu\\u00E4lt"
+            " jeden gr\\u00F6\\u00DFeren Zwerg.\"",
+        },
+        {
+            /* a bit of Greek */
+            "\"\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5\"",
+            "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5",
+            "\"\\u03BA\\u1F79\\u03C3\\u03BC\\u03B5\"",
+        },
+        /* 2  Boundary condition test cases */
+        /* 2.1  First possible sequence of a certain length */
+        /* 2.1.1  1 byte U+0000 */
+        {
+            "\"\\u0000\"",
+            "",                 /* bug: want overlong "\xC0\x80" */
+            "\"\"",             /* bug: want "\"\\u0000\"" */
+        },
+        /* 2.1.2  2 bytes U+0080 */
+        {
+            "\"\xC2\x80\"",
+            "\xC2\x80",
+            "\"\\u0080\"",
+        },
+        /* 2.1.3  3 bytes U+0800 */
+        {
+            "\"\xE0\xA0\x80\"",
+            "\xE0\xA0\x80",
+            "\"\\u0800\"",
+        },
+        /* 2.1.4  4 bytes U+10000 */
+        {
+            "\"\xF0\x90\x80\x80\"",
+            "\xF0\x90\x80\x80",
+            "\"\\u0400\\uFFFF\"", /* bug: want "\"\\uD800\\uDC00\"" */
+        },
+        /* 2.1.5  5 bytes U+200000 */
+        {
+            "\"\xF8\x88\x80\x80\x80\"",
+            NULL,                        /* bug: rejected */
+            "\"\\u8200\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xF8\x88\x80\x80\x80",
+        },
+        /* 2.1.6  6 bytes U+4000000 */
+        {
+            "\"\xFC\x84\x80\x80\x80\x80\"",
+            NULL,                               /* bug: rejected */
+            "\"\\uC100\\uFFFF\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xFC\x84\x80\x80\x80\x80",
+        },
+        /* 2.2  Last possible sequence of a certain length */
+        /* 2.2.1  1 byte U+007F */
+        {
+            "\"\x7F\"",
+            "\x7F",
+            "\"\177\"",
+        },
+        /* 2.2.2  2 bytes U+07FF */
+        {
+            "\"\xDF\xBF\"",
+            "\xDF\xBF",
+            "\"\\u07FF\"",
+        },
+        /* 2.2.3  3 bytes U+FFFF */
+        {
+            "\"\xEF\xBF\xBF\"",
+            "\xEF\xBF\xBF",
+            "\"\\uFFFF\"",
+        },
+        /* 2.2.4  4 bytes U+1FFFFF */
+        {
+            "\"\xF7\xBF\xBF\xBF\"",
+            NULL,                 /* bug: rejected */
+            "\"\\u7FFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xF7\xBF\xBF\xBF",
+        },
+        /* 2.2.5  5 bytes U+3FFFFFF */
+        {
+            "\"\xFB\xBF\xBF\xBF\xBF\"",
+            NULL,                        /* bug: rejected */
+            "\"\\uBFFF\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xFB\xBF\xBF\xBF\xBF",
+        },
+        /* 2.2.6  6 bytes U+7FFFFFFF */
+        {
+            "\"\xFD\xBF\xBF\xBF\xBF\xBF\"",
+            NULL,                               /* bug: rejected */
+            "\"\\uDFFF\\uFFFF\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xFD\xBF\xBF\xBF\xBF\xBF",
+        },
+        /* 2.3  Other boundary conditions */
+        {
+            /* U+D7FF */
+            "\"\xED\x9F\xBF\"",
+            "\xED\x9F\xBF",
+            "\"\\uD7FF\"",
+        },
+        {
+            /* U+E000 */
+            "\"\xEE\x80\x80\"",
+            "\xEE\x80\x80",
+            "\"\\uE000\"",
+        },
+        {
+            /* U+FFFD */
+            "\"\xEF\xBF\xBD\"",
+            "\xEF\xBF\xBD",
+            "\"\\uFFFD\"",
+        },
+        {
+            /* U+10FFFF */
+            "\"\xF4\x8F\xBF\xBF\"",
+            "\xF4\x8F\xBF\xBF",
+            "\"\\u43FF\\uFFFF\"", /* bug: want "\"\\uDBFF\\uDFFF\"" */
+        },
+        {
+            /* U+110000 */
+            "\"\xF4\x90\x80\x80\"",
+            "\xF4\x90\x80\x80",
+            "\"\\u4400\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+        },
+        /* 3  Malformed sequences */
+        /* 3.1  Unexpected continuation bytes */
+        /* 3.1.1  First continuation byte */
+        {
+            "\"\x80\"",
+            "\x80",             /* bug: not corrected */
+            "\"\\uFFFF\"",
+        },
+        /* 3.1.2  Last continuation byte */
+        {
+            "\"\xBF\"",
+            "\xBF",             /* bug: not corrected */
+            "\"\\uFFFF\"",
+        },
+        /* 3.1.3  2 continuation bytes */
+        {
+            "\"\x80\xBF\"",
+            "\x80\xBF",         /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\"",
+        },
+        /* 3.1.4  3 continuation bytes */
+        {
+            "\"\x80\xBF\x80\"",
+            "\x80\xBF\x80",     /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\\uFFFF\"",
+        },
+        /* 3.1.5  4 continuation bytes */
+        {
+            "\"\x80\xBF\x80\xBF\"",
+            "\x80\xBF\x80\xBF", /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\\uFFFF\\uFFFF\"",
+        },
+        /* 3.1.6  5 continuation bytes */
+        {
+            "\"\x80\xBF\x80\xBF\x80\"",
+            "\x80\xBF\x80\xBF\x80", /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\"",
+        },
+        /* 3.1.7  6 continuation bytes */
+        {
+            "\"\x80\xBF\x80\xBF\x80\xBF\"",
+            "\x80\xBF\x80\xBF\x80\xBF", /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\"",
+        },
+        /* 3.1.8  7 continuation bytes */
+        {
+            "\"\x80\xBF\x80\xBF\x80\xBF\x80\"",
+            "\x80\xBF\x80\xBF\x80\xBF\x80", /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\"",
+        },
+        /* 3.1.9  Sequence of all 64 possible continuation bytes */
+        {
+            "\"\x80\x81\x82\x83\x84\x85\x86\x87"
+            "\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
+            "\x90\x91\x92\x93\x94\x95\x96\x97"
+            "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
+            "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7"
+            "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
+            "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7"
+            "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\"",
+             /* bug: not corrected */
+            "\x80\x81\x82\x83\x84\x85\x86\x87"
+            "\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
+            "\x90\x91\x92\x93\x94\x95\x96\x97"
+            "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
+            "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7"
+            "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
+            "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7"
+            "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
+            "\"\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+            "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+            "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+            "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+            "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+            "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+            "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+            "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\""
+        },
+        /* 3.2  Lonely start characters */
+        /* 3.2.1  All 32 first bytes of 2-byte sequences, followed by space */
+        {
+            "\"\xC0 \xC1 \xC2 \xC3 \xC4 \xC5 \xC6 \xC7 "
+            "\xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF "
+            "\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 "
+            "\xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF \"",
+            NULL,               /* bug: rejected */
+            "\"\\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF "
+            "\\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF "
+            "\\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF "
+            "\\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \"",
+            "\xC0 \xC1 \xC2 \xC3 \xC4 \xC5 \xC6 \xC7 "
+            "\xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF "
+            "\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 "
+            "\xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF ",
+        },
+        /* 3.2.2  All 16 first bytes of 3-byte sequences, followed by space */
+        {
+            "\"\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 "
+            "\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF \"",
+            /* bug: not corrected */
+            "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 "
+            "\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
+            "\"\\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF "
+            "\\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \"",
+        },
+        /* 3.2.3  All 8 first bytes of 4-byte sequences, followed by space */
+        {
+            "\"\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 \"",
+            NULL,               /* bug: rejected */
+            "\"\\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \\uFFFF \"",
+            "\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 ",
+        },
+        /* 3.2.4  All 4 first bytes of 5-byte sequences, followed by space */
+        {
+            "\"\xF8 \xF9 \xFA \xFB \"",
+            NULL,               /* bug: rejected */
+            "\"\\uFFFF \\uFFFF \\uFFFF \\uFFFF \"",
+            "\xF8 \xF9 \xFA \xFB ",
+        },
+        /* 3.2.5  All 2 first bytes of 6-byte sequences, followed by space */
+        {
+            "\"\xFC \xFD \"",
+            NULL,               /* bug: rejected */
+            "\"\\uFFFF \\uFFFF \"",
+            "\xFC \xFD ",
+        },
+        /* 3.3  Sequences with last continuation byte missing */
+        /* 3.3.1  2-byte sequence with last byte missing (U+0000) */
+        {
+            "\"\xC0\"",
+            NULL,               /* bug: rejected */
+            "\"\\uFFFF\"",
+            "\xC0",
+        },
+        /* 3.3.2  3-byte sequence with last byte missing (U+0000) */
+        {
+            "\"\xE0\x80\"",
+            "\xE0\x80",           /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+        },
+        /* 3.3.3  4-byte sequence with last byte missing (U+0000) */
+        {
+            "\"\xF0\x80\x80\"",
+            "\xF0\x80\x80",     /* bug: not corrected */
+            "\"\\u0000\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        /* 3.3.4  5-byte sequence with last byte missing (U+0000) */
+        {
+            /* invalid */
+            "\"\xF8\x80\x80\x80\"", /* bug: not corrected */
+            NULL,                   /* bug: rejected */
+            "\"\\u8000\\uFFFF\"",   /* bug: want "\"\\uFFFF\"" */
+            "\xF8\x80\x80\x80",
+        },
+        /* 3.3.5  6-byte sequence with last byte missing (U+0000) */
+        {
+            "\"\xFC\x80\x80\x80\x80\"",
+            NULL,                        /* bug: rejected */
+            "\"\\uC000\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xFC\x80\x80\x80\x80",
+        },
+        /* 3.3.6  2-byte sequence with last byte missing (U+07FF) */
+        {
+            "\"\xDF\"",
+            "\xDF",             /* bug: not corrected */
+            "\"\\uFFFF\"",
+        },
+        /* 3.3.7  3-byte sequence with last byte missing (U+FFFF) */
+        {
+            "\"\xEF\xBF\"",
+            "\xEF\xBF",           /* bug: not corrected */
+            "\"\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+        },
+        /* 3.3.8  4-byte sequence with last byte missing (U+1FFFFF) */
+        {
+            "\"\xF7\xBF\xBF\"",
+            NULL,               /* bug: rejected */
+            "\"\\u7FFF\"",      /* bug: want "\"\\uFFFF\"" */
+            "\xF7\xBF\xBF",
+        },
+        /* 3.3.9  5-byte sequence with last byte missing (U+3FFFFFF) */
+        {
+            "\"\xFB\xBF\xBF\xBF\"",
+            NULL,                 /* bug: rejected */
+            "\"\\uBFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xFB\xBF\xBF\xBF",
+        },
+        /* 3.3.10  6-byte sequence with last byte missing (U+7FFFFFFF) */
+        {
+            "\"\xFD\xBF\xBF\xBF\xBF\"",
+            NULL,                        /* bug: rejected */
+            "\"\\uDFFF\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"", */
+            "\xFD\xBF\xBF\xBF\xBF",
+        },
+        /* 3.4  Concatenation of incomplete sequences */
+        {
+            "\"\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80"
+            "\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF\"",
+            NULL,               /* bug: rejected */
+            /* bug: want "\"\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF"
+               "\\uFFFF\\uFFFF\\uFFFF\\uFFFF\\uFFFF\"" */
+            "\"\\u0020\\uFFFF\\u0000\\u8000\\uFFFF\\uC000\\uFFFF\\uFFFF"
+            "\\u07EF\\uFFFF\\u7FFF\\uBFFF\\uFFFF\\uDFFF\\uFFFF\\uFFFF\"",
+            "\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80"
+            "\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF",
+        },
+        /* 3.5  Impossible bytes */
+        {
+            "\"\xFE\"",
+            NULL,               /* bug: rejected */
+            "\"\\uFFFF\"",
+            "\xFE",
+        },
+        {
+            "\"\xFF\"",
+            NULL,               /* bug: rejected */
+            "\"\\uFFFF\"",
+            "\xFF",
+        },
+        {
+            "\"\xFE\xFE\xFF\xFF\"",
+            NULL,                 /* bug: rejected */
+            /* bug: want "\"\\uFFFF\\uFFFF\\uFFFF\\uFFFF\"" */
+            "\"\\uEFBF\\uFFFF\"",
+            "\xFE\xFE\xFF\xFF",
+        },
+        /* 4  Overlong sequences */
+        /* 4.1  Overlong '/' */
+        {
+            "\"\xC0\xAF\"",
+            NULL,               /* bug: rejected */
+            "\"\\u002F\"",      /* bug: want "\"/\"" */
+            "\xC0\xAF",
+        },
+        {
+            "\"\xE0\x80\xAF\"",
+            "\xE0\x80\xAF",     /* bug: not corrected */
+            "\"\\u002F\"",      /* bug: want "\"/\"" */
+        },
+        {
+            "\"\xF0\x80\x80\xAF\"",
+            "\xF0\x80\x80\xAF",  /* bug: not corrected */
+            "\"\\u0000\\uFFFF\"" /* bug: want "\"/\"" */
+        },
+        {
+            "\"\xF8\x80\x80\x80\xAF\"",
+            NULL,                        /* bug: rejected */
+            "\"\\u8000\\uFFFF\\uFFFF\"", /* bug: want "\"/\"" */
+            "\xF8\x80\x80\x80\xAF",
+        },
+        {
+            "\"\xFC\x80\x80\x80\x80\xAF\"",
+            NULL,                               /* bug: rejected */
+            "\"\\uC000\\uFFFF\\uFFFF\\uFFFF\"", /* bug: want "\"/\"" */
+            "\xFC\x80\x80\x80\x80\xAF",
+        },
+        /* 4.2  Maximum overlong sequences */
+        {
+            /* \U+007F */
+            "\"\xC1\xBF\"",
+            NULL,               /* bug: rejected */
+            "\"\\u007F\"",      /* bug: want "\"\177\"" */
+            "\xC1\xBF",
+        },
+        {
+            /* \U+07FF */
+            "\"\xE0\x9F\xBF\"",
+            "\xE0\x9F\xBF",     /* bug: not corrected */
+            "\"\\u07FF\"",
+        },
+        {
+            /* \U+FFFF */
+            "\"\xF0\x8F\xBF\xBF\"",
+            "\xF0\x8F\xBF\xBF",   /* bug: not corrected */
+            "\"\\u03FF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+        },
+        {
+            /* \U+1FFFFF */
+            "\"\xF8\x87\xBF\xBF\xBF\"",
+            NULL,                        /* bug: rejected */
+            "\"\\u81FF\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xF8\x87\xBF\xBF\xBF",
+        },
+        {
+            /* \U+3FFFFFF */
+            "\"\xFC\x83\xBF\xBF\xBF\xBF\"",
+            NULL,                               /* bug: rejected */
+            "\"\\uC0FF\\uFFFF\\uFFFF\\uFFFF\"", /* bug: want "\"\\uFFFF\"" */
+            "\xFC\x83\xBF\xBF\xBF\xBF",
+        },
+        /* 4.3  Overlong representation of the NUL character */
+        {
+            /* \U+0000 */
+            "\"\xC0\x80\"",
+            NULL,               /* bug: rejected */
+            "\"\\u0000\"",
+            "\xC0\x80",
+        },
+        {
+            /* \U+0000 */
+            "\"\xE0\x80\x80\"",
+            "\xE0\x80\x80",     /* bug: not corrected */
+            "\"\\u0000\"",
+        },
+        {
+            /* \U+0000 */
+            "\"\xF0\x80\x80\x80\"",
+            "\xF0\x80\x80\x80",   /* bug: not corrected */
+            "\"\\u0000\\uFFFF\"", /* bug: want "\"\\u0000\"" */
+        },
+        {
+            /* \U+0000 */
+            "\"\xF8\x80\x80\x80\x80\"",
+            NULL,                        /* bug: rejected */
+            "\"\\u8000\\uFFFF\\uFFFF\"", /* bug: want "\"\\u0000\"" */
+            "\xF8\x80\x80\x80\x80",
+        },
+        {
+            /* \U+0000 */
+            "\"\xFC\x80\x80\x80\x80\x80\"",
+            NULL,                               /* bug: rejected */
+            "\"\\uC000\\uFFFF\\uFFFF\\uFFFF\"", /* bug: want "\"\\u0000\"" */
+            "\xFC\x80\x80\x80\x80\x80",
+        },
+        /* 5  Illegal code positions */
+        /* 5.1  Single UTF-16 surrogates */
+        {
+            /* \U+D800 */
+            "\"\xED\xA0\x80\"",
+            "\xED\xA0\x80",     /* bug: not corrected */
+            "\"\\uD800\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        {
+            /* \U+DB7F */
+            "\"\xED\xAD\xBF\"",
+            "\xED\xAD\xBF",     /* bug: not corrected */
+            "\"\\uDB7F\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        {
+            /* \U+DB80 */
+            "\"\xED\xAE\x80\"",
+            "\xED\xAE\x80",     /* bug: not corrected */
+            "\"\\uDB80\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        {
+            /* \U+DBFF */
+            "\"\xED\xAF\xBF\"",
+            "\xED\xAF\xBF",     /* bug: not corrected */
+            "\"\\uDBFF\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        {
+            /* \U+DC00 */
+            "\"\xED\xB0\x80\"",
+            "\xED\xB0\x80",     /* bug: not corrected */
+            "\"\\uDC00\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        {
+            /* \U+DF80 */
+            "\"\xED\xBE\x80\"",
+            "\xED\xBE\x80",     /* bug: not corrected */
+            "\"\\uDF80\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        {
+            /* \U+DFFF */
+            "\"\xED\xBF\xBF\"",
+            "\xED\xBF\xBF",     /* bug: not corrected */
+            "\"\\uDFFF\"",      /* bug: want "\"\\uFFFF\"" */
+        },
+        /* 5.2  Paired UTF-16 surrogates */
+        {
+            /* \U+D800\U+DC00 */
+            "\"\xED\xA0\x80\xED\xB0\x80\"",
+            "\xED\xA0\x80\xED\xB0\x80", /* bug: not corrected */
+            "\"\\uD800\\uDC00\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        {
+            /* \U+D800\U+DFFF */
+            "\"\xED\xA0\x80\xED\xBF\xBF\"",
+            "\xED\xA0\x80\xED\xBF\xBF", /* bug: not corrected */
+            "\"\\uD800\\uDFFF\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        {
+            /* \U+DB7F\U+DC00 */
+            "\"\xED\xAD\xBF\xED\xB0\x80\"",
+            "\xED\xAD\xBF\xED\xB0\x80", /* bug: not corrected */
+            "\"\\uDB7F\\uDC00\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        {
+            /* \U+DB7F\U+DFFF */
+            "\"\xED\xAD\xBF\xED\xBF\xBF\"",
+            "\xED\xAD\xBF\xED\xBF\xBF", /* bug: not corrected */
+            "\"\\uDB7F\\uDFFF\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        {
+            /* \U+DB80\U+DC00 */
+            "\"\xED\xAE\x80\xED\xB0\x80\"",
+            "\xED\xAE\x80\xED\xB0\x80", /* bug: not corrected */
+            "\"\\uDB80\\uDC00\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        {
+            /* \U+DB80\U+DFFF */
+            "\"\xED\xAE\x80\xED\xBF\xBF\"",
+            "\xED\xAE\x80\xED\xBF\xBF", /* bug: not corrected */
+            "\"\\uDB80\\uDFFF\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        {
+            /* \U+DBFF\U+DC00 */
+            "\"\xED\xAF\xBF\xED\xB0\x80\"",
+            "\xED\xAF\xBF\xED\xB0\x80", /* bug: not corrected */
+            "\"\\uDBFF\\uDC00\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        {
+            /* \U+DBFF\U+DFFF */
+            "\"\xED\xAF\xBF\xED\xBF\xBF\"",
+            "\xED\xAF\xBF\xED\xBF\xBF", /* bug: not corrected */
+            "\"\\uDBFF\\uDFFF\"", /* bug: want "\"\\uFFFF\\uFFFF\"" */
+        },
+        /* 5.3  Other illegal code positions */
+        {
+            /* \U+FFFE */
+            "\"\xEF\xBF\xBE\"",
+            "\xEF\xBF\xBE",     /* bug: not corrected */
+            "\"\\uFFFE\"",      /* bug: not corrected */
+        },
+        {
+            /* \U+FFFF */
+            "\"\xEF\xBF\xBF\"",
+            "\xEF\xBF\xBF",     /* bug: not corrected */
+            "\"\\uFFFF\"",      /* bug: not corrected */
+        },
+        {}
+    };
+    int i;
+    QObject *obj;
+    QString *str;
+    const char *json_in, *utf8_out, *utf8_in, *json_out;
+
+    for (i = 0; test_cases[i].json_in; i++) {
+        json_in = test_cases[i].json_in;
+        utf8_out = test_cases[i].utf8_out;
+        utf8_in = test_cases[i].utf8_in ?: test_cases[i].utf8_out;
+        json_out = test_cases[i].json_out ?: test_cases[i].json_in;
+
+        obj = qobject_from_json(json_in);
+        if (utf8_out) {
+            g_assert(obj);
+            g_assert(qobject_type(obj) == QTYPE_QSTRING);
+            str = qobject_to_qstring(obj);
+            g_assert_cmpstr(qstring_get_str(str), ==, utf8_out);
+        } else {
+            g_assert(!obj);
+        }
+        qobject_decref(obj);
+
+        obj = QOBJECT(qstring_from_str(utf8_in));
+        str = qobject_to_json(obj);
+        if (json_out) {
+            g_assert(str);
+            g_assert_cmpstr(qstring_get_str(str), ==, json_out);
+        } else {
+            g_assert(!str);
+        }
+        QDECREF(str);
+        qobject_decref(obj);
+
+        /*
+         * Disabled, because json_out currently contains the crap
+         * qobject_to_json() produces.
+         * FIXME Enable once these bugs have been fixed.
+         */
+        if (0 && json_out != json_in) {
+            obj = qobject_from_json(json_out);
+            g_assert(obj);
+            g_assert(qobject_type(obj) == QTYPE_QSTRING);
+            str = qobject_to_qstring(obj);
+            g_assert_cmpstr(qstring_get_str(str), ==, utf8_out);
+        }
+    }
+}
+
 static void vararg_string(void)
 {
     int i;
@@ -748,6 +1411,7 @@
 
     g_test_add_func("/literals/string/simple", simple_string);
     g_test_add_func("/literals/string/escaped", escaped_string);
+    g_test_add_func("/literals/string/utf8", utf8_string);
     g_test_add_func("/literals/string/single_quote", single_quote_string);
     g_test_add_func("/literals/string/vararg", vararg_string);
 
diff --git a/tests/tcg/mips/mips32-dspr2/mulq_rs_w.c b/tests/tcg/mips/mips32-dspr2/mulq_rs_w.c
index 669405f..7ba633b 100644
--- a/tests/tcg/mips/mips32-dspr2/mulq_rs_w.c
+++ b/tests/tcg/mips/mips32-dspr2/mulq_rs_w.c
@@ -8,7 +8,7 @@
 
     rs = 0x80001234;
     rt = 0x80004321;
-    result = 0x80005555;
+    result = 0x7FFFAAAB;
 
     __asm
         ("mulq_rs.w %0, %1, %2\n\t"
diff --git a/tests/tcg/mips/mips32-dspr2/mulq_s_ph.c b/tests/tcg/mips/mips32-dspr2/mulq_s_ph.c
index d0f7674..00e0155 100644
--- a/tests/tcg/mips/mips32-dspr2/mulq_s_ph.c
+++ b/tests/tcg/mips/mips32-dspr2/mulq_s_ph.c
@@ -6,6 +6,21 @@
     int rd, rs, rt, dsp;
     int result, resultdsp;
 
+    rs = 0x80000000;
+    rt = 0x0ffc0000;
+    result = 0xF0040000;
+    resultdsp = 0;
+
+    __asm
+        ("mulq_s.ph %0, %2, %3\n\t"
+         "rddsp %1\n\t"
+         : "=r"(rd), "=r"(dsp)
+         : "r"(rs), "r"(rt)
+        );
+    dsp = (dsp >> 21) & 0x01;
+    assert(rd  == result);
+    assert(dsp == resultdsp);
+
     rs = 0x80001234;
     rt = 0x80004321;
     result = 0x7FFF098B;
diff --git a/tests/tcg/mips/mips32-dspr2/mulq_s_w.c b/tests/tcg/mips/mips32-dspr2/mulq_s_w.c
index df148b7..9c2be06 100644
--- a/tests/tcg/mips/mips32-dspr2/mulq_s_w.c
+++ b/tests/tcg/mips/mips32-dspr2/mulq_s_w.c
@@ -8,7 +8,7 @@
 
     rs = 0x80001234;
     rt = 0x80004321;
-    result = 0x80005555;
+    result = 0x7FFFAAAB;
 
     __asm
         ("mulq_s.w %0, %1, %2\n\t"
diff --git a/tests/tcg/test-i386.c b/tests/tcg/test-i386.c
index 6dc730d..b05572b 100644
--- a/tests/tcg/test-i386.c
+++ b/tests/tcg/test-i386.c
@@ -209,7 +209,7 @@
 #define TEST_LEA16(STR)\
 {\
     asm(".code16 ; .byte 0x67 ; leal " STR ", %0 ; .code32"\
-        : "=wq" (res)\
+        : "=r" (res)\
         : "a" (eax), "b" (ebx), "c" (ecx), "d" (edx), "S" (esi), "D" (edi));\
     printf("lea %s = %08lx\n", STR, res);\
 }
@@ -925,7 +925,7 @@
 
 void test_fenv(void)
 {
-    struct QEMU_PACKED {
+    struct __attribute__((__packed__)) {
         uint16_t fpuc;
         uint16_t dummy1;
         uint16_t fpus;
@@ -935,7 +935,7 @@
         uint32_t ignored[4];
         long double fpregs[8];
     } float_env32;
-    struct QEMU_PACKED {
+    struct __attribute__((__packed__)) {
         uint16_t fpuc;
         uint16_t fpus;
         uint16_t fptag;
@@ -1280,7 +1280,7 @@
     struct {
         uint32_t offset;
         uint16_t seg;
-    } QEMU_PACKED segoff;
+    } __attribute__((__packed__)) segoff;
 
     ldt.entry_number = 1;
     ldt.base_addr = (unsigned long)&seg_data1;
@@ -1828,7 +1828,7 @@
     printf("lock nop exception:\n");
     if (setjmp(jmp_env) == 0) {
         /* now execute an invalid instruction */
-        asm volatile(".byte 0xf0, 0x90"); /* lock nop */
+        asm volatile(".byte 0xf0, 0x90");
     }
 
     printf("INT exception:\n");
diff --git a/ui/gtk.c b/ui/gtk.c
index 5f91de4..dcce36d 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -34,7 +34,18 @@
 #define GETTEXT_PACKAGE "qemu"
 #define LOCALEDIR "po"
 
+#include "qemu-common.h"
+
+#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE
+/* Work around an -Wstrict-prototypes warning in GTK headers */
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#endif
 #include <gtk/gtk.h>
+#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE
+#pragma GCC diagnostic error "-Wstrict-prototypes"
+#endif
+
+
 #include <gdk/gdkkeysyms.h>
 #include <glib/gi18n.h>
 #include <locale.h>
@@ -46,7 +57,6 @@
 #include <pty.h>
 #include <math.h>
 
-#include "qemu-common.h"
 #include "ui/console.h"
 #include "sysemu/sysemu.h"
 #include "qmp-commands.h"
@@ -81,8 +91,11 @@
 
     GtkAccelGroup *accel_group;
 
-    GtkWidget *file_menu_item;
-    GtkWidget *file_menu;
+    GtkWidget *machine_menu_item;
+    GtkWidget *machine_menu;
+    GtkWidget *pause_item;
+    GtkWidget *reset_item;
+    GtkWidget *powerdown_item;
     GtkWidget *quit_item;
 
     GtkWidget *view_menu_item;
@@ -118,6 +131,8 @@
     GdkCursor *null_cursor;
     Notifier mouse_mode_notifier;
     gboolean free_scale;
+
+    bool external_pause_update;
 } GtkDisplayState;
 
 static GtkDisplayState *global_state;
@@ -161,14 +176,19 @@
     const char *status = "";
     gchar *title;
     const char *grab = "";
+    bool is_paused = !runstate_is_running();
 
     if (gd_is_grab_active(s)) {
         grab = " - Press Ctrl+Alt+G to release grab";
     }
 
-    if (!runstate_is_running()) {
-        status = " [Stopped]";
+    if (is_paused) {
+        status = " [Paused]";
     }
+    s->external_pause_update = true;
+    gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(s->pause_item),
+                                   is_paused);
+    s->external_pause_update = false;
 
     if (qemu_name) {
         title = g_strdup_printf("QEMU (%s)%s%s", qemu_name, status, grab);
@@ -585,6 +605,30 @@
 
 /** Window Menu Actions **/
 
+static void gd_menu_pause(GtkMenuItem *item, void *opaque)
+{
+    GtkDisplayState *s = opaque;
+
+    if (s->external_pause_update) {
+        return;
+    }
+    if (runstate_is_running()) {
+        qmp_stop(NULL);
+    } else {
+        qmp_cont(NULL);
+    }
+}
+
+static void gd_menu_reset(GtkMenuItem *item, void *opaque)
+{
+    qmp_system_reset(NULL);
+}
+
+static void gd_menu_powerdown(GtkMenuItem *item, void *opaque)
+{
+    qmp_system_powerdown(NULL);
+}
+
 static void gd_menu_quit(GtkMenuItem *item, void *opaque)
 {
     qmp_quit(NULL);
@@ -623,7 +667,7 @@
 {
     GtkDisplayState *s = opaque;
 
-    if (gtk_check_menu_item_get_active(GTK_CHECK_MENU_ITEM(s->full_screen_item))) {
+    if (!s->full_screen) {
         gtk_notebook_set_show_tabs(GTK_NOTEBOOK(s->notebook), FALSE);
         gtk_widget_set_size_request(s->menu_bar, 0, 0);
         gtk_widget_set_size_request(s->drawing_area, -1, -1);
@@ -858,7 +902,9 @@
     const char *label;
     char buffer[32];
     char path[32];
+#if VTE_CHECK_VERSION(0, 26, 0)
     VtePty *pty;
+#endif
     GIOChannel *chan;
     GtkWidget *scrolled_window;
     GtkAdjustment *vadjustment;
@@ -891,9 +937,12 @@
     cfmakeraw(&tty);
     tcsetattr(slave_fd, TCSAFLUSH, &tty);
 
+#if VTE_CHECK_VERSION(0, 26, 0)
     pty = vte_pty_new_foreign(master_fd, NULL);
-
     vte_terminal_set_pty_object(VTE_TERMINAL(vc->terminal), pty);
+#else
+    vte_terminal_set_pty(VTE_TERMINAL(vc->terminal), master_fd);
+#endif
 
     vte_terminal_set_scrollback_lines(VTE_TERMINAL(vc->terminal), -1);
 
@@ -953,6 +1002,12 @@
     g_signal_connect(s->drawing_area, "key-release-event",
                      G_CALLBACK(gd_key_event), s);
 
+    g_signal_connect(s->pause_item, "activate",
+                     G_CALLBACK(gd_menu_pause), s);
+    g_signal_connect(s->reset_item, "activate",
+                     G_CALLBACK(gd_menu_reset), s);
+    g_signal_connect(s->powerdown_item, "activate",
+                     G_CALLBACK(gd_menu_powerdown), s);
     g_signal_connect(s->quit_item, "activate",
                      G_CALLBACK(gd_menu_quit), s);
     g_signal_connect(s->full_screen_item, "activate",
@@ -986,21 +1041,38 @@
     int i;
 
     accel_group = gtk_accel_group_new();
-    s->file_menu = gtk_menu_new();
-    gtk_menu_set_accel_group(GTK_MENU(s->file_menu), accel_group);
-    s->file_menu_item = gtk_menu_item_new_with_mnemonic(_("_File"));
+    s->machine_menu = gtk_menu_new();
+    gtk_menu_set_accel_group(GTK_MENU(s->machine_menu), accel_group);
+    s->machine_menu_item = gtk_menu_item_new_with_mnemonic(_("_Machine"));
+
+    s->pause_item = gtk_check_menu_item_new_with_mnemonic(_("_Pause"));
+    gtk_menu_append(GTK_MENU(s->machine_menu), s->pause_item);
+
+    separator = gtk_separator_menu_item_new();
+    gtk_menu_append(GTK_MENU(s->machine_menu), separator);
+
+    s->reset_item = gtk_image_menu_item_new_with_mnemonic(_("_Reset"));
+    gtk_menu_append(GTK_MENU(s->machine_menu), s->reset_item);
+
+    s->powerdown_item = gtk_image_menu_item_new_with_mnemonic(_("Power _Down"));
+    gtk_menu_append(GTK_MENU(s->machine_menu), s->powerdown_item);
+
+    separator = gtk_separator_menu_item_new();
+    gtk_menu_append(GTK_MENU(s->machine_menu), separator);
 
     s->quit_item = gtk_image_menu_item_new_from_stock(GTK_STOCK_QUIT, NULL);
     gtk_stock_lookup(GTK_STOCK_QUIT, &item);
     gtk_menu_item_set_accel_path(GTK_MENU_ITEM(s->quit_item),
-                                 "<QEMU>/File/Quit");
-    gtk_accel_map_add_entry("<QEMU>/File/Quit", item.keyval, item.modifier);
+                                 "<QEMU>/Machine/Quit");
+    gtk_accel_map_add_entry("<QEMU>/Machine/Quit", item.keyval, item.modifier);
+    gtk_menu_append(GTK_MENU(s->machine_menu), s->quit_item);
 
     s->view_menu = gtk_menu_new();
     gtk_menu_set_accel_group(GTK_MENU(s->view_menu), accel_group);
     s->view_menu_item = gtk_menu_item_new_with_mnemonic(_("_View"));
 
-    s->full_screen_item = gtk_check_menu_item_new_with_mnemonic(_("_Full Screen"));
+    s->full_screen_item =
+        gtk_image_menu_item_new_from_stock(GTK_STOCK_FULLSCREEN, NULL);
     gtk_menu_item_set_accel_path(GTK_MENU_ITEM(s->full_screen_item),
                                  "<QEMU>/View/Full Screen");
     gtk_accel_map_add_entry("<QEMU>/View/Full Screen", GDK_KEY_f, GDK_CONTROL_MASK | GDK_MOD1_MASK);
@@ -1069,9 +1141,9 @@
     gtk_window_add_accel_group(GTK_WINDOW(s->window), accel_group);
     s->accel_group = accel_group;
 
-    gtk_menu_append(GTK_MENU(s->file_menu), s->quit_item);
-    gtk_menu_item_set_submenu(GTK_MENU_ITEM(s->file_menu_item), s->file_menu);
-    gtk_menu_shell_append(GTK_MENU_SHELL(s->menu_bar), s->file_menu_item);
+    gtk_menu_item_set_submenu(GTK_MENU_ITEM(s->machine_menu_item),
+                              s->machine_menu);
+    gtk_menu_shell_append(GTK_MENU_SHELL(s->menu_bar), s->machine_menu_item);
 
     gtk_menu_item_set_submenu(GTK_MENU_ITEM(s->view_menu_item), s->view_menu);
     gtk_menu_shell_append(GTK_MENU_SHELL(s->menu_bar), s->view_menu_item);
diff --git a/user-exec.c b/user-exec.c
index c71acbc..71bd6c5 100644
--- a/user-exec.c
+++ b/user-exec.c
@@ -70,7 +70,7 @@
 #endif
     }
     env1->exception_index = -1;
-    longjmp(env1->jmp_env, 1);
+    siglongjmp(env1->jmp_env, 1);
 }
 
 /* 'pc' is the host PC at which the exception was raised. 'address' is