Merge branch 'tcg-sparc' of git://repo.or.cz/qemu/rth

* 'tcg-sparc' of git://repo.or.cz/qemu/rth:
  tcg-sparc: Preserve branch destinations during retranslation
  tcg-sparc: Fix and enable direct TB chaining.
  tcg-sparc: Add %g/%o registers to alloc_order
  tcg-sparc: Use defines for temporaries.
  tcg-sparc: Mask shift immediates to avoid illegal insns.
  tcg-sparc: Clean up cruft stemming from attempts to use global registers.
  tcg-sparc: Change AREG0 in generated code to %i0.
  tcg-sparc: Support GUEST_BASE.
  tcg-sparc: Fix qemu_ld/st to handle 32-bit host.
  tcg-sparc: Assume v9 cpu always, i.e. force v8plus in 32-bit mode.
  tcg-sparc: Don't MAP_FIXED on top of the program
  tcg-sparc: Fix ADDX opcode.
  tcg-sparc: Hack in qemu_ld/st64 for 32-bit.
  linux-user: Use memcpy in get_user/put_user.
diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index 4902450..a1d489e 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -41,6 +41,13 @@
 #define SNAN_BIT_IS_ONE		0
 #endif
 
+#if defined(TARGET_XTENSA)
+/* Define for architectures which deviate from IEEE in not supporting
+ * signaling NaNs (so all NaNs are treated as quiet).
+ */
+#define NO_SIGNALING_NANS 1
+#endif
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated half-precision NaN.
 *----------------------------------------------------------------------------*/
@@ -57,7 +64,8 @@
 *----------------------------------------------------------------------------*/
 #if defined(TARGET_SPARC)
 const float32 float32_default_nan = const_float32(0x7FFFFFFF);
-#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
+#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) || \
+      defined(TARGET_XTENSA)
 const float32 float32_default_nan = const_float32(0x7FC00000);
 #elif SNAN_BIT_IS_ONE
 const float32 float32_default_nan = const_float32(0x7FBFFFFF);
@@ -127,6 +135,17 @@
     uint64_t high, low;
 } commonNaNT;
 
+#ifdef NO_SIGNALING_NANS
+int float16_is_quiet_nan(float16 a_)
+{
+    return float16_is_any_nan(a_);
+}
+
+int float16_is_signaling_nan(float16 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the half-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -156,6 +175,7 @@
     return (((a >> 9) & 0x3F) == 0x3E) && (a & 0x1FF);
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the half-precision floating point value `a' is a
@@ -217,6 +237,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int float32_is_quiet_nan(float32 a_)
+{
+    return float32_is_any_nan(a_);
+}
+
+int float32_is_signaling_nan(float32 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the single-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -246,6 +277,7 @@
     return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the single-precision floating point value `a' is a
@@ -372,7 +404,7 @@
         return 1;
     }
 }
-#elif defined(TARGET_PPC)
+#elif defined(TARGET_PPC) || defined(TARGET_XTENSA)
 static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN,
                    flag aIsLargerSignificand)
 {
@@ -586,6 +618,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int float64_is_quiet_nan(float64 a_)
+{
+    return float64_is_any_nan(a_);
+}
+
+int float64_is_signaling_nan(float64 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the double-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -619,6 +662,7 @@
         && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the double-precision floating point value `a' is a
@@ -773,6 +817,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int floatx80_is_quiet_nan(floatx80 a_)
+{
+    return floatx80_is_any_nan(a_);
+}
+
+int floatx80_is_signaling_nan(floatx80 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the extended double-precision floating-point value `a' is a
 | quiet NaN; otherwise returns 0. This slightly differs from the same
@@ -816,6 +871,7 @@
         && ( a.low == aLow );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the extended double-precision floating point value
@@ -929,6 +985,17 @@
     }
 }
 
+#ifdef NO_SIGNALING_NANS
+int float128_is_quiet_nan(float128 a_)
+{
+    return float128_is_any_nan(a_);
+}
+
+int float128_is_signaling_nan(float128 a_)
+{
+    return 0;
+}
+#else
 /*----------------------------------------------------------------------------
 | Returns 1 if the quadruple-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
@@ -964,6 +1031,7 @@
         && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
 #endif
 }
+#endif
 
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN if the quadruple-precision floating point value `a' is
diff --git a/fpu/softfloat.h b/fpu/softfloat.h
index feec3a1..d8999b3 100644
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -219,7 +219,7 @@
 enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
-    float_muladd_negate_result = 3,
+    float_muladd_negate_result = 4,
 };
 
 /*----------------------------------------------------------------------------
@@ -251,6 +251,11 @@
 int float16_is_signaling_nan( float16 );
 float16 float16_maybe_silence_nan( float16 );
 
+INLINE int float16_is_any_nan(float16 a)
+{
+    return ((float16_val(a) & ~0x8000) > 0x7c00);
+}
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated half-precision NaN.
 *----------------------------------------------------------------------------*/
diff --git a/gdbstub.c b/gdbstub.c
index a91709f..d02ec75 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -1696,6 +1696,10 @@
         GET_REG32(env->uregs[reg->targno & 0xff]);
         break;
 
+    case 4: /*f*/
+        GET_REG32(float32_val(env->fregs[reg->targno & 0x0f]));
+        break;
+
     case 8: /*a*/
         GET_REG32(env->regs[reg->targno & 0x0f]);
         break;
@@ -1736,6 +1740,10 @@
         env->uregs[reg->targno & 0xff] = tmp;
         break;
 
+    case 4: /*f*/
+        env->fregs[reg->targno & 0x0f] = make_float32(tmp);
+        break;
+
     case 8: /*a*/
         env->regs[reg->targno & 0x0f] = tmp;
         break;
diff --git a/target-xtensa/cpu.h b/target-xtensa/cpu.h
index 177094a..7348277 100644
--- a/target-xtensa/cpu.h
+++ b/target-xtensa/cpu.h
@@ -36,6 +36,7 @@
 #include "config.h"
 #include "qemu-common.h"
 #include "cpu-defs.h"
+#include "fpu/softfloat.h"
 
 #define TARGET_HAS_ICE 1
 
@@ -325,6 +326,8 @@
     uint32_t sregs[256];
     uint32_t uregs[256];
     uint32_t phys_regs[MAX_NAREG];
+    float32 fregs[16];
+    float_status fp_status;
 
     xtensa_tlb_entry itlb[7][MAX_TLB_WAY_SIZE];
     xtensa_tlb_entry dtlb[10][MAX_TLB_WAY_SIZE];
@@ -465,6 +468,8 @@
 #define XTENSA_TBFLAG_LITBASE 0x8
 #define XTENSA_TBFLAG_DEBUG 0x10
 #define XTENSA_TBFLAG_ICOUNT 0x20
+#define XTENSA_TBFLAG_CPENABLE_MASK 0x3fc0
+#define XTENSA_TBFLAG_CPENABLE_SHIFT 6
 
 static inline void cpu_get_tb_cpu_state(CPUXtensaState *env, target_ulong *pc,
         target_ulong *cs_base, int *flags)
@@ -488,6 +493,9 @@
             *flags |= XTENSA_TBFLAG_ICOUNT;
         }
     }
+    if (xtensa_option_enabled(env->config, XTENSA_OPTION_COPROCESSOR)) {
+        *flags |= env->sregs[CPENABLE] << XTENSA_TBFLAG_CPENABLE_SHIFT;
+    }
 }
 
 #include "cpu-all.h"
diff --git a/target-xtensa/helper.h b/target-xtensa/helper.h
index 152fec0..4cc0088 100644
--- a/target-xtensa/helper.h
+++ b/target-xtensa/helper.h
@@ -36,4 +36,25 @@
 DEF_HELPER_3(wsr_dbreaka, void, env, i32, i32)
 DEF_HELPER_3(wsr_dbreakc, void, env, i32, i32)
 
+DEF_HELPER_2(wur_fcr, void, env, i32)
+DEF_HELPER_FLAGS_1(abs_s, TCG_CALL_CONST | TCG_CALL_PURE, f32, f32)
+DEF_HELPER_FLAGS_1(neg_s, TCG_CALL_CONST | TCG_CALL_PURE, f32, f32)
+DEF_HELPER_3(add_s, f32, env, f32, f32)
+DEF_HELPER_3(sub_s, f32, env, f32, f32)
+DEF_HELPER_3(mul_s, f32, env, f32, f32)
+DEF_HELPER_4(madd_s, f32, env, f32, f32, f32)
+DEF_HELPER_4(msub_s, f32, env, f32, f32, f32)
+DEF_HELPER_FLAGS_3(ftoi, TCG_CALL_CONST | TCG_CALL_PURE, i32, f32, i32, i32)
+DEF_HELPER_FLAGS_3(ftoui, TCG_CALL_CONST | TCG_CALL_PURE, i32, f32, i32, i32)
+DEF_HELPER_3(itof, f32, env, i32, i32)
+DEF_HELPER_3(uitof, f32, env, i32, i32)
+
+DEF_HELPER_4(un_s, void, env, i32, f32, f32)
+DEF_HELPER_4(oeq_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ueq_s, void, env, i32, f32, f32)
+DEF_HELPER_4(olt_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ult_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ole_s, void, env, i32, f32, f32)
+DEF_HELPER_4(ule_s, void, env, i32, f32, f32)
+
 #include "def-helper.h"
diff --git a/target-xtensa/op_helper.c b/target-xtensa/op_helper.c
index 2659c0e..ae0c099 100644
--- a/target-xtensa/op_helper.c
+++ b/target-xtensa/op_helper.c
@@ -771,3 +771,137 @@
     }
     env->sregs[DBREAKC + i] = v;
 }
+
+void HELPER(wur_fcr)(CPUXtensaState *env, uint32_t v)
+{
+    static const int rounding_mode[] = {
+        float_round_nearest_even,
+        float_round_to_zero,
+        float_round_up,
+        float_round_down,
+    };
+
+    env->uregs[FCR] = v & 0xfffff07f;
+    set_float_rounding_mode(rounding_mode[v & 3], &env->fp_status);
+}
+
+float32 HELPER(abs_s)(float32 v)
+{
+    return float32_abs(v);
+}
+
+float32 HELPER(neg_s)(float32 v)
+{
+    return float32_chs(v);
+}
+
+float32 HELPER(add_s)(CPUXtensaState *env, float32 a, float32 b)
+{
+    return float32_add(a, b, &env->fp_status);
+}
+
+float32 HELPER(sub_s)(CPUXtensaState *env, float32 a, float32 b)
+{
+    return float32_sub(a, b, &env->fp_status);
+}
+
+float32 HELPER(mul_s)(CPUXtensaState *env, float32 a, float32 b)
+{
+    return float32_mul(a, b, &env->fp_status);
+}
+
+float32 HELPER(madd_s)(CPUXtensaState *env, float32 a, float32 b, float32 c)
+{
+    return float32_muladd(b, c, a, 0,
+            &env->fp_status);
+}
+
+float32 HELPER(msub_s)(CPUXtensaState *env, float32 a, float32 b, float32 c)
+{
+    return float32_muladd(b, c, a, float_muladd_negate_product,
+            &env->fp_status);
+}
+
+uint32_t HELPER(ftoi)(float32 v, uint32_t rounding_mode, uint32_t scale)
+{
+    float_status fp_status = {0};
+
+    set_float_rounding_mode(rounding_mode, &fp_status);
+    return float32_to_int32(
+            float32_scalbn(v, scale, &fp_status), &fp_status);
+}
+
+uint32_t HELPER(ftoui)(float32 v, uint32_t rounding_mode, uint32_t scale)
+{
+    float_status fp_status = {0};
+    float32 res;
+
+    set_float_rounding_mode(rounding_mode, &fp_status);
+
+    res = float32_scalbn(v, scale, &fp_status);
+
+    if (float32_is_neg(v) && !float32_is_any_nan(v)) {
+        return float32_to_int32(res, &fp_status);
+    } else {
+        return float32_to_uint32(res, &fp_status);
+    }
+}
+
+float32 HELPER(itof)(CPUXtensaState *env, uint32_t v, uint32_t scale)
+{
+    return float32_scalbn(int32_to_float32(v, &env->fp_status),
+            (int32_t)scale, &env->fp_status);
+}
+
+float32 HELPER(uitof)(CPUXtensaState *env, uint32_t v, uint32_t scale)
+{
+    return float32_scalbn(uint32_to_float32(v, &env->fp_status),
+            (int32_t)scale, &env->fp_status);
+}
+
+static inline void set_br(CPUXtensaState *env, bool v, uint32_t br)
+{
+    if (v) {
+        env->sregs[BR] |= br;
+    } else {
+        env->sregs[BR] &= ~br;
+    }
+}
+
+void HELPER(un_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_unordered_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(oeq_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_eq_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(ueq_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    int v = float32_compare_quiet(a, b, &env->fp_status);
+    set_br(env, v == float_relation_equal || v == float_relation_unordered, br);
+}
+
+void HELPER(olt_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_lt_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(ult_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    int v = float32_compare_quiet(a, b, &env->fp_status);
+    set_br(env, v == float_relation_less || v == float_relation_unordered, br);
+}
+
+void HELPER(ole_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    set_br(env, float32_le_quiet(a, b, &env->fp_status), br);
+}
+
+void HELPER(ule_s)(CPUXtensaState *env, uint32_t br, float32 a, float32 b)
+{
+    int v = float32_compare_quiet(a, b, &env->fp_status);
+    set_br(env, v != float_relation_greater, br);
+}
diff --git a/target-xtensa/overlay_tool.h b/target-xtensa/overlay_tool.h
index a3a5650..e395053 100644
--- a/target-xtensa/overlay_tool.h
+++ b/target-xtensa/overlay_tool.h
@@ -58,6 +58,7 @@
     XCHAL_OPTION(XCHAL_HAVE_SEXT, XTENSA_OPTION_MISC_OP_SEXT) | \
     XCHAL_OPTION(XCHAL_HAVE_CLAMPS, XTENSA_OPTION_MISC_OP_CLAMPS) | \
     XCHAL_OPTION(XCHAL_HAVE_CP, XTENSA_OPTION_COPROCESSOR) | \
+    XCHAL_OPTION(XCHAL_HAVE_BOOLEANS, XTENSA_OPTION_BOOLEAN) | \
     XCHAL_OPTION(XCHAL_HAVE_FP, XTENSA_OPTION_FP_COPROCESSOR) | \
     XCHAL_OPTION(XCHAL_HAVE_RELEASE_SYNC, XTENSA_OPTION_MP_SYNCHRO) | \
     XCHAL_OPTION(XCHAL_HAVE_S32C1I, XTENSA_OPTION_CONDITIONAL_STORE) | \
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index b6643eb..ba3ffcb 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -65,11 +65,14 @@
     bool debug;
     bool icount;
     TCGv_i32 next_icount;
+
+    unsigned cpenable;
 } DisasContext;
 
 static TCGv_ptr cpu_env;
 static TCGv_i32 cpu_pc;
 static TCGv_i32 cpu_R[16];
+static TCGv_i32 cpu_FR[16];
 static TCGv_i32 cpu_SR[256];
 static TCGv_i32 cpu_UR[256];
 
@@ -155,6 +158,12 @@
         "ar8", "ar9", "ar10", "ar11",
         "ar12", "ar13", "ar14", "ar15",
     };
+    static const char * const fregnames[] = {
+        "f0", "f1", "f2", "f3",
+        "f4", "f5", "f6", "f7",
+        "f8", "f9", "f10", "f11",
+        "f12", "f13", "f14", "f15",
+    };
     int i;
 
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
@@ -167,6 +176,12 @@
                 regnames[i]);
     }
 
+    for (i = 0; i < 16; i++) {
+        cpu_FR[i] = tcg_global_mem_new_i32(TCG_AREG0,
+                offsetof(CPUXtensaState, fregs[i]),
+                fregnames[i]);
+    }
+
     for (i = 0; i < 256; ++i) {
         if (sregnames[i]) {
             cpu_SR[i] = tcg_global_mem_new_i32(TCG_AREG0,
@@ -318,6 +333,15 @@
     }
 }
 
+static void gen_check_cpenable(DisasContext *dc, unsigned cp)
+{
+    if (option_enabled(dc, XTENSA_OPTION_COPROCESSOR) &&
+            !(dc->cpenable & (1 << cp))) {
+        gen_exception_cause(dc, COPROCESSOR0_DISABLED + cp);
+        dc->is_jmp = DISAS_UPDATE;
+    }
+}
+
 static void gen_jump_slot(DisasContext *dc, TCGv dest, int slot)
 {
     tcg_gen_mov_i32(cpu_pc, dest);
@@ -566,6 +590,13 @@
     }
 }
 
+static void gen_wsr_cpenable(DisasContext *dc, uint32_t sr, TCGv_i32 v)
+{
+    tcg_gen_andi_i32(cpu_SR[sr], v, 0xff);
+    /* This can change tb->flags, so exit tb */
+    gen_jumpi_check_loop_end(dc, -1);
+}
+
 static void gen_wsr_intset(DisasContext *dc, uint32_t sr, TCGv_i32 v)
 {
     tcg_gen_andi_i32(cpu_SR[sr], v,
@@ -668,6 +699,7 @@
         [DBREAKA + 1] = gen_wsr_dbreaka,
         [DBREAKC] = gen_wsr_dbreakc,
         [DBREAKC + 1] = gen_wsr_dbreakc,
+        [CPENABLE] = gen_wsr_cpenable,
         [INTSET] = gen_wsr_intset,
         [INTCLEAR] = gen_wsr_intclear,
         [INTENABLE] = gen_wsr_intenable,
@@ -692,6 +724,23 @@
     }
 }
 
+static void gen_wur(uint32_t ur, TCGv_i32 s)
+{
+    switch (ur) {
+    case FCR:
+        gen_helper_wur_fcr(cpu_env, s);
+        break;
+
+    case FSR:
+        tcg_gen_andi_i32(cpu_UR[ur], s, 0xffffff80);
+        break;
+
+    default:
+        tcg_gen_mov_i32(cpu_UR[ur], s);
+        break;
+    }
+}
+
 static void gen_load_store_alignment(DisasContext *dc, int shift,
         TCGv_i32 addr, bool no_hw_alignment)
 {
@@ -1761,13 +1810,11 @@
 
             case 15: /*WUR*/
                 gen_window_check1(dc, RRR_T);
-                {
-                    if (uregnames[RSR_SR]) {
-                        tcg_gen_mov_i32(cpu_UR[RSR_SR], cpu_R[RRR_T]);
-                    } else {
-                        qemu_log("WUR %d not implemented, ", RSR_SR);
-                        TBD();
-                    }
+                if (uregnames[RSR_SR]) {
+                    gen_wur(RSR_SR, cpu_R[RRR_T]);
+                } else {
+                    qemu_log("WUR %d not implemented, ", RSR_SR);
+                    TBD();
                 }
                 break;
 
@@ -1815,8 +1862,34 @@
             break;
 
         case 8: /*LSCXp*/
-            HAS_OPTION(XTENSA_OPTION_COPROCESSOR);
-            TBD();
+            switch (OP2) {
+            case 0: /*LSXf*/
+            case 1: /*LSXUf*/
+            case 4: /*SSXf*/
+            case 5: /*SSXUf*/
+                HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
+                gen_window_check2(dc, RRR_S, RRR_T);
+                gen_check_cpenable(dc, 0);
+                {
+                    TCGv_i32 addr = tcg_temp_new_i32();
+                    tcg_gen_add_i32(addr, cpu_R[RRR_S], cpu_R[RRR_T]);
+                    gen_load_store_alignment(dc, 2, addr, false);
+                    if (OP2 & 0x4) {
+                        tcg_gen_qemu_st32(cpu_FR[RRR_R], addr, dc->cring);
+                    } else {
+                        tcg_gen_qemu_ld32u(cpu_FR[RRR_R], addr, dc->cring);
+                    }
+                    if (OP2 & 0x1) {
+                        tcg_gen_mov_i32(cpu_R[RRR_S], addr);
+                    }
+                    tcg_temp_free(addr);
+                }
+                break;
+
+            default: /*reserved*/
+                RESERVED();
+                break;
+            }
             break;
 
         case 9: /*LSC4*/
@@ -1854,12 +1927,213 @@
 
         case 10: /*FP0*/
             HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
-            TBD();
+            switch (OP2) {
+            case 0: /*ADD.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_add_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 1: /*SUB.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_sub_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 2: /*MUL.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_mul_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 4: /*MADD.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_madd_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_R], cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 5: /*MSUB.Sf*/
+                gen_check_cpenable(dc, 0);
+                gen_helper_msub_s(cpu_FR[RRR_R], cpu_env,
+                        cpu_FR[RRR_R], cpu_FR[RRR_S], cpu_FR[RRR_T]);
+                break;
+
+            case 8: /*ROUND.Sf*/
+            case 9: /*TRUNC.Sf*/
+            case 10: /*FLOOR.Sf*/
+            case 11: /*CEIL.Sf*/
+            case 14: /*UTRUNC.Sf*/
+                gen_window_check1(dc, RRR_R);
+                gen_check_cpenable(dc, 0);
+                {
+                    static const unsigned rounding_mode_const[] = {
+                        float_round_nearest_even,
+                        float_round_to_zero,
+                        float_round_down,
+                        float_round_up,
+                        [6] = float_round_to_zero,
+                    };
+                    TCGv_i32 rounding_mode = tcg_const_i32(
+                            rounding_mode_const[OP2 & 7]);
+                    TCGv_i32 scale = tcg_const_i32(RRR_T);
+
+                    if (OP2 == 14) {
+                        gen_helper_ftoui(cpu_R[RRR_R], cpu_FR[RRR_S],
+                                rounding_mode, scale);
+                    } else {
+                        gen_helper_ftoi(cpu_R[RRR_R], cpu_FR[RRR_S],
+                                rounding_mode, scale);
+                    }
+
+                    tcg_temp_free(rounding_mode);
+                    tcg_temp_free(scale);
+                }
+                break;
+
+            case 12: /*FLOAT.Sf*/
+            case 13: /*UFLOAT.Sf*/
+                gen_window_check1(dc, RRR_S);
+                gen_check_cpenable(dc, 0);
+                {
+                    TCGv_i32 scale = tcg_const_i32(-RRR_T);
+
+                    if (OP2 == 13) {
+                        gen_helper_uitof(cpu_FR[RRR_R], cpu_env,
+                                cpu_R[RRR_S], scale);
+                    } else {
+                        gen_helper_itof(cpu_FR[RRR_R], cpu_env,
+                                cpu_R[RRR_S], scale);
+                    }
+                    tcg_temp_free(scale);
+                }
+                break;
+
+            case 15: /*FP1OP*/
+                switch (RRR_T) {
+                case 0: /*MOV.Sf*/
+                    gen_check_cpenable(dc, 0);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                case 1: /*ABS.Sf*/
+                    gen_check_cpenable(dc, 0);
+                    gen_helper_abs_s(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                case 4: /*RFRf*/
+                    gen_window_check1(dc, RRR_R);
+                    gen_check_cpenable(dc, 0);
+                    tcg_gen_mov_i32(cpu_R[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                case 5: /*WFRf*/
+                    gen_window_check1(dc, RRR_S);
+                    gen_check_cpenable(dc, 0);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_R[RRR_S]);
+                    break;
+
+                case 6: /*NEG.Sf*/
+                    gen_check_cpenable(dc, 0);
+                    gen_helper_neg_s(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    break;
+
+                default: /*reserved*/
+                    RESERVED();
+                    break;
+                }
+                break;
+
+            default: /*reserved*/
+                RESERVED();
+                break;
+            }
             break;
 
         case 11: /*FP1*/
             HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
-            TBD();
+
+#define gen_compare(rel, br, a, b) \
+    do { \
+        TCGv_i32 bit = tcg_const_i32(1 << br); \
+        \
+        gen_check_cpenable(dc, 0); \
+        gen_helper_##rel(cpu_env, bit, cpu_FR[a], cpu_FR[b]); \
+        tcg_temp_free(bit); \
+    } while (0)
+
+            switch (OP2) {
+            case 1: /*UN.Sf*/
+                gen_compare(un_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 2: /*OEQ.Sf*/
+                gen_compare(oeq_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 3: /*UEQ.Sf*/
+                gen_compare(ueq_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 4: /*OLT.Sf*/
+                gen_compare(olt_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 5: /*ULT.Sf*/
+                gen_compare(ult_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 6: /*OLE.Sf*/
+                gen_compare(ole_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+            case 7: /*ULE.Sf*/
+                gen_compare(ule_s, RRR_R, RRR_S, RRR_T);
+                break;
+
+#undef gen_compare
+
+            case 8: /*MOVEQZ.Sf*/
+            case 9: /*MOVNEZ.Sf*/
+            case 10: /*MOVLTZ.Sf*/
+            case 11: /*MOVGEZ.Sf*/
+                gen_window_check1(dc, RRR_T);
+                gen_check_cpenable(dc, 0);
+                {
+                    static const TCGCond cond[] = {
+                        TCG_COND_NE,
+                        TCG_COND_EQ,
+                        TCG_COND_GE,
+                        TCG_COND_LT
+                    };
+                    int label = gen_new_label();
+                    tcg_gen_brcondi_i32(cond[OP2 - 8], cpu_R[RRR_T], 0, label);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    gen_set_label(label);
+                }
+                break;
+
+            case 12: /*MOVF.Sf*/
+            case 13: /*MOVT.Sf*/
+                HAS_OPTION(XTENSA_OPTION_BOOLEAN);
+                gen_check_cpenable(dc, 0);
+                {
+                    int label = gen_new_label();
+                    TCGv_i32 tmp = tcg_temp_new_i32();
+
+                    tcg_gen_andi_i32(tmp, cpu_SR[BR], 1 << RRR_T);
+                    tcg_gen_brcondi_i32(
+                            OP2 & 1 ? TCG_COND_EQ : TCG_COND_NE,
+                            tmp, 0, label);
+                    tcg_gen_mov_i32(cpu_FR[RRR_R], cpu_FR[RRR_S]);
+                    gen_set_label(label);
+                    tcg_temp_free(tmp);
+                }
+                break;
+
+            default: /*reserved*/
+                RESERVED();
+                break;
+            }
             break;
 
         default: /*reserved*/
@@ -2090,8 +2364,34 @@
         break;
 
     case 3: /*LSCIp*/
-        HAS_OPTION(XTENSA_OPTION_COPROCESSOR);
-        TBD();
+        switch (RRI8_R) {
+        case 0: /*LSIf*/
+        case 4: /*SSIf*/
+        case 8: /*LSIUf*/
+        case 12: /*SSIUf*/
+            HAS_OPTION(XTENSA_OPTION_FP_COPROCESSOR);
+            gen_window_check1(dc, RRI8_S);
+            gen_check_cpenable(dc, 0);
+            {
+                TCGv_i32 addr = tcg_temp_new_i32();
+                tcg_gen_addi_i32(addr, cpu_R[RRI8_S], RRI8_IMM8 << 2);
+                gen_load_store_alignment(dc, 2, addr, false);
+                if (RRI8_R & 0x4) {
+                    tcg_gen_qemu_st32(cpu_FR[RRI8_T], addr, dc->cring);
+                } else {
+                    tcg_gen_qemu_ld32u(cpu_FR[RRI8_T], addr, dc->cring);
+                }
+                if (RRI8_R & 0x8) {
+                    tcg_gen_mov_i32(cpu_R[RRI8_S], addr);
+                }
+                tcg_temp_free(addr);
+            }
+            break;
+
+        default: /*reserved*/
+            RESERVED();
+            break;
+        }
         break;
 
     case 4: /*MAC16d*/
@@ -2589,6 +2889,8 @@
     dc.ccount_delta = 0;
     dc.debug = tb->flags & XTENSA_TBFLAG_DEBUG;
     dc.icount = tb->flags & XTENSA_TBFLAG_ICOUNT;
+    dc.cpenable = (tb->flags & XTENSA_TBFLAG_CPENABLE_MASK) >>
+        XTENSA_TBFLAG_CPENABLE_SHIFT;
 
     init_litbase(&dc);
     init_sar_tracker(&dc);
@@ -2730,6 +3032,16 @@
         cpu_fprintf(f, "AR%02d=%08x%c", i, env->phys_regs[i],
                 (i % 4) == 3 ? '\n' : ' ');
     }
+
+    if (xtensa_option_enabled(env->config, XTENSA_OPTION_FP_COPROCESSOR)) {
+        cpu_fprintf(f, "\n");
+
+        for (i = 0; i < 16; ++i) {
+            cpu_fprintf(f, "F%02d=%08x (%+10.8e)%c", i,
+                    float32_val(env->fregs[i]),
+                    *(float *)&env->fregs[i], (i % 2) == 1 ? '\n' : ' ');
+        }
+    }
 }
 
 void restore_state_to_opc(CPUXtensaState *env, TranslationBlock *tb, int pc_pos)
diff --git a/tcg/README b/tcg/README
index d03ae05..33783ee 100644
--- a/tcg/README
+++ b/tcg/README
@@ -392,7 +392,8 @@
 
 Exit the current TB and jump to the TB index 'index' (constant) if the
 current TB was linked to this TB. Otherwise execute the next
-instructions.
+instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
+at most once with each slot index per TB.
 
 * qemu_ld8u t0, t1, flags
 qemu_ld8s t0, t1, flags
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index aed3b53..2bad0a2 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -145,12 +145,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 4;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
diff --git a/tcg/hppa/tcg-target.c b/tcg/hppa/tcg-target.c
index 5385d45..2c79c10 100644
--- a/tcg/hppa/tcg-target.c
+++ b/tcg/hppa/tcg-target.c
@@ -175,12 +175,6 @@
     *insn_ptr = insn;
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 4;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -912,6 +906,18 @@
     tcg_out_mov(s, TCG_TYPE_I32, ret, scratch);
 }
 
+static void tcg_out_movcond(TCGContext *s, int cond, TCGArg ret,
+                            TCGArg c1, TCGArg c2, int c2const,
+                            TCGArg v1, int v1const)
+{
+    tcg_out_comclr(s, tcg_invert_cond(cond), TCG_REG_R0, c1, c2, c2const);
+    if (v1const) {
+        tcg_out_movi(s, TCG_TYPE_I32, ret, v1);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_I32, ret, v1);
+    }
+}
+
 #if defined(CONFIG_SOFTMMU)
 #include "../../softmmu_defs.h"
 
@@ -1520,6 +1526,11 @@
                          args[3], const_args[3], args[4], const_args[4]);
         break;
 
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2], const_args[2],
+                        args[3], const_args[3]);
+        break;
+
     case INDEX_op_add2_i32:
         tcg_out_add2(s, args[0], args[1], args[2], args[3],
                      args[4], args[5], const_args[4]);
@@ -1628,6 +1639,10 @@
     { INDEX_op_setcond_i32, { "r", "rZ", "rI" } },
     { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rI", "rI" } },
 
+    /* ??? We can actually support a signed 14-bit arg3, but we
+       only have existing constraints for a signed 11-bit.  */
+    { INDEX_op_movcond_i32, { "r", "rZ", "rI", "rI", "0" } },
+
     { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rI", "rZ" } },
     { INDEX_op_sub2_i32, { "r", "r", "rI", "rZ", "rK", "rZ" } },
 
diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
index 4defd28..5351353 100644
--- a/tcg/hppa/tcg-target.h
+++ b/tcg/hppa/tcg-target.h
@@ -96,7 +96,7 @@
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
-#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i32      1
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub rd, 0, rs */
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index aa1fa9f..122d636 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -75,9 +75,7 @@
     TCG_REG_R8,
     TCG_REG_R9,
 #else
-    TCG_REG_EAX,
-    TCG_REG_EDX,
-    TCG_REG_ECX
+    /* 32 bit mode uses stack based calling convention (GCC default). */
 #endif
 };
 
@@ -88,6 +86,18 @@
 #endif
 };
 
+/* Registers used with L constraint, which are the first argument 
+   registers on x86_64, and two random call clobbered registers on
+   i386. */
+#if TCG_TARGET_REG_BITS == 64
+# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
+# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
+# define TCG_REG_L2 tcg_target_call_iarg_regs[2]
+#else
+# define TCG_REG_L0 TCG_REG_EAX
+# define TCG_REG_L1 TCG_REG_EDX
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -114,16 +124,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    if (TCG_TARGET_REG_BITS == 64) {
-        return 6;
-    }
-
-    return 0;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -179,16 +179,16 @@
         /* qemu_ld/st address constraint */
     case 'L':
         ct->ct |= TCG_CT_REG;
-        if (TCG_TARGET_REG_BITS == 64) {
+#if TCG_TARGET_REG_BITS == 64
             tcg_regset_set32(ct->u.regs, 0, 0xffff);
-            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[0]);
-            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[1]);
-            tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[2]);
-        } else {
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L2);
+#else
             tcg_regset_set32(ct->u.regs, 0, 0xff);
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_EAX);
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_EDX);
-        }
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
+            tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
+#endif
         break;
 
     case 'e':
@@ -1029,8 +1029,8 @@
                                     uint8_t **label_ptr, int which)
 {
     const int addrlo = args[addrlo_idx];
-    const int r0 = tcg_target_call_iarg_regs[0];
-    const int r1 = tcg_target_call_iarg_regs[1];
+    const int r0 = TCG_REG_L0;
+    const int r1 = TCG_REG_L1;
     TCGType type = TCG_TYPE_I32;
     int rexw = 0;
 
@@ -1192,8 +1192,7 @@
                      label_ptr, offsetof(CPUTLBEntry, addr_read));
 
     /* TLB Hit.  */
-    tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
-                           tcg_target_call_iarg_regs[0], 0, opc);
+    tcg_out_qemu_ld_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);
 
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
@@ -1226,14 +1225,10 @@
     tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx],
                  mem_index);
     /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
 #endif
 
     tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
@@ -1299,11 +1294,9 @@
                use the ADDR32 prefix.  For now, do nothing.  */
 
             if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64,
-                             tcg_target_call_iarg_regs[0], GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW,
-                            tcg_target_call_iarg_regs[0], base);
-                base = tcg_target_call_iarg_regs[0];
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
+                base = TCG_REG_L0;
                 offset = 0;
             }
         }
@@ -1324,8 +1317,8 @@
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
        instead of moving to the scratch.  But as it is, the L constraint
-       means that the second argument reg is definitely free here.  */
-    int scratch = tcg_target_call_iarg_regs[1];
+       means that TCG_REG_L1 is definitely free here.  */
+    const int scratch = TCG_REG_L1;
 
     switch (sizeop) {
     case 0:
@@ -1398,8 +1391,7 @@
                      label_ptr, offsetof(CPUTLBEntry, addr_write));
 
     /* TLB Hit.  */
-    tcg_out_qemu_st_direct(s, data_reg, data_reg2,
-                           tcg_target_call_iarg_regs[0], 0, opc);
+    tcg_out_qemu_st_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);
 
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
@@ -1434,18 +1426,14 @@
     stack_adjust += 4;
 #else
     tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-                tcg_target_call_iarg_regs[1], data_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+                TCG_REG_L1, data_reg);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_L2, mem_index);
     stack_adjust = 0;
     /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
 #endif
 
     tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
@@ -1472,11 +1460,9 @@
                use the ADDR32 prefix.  For now, do nothing.  */
 
             if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64,
-                             tcg_target_call_iarg_regs[0], GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW,
-                            tcg_target_call_iarg_regs[0], base);
-                base = tcg_target_call_iarg_regs[0];
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
+                base = TCG_REG_L0;
                 offset = 0;
             }
         }
@@ -2061,15 +2047,17 @@
 #if TCG_TARGET_REG_BITS == 32
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
-    tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[1], TCG_REG_ESP,
-               (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4);
+    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+    /* jmp *tb.  */
+    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
+		         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
+			 + stack_addend);
 #else
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
-#endif
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
-
     /* jmp *tb.  */
     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
+#endif
 
     /* TB epilogue */
     tb_ret_addr = s->code_ptr;
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b356d76..ace63ba 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -67,7 +67,11 @@
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_ESP 
 #define TCG_TARGET_STACK_ALIGN 16
+#if defined(_WIN64)
+#define TCG_TARGET_CALL_STACK_OFFSET 32
+#else
 #define TCG_TARGET_CALL_STACK_OFFSET 0
+#endif
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
diff --git a/tcg/ia64/tcg-target.c b/tcg/ia64/tcg-target.c
index 1745038..dc9c12c 100644
--- a/tcg/ia64/tcg-target.c
+++ b/tcg/ia64/tcg-target.c
@@ -176,12 +176,6 @@
     TCG_REG_R8
 };
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 8;
-}
-
 /*
  * opcode formation
  */
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index 74db83d..f70910a 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -68,7 +68,7 @@
 #endif
 
 /* check if we really need so many registers :P */
-static const int tcg_target_reg_alloc_order[] = {
+static const TCGReg tcg_target_reg_alloc_order[] = {
     TCG_REG_S0,
     TCG_REG_S1,
     TCG_REG_S2,
@@ -94,14 +94,14 @@
     TCG_REG_V1
 };
 
-static const int tcg_target_call_iarg_regs[4] = {
+static const TCGReg tcg_target_call_iarg_regs[4] = {
     TCG_REG_A0,
     TCG_REG_A1,
     TCG_REG_A2,
     TCG_REG_A3
 };
 
-static const int tcg_target_call_oarg_regs[2] = {
+static const TCGReg tcg_target_call_oarg_regs[2] = {
     TCG_REG_V0,
     TCG_REG_V1
 };
@@ -185,12 +185,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 4;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -278,6 +272,8 @@
 enum {
     OPC_BEQ      = 0x04 << 26,
     OPC_BNE      = 0x05 << 26,
+    OPC_BLEZ     = 0x06 << 26,
+    OPC_BGTZ     = 0x07 << 26,
     OPC_ADDIU    = 0x09 << 26,
     OPC_SLTI     = 0x0A << 26,
     OPC_SLTIU    = 0x0B << 26,
@@ -298,12 +294,16 @@
     OPC_SPECIAL  = 0x00 << 26,
     OPC_SLL      = OPC_SPECIAL | 0x00,
     OPC_SRL      = OPC_SPECIAL | 0x02,
+    OPC_ROTR     = OPC_SPECIAL | (0x01 << 21) | 0x02,
     OPC_SRA      = OPC_SPECIAL | 0x03,
     OPC_SLLV     = OPC_SPECIAL | 0x04,
     OPC_SRLV     = OPC_SPECIAL | 0x06,
+    OPC_ROTRV    = OPC_SPECIAL | (0x01 <<  6) | 0x06,
     OPC_SRAV     = OPC_SPECIAL | 0x07,
     OPC_JR       = OPC_SPECIAL | 0x08,
     OPC_JALR     = OPC_SPECIAL | 0x09,
+    OPC_MOVZ     = OPC_SPECIAL | 0x0A,
+    OPC_MOVN     = OPC_SPECIAL | 0x0B,
     OPC_MFHI     = OPC_SPECIAL | 0x10,
     OPC_MFLO     = OPC_SPECIAL | 0x12,
     OPC_MULT     = OPC_SPECIAL | 0x18,
@@ -319,7 +319,13 @@
     OPC_SLT      = OPC_SPECIAL | 0x2A,
     OPC_SLTU     = OPC_SPECIAL | 0x2B,
 
+    OPC_REGIMM   = 0x01 << 26,
+    OPC_BLTZ     = OPC_REGIMM | (0x00 << 16),
+    OPC_BGEZ     = OPC_REGIMM | (0x01 << 16),
+
     OPC_SPECIAL3 = 0x1f << 26,
+    OPC_INS      = OPC_SPECIAL3 | 0x004,
+    OPC_WSBH     = OPC_SPECIAL3 | 0x0a0,
     OPC_SEB      = OPC_SPECIAL3 | 0x420,
     OPC_SEH      = OPC_SPECIAL3 | 0x620,
 };
@@ -327,7 +333,8 @@
 /*
  * Type reg
  */
-static inline void tcg_out_opc_reg(TCGContext *s, int opc, int rd, int rs, int rt)
+static inline void tcg_out_opc_reg(TCGContext *s, int opc,
+                                   TCGReg rd, TCGReg rs, TCGReg rt)
 {
     int32_t inst;
 
@@ -341,7 +348,8 @@
 /*
  * Type immediate
  */
-static inline void tcg_out_opc_imm(TCGContext *s, int opc, int rt, int rs, int imm)
+static inline void tcg_out_opc_imm(TCGContext *s, int opc,
+                                   TCGReg rt, TCGReg rs, TCGArg imm)
 {
     int32_t inst;
 
@@ -355,7 +363,8 @@
 /*
  * Type branch
  */
-static inline void tcg_out_opc_br(TCGContext *s, int opc, int rt, int rs)
+static inline void tcg_out_opc_br(TCGContext *s, int opc,
+                                  TCGReg rt, TCGReg rs)
 {
     /* We pay attention here to not modify the branch target by reading
        the existing value and using it again. This ensure that caches and
@@ -368,7 +377,8 @@
 /*
  * Type sa
  */
-static inline void tcg_out_opc_sa(TCGContext *s, int opc, int rd, int rt, int sa)
+static inline void tcg_out_opc_sa(TCGContext *s, int opc,
+                                  TCGReg rd, TCGReg rt, TCGArg sa)
 {
     int32_t inst;
 
@@ -407,38 +417,47 @@
     }
 }
 
-static inline void tcg_out_bswap16(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap16(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+#else
     /* ret and arg can't be register at */
     if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
     }
 
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0x00ff);
-
     tcg_out_opc_sa(s, OPC_SLL, ret, arg, 8);
     tcg_out_opc_imm(s, OPC_ANDI, ret, ret, 0xff00);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_bswap16s(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap16s(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+    tcg_out_opc_reg(s, OPC_SEH, ret, 0, ret);
+#else
     /* ret and arg can't be register at */
     if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
     }
 
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0xff);
-
     tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
     tcg_out_opc_sa(s, OPC_SRA, ret, ret, 16);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_bswap32(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+    tcg_out_opc_sa(s, OPC_ROTR, ret, ret, 16);
+#else
     /* ret and arg must be different and can't be register at */
     if (ret == arg || ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
@@ -456,9 +475,10 @@
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0xff00);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_ext8s(TCGContext *s, int ret, int arg)
+static inline void tcg_out_ext8s(TCGContext *s, TCGReg ret, TCGReg arg)
 {
 #ifdef _MIPS_ARCH_MIPS32R2
     tcg_out_opc_reg(s, OPC_SEB, ret, 0, arg);
@@ -468,7 +488,7 @@
 #endif
 }
 
-static inline void tcg_out_ext16s(TCGContext *s, int ret, int arg)
+static inline void tcg_out_ext16s(TCGContext *s, TCGReg ret, TCGReg arg)
 {
 #ifdef _MIPS_ARCH_MIPS32R2
     tcg_out_opc_reg(s, OPC_SEH, ret, 0, arg);
@@ -478,8 +498,8 @@
 #endif
 }
 
-static inline void tcg_out_ldst(TCGContext *s, int opc, int arg,
-                              int arg1, tcg_target_long arg2)
+static inline void tcg_out_ldst(TCGContext *s, int opc, TCGArg arg,
+                                TCGReg arg1, TCGArg arg2)
 {
     if (arg2 == (int16_t) arg2) {
         tcg_out_opc_imm(s, opc, arg, arg1, arg2);
@@ -502,7 +522,7 @@
     tcg_out_ldst(s, OPC_SW, arg, arg1, arg2);
 }
 
-static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
+static inline void tcg_out_addi(TCGContext *s, TCGReg reg, TCGArg val)
 {
     if (val == (int16_t)val) {
         tcg_out_opc_imm(s, OPC_ADDIU, reg, reg, val);
@@ -543,7 +563,7 @@
 #undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
 #define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
     tcg_out_movi(s, TCG_TYPE_I32, A, arg);
-DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_imm32, uint32_t arg)
+DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_imm32, TCGArg arg)
 #undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
 
 /* We don't use the macro for this one to avoid an unnecessary reg-reg
@@ -573,8 +593,8 @@
 #endif
 }
 
-static void tcg_out_brcond(TCGContext *s, TCGCond cond, int arg1,
-                           int arg2, int label_index)
+static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGArg arg1,
+                           TCGArg arg2, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
 
@@ -586,32 +606,48 @@
         tcg_out_opc_br(s, OPC_BNE, arg1, arg2);
         break;
     case TCG_COND_LT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BLTZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
+            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_LTU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
         tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_GE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BGEZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
+            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_GEU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
         tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_LE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BLEZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
+            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_LEU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
         tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_GT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BGTZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
+            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_GTU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
@@ -631,8 +667,9 @@
 
 /* XXX: we implement it at the target level to avoid having to
    handle cross basic blocks temporaries */
-static void tcg_out_brcond2(TCGContext *s, TCGCond cond, int arg1,
-                            int arg2, int arg3, int arg4, int label_index)
+static void tcg_out_brcond2(TCGContext *s, TCGCond cond, TCGArg arg1,
+                            TCGArg arg2, TCGArg arg3, TCGArg arg4,
+                            int label_index)
 {
     void *label_ptr;
 
@@ -694,8 +731,70 @@
     reloc_pc16(label_ptr, (tcg_target_long) s->code_ptr);
 }
 
-static void tcg_out_setcond(TCGContext *s, TCGCond cond, int ret,
-                            int arg1, int arg2)
+static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGArg c1, TCGArg c2, TCGArg v)
+{
+    switch (cond) {
+    case TCG_COND_EQ:
+        if (c1 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, c2);
+        } else if (c2 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, c1);
+        } else {
+            tcg_out_opc_reg(s, OPC_XOR, TCG_REG_AT, c1, c2);
+            tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        }
+        break;
+    case TCG_COND_NE:
+        if (c1 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVN, ret, v, c2);
+        } else if (c2 == 0) {
+            tcg_out_opc_reg(s, OPC_MOVN, ret, v, c1);
+        } else {
+            tcg_out_opc_reg(s, OPC_XOR, TCG_REG_AT, c1, c2);
+            tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        }
+        break;
+    case TCG_COND_LT:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_LTU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GE:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GEU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c1, c2);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_LE:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_LEU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVZ, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GT:
+        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    case TCG_COND_GTU:
+        tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, c2, c1);
+        tcg_out_opc_reg(s, OPC_MOVN, ret, v, TCG_REG_AT);
+        break;
+    default:
+        tcg_abort();
+        break;
+    }
+}
+
+static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGArg arg1, TCGArg arg2)
 {
     switch (cond) {
     case TCG_COND_EQ:
@@ -754,8 +853,8 @@
 
 /* XXX: we implement it at the target level to avoid having to
    handle cross basic blocks temporaries */
-static void tcg_out_setcond2(TCGContext *s, TCGCond cond, int ret,
-                             int arg1, int arg2, int arg3, int arg4)
+static void tcg_out_setcond2(TCGContext *s, TCGCond cond, TCGReg ret,
+                             TCGArg arg1, TCGArg arg2, TCGArg arg3, TCGArg arg4)
 {
     switch (cond) {
     case TCG_COND_EQ:
@@ -842,18 +941,17 @@
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
                             int opc)
 {
-    int addr_regl, addr_meml;
-    int data_regl, data_regh, data_reg1, data_reg2;
-    int mem_index, s_bits;
+    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
     void *label1_ptr, *label2_ptr;
     int arg_num;
-#endif
-#if TARGET_LONG_BITS == 64
-# if defined(CONFIG_SOFTMMU)
+    int mem_index, s_bits;
+    int addr_meml;
+# if TARGET_LONG_BITS == 64
     uint8_t *label3_ptr;
+    TCGReg addr_regh;
+    int addr_memh;
 # endif
-    int addr_regh, addr_memh;
 #endif
     data_regl = *args++;
     if (opc == 3)
@@ -861,11 +959,22 @@
     else
         data_regh = 0;
     addr_regl = *args++;
-#if TARGET_LONG_BITS == 64
+#if defined(CONFIG_SOFTMMU)
+# if TARGET_LONG_BITS == 64
     addr_regh = *args++;
-#endif
+#  if defined(TCG_TARGET_WORDS_BIGENDIAN)
+    addr_memh = 0;
+    addr_meml = 4;
+#  else
+    addr_memh = 4;
+    addr_meml = 0;
+#  endif
+# else
+    addr_meml = 0;
+# endif
     mem_index = *args;
     s_bits = opc & 3;
+#endif
 
     if (opc == 3) {
 #if defined(TCG_TARGET_WORDS_BIGENDIAN)
@@ -879,18 +988,6 @@
         data_reg1 = data_regl;
         data_reg2 = 0;
     }
-#if TARGET_LONG_BITS == 64
-# if defined(TCG_TARGET_WORDS_BIGENDIAN)
-    addr_memh = 0;
-    addr_meml = 4;
-# else
-    addr_memh = 4;
-    addr_meml = 0;
-# endif
-#else
-    addr_meml = 0;
-#endif
-
 #if defined(CONFIG_SOFTMMU)
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
@@ -1029,23 +1126,45 @@
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
                             int opc)
 {
-    int addr_regl, addr_meml;
-    int data_regl, data_regh, data_reg1, data_reg2;
-    int mem_index, s_bits;
+    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
     uint8_t *label1_ptr, *label2_ptr;
     int arg_num;
+    int mem_index, s_bits;
+    int addr_meml;
 #endif
 #if TARGET_LONG_BITS == 64
 # if defined(CONFIG_SOFTMMU)
     uint8_t *label3_ptr;
+    TCGReg addr_regh;
+    int addr_memh;
 # endif
-    int addr_regh, addr_memh;
 #endif
-
     data_regl = *args++;
     if (opc == 3) {
         data_regh = *args++;
+    } else {
+        data_regh = 0;
+    }
+    addr_regl = *args++;
+#if defined(CONFIG_SOFTMMU)
+# if TARGET_LONG_BITS == 64
+    addr_regh = *args++;
+#  if defined(TCG_TARGET_WORDS_BIGENDIAN)
+    addr_memh = 0;
+    addr_meml = 4;
+#  else
+    addr_memh = 4;
+    addr_meml = 0;
+#  endif
+# else
+    addr_meml = 0;
+# endif
+    mem_index = *args;
+    s_bits = opc;
+#endif
+
+    if (opc == 3) {
 #if defined(TCG_TARGET_WORDS_BIGENDIAN)
         data_reg1 = data_regh;
         data_reg2 = data_regl;
@@ -1056,23 +1175,7 @@
     } else {
         data_reg1 = data_regl;
         data_reg2 = 0;
-        data_regh = 0;
     }
-    addr_regl = *args++;
-#if TARGET_LONG_BITS == 64
-    addr_regh = *args++;
-# if defined(TCG_TARGET_WORDS_BIGENDIAN)
-    addr_memh = 0;
-    addr_meml = 4;
-# else
-    addr_memh = 4;
-    addr_meml = 0;
-# endif
-#else
-    addr_meml = 0;
-#endif
-    mem_index = *args;
-    s_bits = opc;
 
 #if defined(CONFIG_SOFTMMU)
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
@@ -1157,7 +1260,8 @@
         break;
     case 1:
         if (TCG_NEED_BSWAP) {
-            tcg_out_bswap16(s, TCG_REG_T0, data_reg1);
+            tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_T0, data_reg1, 0xffff);
+            tcg_out_bswap16(s, TCG_REG_T0, TCG_REG_T0);
             tcg_out_opc_imm(s, OPC_SH, TCG_REG_T0, TCG_REG_A0, 0);
         } else {
             tcg_out_opc_imm(s, OPC_SH, data_reg1, TCG_REG_A0, 0);
@@ -1377,6 +1481,31 @@
             tcg_out_opc_reg(s, OPC_SRLV, args[0], args[2], args[1]);
         }
         break;
+    case INDEX_op_rotl_i32:
+        if (const_args[2]) {
+            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], 0x20 - args[2]);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_AT, 32);
+            tcg_out_opc_reg(s, OPC_SUBU, TCG_REG_AT, TCG_REG_AT, args[2]);
+            tcg_out_opc_reg(s, OPC_ROTRV, args[0], TCG_REG_AT, args[1]);
+        }
+        break;
+    case INDEX_op_rotr_i32:
+        if (const_args[2]) {
+            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], args[2]);
+        } else {
+            tcg_out_opc_reg(s, OPC_ROTRV, args[0], args[2], args[1]);
+        }
+        break;
+
+    /* The bswap routines do not work on non-R2 CPU. In that case
+       we let TCG generating the corresponding code. */
+    case INDEX_op_bswap16_i32:
+        tcg_out_bswap16(s, args[0], args[1]);
+        break;
+    case INDEX_op_bswap32_i32:
+        tcg_out_bswap32(s, args[0], args[1]);
+        break;
 
     case INDEX_op_ext8s_i32:
         tcg_out_ext8s(s, args[0], args[1]);
@@ -1385,6 +1514,11 @@
         tcg_out_ext16s(s, args[0], args[1]);
         break;
 
+    case INDEX_op_deposit_i32:
+        tcg_out_opc_imm(s, OPC_INS, args[0], args[2],
+                        ((args[3] + args[4] - 1) << 11) | (args[3] << 6));
+        break;
+
     case INDEX_op_brcond_i32:
         tcg_out_brcond(s, args[2], args[0], args[1], args[3]);
         break;
@@ -1392,6 +1526,10 @@
         tcg_out_brcond2(s, args[4], args[0], args[1], args[2], args[3], args[5]);
         break;
 
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond(s, args[5], args[0], args[1], args[2], args[3]);
+        break;
+
     case INDEX_op_setcond_i32:
         tcg_out_setcond(s, args[3], args[0], args[1], args[2]);
         break;
@@ -1453,34 +1591,42 @@
     { INDEX_op_st16_i32, { "rZ", "r" } },
     { INDEX_op_st_i32, { "rZ", "r" } },
 
-    { INDEX_op_add_i32, { "r", "rZ", "rJZ" } },
+    { INDEX_op_add_i32, { "r", "rZ", "rJ" } },
     { INDEX_op_mul_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_mulu2_i32, { "r", "r", "rZ", "rZ" } },
     { INDEX_op_div_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_divu_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_rem_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_remu_i32, { "r", "rZ", "rZ" } },
-    { INDEX_op_sub_i32, { "r", "rZ", "rJZ" } },
+    { INDEX_op_sub_i32, { "r", "rZ", "rJ" } },
 
-    { INDEX_op_and_i32, { "r", "rZ", "rIZ" } },
+    { INDEX_op_and_i32, { "r", "rZ", "rI" } },
     { INDEX_op_nor_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_not_i32, { "r", "rZ" } },
     { INDEX_op_or_i32, { "r", "rZ", "rIZ" } },
     { INDEX_op_xor_i32, { "r", "rZ", "rIZ" } },
 
-    { INDEX_op_shl_i32, { "r", "rZ", "riZ" } },
-    { INDEX_op_shr_i32, { "r", "rZ", "riZ" } },
-    { INDEX_op_sar_i32, { "r", "rZ", "riZ" } },
+    { INDEX_op_shl_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_shr_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_sar_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_rotr_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_rotl_i32, { "r", "rZ", "ri" } },
+
+    { INDEX_op_bswap16_i32, { "r", "r" } },
+    { INDEX_op_bswap32_i32, { "r", "r" } },
 
     { INDEX_op_ext8s_i32, { "r", "rZ" } },
     { INDEX_op_ext16s_i32, { "r", "rZ" } },
 
+    { INDEX_op_deposit_i32, { "r", "0", "rZ" } },
+
     { INDEX_op_brcond_i32, { "rZ", "rZ" } },
+    { INDEX_op_movcond_i32, { "r", "rZ", "rZ", "rZ", "0" } },
     { INDEX_op_setcond_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rZ", "rZ" } },
 
-    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJZ", "rJZ" } },
-    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJZ", "rJZ" } },
+    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
+    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
     { INDEX_op_brcond2_i32, { "rZ", "rZ", "rZ", "rZ" } },
 
 #if TARGET_LONG_BITS == 32
@@ -1520,7 +1666,6 @@
     TCG_REG_S5,
     TCG_REG_S6,
     TCG_REG_S7,
-    TCG_REG_GP,
     TCG_REG_FP,
     TCG_REG_RA,       /* should be last for ABI compliance */
 };
@@ -1530,11 +1675,15 @@
 {
     int i, frame_size;
 
-    /* reserve some stack space */
+    /* reserve some stack space, also for TCG temps. */
     frame_size = ARRAY_SIZE(tcg_target_callee_save_regs) * 4
-                 + TCG_STATIC_CALL_ARGS_SIZE;
+                 + TCG_STATIC_CALL_ARGS_SIZE
+                 + CPU_TEMP_BUF_NLONGS * sizeof(long);
     frame_size = (frame_size + TCG_TARGET_STACK_ALIGN - 1) &
                  ~(TCG_TARGET_STACK_ALIGN - 1);
+    tcg_set_frame(s, TCG_REG_SP, ARRAY_SIZE(tcg_target_callee_save_regs) * 4
+                  + TCG_STATIC_CALL_ARGS_SIZE,
+                  CPU_TEMP_BUF_NLONGS * sizeof(long));
 
     /* TB prologue */
     tcg_out_addi(s, TCG_REG_SP, -frame_size);
@@ -1586,8 +1735,7 @@
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_T0);   /* internal use */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_RA);   /* return address */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);   /* stack pointer */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_GP);   /* global pointer */
 
     tcg_add_target_add_op_defs(mips_op_defs);
-    tcg_set_frame(s, TCG_AREG0, offsetof(CPUArchState, temp_buf),
-                  CPU_TEMP_BUF_NLONGS * sizeof(long));
 }
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 9c68a32..d147e70 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -80,17 +80,34 @@
 #define TCG_TARGET_HAS_div_i32          1
 #define TCG_TARGET_HAS_not_i32          1
 #define TCG_TARGET_HAS_nor_i32          1
-#define TCG_TARGET_HAS_rot_i32          0
 #define TCG_TARGET_HAS_ext8s_i32        1
 #define TCG_TARGET_HAS_ext16s_i32       1
-#define TCG_TARGET_HAS_bswap32_i32      0
-#define TCG_TARGET_HAS_bswap16_i32      0
 #define TCG_TARGET_HAS_andc_i32         0
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
-#define TCG_TARGET_HAS_deposit_i32      0
+
+/* optional instructions only implemented on MIPS4, MIPS32 and Loongson 2 */
+#if defined(_MIPS_ARCH_MIPS4) || defined(_MIPS_ARCH_MIPS32) || \
+    defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_LOONGSON2E) || \
+    defined(_MIPS_ARCH_LOONGSON2F)
+#define TCG_TARGET_HAS_movcond_i32      1
+#else
 #define TCG_TARGET_HAS_movcond_i32      0
+#endif
+
+/* optional instructions only implemented on MIPS32R2 */
+#ifdef _MIPS_ARCH_MIPS32R2
+#define TCG_TARGET_HAS_bswap16_i32      1
+#define TCG_TARGET_HAS_bswap32_i32      1
+#define TCG_TARGET_HAS_rot_i32          1
+#define TCG_TARGET_HAS_deposit_i32      1
+#else
+#define TCG_TARGET_HAS_bswap16_i32      0
+#define TCG_TARGET_HAS_bswap32_i32      0
+#define TCG_TARGET_HAS_rot_i32          0
+#define TCG_TARGET_HAS_deposit_i32      0
+#endif
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub  rd, zero, rt   */
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 1be7631..35532a1 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -39,8 +39,6 @@
     TCG_TEMP_UNDEF = 0,
     TCG_TEMP_CONST,
     TCG_TEMP_COPY,
-    TCG_TEMP_HAS_COPY,
-    TCG_TEMP_ANY
 } tcg_temp_state;
 
 struct tcg_temp_info {
@@ -52,39 +50,19 @@
 
 static struct tcg_temp_info temps[TCG_MAX_TEMPS];
 
-/* Reset TEMP's state to TCG_TEMP_ANY.  If TEMP was a representative of some
-   class of equivalent temp's, a new representative should be chosen in this
-   class. */
-static void reset_temp(TCGArg temp, int nb_temps, int nb_globals)
+/* Reset TEMP's state to TCG_TEMP_UNDEF.  If TEMP only had one copy, remove
+   the copy flag from the left temp.  */
+static void reset_temp(TCGArg temp)
 {
-    int i;
-    TCGArg new_base = (TCGArg)-1;
-    if (temps[temp].state == TCG_TEMP_HAS_COPY) {
-        for (i = temps[temp].next_copy; i != temp; i = temps[i].next_copy) {
-            if (i >= nb_globals) {
-                temps[i].state = TCG_TEMP_HAS_COPY;
-                new_base = i;
-                break;
-            }
+    if (temps[temp].state == TCG_TEMP_COPY) {
+        if (temps[temp].prev_copy == temps[temp].next_copy) {
+            temps[temps[temp].next_copy].state = TCG_TEMP_UNDEF;
+        } else {
+            temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
+            temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
         }
-        for (i = temps[temp].next_copy; i != temp; i = temps[i].next_copy) {
-            if (new_base == (TCGArg)-1) {
-                temps[i].state = TCG_TEMP_ANY;
-            } else {
-                temps[i].val = new_base;
-            }
-        }
-        temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
-        temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
-    } else if (temps[temp].state == TCG_TEMP_COPY) {
-        temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
-        temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
-        new_base = temps[temp].val;
     }
-    temps[temp].state = TCG_TEMP_ANY;
-    if (new_base != (TCGArg)-1 && temps[new_base].next_copy == new_base) {
-        temps[new_base].state = TCG_TEMP_ANY;
-    }
+    temps[temp].state = TCG_TEMP_UNDEF;
 }
 
 static int op_bits(TCGOpcode op)
@@ -107,33 +85,83 @@
     }
 }
 
-static void tcg_opt_gen_mov(TCGArg *gen_args, TCGArg dst, TCGArg src,
-                            int nb_temps, int nb_globals)
+static TCGArg find_better_copy(TCGContext *s, TCGArg temp)
 {
-        reset_temp(dst, nb_temps, nb_globals);
-        assert(temps[src].state != TCG_TEMP_COPY);
-        if (src >= nb_globals) {
-            assert(temps[src].state != TCG_TEMP_CONST);
-            if (temps[src].state != TCG_TEMP_HAS_COPY) {
-                temps[src].state = TCG_TEMP_HAS_COPY;
+    TCGArg i;
+
+    /* If this is already a global, we can't do better. */
+    if (temp < s->nb_globals) {
+        return temp;
+    }
+
+    /* Search for a global first. */
+    for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
+        if (i < s->nb_globals) {
+            return i;
+        }
+    }
+
+    /* If it is a temp, search for a temp local. */
+    if (!s->temps[temp].temp_local) {
+        for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
+            if (s->temps[i].temp_local) {
+                return i;
+            }
+        }
+    }
+
+    /* Failure to find a better representation, return the same temp. */
+    return temp;
+}
+
+static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
+{
+    TCGArg i;
+
+    if (arg1 == arg2) {
+        return true;
+    }
+
+    if (temps[arg1].state != TCG_TEMP_COPY
+        || temps[arg2].state != TCG_TEMP_COPY) {
+        return false;
+    }
+
+    for (i = temps[arg1].next_copy ; i != arg1 ; i = temps[i].next_copy) {
+        if (i == arg2) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args,
+                            TCGArg dst, TCGArg src)
+{
+        reset_temp(dst);
+        assert(temps[src].state != TCG_TEMP_CONST);
+
+        if (s->temps[src].type == s->temps[dst].type) {
+            if (temps[src].state != TCG_TEMP_COPY) {
+                temps[src].state = TCG_TEMP_COPY;
                 temps[src].next_copy = src;
                 temps[src].prev_copy = src;
             }
             temps[dst].state = TCG_TEMP_COPY;
-            temps[dst].val = src;
             temps[dst].next_copy = temps[src].next_copy;
             temps[dst].prev_copy = src;
             temps[temps[dst].next_copy].prev_copy = dst;
             temps[src].next_copy = dst;
         }
+
         gen_args[0] = dst;
         gen_args[1] = src;
 }
 
-static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val,
-                             int nb_temps, int nb_globals)
+static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val)
 {
-        reset_temp(dst, nb_temps, nb_globals);
+        reset_temp(dst);
         temps[dst].state = TCG_TEMP_CONST;
         temps[dst].val = val;
         gen_args[0] = dst;
@@ -264,58 +292,88 @@
     return res;
 }
 
+/* Return 2 if the condition can't be simplified, and the result
+   of the condition (0 or 1) if it can */
 static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
                                        TCGArg y, TCGCond c)
 {
-    switch (op_bits(op)) {
-    case 32:
-        switch (c) {
-        case TCG_COND_EQ:
-            return (uint32_t)x == (uint32_t)y;
-        case TCG_COND_NE:
-            return (uint32_t)x != (uint32_t)y;
-        case TCG_COND_LT:
-            return (int32_t)x < (int32_t)y;
-        case TCG_COND_GE:
-            return (int32_t)x >= (int32_t)y;
-        case TCG_COND_LE:
-            return (int32_t)x <= (int32_t)y;
-        case TCG_COND_GT:
-            return (int32_t)x > (int32_t)y;
-        case TCG_COND_LTU:
-            return (uint32_t)x < (uint32_t)y;
-        case TCG_COND_GEU:
-            return (uint32_t)x >= (uint32_t)y;
-        case TCG_COND_LEU:
-            return (uint32_t)x <= (uint32_t)y;
-        case TCG_COND_GTU:
-            return (uint32_t)x > (uint32_t)y;
+    if (temps[x].state == TCG_TEMP_CONST && temps[y].state == TCG_TEMP_CONST) {
+        switch (op_bits(op)) {
+        case 32:
+            switch (c) {
+            case TCG_COND_EQ:
+                return (uint32_t)temps[x].val == (uint32_t)temps[y].val;
+            case TCG_COND_NE:
+                return (uint32_t)temps[x].val != (uint32_t)temps[y].val;
+            case TCG_COND_LT:
+                return (int32_t)temps[x].val < (int32_t)temps[y].val;
+            case TCG_COND_GE:
+                return (int32_t)temps[x].val >= (int32_t)temps[y].val;
+            case TCG_COND_LE:
+                return (int32_t)temps[x].val <= (int32_t)temps[y].val;
+            case TCG_COND_GT:
+                return (int32_t)temps[x].val > (int32_t)temps[y].val;
+            case TCG_COND_LTU:
+                return (uint32_t)temps[x].val < (uint32_t)temps[y].val;
+            case TCG_COND_GEU:
+                return (uint32_t)temps[x].val >= (uint32_t)temps[y].val;
+            case TCG_COND_LEU:
+                return (uint32_t)temps[x].val <= (uint32_t)temps[y].val;
+            case TCG_COND_GTU:
+                return (uint32_t)temps[x].val > (uint32_t)temps[y].val;
+            }
+            break;
+        case 64:
+            switch (c) {
+            case TCG_COND_EQ:
+                return (uint64_t)temps[x].val == (uint64_t)temps[y].val;
+            case TCG_COND_NE:
+                return (uint64_t)temps[x].val != (uint64_t)temps[y].val;
+            case TCG_COND_LT:
+                return (int64_t)temps[x].val < (int64_t)temps[y].val;
+            case TCG_COND_GE:
+                return (int64_t)temps[x].val >= (int64_t)temps[y].val;
+            case TCG_COND_LE:
+                return (int64_t)temps[x].val <= (int64_t)temps[y].val;
+            case TCG_COND_GT:
+                return (int64_t)temps[x].val > (int64_t)temps[y].val;
+            case TCG_COND_LTU:
+                return (uint64_t)temps[x].val < (uint64_t)temps[y].val;
+            case TCG_COND_GEU:
+                return (uint64_t)temps[x].val >= (uint64_t)temps[y].val;
+            case TCG_COND_LEU:
+                return (uint64_t)temps[x].val <= (uint64_t)temps[y].val;
+            case TCG_COND_GTU:
+                return (uint64_t)temps[x].val > (uint64_t)temps[y].val;
+            }
+            break;
         }
-        break;
-    case 64:
+    } else if (temps_are_copies(x, y)) {
         switch (c) {
-        case TCG_COND_EQ:
-            return (uint64_t)x == (uint64_t)y;
-        case TCG_COND_NE:
-            return (uint64_t)x != (uint64_t)y;
-        case TCG_COND_LT:
-            return (int64_t)x < (int64_t)y;
-        case TCG_COND_GE:
-            return (int64_t)x >= (int64_t)y;
-        case TCG_COND_LE:
-            return (int64_t)x <= (int64_t)y;
         case TCG_COND_GT:
-            return (int64_t)x > (int64_t)y;
         case TCG_COND_LTU:
-            return (uint64_t)x < (uint64_t)y;
-        case TCG_COND_GEU:
-            return (uint64_t)x >= (uint64_t)y;
-        case TCG_COND_LEU:
-            return (uint64_t)x <= (uint64_t)y;
+        case TCG_COND_LT:
         case TCG_COND_GTU:
-            return (uint64_t)x > (uint64_t)y;
+        case TCG_COND_NE:
+            return 0;
+        case TCG_COND_GE:
+        case TCG_COND_GEU:
+        case TCG_COND_LE:
+        case TCG_COND_LEU:
+        case TCG_COND_EQ:
+            return 1;
         }
-        break;
+    } else if (temps[y].state == TCG_TEMP_CONST && temps[y].val == 0) {
+        switch (c) {
+        case TCG_COND_LTU:
+            return 0;
+        case TCG_COND_GEU:
+            return 1;
+        default:
+            return 2;
+        }
+    } else {
+        return 2;
     }
 
     fprintf(stderr,
@@ -324,7 +382,6 @@
     tcg_abort();
 }
 
-
 /* Propagate constants and copies, fold constant expressions. */
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
@@ -338,10 +395,8 @@
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
-       If this temp is a copy of other ones then this equivalence class'
-       representative is kept in VALS' element.
-       If this temp is neither copy nor constant then corresponding VALS'
-       element is unused. */
+       If this temp is a copy of other ones then the other copies are
+       available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
@@ -353,11 +408,18 @@
         op = gen_opc_buf[op_index];
         def = &tcg_op_defs[op];
         /* Do copy propagation */
-        if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS))) {
-            assert(op != INDEX_op_call);
+        if (op == INDEX_op_call) {
+            int nb_oargs = args[0] >> 16;
+            int nb_iargs = args[0] & 0xffff;
+            for (i = nb_oargs + 1; i < nb_oargs + nb_iargs + 1; i++) {
+                if (temps[args[i]].state == TCG_TEMP_COPY) {
+                    args[i] = find_better_copy(s, args[i]);
+                }
+            }
+        } else {
             for (i = def->nb_oargs; i < def->nb_oargs + def->nb_iargs; i++) {
                 if (temps[args[i]].state == TCG_TEMP_COPY) {
-                    args[i] = temps[args[i]].val;
+                    args[i] = find_better_copy(s, args[i]);
                 }
             }
         }
@@ -372,7 +434,10 @@
         CASE_OP_32_64(eqv):
         CASE_OP_32_64(nand):
         CASE_OP_32_64(nor):
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
+            /* Prefer the constant in second argument, and then the form
+               op a, a, b, which is better handled on non-RISC hosts. */
+            if (temps[args[1]].state == TCG_TEMP_CONST || (args[0] == args[2]
+                && temps[args[2]].state != TCG_TEMP_CONST)) {
                 tmp = args[1];
                 args[1] = args[2];
                 args[2] = tmp;
@@ -429,7 +494,7 @@
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[1]].val == 0) {
                 gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
                 args += 3;
                 gen_args += 2;
                 continue;
@@ -456,14 +521,11 @@
             }
             if (temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0) {
-                if ((temps[args[0]].state == TCG_TEMP_COPY
-                    && temps[args[0]].val == args[1])
-                    || args[0] == args[1]) {
+                if (temps_are_copies(args[0], args[1])) {
                     gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
                     gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(gen_args, args[0], args[1],
-                                    nb_temps, nb_globals);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                     gen_args += 2;
                 }
                 args += 3;
@@ -481,7 +543,7 @@
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
                 gen_opc_buf[op_index] = op_to_movi(op);
-                tcg_opt_gen_movi(gen_args, args[0], 0, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
                 args += 3;
                 gen_args += 2;
                 continue;
@@ -495,13 +557,12 @@
         switch (op) {
         CASE_OP_32_64(or):
         CASE_OP_32_64(and):
-            if (args[1] == args[2]) {
-                if (args[1] == args[0]) {
+            if (temps_are_copies(args[1], args[2])) {
+                if (temps_are_copies(args[0], args[1])) {
                     gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
                     gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(gen_args, args[0], args[1], nb_temps,
-                                    nb_globals);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                     gen_args += 2;
                 }
                 args += 3;
@@ -512,21 +573,34 @@
             break;
         }
 
+        /* Simplify expression for "op r, a, a => movi r, 0" cases */
+        switch (op) {
+        CASE_OP_32_64(sub):
+        CASE_OP_32_64(xor):
+            if (temps_are_copies(args[1], args[2])) {
+                gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], 0);
+                gen_args += 2;
+                args += 3;
+                continue;
+            }
+            break;
+        default:
+            break;
+        }
+
         /* Propagate constants through copy operations and do constant
            folding.  Constants will be substituted to arguments by register
            allocator where needed and possible.  Also detect copies. */
         switch (op) {
         CASE_OP_32_64(mov):
-            if ((temps[args[1]].state == TCG_TEMP_COPY
-                && temps[args[1]].val == args[0])
-                || args[0] == args[1]) {
+            if (temps_are_copies(args[0], args[1])) {
                 args += 2;
                 gen_opc_buf[op_index] = INDEX_op_nop;
                 break;
             }
             if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(gen_args, args[0], args[1],
-                                nb_temps, nb_globals);
+                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
                 gen_args += 2;
                 args += 2;
                 break;
@@ -538,7 +612,7 @@
             args[1] = temps[args[1]].val;
             /* fallthrough */
         CASE_OP_32_64(movi):
-            tcg_opt_gen_movi(gen_args, args[0], args[1], nb_temps, nb_globals);
+            tcg_opt_gen_movi(gen_args, args[0], args[1]);
             gen_args += 2;
             args += 2;
             break;
@@ -553,9 +627,9 @@
             if (temps[args[1]].state == TCG_TEMP_CONST) {
                 gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val, 0);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
             }
@@ -583,10 +657,10 @@
                 gen_opc_buf[op_index] = op_to_movi(op);
                 tmp = do_constant_folding(op, temps[args[1]].val,
                                           temps[args[2]].val);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
                 gen_args += 2;
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
@@ -594,16 +668,34 @@
             }
             args += 3;
             break;
-        CASE_OP_32_64(setcond):
+        CASE_OP_32_64(deposit):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 gen_opc_buf[op_index] = op_to_movi(op);
-                tmp = do_constant_folding_cond(op, temps[args[1]].val,
-                                               temps[args[2]].val, args[3]);
-                tcg_opt_gen_movi(gen_args, args[0], tmp, nb_temps, nb_globals);
+                tmp = ((1ull << args[4]) - 1);
+                tmp = (temps[args[1]].val & ~(tmp << args[3]))
+                      | ((temps[args[2]].val & tmp) << args[3]);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
                 gen_args += 2;
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
+                gen_args[0] = args[0];
+                gen_args[1] = args[1];
+                gen_args[2] = args[2];
+                gen_args[3] = args[3];
+                gen_args[4] = args[4];
+                gen_args += 5;
+            }
+            args += 5;
+            break;
+        CASE_OP_32_64(setcond):
+            tmp = do_constant_folding_cond(op, args[1], args[2], args[3]);
+            if (tmp != 2) {
+                gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                gen_args += 2;
+            } else {
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
@@ -613,10 +705,9 @@
             args += 4;
             break;
         CASE_OP_32_64(brcond):
-            if (temps[args[0]].state == TCG_TEMP_CONST
-                && temps[args[1]].state == TCG_TEMP_CONST) {
-                if (do_constant_folding_cond(op, temps[args[0]].val,
-                                             temps[args[1]].val, args[2])) {
+            tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
+            if (tmp != 2) {
+                if (tmp) {
                     memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
                     gen_opc_buf[op_index] = INDEX_op_br;
                     gen_args[0] = args[3];
@@ -626,7 +717,7 @@
                 }
             } else {
                 memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
@@ -636,27 +727,21 @@
             args += 4;
             break;
         CASE_OP_32_64(movcond):
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding_cond(op, temps[args[1]].val,
-                                               temps[args[2]].val, args[5]);
-                if (args[0] == args[4-tmp]
-                    || (temps[args[4-tmp]].state == TCG_TEMP_COPY
-                        && temps[args[4-tmp]].val == args[0])) {
+            tmp = do_constant_folding_cond(op, args[1], args[2], args[5]);
+            if (tmp != 2) {
+                if (temps_are_copies(args[0], args[4-tmp])) {
                     gen_opc_buf[op_index] = INDEX_op_nop;
                 } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
                     gen_opc_buf[op_index] = op_to_movi(op);
-                    tcg_opt_gen_movi(gen_args, args[0], temps[args[4-tmp]].val,
-                                     nb_temps, nb_globals);
+                    tcg_opt_gen_movi(gen_args, args[0], temps[args[4-tmp]].val);
                     gen_args += 2;
                 } else {
                     gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(gen_args, args[0], args[4-tmp],
-                                    nb_temps, nb_globals);
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[4-tmp]);
                     gen_args += 2;
                 }
             } else {
-                reset_temp(args[0], nb_temps, nb_globals);
+                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
@@ -671,11 +756,11 @@
             nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
             if (!(args[nb_call_args + 1] & (TCG_CALL_CONST | TCG_CALL_PURE))) {
                 for (i = 0; i < nb_globals; i++) {
-                    reset_temp(i, nb_temps, nb_globals);
+                    reset_temp(i);
                 }
             }
             for (i = 0; i < (args[0] >> 16); i++) {
-                reset_temp(args[i + 1], nb_temps, nb_globals);
+                reset_temp(args[i + 1]);
             }
             i = nb_call_args + 3;
             while (i) {
@@ -694,7 +779,7 @@
                 memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
             } else {
                 for (i = 0; i < def->nb_oargs; i++) {
-                    reset_temp(args[i], nb_temps, nb_globals);
+                    reset_temp(args[i]);
                 }
             }
             for (i = 0; i < def->nb_args; i++) {
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 26c4b33..90c275d 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -221,12 +221,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return ARRAY_SIZE (tcg_target_call_iarg_regs);
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
@@ -390,6 +384,7 @@
 #define ORC    XO31(412)
 #define EQV    XO31(284)
 #define NAND   XO31(476)
+#define ISEL   XO31( 15)
 
 #define LBZX   XO31( 87)
 #define LHZX   XO31(279)
@@ -1269,6 +1264,72 @@
         );
 }
 
+static void tcg_out_movcond (TCGContext *s, TCGCond cond,
+                             TCGArg dest,
+                             TCGArg c1, TCGArg c2,
+                             TCGArg v1, TCGArg v2,
+                             int const_c2)
+{
+    tcg_out_cmp (s, cond, c1, c2, const_c2, 7);
+
+    if (1) {
+        /* At least here on 7747A bit twiddling hacks are outperformed
+           by jumpy code (the testing was not scientific) */
+        if (dest == v2) {
+            cond = tcg_invert_cond (cond);
+            v2 = v1;
+        }
+        else {
+            if (dest != v1) {
+                tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
+            }
+        }
+        /* Branch forward over one insn */
+        tcg_out32 (s, tcg_to_bc[cond] | 8);
+        tcg_out_mov (s, TCG_TYPE_I32, dest, v2);
+    }
+    else {
+        /* isel version, "if (1)" above should be replaced once a way
+           to figure out availability of isel on the underlying
+           hardware is found */
+        int tab, bc;
+
+        switch (cond) {
+        case TCG_COND_EQ:
+            tab = TAB (dest, v1, v2);
+            bc = CR_EQ;
+            break;
+        case TCG_COND_NE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_EQ;
+            break;
+        case TCG_COND_LTU:
+        case TCG_COND_LT:
+            tab = TAB (dest, v1, v2);
+            bc = CR_LT;
+            break;
+        case TCG_COND_GEU:
+        case TCG_COND_GE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_LT;
+            break;
+        case TCG_COND_LEU:
+        case TCG_COND_LE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_GT;
+            break;
+        case TCG_COND_GTU:
+        case TCG_COND_GT:
+            tab = TAB (dest, v1, v2);
+            bc = CR_GT;
+            break;
+        default:
+            tcg_abort ();
+        }
+        tcg_out32 (s, ISEL | tab | ((bc + 28) << 6));
+    }
+}
+
 static void tcg_out_brcond (TCGContext *s, TCGCond cond,
                             TCGArg arg1, TCGArg arg2, int const_arg2,
                             int label_index)
@@ -1826,6 +1887,13 @@
             );
         break;
 
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond (s, args[5], args[0],
+                         args[1], args[2],
+                         args[3], args[4],
+                         const_args[2]);
+        break;
+
     default:
         tcg_dump_ops (s);
         tcg_abort ();
@@ -1922,6 +1990,7 @@
     { INDEX_op_ext16u_i32, { "r", "r" } },
 
     { INDEX_op_deposit_i32, { "r", "0", "r" } },
+    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "r" } },
 
     { -1 },
 };
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 177eea1..3259d89 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -92,7 +92,7 @@
 #define TCG_TARGET_HAS_nand_i32         1
 #define TCG_TARGET_HAS_nor_i32          1
 #define TCG_TARGET_HAS_deposit_i32      1
-#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i32      1
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
index 337cd41..19944bc 100644
--- a/tcg/ppc64/tcg-target.c
+++ b/tcg/ppc64/tcg-target.c
@@ -208,12 +208,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static int tcg_target_get_call_iarg_regs_count (int flags)
-{
-    return ARRAY_SIZE (tcg_target_call_iarg_regs);
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint (TCGArgConstraint *ct, const char **pct_str)
 {
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index aac11d9..3b90605 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -356,11 +356,6 @@
     }
 }
 
-static int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return sizeof(tcg_target_call_iarg_regs) / sizeof(int);
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 876da4f..8fd7f86 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -155,12 +155,6 @@
     }
 }
 
-/* maximum number of register used for input function arguments */
-static inline int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return 6;
-}
-
 /* parse target specific constraints */
 static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
 {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 45a26f2..b3c2650 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -89,7 +89,6 @@
                        tcg_target_long arg2);
 static int tcg_target_const_match(tcg_target_long val,
                                   const TCGArgConstraint *arg_ct);
-static int tcg_target_get_call_iarg_regs_count(int flags);
 
 TCGOpDef tcg_op_defs[] = {
 #define DEF(s, oargs, iargs, cargs, flags) { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
@@ -937,11 +936,7 @@
                                                        args[nb_oargs + i]));
                 }
             }
-        } else if (c == INDEX_op_movi_i32 
-#if TCG_TARGET_REG_BITS == 64
-                   || c == INDEX_op_movi_i64
-#endif
-                   ) {
+        } else if (c == INDEX_op_movi_i32 || c == INDEX_op_movi_i64) {
             tcg_target_ulong val;
             TCGHelperInfo *th;
 
@@ -993,14 +988,11 @@
             case INDEX_op_brcond_i32:
             case INDEX_op_setcond_i32:
             case INDEX_op_movcond_i32:
-#if TCG_TARGET_REG_BITS == 32
             case INDEX_op_brcond2_i32:
             case INDEX_op_setcond2_i32:
-#else
             case INDEX_op_brcond_i64:
             case INDEX_op_setcond_i64:
             case INDEX_op_movcond_i64:
-#endif
                 if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]]) {
                     qemu_log(",%s", cond_name[args[k++]]);
                 } else {
@@ -1861,7 +1853,7 @@
 
     flags = args[nb_oargs + nb_iargs];
 
-    nb_regs = tcg_target_get_call_iarg_regs_count(flags);
+    nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
     if (nb_regs > nb_params)
         nb_regs = nb_params;
 
@@ -2103,16 +2095,12 @@
 #endif
         switch(opc) {
         case INDEX_op_mov_i32:
-#if TCG_TARGET_REG_BITS == 64
         case INDEX_op_mov_i64:
-#endif
             dead_args = s->op_dead_args[op_index];
             tcg_reg_alloc_mov(s, def, args, dead_args);
             break;
         case INDEX_op_movi_i32:
-#if TCG_TARGET_REG_BITS == 64
         case INDEX_op_movi_i64:
-#endif
             tcg_reg_alloc_movi(s, args);
             break;
         case INDEX_op_debug_insn_start:
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 003244c..3f4a24b 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -863,12 +863,6 @@
     return arg_ct->ct & TCG_CT_CONST;
 }
 
-/* Maximum number of register used for input function arguments. */
-static int tcg_target_get_call_iarg_regs_count(int flags)
-{
-    return ARRAY_SIZE(tcg_target_call_iarg_regs);
-}
-
 static void tcg_target_init(TCGContext *s)
 {
 #if defined(CONFIG_DEBUG_TCG_INTERPRETER)