TCG: Update to QEMU 2.0.0 version.

QEMU 2.0.0 was just released, so upgrade TCG to its version,
this matches the changes up to the following upstream commit:

  1a8e80d7e82aa385ad887dba5d039e399a18264b

Change-Id: I6da1ee5a59c3b636455abcbed23feb854366354a
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4268345..ebfcd6d 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -94,6 +94,10 @@
 #endif
 };
 
+/* Constants we accept.  */
+#define TCG_CT_CONST_S32 0x100
+#define TCG_CT_CONST_U32 0x200
+#define TCG_CT_CONST_I32 0x400
 /* Registers used with L constraint, which are the first argument 
    registers on x86_64, and two random call clobbered registers on
    i386. */
@@ -105,18 +109,41 @@
 # define TCG_REG_L1 TCG_REG_EDX
 #endif
 
+/* The host compiler should supply <cpuid.h> to enable runtime features
+   detection, as we're not going to go so far as our own inline assembly.
+   If not available, default values will be assumed.  */
+#if defined(CONFIG_CPUID_H)
+#include <cpuid.h>
+#endif
+
 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
-   is available.  However, the host compiler must supply <cpuid.h>, as we're
-   not going to go so far as our own inline assembly.  */
+   is available.  */
 #if TCG_TARGET_REG_BITS == 64
 # define have_cmov 1
-#elif defined(CONFIG_CPUID_H)
-#include <cpuid.h>
+#elif defined(CONFIG_CPUID_H) && defined(bit_CMOV)
 static bool have_cmov;
 #else
 # define have_cmov 0
 #endif
 
+/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
+   going to attempt to determine at runtime whether movbe is available.  */
+#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
+static bool have_movbe;
+#else
+# define have_movbe 0
+#endif
+
+/* We need this symbol in tcg-target.h, and we can't properly conditionalize
+   it there.  Therefore we always define the variable.  */
+bool have_bmi1;
+
+#if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
+static bool have_bmi2;
+#else
+# define have_bmi2 0
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -159,6 +186,7 @@
         tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
         break;
     case 'c':
+    case_c:
         ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
         break;
@@ -187,6 +215,7 @@
         tcg_regset_set32(ct->u.regs, 0, 0xf);
         break;
     case 'r':
+    case_r:
         ct->ct |= TCG_CT_REG;
         if (TCG_TARGET_REG_BITS == 64) {
             tcg_regset_set32(ct->u.regs, 0, 0xffff);
@@ -194,6 +223,13 @@
             tcg_regset_set32(ct->u.regs, 0, 0xff);
         }
         break;
+    case 'C':
+        /* With SHRX et al, we need not use ECX as shift count register.  */
+        if (have_bmi2) {
+            goto case_r;
+        } else {
+            goto case_c;
+        }
 
         /* qemu_ld/st address constraint */
     case 'L':
@@ -213,6 +249,9 @@
     case 'Z':
         ct->ct |= TCG_CT_CONST_U32;
         break;
+    case 'I':
+        ct->ct |= TCG_CT_CONST_I32;
+        break;
 
     default:
         return -1;
@@ -236,6 +275,9 @@
     if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
         return 1;
     }
+    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
+        return 1;
+    }
     return 0;
 }
 
@@ -246,13 +288,14 @@
 #endif
 
 #define P_EXT		0x100		/* 0x0f opcode prefix */
-#define P_DATA16	0x200		/* 0x66 opcode prefix */
+#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
+#define P_DATA16        0x400           /* 0x66 opcode prefix */
 #if TCG_TARGET_REG_BITS == 64
-# define P_ADDR32	0x400		/* 0x67 opcode prefix */
-# define P_REXW		0x800		/* Set REX.W = 1 */
-# define P_REXB_R	0x1000		/* REG field as byte register */
-# define P_REXB_RM	0x2000		/* R/M field as byte register */
-# define P_GS           0x4000          /* gs segment override */
+# define P_ADDR32       0x800           /* 0x67 opcode prefix */
+# define P_REXW         0x1000          /* Set REX.W = 1 */
+# define P_REXB_R       0x2000          /* REG field as byte register */
+# define P_REXB_RM      0x4000          /* R/M field as byte register */
+# define P_GS           0x8000          /* gs segment override */
 #else
 # define P_ADDR32	0
 # define P_REXW		0
@@ -260,10 +303,13 @@
 # define P_REXB_RM	0
 # define P_GS           0
 #endif
+#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
+#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
+#define OPC_ANDN        (0xf2 | P_EXT38)
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_CALL_Jz	(0xe8)
@@ -285,6 +331,8 @@
 #define OPC_MOVB_EvIz   (0xc6)
 #define OPC_MOVL_EvIz	(0xc7)
 #define OPC_MOVL_Iv     (0xb8)
+#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
+#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 #define OPC_MOVSBL	(0xbe | P_EXT)
 #define OPC_MOVSWL	(0xbf | P_EXT)
 #define OPC_MOVSLQ	(0x63 | P_REXW)
@@ -299,6 +347,9 @@
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
+#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
+#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
+#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 #define OPC_TESTL	(0x85)
 #define OPC_XCHG_ax_r32	(0x90)
 
@@ -387,7 +438,7 @@
     }
 
     rex = 0;
-    rex |= (opc & P_REXW) >> 8;		/* REX.W */
+    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
     rex |= (r & 8) >> 1;		/* REX.R */
     rex |= (x & 8) >> 2;		/* REX.X */
     rex |= (rm & 8) >> 3;		/* REX.B */
@@ -404,8 +455,11 @@
         tcg_out8(s, (uint8_t)(rex | 0x40));
     }
 
-    if (opc & P_EXT) {
+    if (opc & (P_EXT | P_EXT38)) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+    }
     }
     tcg_out8(s, opc);
 }
@@ -415,8 +469,11 @@
     if (opc & P_DATA16) {
         tcg_out8(s, 0x66);
     }
-    if (opc & P_EXT) {
+    if (opc & (P_EXT | P_EXT38)) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+        }
     }
     tcg_out8(s, opc);
 }
@@ -432,6 +489,47 @@
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+    int tmp;
+
+    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
+        /* Three byte VEX prefix.  */
+        tcg_out8(s, 0xc4);
+
+        /* VEX.m-mmmm */
+        if (opc & P_EXT38) {
+            tmp = 2;
+        } else if (opc & P_EXT) {
+            tmp = 1;
+        } else {
+            tcg_abort();
+        }
+        tmp |= 0x40;                       /* VEX.X */
+        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */
+        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */
+        tcg_out8(s, tmp);
+
+        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */
+    } else {
+        /* Two byte VEX prefix.  */
+        tcg_out8(s, 0xc5);
+
+        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
+    }
+    /* VEX.pp */
+    if (opc & P_DATA16) {
+        tmp |= 1;                          /* 0x66 */
+    } else if (opc & P_SIMDF3) {
+        tmp |= 2;                          /* 0xf3 */
+    } else if (opc & P_SIMDF2) {
+        tmp |= 3;                          /* 0xf2 */
+    }
+    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
+    tcg_out8(s, tmp);
+    tcg_out8(s, opc);
+    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
+}
 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
    We handle either RM and INDEX missing with a negative value.  In 64-bit
    mode for absolute addresses, ~RM is the size of the immediate operand
@@ -1342,7 +1440,14 @@
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_GvEv;
+
+    if (have_movbe && real_bswap) {
+        bswap = 0;
+        movop = OPC_MOVBE_GyMy;
+    }
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
@@ -1353,14 +1458,19 @@
         break;
     case MO_UW:
         tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
-        if (bswap) {
+        if (real_bswap) {
             tcg_out_rolw_8(s, datalo);
         }
         break;
     case MO_SW:
-        if (bswap) {
+        if (real_bswap) {
+            if (have_movbe) {
+                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                     datalo, base, ofs);
+            } else {
             tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
             tcg_out_rolw_8(s, datalo);
+            }
             tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
@@ -1368,16 +1478,18 @@
         }
         break;
     case MO_UL:
-        tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
         break;
 #if TCG_TARGET_REG_BITS == 64
     case MO_SL:
+        if (real_bswap) {
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
             tcg_out_bswap32(s, datalo);
+            }
             tcg_out_ext32s(s, datalo, datalo);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
@@ -1386,27 +1498,22 @@
 #endif
     case MO_Q:
         if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
             if (bswap) {
                 tcg_out_bswap64(s, datalo);
             }
         } else {
-            if (bswap) {
+            if (real_bswap) {
                 int t = datalo;
                 datalo = datahi;
                 datahi = t;
             }
             if (base != datalo) {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
             } else {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
             }
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
@@ -1482,17 +1589,24 @@
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
 
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
        instead of moving to the scratch.  But as it is, the L constraint
        means that TCG_REG_L0 is definitely free here.  */
     const TCGReg scratch = TCG_REG_L0;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_EvGv;
+
+    if (have_movbe && real_bswap) {
+        bswap = 0;
+        movop = OPC_MOVBE_MyGy;
+    }
 
     switch (memop & MO_SIZE) {
     case MO_8:
-        /* In 32-bit mode, 8-byte stores can only happen from [abcd]x.
+        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
            Use the scratch register if necessary.  */
         if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
@@ -1507,8 +1621,7 @@
             tcg_out_rolw_8(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
-                             datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
         break;
     case MO_32:
         if (bswap) {
@@ -1516,7 +1629,7 @@
             tcg_out_bswap32(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         break;
     case MO_64:
         if (TCG_TARGET_REG_BITS == 64) {
@@ -1525,8 +1638,7 @@
                 tcg_out_bswap64(s, scratch);
                 datalo = scratch;
             }
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
         } else if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
             tcg_out_bswap32(s, scratch);
@@ -1535,8 +1647,13 @@
             tcg_out_bswap32(s, scratch);
             tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4);
+            if (real_bswap) {
+                int t = datalo;
+                datalo = datahi;
+                datahi = t;
+            }
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
         }
         break;
     default:
@@ -1603,7 +1720,7 @@
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
-    int c, rexw = 0;
+    int c, vexop, rexw = 0;
 
 #if TCG_TARGET_REG_BITS == 64
 # define OP_32_64(x) \
@@ -1739,6 +1856,16 @@
         }
         break;
 
+    OP_32_64(andc):
+        if (const_args[2]) {
+            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32,
+                        args[0], args[1]);
+            tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0);
+        } else {
+            tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]);
+        }
+        break;
+
     OP_32_64(mul):
         if (const_args[2]) {
             int32_t val;
@@ -1764,19 +1891,28 @@
 
     OP_32_64(shl):
         c = SHIFT_SHL;
-        goto gen_shift;
+        vexop = OPC_SHLX;
+        goto gen_shift_maybe_vex;
     OP_32_64(shr):
         c = SHIFT_SHR;
-        goto gen_shift;
+        vexop = OPC_SHRX;
+        goto gen_shift_maybe_vex;
     OP_32_64(sar):
         c = SHIFT_SAR;
-        goto gen_shift;
+        vexop = OPC_SARX;
+        goto gen_shift_maybe_vex;
     OP_32_64(rotl):
         c = SHIFT_ROL;
         goto gen_shift;
     OP_32_64(rotr):
         c = SHIFT_ROR;
         goto gen_shift;
+    gen_shift_maybe_vex:
+        if (have_bmi2 && !const_args[2]) {
+            tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
+            break;
+        }
+        /* FALLTHRU */
     gen_shift:
         if (const_args[2]) {
             tcg_out_shifti(s, c + rexw, args[0], args[2]);
@@ -1967,10 +2103,11 @@
     { INDEX_op_and_i32, { "r", "0", "ri" } },
     { INDEX_op_or_i32, { "r", "0", "ri" } },
     { INDEX_op_xor_i32, { "r", "0", "ri" } },
+    { INDEX_op_andc_i32, { "r", "r", "ri" } },
 
-    { INDEX_op_shl_i32, { "r", "0", "ci" } },
-    { INDEX_op_shr_i32, { "r", "0", "ci" } },
-    { INDEX_op_sar_i32, { "r", "0", "ci" } },
+    { INDEX_op_shl_i32, { "r", "0", "Ci" } },
+    { INDEX_op_shr_i32, { "r", "0", "Ci" } },
+    { INDEX_op_sar_i32, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i32, { "r", "0", "ci" } },
     { INDEX_op_rotr_i32, { "r", "0", "ci" } },
 
@@ -1991,9 +2128,7 @@
     { INDEX_op_setcond_i32, { "q", "r", "ri" } },
 
     { INDEX_op_deposit_i32, { "Q", "0", "Q" } },
-#if TCG_TARGET_HAS_movcond_i32
     { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
-#endif
 
     { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
     { INDEX_op_muls2_i32, { "a", "d", "a", "r" } },
@@ -2026,10 +2161,11 @@
     { INDEX_op_and_i64, { "r", "0", "reZ" } },
     { INDEX_op_or_i64, { "r", "0", "re" } },
     { INDEX_op_xor_i64, { "r", "0", "re" } },
+    { INDEX_op_andc_i64, { "r", "r", "rI" } },
 
-    { INDEX_op_shl_i64, { "r", "0", "ci" } },
-    { INDEX_op_shr_i64, { "r", "0", "ci" } },
-    { INDEX_op_sar_i64, { "r", "0", "ci" } },
+    { INDEX_op_shl_i64, { "r", "0", "Ci" } },
+    { INDEX_op_shr_i64, { "r", "0", "Ci" } },
+    { INDEX_op_sar_i64, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i64, { "r", "0", "ci" } },
     { INDEX_op_rotr_i64, { "r", "0", "ci" } },
 
@@ -2163,13 +2299,34 @@
 
 static void tcg_target_init(TCGContext *s)
 {
-    /* For 32-bit, 99% certainty that we're running on hardware that supports
-       cmov, but we still need to check.  In case cmov is not available, we'll
-       use a small forward branch.  */
-#ifndef have_cmov
-    {
+#ifdef CONFIG_CPUID_H
         unsigned a, b, c, d;
-        have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
+    int max = __get_cpuid_max(0, 0);
+
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+#ifndef have_cmov
+        /* For 32-bit, 99% certainty that we're running on hardware that
+           supports cmov, but we still need to check.  In case cmov is not
+           available, we'll use a small forward branch.  */
+        have_cmov = (d & bit_CMOV) != 0;
+#endif
+#ifndef have_movbe
+        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
+           need to probe for it.  */
+        have_movbe = (c & bit_MOVBE) != 0;
+#endif
+    }
+
+    if (max >= 7) {
+        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
+        __cpuid_count(7, 0, a, b, c, d);
+#ifdef bit_BMI
+        have_bmi1 = (b & bit_BMI) != 0;
+#endif
+#ifndef have_bmi2
+        have_bmi2 = (b & bit_BMI2) != 0;
+#endif
     }
 #endif
 
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 92c0fcd..20e4997 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -64,8 +64,6 @@
     TCG_REG_RDI = TCG_REG_EDI,
 } TCGReg;
 
-#define TCG_CT_CONST_S32 0x100
-#define TCG_CT_CONST_U32 0x200
 
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_ESP 
@@ -76,6 +74,7 @@
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #endif
 
+extern bool have_bmi1;
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
 #define TCG_TARGET_HAS_rot_i32          1
@@ -87,7 +86,7 @@
 #define TCG_TARGET_HAS_bswap32_i32      1
 #define TCG_TARGET_HAS_neg_i32          1
 #define TCG_TARGET_HAS_not_i32          1
-#define TCG_TARGET_HAS_andc_i32         0
+#define TCG_TARGET_HAS_andc_i32         have_bmi1
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
@@ -115,7 +114,7 @@
 #define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_neg_i64          1
 #define TCG_TARGET_HAS_not_i64          1
-#define TCG_TARGET_HAS_andc_i64         0
+#define TCG_TARGET_HAS_andc_i64         have_bmi1
 #define TCG_TARGET_HAS_orc_i64          0
 #define TCG_TARGET_HAS_eqv_i64          0
 #define TCG_TARGET_HAS_nand_i64         0
diff --git a/tcg/optimize.c b/tcg/optimize.c
index ef83229..29564f6 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -657,11 +657,68 @@
                 }
             }
             break;
+        CASE_OP_32_64(xor):
+        CASE_OP_32_64(nand):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[2]].val == -1) {
+                i = 1;
+                goto try_not;
+            }
+            break;
+        CASE_OP_32_64(nor):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[2]].val == 0) {
+                i = 1;
+                goto try_not;
+            }
+            break;
+        CASE_OP_32_64(andc):
+            if (temps[args[2]].state != TCG_TEMP_CONST
+                && temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[1]].val == -1) {
+                i = 2;
+                goto try_not;
+            }
+            break;
+        CASE_OP_32_64(orc):
+        CASE_OP_32_64(eqv):
+            if (temps[args[2]].state != TCG_TEMP_CONST
+                && temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[1]].val == 0) {
+                i = 2;
+                goto try_not;
+            }
+            break;
+        try_not:
+            {
+                TCGOpcode not_op;
+                bool have_not;
+
+                if (def->flags & TCG_OPF_64BIT) {
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                } else {
+                    not_op = INDEX_op_not_i32;
+                    have_not = TCG_TARGET_HAS_not_i32;
+                }
+                if (!have_not) {
+                    break;
+                }
+                s->gen_opc_buf[op_index] = not_op;
+                reset_temp(args[0]);
+                gen_args[0] = args[0];
+                gen_args[1] = args[i];
+                args += 3;
+                gen_args += 2;
+                continue;
+            }
         default:
             break;
         }
 
-        /* Simplify expression for "op r, a, 0 => mov r, a" cases */
+        /* Simplify expression for "op r, a, const => mov r, a" cases */
         switch (op) {
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -672,12 +729,23 @@
         CASE_OP_32_64(rotr):
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            if (temps[args[1]].state == TCG_TEMP_CONST) {
-                /* Proceed with possible constant folding. */
-                break;
-            }
-            if (temps[args[2]].state == TCG_TEMP_CONST
+        CASE_OP_32_64(andc):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0) {
+                goto do_mov3;
+            }
+                break;
+        CASE_OP_32_64(and):
+        CASE_OP_32_64(orc):
+        CASE_OP_32_64(eqv):
+            if (temps[args[1]].state != TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[2]].val == -1) {
+                goto do_mov3;
+            }
+            break;
+        do_mov3:
                 if (temps_are_copies(args[0], args[1])) {
                     s->gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
@@ -687,13 +755,12 @@
                 }
                 args += 3;
                 continue;
-            }
-            break;
         default:
             break;
         }
 
-        /* Simplify using known-zero bits */
+        /* Simplify using known-zero bits. Currently only ops with a single
+           output argument is supported. */
         mask = -1;
         affected = -1;
         switch (op) {
@@ -728,16 +795,36 @@
             mask = temps[args[1]].mask & mask;
             break;
 
-        CASE_OP_32_64(sar):
+        CASE_OP_32_64(andc):
+            /* Known-zeros does not imply known-ones.  Therefore unless
+               args[2] is constant, we can't infer anything from it.  */
             if (temps[args[2]].state == TCG_TEMP_CONST) {
-                mask = ((tcg_target_long)temps[args[1]].mask
-                        >> temps[args[2]].val);
+                mask = ~temps[args[2]].mask;
+                goto and_const;
+            }
+            /* But we certainly know nothing outside args[1] may be set. */
+            mask = temps[args[1]].mask;
+            break;
+
+        case INDEX_op_sar_i32:
+            if (temps[args[2]].state == TCG_TEMP_CONST) {
+                mask = (int32_t)temps[args[1]].mask >> temps[args[2]].val;
+            }
+            break;
+        case INDEX_op_sar_i64:
+            if (temps[args[2]].state == TCG_TEMP_CONST) {
+                mask = (int64_t)temps[args[1]].mask >> temps[args[2]].val;
             }
             break;
 
-        CASE_OP_32_64(shr):
+        case INDEX_op_shr_i32:
             if (temps[args[2]].state == TCG_TEMP_CONST) {
-                mask = temps[args[1]].mask >> temps[args[2]].val;
+                mask = (uint32_t)temps[args[1]].mask >> temps[args[2]].val;
+            }
+            break;
+        case INDEX_op_shr_i64:
+            if (temps[args[2]].state == TCG_TEMP_CONST) {
+                mask = (uint64_t)temps[args[1]].mask >> temps[args[2]].val;
             }
             break;
 
@@ -771,10 +858,39 @@
             mask = temps[args[3]].mask | temps[args[4]].mask;
             break;
 
+        CASE_OP_32_64(ld8u):
+        case INDEX_op_qemu_ld8u:
+            mask = 0xff;
+            break;
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_qemu_ld16u:
+            mask = 0xffff;
+            break;
+        case INDEX_op_ld32u_i64:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_qemu_ld32u:
+#endif
+            mask = 0xffffffffu;
+            break;
+
+        CASE_OP_32_64(qemu_ld):
+            {
+                TCGMemOp mop = args[def->nb_oargs + def->nb_iargs];
+                if (!(mop & MO_SIGN)) {
+                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                }
+            }
+            break;
+
         default:
             break;
         }
 
+        /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit
+           results */
+        if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) {
+            mask &= 0xffffffffu;
+        }
         if (mask == 0) {
             assert(def->nb_oargs == 1);
             s->gen_opc_buf[op_index] = op_to_movi(op);
@@ -841,6 +957,7 @@
 
         /* Simplify expression for "op r, a, a => movi r, 0" cases */
         switch (op) {
+        CASE_OP_32_64(andc):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(xor):
             if (temps_are_copies(args[1], args[2])) {
@@ -1142,6 +1259,11 @@
             } else {
                 for (i = 0; i < def->nb_oargs; i++) {
                     reset_temp(args[i]);
+                    /* Save the corresponding known-zero bits mask for the
+                       first output argument (only one supported so far). */
+                    if (i == 0) {
+                        temps[args[i]].mask = mask;
+                    }
                 }
             }
             for (i = 0; i < def->nb_args; i++) {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 712438c..f1e0763 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -41,7 +41,7 @@
 #include "qemu/host-utils.h"
 #include "qemu/timer.h"
 
-/* Note: the long term plan is to reduce the dependancies on the QEMU
+/* Note: the long term plan is to reduce the dependencies on the QEMU
    CPU definitions. Currently they are used for qemu_ld/st
    instructions */
 #define NO_CPU_IO_DEFS
@@ -526,7 +526,7 @@
             ts->temp_local = temp_local;
             ts->name = NULL;
             ts++;
-            ts->base_type = TCG_TYPE_I32;
+            ts->base_type = type;
             ts->type = TCG_TYPE_I32;
             ts->temp_allocated = 1;
             ts->temp_local = temp_local;
@@ -586,7 +586,7 @@
     assert(ts->temp_allocated != 0);
     ts->temp_allocated = 0;
 
-    k = ts->type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
+    k = ts->base_type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
     set_bit(idx, s->free_temps[k].l);
 }
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index c72af6c..f7efcb4 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -324,13 +324,16 @@
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
+#define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
 
 /* call flags */
 /* Helper does not read globals (either directly or through an exception). It