tcg: Use softmmu fast path for unaligned accesses

This patch integrates several commits from Qemu ToT in order to improve
tcg infrastructure for better unaligned accesses handling. These tcg updates
bring performance gain for all architectures which support misaligned access
in hardware.

Makefile.qemu2-sources.mk is updated to include the new file tcg/tcg-op.c.

List of squashed commits from Qemu ToT:

1f00b27 tcg: Improve the alignment check infrastructure
08b0b23 tcg/i386: omit a few REXW prefixes in softmmu code
9ee1490 tcg/aarch64: Use softmmu fast path for unaligned accesses
a5e3981 tcg/s390: Use softmmu fast path for unaligned accesses
68d45bb tcg/ppc: Improve unaligned load/store handling on 64-bit backend
8cc580f tcg/i386: use softmmu fast path for unaligned accesses
9615212 tcg/optimize: fix tcg_opt_gen_movi
80adb8f tcg/aarch64: use 32-bit offset for 32-bit softmmu emulation
ffc6372 tcg/aarch64: use 32-bit offset for 32-bit user-mode emulation
6c0f0c0 tcg/aarch64: add ext argument to tcg_out_insn_3310
ee8ba9e tcg/i386: Extend addresses for 32-bit guests
c19f47b tcg: fix dead computation for repeated		 input arguments
7e1df26 tcg: fix register allocation with two aliased dead inputs
59c4b7e tcg: Handle MO_AMASK in tcg_dump_ops
2b7ec66 tcg: Mask TCGMemOp appropriately for indexing
dfb3630 tcg: Add MO_ALIGN, MO_UNALN
3972ef6 tcg: Push merged memop+mmu_idx parameter to softmmu routines
59227d5 tcg: Merge memop and mmu_idx parameters to qemu_ld/st
15fc7da tcg: Remove unused opcodes
a4ce099 tcg: Implement insert_op_before
0c627cd tcg: Remove opcodes instead of noping them out
c45cb8b tcg: Put opcodes in a linked list
fe700ad tcg: Introduce tcg_op_buf_count and tcg_op_buf_full
0a7df5d tcg: Move emit of INDEX_op_end into gen_tb_end
3a13c3f tcg: Reduce ifdefs in tcg-op.c
951c630 tcg: Move some opcode generation functions out of line

Change-Id: I88b46075a65b1f35d8aeb286fc5d7c8a6dae7d4c
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
diff --git a/Makefile.target b/Makefile.target
index 36ac49c..523602b 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -83,7 +83,7 @@
 #########################################################
 # cpu emulator library
 obj-y = exec.o translate-all.o cpu-exec.o
-obj-y += tcg/tcg.o tcg/optimize.o
+obj-y += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
 obj-$(CONFIG_TCG_INTERPRETER) += tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
diff --git a/android-qemu2-glue/build/Makefile.qemu2-sources.mk b/android-qemu2-glue/build/Makefile.qemu2-sources.mk
index 08b3395..09bd237 100644
--- a/android-qemu2-glue/build/Makefile.qemu2-sources.mk
+++ b/android-qemu2-glue/build/Makefile.qemu2-sources.mk
@@ -341,6 +341,7 @@
     savevm.c \
     tcg/optimize.c \
     tcg/tcg.c \
+    tcg/tcg-op.c \
     translate-all.c \
     vl.c \
     xen-common-stub.c \
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index c085804..24aa095 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -325,14 +325,22 @@
 extern RAMList ram_list;
 
 /* Flags stored in the low bits of the TLB virtual address.  These are
-   defined so that fast path ram access is all zeros.  */
+ * defined so that fast path ram access is all zeros.
+ * The flags all must be between TARGET_PAGE_BITS and
+ * maximum address alignment bit.
+ */
 /* Zero if TLB entry is valid.  */
-#define TLB_INVALID_MASK   (1 << 3)
+#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS - 1))
 /* Set if TLB entry references a clean RAM page.  The iotlb entry will
    contain the page physical address.  */
-#define TLB_NOTDIRTY    (1 << 4)
+#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS - 2))
 /* Set if TLB entry is an IO callback.  */
-#define TLB_MMIO        (1 << 5)
+#define TLB_MMIO            (1 << (TARGET_PAGE_BITS - 3))
+
+/* Use this mask to check interception with an alignment mask
+ * in a TCG backend.
+ */
+#define TLB_FLAGS_MASK  (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO)
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 ram_addr_t last_ram_offset(void);
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index da53395..79af464 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -11,8 +11,8 @@
 
 static inline void gen_tb_start(void)
 {
-    TCGv_i32 count;
-    TCGv_i32 flag;
+    TCGv_i32 count, flag, imm;
+    int i;
 
     exitreq_label = gen_new_label();
     flag = tcg_temp_new_i32();
@@ -28,9 +28,17 @@
     count = tcg_temp_local_new_i32();
     tcg_gen_ld_i32(count, cpu_env,
                    -ENV_OFFSET + offsetof(CPUState, icount_decr.u32));
+
+    imm = tcg_temp_new_i32();
+    tcg_gen_movi_i32(imm, 0xdeadbeef);
+
     /* This is a horrid hack to allow fixing up the value later.  */
-    icount_arg = tcg_ctx.gen_opparam_ptr + 1;
-    tcg_gen_subi_i32(count, count, 0xdeadbeef);
+    i = tcg_ctx.gen_last_op_idx;
+    i = tcg_ctx.gen_op_buf[i].args;
+    icount_arg = &tcg_ctx.gen_opparam_buf[i + 1];
+
+    tcg_gen_sub_i32(count, count, imm);
+    tcg_temp_free_i32(imm);
 
     tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, icount_label);
     tcg_gen_st16_i32(count, cpu_env,
@@ -48,6 +56,9 @@
         gen_set_label(icount_label);
         tcg_gen_exit_tb((uintptr_t)tb + TB_EXIT_ICOUNT_EXPIRED);
     }
+
+    /* Terminate the linked list.  */
+    tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
 }
 
 static inline void gen_io_start(void)
diff --git a/softmmu_template.h b/softmmu_template.h
index 6b4e615..8ec7a0e 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -166,26 +166,27 @@
 #ifdef SOFTMMU_CODE_ACCESS
 static __attribute__((unused))
 #endif
-WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr, int mmu_idx,
-                            uintptr_t retaddr)
+WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
+                            TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
     DATA_TYPE res;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
          != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                                 mmu_idx, retaddr);
-        }
-#endif
         if (!VICTIM_TLB_HIT(ADDR_READ)) {
             tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
                      mmu_idx, retaddr);
@@ -216,16 +217,12 @@
         DATA_TYPE res1, res2;
         unsigned shift;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr);
-#endif
         addr1 = addr & ~(DATA_SIZE - 1);
         addr2 = addr1 + DATA_SIZE;
         /* Note the adjustment at the beginning of the function.
            Undo that for the recursion.  */
-        res1 = helper_le_ld_name(env, addr1, mmu_idx, retaddr + GETPC_ADJ);
-        res2 = helper_le_ld_name(env, addr2, mmu_idx, retaddr + GETPC_ADJ);
+        res1 = helper_le_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
+        res2 = helper_le_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
         shift = (addr & (DATA_SIZE - 1)) * 8;
 
         /* Little-endian combine.  */
@@ -233,14 +230,6 @@
         return res;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
 #if DATA_SIZE == 1
     res = glue(glue(ld, LSUFFIX), _p)((uint8_t *)haddr);
@@ -254,26 +243,27 @@
 #ifdef SOFTMMU_CODE_ACCESS
 static __attribute__((unused))
 #endif
-WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr, int mmu_idx,
-                            uintptr_t retaddr)
+WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
+                            TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
     DATA_TYPE res;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
          != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                                 mmu_idx, retaddr);
-        }
-#endif
         if (!VICTIM_TLB_HIT(ADDR_READ)) {
             tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
                      mmu_idx, retaddr);
@@ -304,16 +294,12 @@
         DATA_TYPE res1, res2;
         unsigned shift;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr);
-#endif
         addr1 = addr & ~(DATA_SIZE - 1);
         addr2 = addr1 + DATA_SIZE;
         /* Note the adjustment at the beginning of the function.
            Undo that for the recursion.  */
-        res1 = helper_be_ld_name(env, addr1, mmu_idx, retaddr + GETPC_ADJ);
-        res2 = helper_be_ld_name(env, addr2, mmu_idx, retaddr + GETPC_ADJ);
+        res1 = helper_be_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
+        res2 = helper_be_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
         shift = (addr & (DATA_SIZE - 1)) * 8;
 
         /* Big-endian combine.  */
@@ -321,14 +307,6 @@
         return res;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
     res = glue(glue(ld, LSUFFIX), _be_p)((uint8_t *)haddr);
     return res;
@@ -339,7 +317,8 @@
 glue(glue(helper_ld, SUFFIX), MMUSUFFIX)(CPUArchState *env, target_ulong addr,
                                          int mmu_idx)
 {
-    return helper_te_ld_name (env, addr, mmu_idx, GETRA());
+    TCGMemOpIdx oi = make_memop_idx(SHIFT, mmu_idx);
+    return helper_te_ld_name (env, addr, oi, GETRA());
 }
 
 #ifndef SOFTMMU_CODE_ACCESS
@@ -348,16 +327,16 @@
    avoid this for 64-bit data, or for 32-bit data on 32-bit host.  */
 #if DATA_SIZE * 8 < TCG_TARGET_REG_BITS
 WORD_TYPE helper_le_lds_name(CPUArchState *env, target_ulong addr,
-                             int mmu_idx, uintptr_t retaddr)
+                             TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return (SDATA_TYPE)helper_le_ld_name(env, addr, mmu_idx, retaddr);
+    return (SDATA_TYPE)helper_le_ld_name(env, addr, oi, retaddr);
 }
 
 # if DATA_SIZE > 1
 WORD_TYPE helper_be_lds_name(CPUArchState *env, target_ulong addr,
-                             int mmu_idx, uintptr_t retaddr)
+                             TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return (SDATA_TYPE)helper_be_ld_name(env, addr, mmu_idx, retaddr);
+    return (SDATA_TYPE)helper_be_ld_name(env, addr, oi, retaddr);
 }
 # endif
 #endif
@@ -382,24 +361,25 @@
 }
 
 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
-                       int mmu_idx, uintptr_t retaddr)
+                       TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                                 mmu_idx, retaddr);
-        }
-#endif
         if (!VICTIM_TLB_HIT(addr_write)) {
             tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
         }
@@ -427,10 +407,6 @@
                      >= TARGET_PAGE_SIZE)) {
         int i;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr);
-#endif
         /* XXX: not efficient, but simple */
         /* Note: relies on the fact that tlb_fill() does not remove the
          * previous page from the TLB cache.  */
@@ -440,19 +416,11 @@
             /* Note the adjustment at the beginning of the function.
                Undo that for the recursion.  */
             glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            mmu_idx, retaddr + GETPC_ADJ);
+                                            oi, retaddr + GETPC_ADJ);
         }
         return;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
 #if DATA_SIZE == 1
     glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
@@ -463,24 +431,25 @@
 
 #if DATA_SIZE > 1
 void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
-                       int mmu_idx, uintptr_t retaddr)
+                       TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                                 mmu_idx, retaddr);
-        }
-#endif
         if (!VICTIM_TLB_HIT(addr_write)) {
             tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
         }
@@ -508,10 +477,6 @@
                      >= TARGET_PAGE_SIZE)) {
         int i;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr);
-#endif
         /* XXX: not efficient, but simple */
         /* Note: relies on the fact that tlb_fill() does not remove the
          * previous page from the TLB cache.  */
@@ -521,19 +486,11 @@
             /* Note the adjustment at the beginning of the function.
                Undo that for the recursion.  */
             glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            mmu_idx, retaddr + GETPC_ADJ);
+                                            oi, retaddr + GETPC_ADJ);
         }
         return;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
     glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
 }
@@ -543,7 +500,8 @@
 glue(glue(helper_st, SUFFIX), MMUSUFFIX)(CPUArchState *env, target_ulong addr,
                                          DATA_TYPE val, int mmu_idx)
 {
-    helper_te_st_name(env, addr, val, mmu_idx, GETRA());
+    TCGMemOpIdx oi = make_memop_idx(SHIFT, mmu_idx);
+    helper_te_st_name(env, addr, val, oi, GETRA());
 }
 
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index 76658a0..5bec586 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -2790,7 +2790,6 @@
     target_ulong pc_start;
     target_ulong pc_mask;
     uint32_t insn;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     ExitStatus ret;
@@ -2798,7 +2797,6 @@
     int max_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
 
     ctx.tb = tb;
     ctx.pc = pc_start;
@@ -2839,11 +2837,12 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
-                while (lj < j)
+                while (lj < j) {
                     tcg_ctx.gen_opc_instr_start[lj++] = 0;
+                }
             }
             tcg_ctx.gen_opc_pc[lj] = ctx.pc;
             tcg_ctx.gen_opc_instr_start[lj] = 1;
@@ -2881,7 +2880,7 @@
            or exhaust instruction count, stop generation.  */
         if (ret == NO_EXIT
             && ((ctx.pc & pc_mask) == 0
-                || tcg_ctx.gen_opc_ptr >= gen_opc_end
+                || tcg_op_buf_full()
                 || num_insns >= max_insns
                 || singlestep
                 || ctx.singlestep_enabled)) {
@@ -2912,12 +2911,13 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
-        while (lj <= j)
+        while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
+        }
     } else {
         tb->size = ctx.pc - pc_start;
         tb->icount = num_insns;
diff --git a/target-arm/helper.c b/target-arm/helper.c
index bf8d4b3..d436e34 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -4993,13 +4993,15 @@
         int maxidx = DIV_ROUND_UP(blocklen, TARGET_PAGE_SIZE);
         void *hostaddr[maxidx];
         int try, i;
+        unsigned mmu_idx = cpu_mmu_index(env);
+        TCGMemOpIdx oi = make_memop_idx(MO_UB, mmu_idx);
 
         for (try = 0; try < 2; try++) {
 
             for (i = 0; i < maxidx; i++) {
                 hostaddr[i] = tlb_vaddr_to_host(env,
                                                 vaddr + TARGET_PAGE_SIZE * i,
-                                                1, cpu_mmu_index(env));
+                                                1, mmu_idx);
                 if (!hostaddr[i]) {
                     break;
                 }
@@ -5020,12 +5022,12 @@
              * this purpose use the actual register value passed to us
              * so that we get the fault address right.
              */
-            helper_ret_stb_mmu(env, vaddr_in, 0, cpu_mmu_index(env), GETRA());
+            helper_ret_stb_mmu(env, vaddr_in, 0, oi, GETRA());
             /* Now we can populate the other TLB entries, if any */
             for (i = 0; i < maxidx; i++) {
                 uint64_t va = vaddr + TARGET_PAGE_SIZE * i;
                 if (va != (vaddr_in & TARGET_PAGE_MASK)) {
-                    helper_ret_stb_mmu(env, va, 0, cpu_mmu_index(env), GETRA());
+                    helper_ret_stb_mmu(env, va, 0, oi, GETRA());
                 }
             }
         }
@@ -5042,7 +5044,7 @@
          *    bounce buffer was in use
          */
         for (i = 0; i < blocklen; i++) {
-            helper_ret_stb_mmu(env, vaddr + i, 0, cpu_mmu_index(env), GETRA());
+            helper_ret_stb_mmu(env, vaddr + i, 0, oi, GETRA());
         }
     }
 #else
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 80d2c07..1e4e4ef 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -10899,7 +10899,6 @@
     CPUARMState *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
-    uint16_t *gen_opc_end;
     int j, lj;
     target_ulong pc_start;
     target_ulong next_page_start;
@@ -10910,8 +10909,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -10980,7 +10977,7 @@
         }
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -11030,7 +11027,7 @@
          * ensures prefetch aborts occur at the right place.
          */
         num_insns++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              !dc->ss_active &&
@@ -11090,7 +11087,6 @@
 
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -11102,7 +11098,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index af51568..32d8505 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -10995,7 +10995,6 @@
     CPUARMState *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
-    uint16_t *gen_opc_end;
     int j, lj;
     target_ulong pc_start;
     target_ulong next_page_start;
@@ -11016,8 +11015,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -11150,7 +11147,7 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -11216,7 +11213,7 @@
          * Also stop translation when a page boundary is reached.  This
          * ensures prefetch aborts occur at the right place.  */
         num_insns ++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              !dc->ss_active &&
@@ -11325,7 +11322,6 @@
 
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -11337,7 +11333,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-cris/translate.c b/target-cris/translate.c
index 76406af..b4c4ca3 100644
--- a/target-cris/translate.c
+++ b/target-cris/translate.c
@@ -3116,7 +3116,6 @@
 {
     CPUState *cs = CPU(cpu);
     CPUCRISState *env = &cpu->env;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     unsigned int insn_len;
     int j, lj;
@@ -3142,8 +3141,6 @@
     dc->cpu = cpu;
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->ppc = pc_start;
     dc->pc = pc_start;
@@ -3207,7 +3204,7 @@
         check_breakpoint(env, dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -3291,7 +3288,7 @@
             break;
         }
     } while (!dc->is_jmp && !dc->cpustate_changed
-            && tcg_ctx.gen_opc_ptr < gen_opc_end
+            && !tcg_op_buf_full()
             && !singlestep
             && (dc->pc < next_page_start)
             && num_insns < max_insns);
@@ -3344,9 +3341,9 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
@@ -3361,8 +3358,8 @@
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
         log_target_disas(env, pc_start, dc->pc - pc_start,
                          env->pregs[PR_VR]);
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 #endif
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 8af4ee9..899ca51 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -7901,7 +7901,6 @@
     CPUX86State *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     target_ulong pc_ptr;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj;
     uint64_t flags;
@@ -7970,8 +7969,6 @@
     cpu_ptr1 = tcg_temp_new_ptr();
     cpu_cc_srcT = tcg_temp_local_new();
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     pc_ptr = pc_start;
     lj = -1;
@@ -7992,7 +7989,7 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -8030,7 +8027,7 @@
             break;
         }
         /* if too long translation, stop generation too */
-        if (tcg_ctx.gen_opc_ptr >= gen_opc_end ||
+        if (tcg_op_buf_full() ||
             (pc_ptr - pc_start) >= (TARGET_PAGE_SIZE - 32) ||
             num_insns >= max_insns) {
             gen_jmp_im(pc_ptr - dc->cs_base);
@@ -8047,10 +8044,10 @@
         gen_io_end();
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     /* we don't forget to fill the last values */
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index 8454e8b..79b67d1 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -1062,7 +1062,6 @@
     CPUState *cs = CPU(cpu);
     CPULM32State *env = &cpu->env;
     struct DisasContext ctx, *dc = &ctx;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     int j, lj;
     uint32_t next_page_start;
@@ -1075,8 +1074,6 @@
     dc->num_watchpoints = cpu->num_watchpoints;
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -1100,7 +1097,7 @@
         check_breakpoint(env, dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -1124,7 +1121,7 @@
         num_insns++;
 
     } while (!dc->is_jmp
-         && tcg_ctx.gen_opc_ptr < gen_opc_end
+         && !tcg_op_buf_full()
          && !cs->singlestep_enabled
          && !singlestep
          && (dc->pc < next_page_start)
@@ -1158,9 +1155,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
@@ -1174,9 +1171,8 @@
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
         qemu_log("\n");
         log_target_disas(env, pc_start, dc->pc - pc_start, 0);
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr -
-            tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 }
diff --git a/target-m68k/translate.c b/target-m68k/translate.c
index efd4cfc..5c2f4d0 100644
--- a/target-m68k/translate.c
+++ b/target-m68k/translate.c
@@ -2980,7 +2980,6 @@
     CPUState *cs = CPU(cpu);
     CPUM68KState *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj;
     target_ulong pc_start;
@@ -2993,8 +2992,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->env = env;
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
@@ -3026,7 +3023,7 @@
                 break;
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -3041,7 +3038,7 @@
         dc->insn_pc = dc->pc;
 	disas_m68k_insn(env, dc);
         num_insns++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              (pc_offset) < (TARGET_PAGE_SIZE - 32) &&
@@ -3075,7 +3072,6 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -3086,7 +3082,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-microblaze/translate.c b/target-microblaze/translate.c
index fd2b771..d37f235 100644
--- a/target-microblaze/translate.c
+++ b/target-microblaze/translate.c
@@ -1673,7 +1673,6 @@
 {
     CPUState *cs = CPU(cpu);
     CPUMBState *env = &cpu->env;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     int j, lj;
     struct DisasContext ctx;
@@ -1688,8 +1687,6 @@
     dc->tb = tb;
     org_flags = dc->synced_flags = dc->tb_flags = tb->flags;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->jmp = 0;
     dc->delayed_branch = !!(dc->tb_flags & D_FLAG);
@@ -1732,7 +1729,7 @@
         check_breakpoint(env, dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -1795,10 +1792,10 @@
             break;
         }
     } while (!dc->is_jmp && !dc->cpustate_changed
-         && tcg_ctx.gen_opc_ptr < gen_opc_end
-                 && !singlestep
-         && (dc->pc < next_page_start)
-                 && num_insns < max_insns);
+             && !tcg_op_buf_full()
+             && !singlestep
+             && (dc->pc < next_page_start)
+             && num_insns < max_insns);
 
     npc = dc->pc;
     if (dc->jmp == JMP_DIRECT || dc->jmp == JMP_DIRECT_CC) {
@@ -1846,9 +1843,9 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
@@ -1864,9 +1861,8 @@
 #if DISAS_GNU
         log_target_disas(env, pc_start, dc->pc - pc_start, 0);
 #endif
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr -
-            tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 #endif
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 02e9495..1b93fc0 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -19766,7 +19766,6 @@
     CPUMIPSState *env = &cpu->env;
     DisasContext ctx;
     target_ulong pc_start;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     int num_insns;
@@ -19778,7 +19777,6 @@
         qemu_log("search pc %d\n", search_pc);
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.saved_pc = -1;
     ctx.singlestep_enabled = cs->singlestep_enabled;
@@ -19822,7 +19820,7 @@
         }
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -19880,7 +19878,7 @@
         if ((ctx.pc & (TARGET_PAGE_SIZE - 1)) == 0)
             break;
 
-        if (tcg_ctx.gen_opc_ptr >= gen_opc_end) {
+        if (tcg_op_buf_full()) {
             break;
         }
 
@@ -19915,9 +19913,9 @@
     }
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-moxie/translate.c b/target-moxie/translate.c
index 4541b9b..cc9af66 100644
--- a/target-moxie/translate.c
+++ b/target-moxie/translate.c
@@ -827,14 +827,12 @@
     CPUState *cs = CPU(cpu);
     DisasContext ctx;
     target_ulong pc_start;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     CPUMoxieState *env = &cpu->env;
     int num_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.saved_pc = -1;
     ctx.tb = tb;
@@ -857,7 +855,7 @@
         }
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -879,7 +877,7 @@
         if ((ctx.pc & (TARGET_PAGE_SIZE - 1)) == 0) {
             break;
         }
-    } while (ctx.bstate == BS_NONE && tcg_ctx.gen_opc_ptr < gen_opc_end);
+    } while (ctx.bstate == BS_NONE && !tcg_op_buf_full());
 
     if (cs->singlestep_enabled) {
         tcg_gen_movi_tl(cpu_pc, ctx.pc);
@@ -900,9 +898,9 @@
     }
  done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-openrisc/translate.c b/target-openrisc/translate.c
index 407bd97..fdc1a16 100644
--- a/target-openrisc/translate.c
+++ b/target-openrisc/translate.c
@@ -1642,7 +1642,6 @@
 {
     CPUState *cs = CPU(cpu);
     struct DisasContext ctx, *dc = &ctx;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     int j, k;
     uint32_t next_page_start;
@@ -1652,7 +1651,6 @@
     pc_start = tb->pc;
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     dc->is_jmp = DISAS_NEXT;
     dc->ppc = pc_start;
     dc->pc = pc_start;
@@ -1680,7 +1678,7 @@
     do {
         check_breakpoint(cpu, dc);
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (k < j) {
                 k++;
                 while (k < j) {
@@ -1721,7 +1719,7 @@
             }
         }
     } while (!dc->is_jmp
-             && tcg_ctx.gen_opc_ptr < gen_opc_end
+             && !tcg_op_buf_full()
              && !cs->singlestep_enabled
              && !singlestep
              && (dc->pc < next_page_start)
@@ -1759,9 +1757,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         k++;
         while (k <= j) {
             tcg_ctx.gen_opc_instr_start[k++] = 0;
@@ -1775,9 +1773,8 @@
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
         qemu_log("\n");
         log_target_disas(&cpu->env, pc_start, dc->pc - pc_start, 0);
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr -
-            tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 }
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index d381632..344cb5d 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -11273,14 +11273,12 @@
     DisasContext ctx, *ctxp = &ctx;
     opc_handler_t **table, *handler;
     target_ulong pc_start;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     int num_insns;
     int max_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.nip = pc_start;
     ctx.tb = tb;
     ctx.exception = POWERPC_EXCP_NONE;
@@ -11332,8 +11330,7 @@
     gen_tb_start();
     tcg_clear_temp_count();
     /* Set env in case of segfault during code fetch */
-    while (ctx.exception == POWERPC_EXCP_NONE
-            && tcg_ctx.gen_opc_ptr < gen_opc_end) {
+    while (ctx.exception == POWERPC_EXCP_NONE && !tcg_op_buf_full()) {
         if (unlikely(!QTAILQ_EMPTY(&cs->breakpoints))) {
             QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
                 if (bp->pc == ctx.nip) {
@@ -11343,7 +11340,7 @@
             }
         }
         if (unlikely(search_pc)) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -11449,9 +11446,9 @@
         tcg_gen_exit_tb(0);
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (unlikely(search_pc)) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index dbf1993..215ef2e 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -4750,7 +4750,6 @@
     DisasContext dc;
     target_ulong pc_start;
     uint64_t next_page_start;
-    uint16_t *gen_opc_end;
     int j, lj = -1;
     int num_insns, max_insns;
     CPUBreakpoint *bp;
@@ -4769,8 +4768,6 @@
     dc.cc_op = CC_OP_DYNAMIC;
     do_debug = dc.singlestep_enabled = cs->singlestep_enabled;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
 
     num_insns = 0;
@@ -4783,7 +4780,7 @@
 
     do {
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -4821,7 +4818,7 @@
            or exhaust instruction count, stop generation.  */
         if (status == NO_EXIT
             && (dc.pc >= next_page_start
-                || tcg_ctx.gen_opc_ptr >= gen_opc_end
+                || tcg_op_buf_full()
                 || num_insns >= max_insns
                 || singlestep
                 || cs->singlestep_enabled)) {
@@ -4856,9 +4853,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 3088edc..bc82044 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -1865,14 +1865,12 @@
     CPUSH4State *env = &cpu->env;
     DisasContext ctx;
     target_ulong pc_start;
-    static uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int i, ii;
     int num_insns;
     int max_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.flags = (uint32_t)tb->flags;
     ctx.bstate = BS_NONE;
@@ -1891,7 +1889,7 @@
     if (max_insns == 0)
         max_insns = CF_COUNT_MASK;
     gen_tb_start();
-    while (ctx.bstate == BS_NONE && tcg_ctx.gen_opc_ptr < gen_opc_end) {
+    while (ctx.bstate == BS_NONE && !tcg_op_buf_full()) {
         if (unlikely(!QTAILQ_EMPTY(&cs->breakpoints))) {
             QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
                 if (ctx.pc == bp->pc) {
@@ -1904,7 +1902,7 @@
 	    }
 	}
         if (search_pc) {
-            i = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            i = tcg_op_buf_count();
             if (ii < i) {
                 ii++;
                 while (ii < i)
@@ -1962,9 +1960,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        i = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        i = tcg_op_buf_count();
         ii++;
         while (ii <= i)
             tcg_ctx.gen_opc_instr_start[ii++] = 0;
diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index 78c4e21..99b4f53 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -5245,7 +5245,6 @@
     CPUState *cs = CPU(cpu);
     CPUSPARCState *env = &cpu->env;
     target_ulong pc_start, last_pc;
-    uint16_t *gen_opc_end;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
     int j, lj = -1;
@@ -5265,7 +5264,6 @@
     dc->fpu_enabled = tb_fpu_enabled(tb->flags);
     dc->address_mask_32bit = tb_am_enabled(tb->flags);
     dc->singlestep = (cs->singlestep_enabled || singlestep);
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
 
     num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
@@ -5287,7 +5285,7 @@
         }
         if (spc) {
             qemu_log("Search PC...\n");
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -5320,7 +5318,7 @@
         if (dc->singlestep) {
             break;
         }
-    } while ((tcg_ctx.gen_opc_ptr < gen_opc_end) &&
+    } while (!tcg_op_buf_full() &&
              (dc->pc - pc_start) < (TARGET_PAGE_SIZE - 32) &&
              num_insns < max_insns);
 
@@ -5342,9 +5340,9 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (spc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-tricore/translate.c b/target-tricore/translate.c
index d5a9596..68e3c53 100644
--- a/target-tricore/translate.c
+++ b/target-tricore/translate.c
@@ -2430,7 +2430,6 @@
     DisasContext ctx;
     target_ulong pc_start;
     int num_insns;
-    uint16_t *gen_opc_end;
 
     if (search_pc) {
         qemu_log("search pc %d\n", search_pc);
@@ -2438,7 +2437,6 @@
 
     num_insns = 0;
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.saved_pc = -1;
     ctx.tb = tb;
@@ -2454,7 +2452,7 @@
 
         num_insns++;
 
-        if (tcg_ctx.gen_opc_ptr >= gen_opc_end) {
+        if (tcg_op_buf_full()) {
             gen_save_pc(ctx.next_pc);
             tcg_gen_exit_tb(0);
             break;
@@ -2468,7 +2466,6 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
     if (search_pc) {
         printf("done_generating search pc\n");
     } else {
diff --git a/target-unicore32/translate.c b/target-unicore32/translate.c
index 653c225..e95c1e2 100644
--- a/target-unicore32/translate.c
+++ b/target-unicore32/translate.c
@@ -1877,7 +1877,6 @@
     CPUUniCore32State *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
-    uint16_t *gen_opc_end;
     int j, lj;
     target_ulong pc_start;
     uint32_t next_page_start;
@@ -1891,8 +1890,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -1933,7 +1930,7 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -1965,7 +1962,7 @@
          * Also stop translation when a page boundary is reached.  This
          * ensures prefetch aborts occur at the right place.  */
         num_insns++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              dc->pc < next_page_start &&
@@ -2037,7 +2034,6 @@
 
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -2048,7 +2044,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index badca19..7ee565c 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -2987,7 +2987,6 @@
     DisasContext dc;
     int insn_count = 0;
     int j, lj = -1;
-    uint16_t *gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     int max_insns = tb->cflags & CF_COUNT_MASK;
     uint32_t pc_start = tb->pc;
     uint32_t next_page_start =
@@ -3030,7 +3029,7 @@
         check_breakpoint(env, &dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -3081,7 +3080,7 @@
     } while (dc.is_jmp == DISAS_NEXT &&
             insn_count < max_insns &&
             dc.pc < next_page_start &&
-            tcg_ctx.gen_opc_ptr < gen_opc_end);
+            !tcg_op_buf_full());
 
     reset_litbase(&dc);
     reset_sar_tracker(&dc);
@@ -3097,7 +3096,6 @@
         gen_jumpi(&dc, dc.pc, 0);
     }
     gen_tb_end(tb, insn_count);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -3108,7 +3106,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         memset(tcg_ctx.gen_opc_instr_start + lj + 1, 0,
                 (j - lj) * sizeof(tcg_ctx.gen_opc_instr_start[0]));
     } else {
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 987c0bd..35920e1 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -280,7 +280,7 @@
     I3312_LDRSHX    = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
     I3312_LDRSWX    = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
 
-    I3312_TO_I3310  = 0x00206800,
+    I3312_TO_I3310  = 0x00200800,
     I3312_TO_I3313  = 0x01000000,
 
     /* Load/store register pair instructions.  */
@@ -496,13 +496,14 @@
 }
 
 static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
-                              TCGReg rd, TCGReg base, TCGReg regoff)
+                              TCGReg rd, TCGReg base, TCGType ext,
+                              TCGReg regoff)
 {
     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
-    tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 | base << 5 | rd);
+    tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
+              0x4000 | ext << 13 | base << 5 | rd);
 }
 
-
 static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
                               TCGReg rd, TCGReg rn, intptr_t offset)
 {
@@ -677,7 +678,7 @@
 
     /* Worst-case scenario, move offset to temp register, use reg offset.  */
     tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
-    tcg_out_ldst_r(s, insn, rd, rn, TCG_REG_TMP);
+    tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
 }
 
 static inline void tcg_out_mov(TCGContext *s,
@@ -962,7 +963,7 @@
 
 #ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
- *                                     int mmu_idx, uintptr_t ra)
+ *                                     TCGMemOpIdx oi, uintptr_t ra)
  */
 static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
@@ -975,7 +976,8 @@
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
- *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ *                                     uintxx_t val, TCGMemOpIdx oi,
+ *                                     uintptr_t ra)
  */
 static void * const qemu_st_helpers[16] = {
     [MO_UB]   = helper_ret_stb_mmu,
@@ -996,16 +998,17 @@
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp size = opc & MO_SIZE;
 
     reloc_pc19(lb->label_ptr[0], s->code_ptr);
 
-    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_X0, TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, oi);
     tcg_out_adr(s, TCG_REG_X3, lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
     if (opc & MO_SIGN) {
         tcg_out_sxt(s, lb->type, size, lb->datalo_reg, TCG_REG_X0);
     } else {
@@ -1017,33 +1020,32 @@
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp size = opc & MO_SIZE;
 
     reloc_pc19(lb->label_ptr[0], s->code_ptr);
 
-    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_X0, TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
     tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, oi);
     tcg_out_adr(s, TCG_REG_X4, lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
     tcg_out_goto(s, lb->raddr);
 }
 
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGType ext, TCGReg data_reg, TCGReg addr_reg,
-                                int mem_index, tcg_insn_unit *raddr,
-                                tcg_insn_unit *label_ptr)
+                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->type = ext;
     label->datalo_reg = data_reg;
     label->addrlo_reg = addr_reg;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr;
 }
@@ -1052,14 +1054,30 @@
    slow path for the failure case, which will be patched later when finalizing
    the slow path. Generated code returns the host addend in X1,
    clobbers X0,X2,X3,TMP. */
-static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp s_bits,
+static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
                              tcg_insn_unit **label_ptr, int mem_index,
                              bool is_read)
 {
-    TCGReg base = TCG_AREG0;
     int tlb_offset = is_read ?
         offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
+    int a_bits = get_alignment_bits(opc);
+    TCGReg base = TCG_AREG0, x3;
+    uint64_t tlb_mask;
+
+    /* For aligned accesses, we check the first byte and include the alignment
+       bits within the address.  For unaligned access, we check that we don't
+       cross pages using the address of the last byte of the access.  */
+    if (a_bits >= 0) {
+        /* A byte access or an alignment check required */
+        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
+        x3 = addr_reg;
+    } else {
+        tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64,
+                     TCG_REG_X3, addr_reg, (1 << (opc & MO_SIZE)) - 1);
+        tlb_mask = TARGET_PAGE_MASK;
+        x3 = TCG_REG_X3;
+    }
 
     /* Extract the TLB index from the address into X0.
        X0<CPU_TLB_BITS:0> =
@@ -1067,11 +1085,9 @@
     tcg_out_ubfm(s, TARGET_LONG_BITS == 64, TCG_REG_X0, addr_reg,
                  TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
 
-    /* Store the page mask part of the address and the low s_bits into X3.
-       Later this allows checking for equality and alignment at the same time.
-       X3 = addr_reg & (PAGE_MASK | ((1 << s_bits) - 1)) */
-    tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64, TCG_REG_X3,
-                     addr_reg, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+    /* Store the page mask part of the address into X3.  */
+    tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64,
+                     TCG_REG_X3, x3, tlb_mask);
 
     /* Add any "high bits" from the tlb offset to the env address into X2,
        to take advantage of the LSL12 form of the ADDI instruction.
@@ -1110,51 +1126,52 @@
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp memop, TCGType ext,
-                                   TCGReg data_r, TCGReg addr_r, TCGReg off_r)
+                                   TCGReg data_r, TCGReg addr_r,
+                                   TCGType otype, TCGReg off_r)
 {
     const TCGMemOp bswap = memop & MO_BSWAP;
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
-        tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, otype, off_r);
         break;
     case MO_SB:
         tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
-                       data_r, addr_r, off_r);
+                       data_r, addr_r, otype, off_r);
         break;
     case MO_UW:
-        tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, otype, off_r);
         if (bswap) {
             tcg_out_rev16(s, data_r, data_r);
         }
         break;
     case MO_SW:
         if (bswap) {
-            tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, otype, off_r);
             tcg_out_rev16(s, data_r, data_r);
             tcg_out_sxt(s, ext, MO_16, data_r, data_r);
         } else {
-            tcg_out_ldst_r(s, ext ? I3312_LDRSHX : I3312_LDRSHW,
-                           data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, (ext ? I3312_LDRSHX : I3312_LDRSHW),
+                           data_r, addr_r, otype, off_r);
         }
         break;
     case MO_UL:
-        tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, otype, off_r);
         if (bswap) {
             tcg_out_rev32(s, data_r, data_r);
         }
         break;
     case MO_SL:
         if (bswap) {
-            tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, otype, off_r);
             tcg_out_rev32(s, data_r, data_r);
             tcg_out_sxt(s, TCG_TYPE_I64, MO_32, data_r, data_r);
         } else {
-            tcg_out_ldst_r(s, I3312_LDRSWX, data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, I3312_LDRSWX, data_r, addr_r, otype, off_r);
         }
         break;
     case MO_Q:
-        tcg_out_ldst_r(s, I3312_LDRX, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRX, data_r, addr_r, otype, off_r);
         if (bswap) {
             tcg_out_rev64(s, data_r, data_r);
         }
@@ -1165,34 +1182,35 @@
 }
 
 static void tcg_out_qemu_st_direct(TCGContext *s, TCGMemOp memop,
-                                   TCGReg data_r, TCGReg addr_r, TCGReg off_r)
+                                   TCGReg data_r, TCGReg addr_r,
+                                   TCGType otype, TCGReg off_r)
 {
     const TCGMemOp bswap = memop & MO_BSWAP;
 
     switch (memop & MO_SIZE) {
     case MO_8:
-        tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, otype, off_r);
         break;
     case MO_16:
         if (bswap && data_r != TCG_REG_XZR) {
             tcg_out_rev16(s, TCG_REG_TMP, data_r);
             data_r = TCG_REG_TMP;
         }
-        tcg_out_ldst_r(s, I3312_STRH, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRH, data_r, addr_r, otype, off_r);
         break;
     case MO_32:
         if (bswap && data_r != TCG_REG_XZR) {
             tcg_out_rev32(s, TCG_REG_TMP, data_r);
             data_r = TCG_REG_TMP;
         }
-        tcg_out_ldst_r(s, I3312_STRW, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRW, data_r, addr_r, otype, off_r);
         break;
     case MO_64:
         if (bswap && data_r != TCG_REG_XZR) {
             tcg_out_rev64(s, TCG_REG_TMP, data_r);
             data_r = TCG_REG_TMP;
         }
-        tcg_out_ldst_r(s, I3312_STRX, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRX, data_r, addr_r, otype, off_r);
         break;
     default:
         tcg_abort();
@@ -1200,36 +1218,44 @@
 }
 
 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp memop, TCGType ext, int mem_index)
+                            TCGMemOpIdx oi, TCGType ext)
 {
+    TCGMemOp memop = get_memop(oi);
+    const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
 
-    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 1);
-    tcg_out_qemu_ld_direct(s, memop, ext, data_reg, addr_reg, TCG_REG_X1);
-    add_qemu_ldst_label(s, true, memop, ext, data_reg, addr_reg,
-                        mem_index, s->code_ptr, label_ptr);
+    tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 1);
+    tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
+                           TCG_REG_X1, otype, addr_reg);
+    add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
+                        s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-    tcg_out_qemu_ld_direct(s, memop, ext, data_reg, addr_reg,
-                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
+    tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
+                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
+                           otype, addr_reg);
 #endif /* CONFIG_SOFTMMU */
 }
 
 static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp memop, int mem_index)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp memop = get_memop(oi);
+    const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
 
-    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 0);
-    tcg_out_qemu_st_direct(s, memop, data_reg, addr_reg, TCG_REG_X1);
-    add_qemu_ldst_label(s, false, memop, s_bits == MO_64, data_reg, addr_reg,
-                        mem_index, s->code_ptr, label_ptr);
+    tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 0);
+    tcg_out_qemu_st_direct(s, memop, data_reg,
+                           TCG_REG_X1, otype, addr_reg);
+    add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
+                        data_reg, addr_reg, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-    tcg_out_qemu_st_direct(s, memop, data_reg, addr_reg,
-                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
+    tcg_out_qemu_st_direct(s, memop, data_reg,
+                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
+                           otype, addr_reg);
 #endif /* CONFIG_SOFTMMU */
 }
 
@@ -1518,11 +1544,11 @@
 
     case INDEX_op_qemu_ld_i32:
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, a0, a1, a2, ext, args[3]);
+        tcg_out_qemu_ld(s, a0, a1, a2, ext);
         break;
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
-        tcg_out_qemu_st(s, REG0(0), a1, a2, args[3]);
+        tcg_out_qemu_st(s, REG0(0), a1, a2);
         break;
 
     case INDEX_op_bswap64_i64:
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index e40301c..a078a22 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1223,20 +1223,19 @@
 /* Record the context of a call to the out of line helper code for the slow
    path for a load or store, so that we can later generate the correct
    helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo, TCGReg datahi, TCGReg addrlo,
-                                TCGReg addrhi, int mem_index,
-                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
+                                TCGReg addrhi, tcg_insn_unit *raddr,
+                                tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo;
     label->datahi_reg = datahi;
     label->addrlo_reg = addrlo;
     label->addrhi_reg = addrhi;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr;
 }
@@ -1244,7 +1243,8 @@
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
     TCGReg argreg, datalo, datahi;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     void *func;
 
     reloc_pc24(lb->label_ptr[0], s->code_ptr);
@@ -1255,16 +1255,16 @@
     } else {
         argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
     }
-    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
+    argreg = tcg_out_arg_imm32(s, argreg, oi);
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
     /* For armv6 we can use the canonical unsigned helpers and minimize
        icache usage.  For pre-armv6, use the signed helpers since we do
        not have a single insn sign-extend.  */
     if (use_armv6_instructions) {
-        func = qemu_ld_helpers[opc & ~MO_SIGN];
+        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
     } else {
-        func = qemu_ld_helpers[opc];
+        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
         if (opc & MO_SIGN) {
             opc = MO_UL;
         }
@@ -1304,7 +1304,8 @@
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
     TCGReg argreg, datalo, datahi;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
 
     reloc_pc24(lb->label_ptr[0], s->code_ptr);
 
@@ -1334,11 +1335,11 @@
         break;
     }
 
-    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
+    argreg = tcg_out_arg_imm32(s, argreg, oi);
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
     /* Tail-call to the helper, which will return to the fast path.  */
-    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc]);
+    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 }
 #endif /* SOFTMMU */
 
@@ -1465,6 +1466,7 @@
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 {
     TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1476,10 +1478,11 @@
     datahi = (is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 1);
 
     /* This a conditional BL only to load a pointer within this opcode into LR
@@ -1489,8 +1492,8 @@
 
     tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
 
-    add_qemu_ldst_label(s, true, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
@@ -1594,6 +1597,7 @@
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 {
     TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1605,10 +1609,11 @@
     datahi = (is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 0);
 
     tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
@@ -1617,8 +1622,8 @@
     label_ptr = s->code_ptr;
     tcg_out_bl_noaddr(s, COND_NE);
 
-    add_qemu_ldst_label(s, false, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4133dcf..bde8a62 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1172,14 +1172,16 @@
    First argument register is clobbered.  */
 
 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-                                    int mem_index, TCGMemOp s_bits,
+                                    int mem_index, TCGMemOp opc,
                                     tcg_insn_unit **label_ptr, int which)
 {
     const TCGReg r0 = TCG_REG_L0;
     const TCGReg r1 = TCG_REG_L1;
     TCGType ttype = TCG_TYPE_I32;
-    TCGType htype = TCG_TYPE_I32;
-    int trexw = 0, hrexw = 0;
+    TCGType tlbtype = TCG_TYPE_I32;
+    int trexw = 0, hrexw = 0, tlbrexw = 0;
+    int a_bits = get_alignment_bits(opc);
+    target_ulong tlb_mask;
 
     if (TCG_TARGET_REG_BITS == 64) {
         if (TARGET_LONG_BITS == 64) {
@@ -1187,20 +1189,32 @@
             trexw = P_REXW;
         }
         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
-            htype = TCG_TYPE_I64;
             hrexw = P_REXW;
+            if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
+                tlbtype = TCG_TYPE_I64;
+                tlbrexw = P_REXW;
+            }
         }
     }
 
-    tcg_out_mov(s, htype, r0, addrlo);
-    tcg_out_mov(s, ttype, r1, addrlo);
+    tcg_out_mov(s, tlbtype, r0, addrlo);
+    if (a_bits >= 0) {
+        /* A byte access or an alignment check required */
+        tcg_out_mov(s, ttype, r1, addrlo);
+        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
+    } else {
+        /* For unaligned access check that we don't cross pages using
+           the page address of the last byte.  */
+        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo,
+                             (1 << (opc & MO_SIZE)) - 1);
+        tlb_mask = TARGET_PAGE_MASK;
+    }
 
-    tcg_out_shifti(s, SHIFT_SHR + hrexw, r0,
+    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 
-    tgen_arithi(s, ARITH_AND + trexw, r1,
-                TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
-    tgen_arithi(s, ARITH_AND + hrexw, r0,
+    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
+    tgen_arithi(s, ARITH_AND + tlbrexw, r0,
                 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
     tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
@@ -1244,21 +1258,20 @@
  * Record the context of a call to the out of line helper code for the slow path
  * for a load or store, so that we can later generate the correct helper code
  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo, TCGReg datahi,
                                 TCGReg addrlo, TCGReg addrhi,
-                                int mem_index, tcg_insn_unit *raddr,
+                                tcg_insn_unit *raddr,
                                 tcg_insn_unit **label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo;
     label->datahi_reg = datahi;
     label->addrlo_reg = addrlo;
     label->addrhi_reg = addrhi;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr[0];
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
@@ -1271,7 +1284,8 @@
  */
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = l->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGReg data_reg;
     tcg_insn_unit **label_ptr = &l->label_ptr[0];
 
@@ -1295,20 +1309,19 @@
             ofs += 4;
         }
 
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, oi);
         ofs += 4;
 
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr);
+        tcg_out_sti(s, TCG_TYPE_PTR, TCG_REG_ESP, ofs, (uintptr_t)l->raddr);
     } else {
         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
         /* The second argument is already loaded with addrlo.  */
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
-                     l->mem_index);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
                      (uintptr_t)l->raddr);
     }
 
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     data_reg = l->datalo_reg;
     switch (opc & MO_SSIZE) {
@@ -1354,7 +1367,8 @@
  */
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = l->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp s_bits = opc & MO_SIZE;
     tcg_insn_unit **label_ptr = &l->label_ptr[0];
     TCGReg retaddr;
@@ -1387,19 +1401,18 @@
             ofs += 4;
         }
 
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, oi);
         ofs += 4;
 
         retaddr = TCG_REG_EAX;
-        tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr);
-        tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs);
+        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
+        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
     } else {
         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
         /* The second argument is already loaded with addrlo.  */
         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                     tcg_target_call_iarg_regs[2], l->datalo_reg);
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
-                     l->mem_index);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
 
         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
             retaddr = tcg_target_call_iarg_regs[4];
@@ -1414,7 +1427,7 @@
 
     /* "Tail call" to the helper, with the return address back inline.  */
     tcg_out_push(s, retaddr);
-    tcg_out_jmp(s, qemu_st_helpers[opc]);
+    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 }
 #elif defined(__x86_64__) && defined(__linux__)
 # include <asm/prctl.h>
@@ -1435,8 +1448,8 @@
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
-                                   TCGReg base, intptr_t ofs, int seg,
-                                   TCGMemOp memop)
+                                   TCGReg base, int index, intptr_t ofs,
+                                   int seg, TCGMemOp memop)
 {
     const TCGMemOp real_bswap = memop & MO_BSWAP;
     TCGMemOp bswap = real_bswap;
@@ -1449,13 +1462,16 @@
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
-        tcg_out_modrm_offset(s, OPC_MOVZBL + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
+                                 base, index, 0, ofs);
         break;
     case MO_SB:
-        tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
+                                 base, index, 0, ofs);
         break;
     case MO_UW:
-        tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
+                                 base, index, 0, ofs);
         if (real_bswap) {
             tcg_out_rolw_8(s, datalo);
         }
@@ -1463,20 +1479,21 @@
     case MO_SW:
         if (real_bswap) {
             if (have_movbe) {
-                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
-                                     datalo, base, ofs);
+                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                         datalo, base, index, 0, ofs);
             } else {
-                tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
+                                         base, index, 0, ofs);
                 tcg_out_rolw_8(s, datalo);
             }
             tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
+                                     datalo, base, index, 0, ofs);
         }
         break;
     case MO_UL:
-        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
@@ -1484,19 +1501,22 @@
 #if TCG_TARGET_REG_BITS == 64
     case MO_SL:
         if (real_bswap) {
-            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+                                     base, index, 0, ofs);
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
             }
             tcg_out_ext32s(s, datalo, datalo);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
+                                     base, index, 0, ofs);
         }
         break;
 #endif
     case MO_Q:
         if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
+                                     base, index, 0, ofs);
             if (bswap) {
                 tcg_out_bswap64(s, datalo);
             }
@@ -1507,11 +1527,15 @@
                 datahi = t;
             }
             if (base != datalo) {
-                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
-                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
+                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+                                         base, index, 0, ofs);
+                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
+                                         base, index, 0, ofs + 4);
             } else {
-                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
-                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
+                                         base, index, 0, ofs + 4);
+                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+                                         base, index, 0, ofs);
             }
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
@@ -1531,10 +1555,10 @@
 {
     TCGReg datalo, datahi, addrlo;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    TCGMemOp s_bits;
     tcg_insn_unit *label_ptr[2];
 #endif
 
@@ -1542,43 +1566,52 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args++;
-    s_bits = opc & MO_SIZE;
+    mem_index = get_mmuidx(oi);
 
-    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
                      label_ptr, offsetof(CPUTLBEntry, addr_read));
 
     /* TLB Hit.  */
-    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
+    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
 
     /* Record the current context of a load into ldst label */
-    add_qemu_ldst_label(s, true, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
         TCGReg base = addrlo;
+        int index = -1;
         int seg = 0;
 
-        /* ??? We assume all operations have left us with register contents
-           that are zero extended.  So far this appears to be true.  If we
-           want to enforce this, we can either do an explicit zero-extension
-           here, or (if GUEST_BASE == 0, or a segment register is in use)
-           use the ADDR32 prefix.  For now, do nothing.  */
-        if (GUEST_BASE && guest_base_flags) {
+        /* For a 32-bit guest, the high 32 bits may contain garbage.
+           We can do this with the ADDR32 prefix if we're not using
+           a guest base, or when using segmentation.  Otherwise we
+           need to zero-extend manually.  */
+        if (GUEST_BASE == 0 || guest_base_flags) {
             seg = guest_base_flags;
             offset = 0;
-        } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
-            tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
-            base = TCG_REG_L1;
-            offset = 0;
+            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+                seg |= P_ADDR32;
+            }
+        } else if (TCG_TARGET_REG_BITS == 64) {
+            if (TARGET_LONG_BITS == 32) {
+                tcg_out_ext32u(s, TCG_REG_L0, base);
+                base = TCG_REG_L0;
+            }
+            if (offset != GUEST_BASE) {
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
+                index = TCG_REG_L1;
+                offset = 0;
+            }
         }
 
-        tcg_out_qemu_ld_direct(s, datalo, datahi, base, offset, seg, opc);
+        tcg_out_qemu_ld_direct(s, datalo, datahi,
+                               base, index, offset, seg, opc);
     }
 #endif
 }
@@ -1662,10 +1695,10 @@
 {
     TCGReg datalo, datahi, addrlo;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    TCGMemOp s_bits;
     tcg_insn_unit *label_ptr[2];
 #endif
 
@@ -1673,40 +1706,50 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args++;
-    s_bits = opc & MO_SIZE;
+    mem_index = get_mmuidx(oi);
 
-    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
                      label_ptr, offsetof(CPUTLBEntry, addr_write));
 
     /* TLB Hit.  */
     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
 
     /* Record the current context of a store into ldst label */
-    add_qemu_ldst_label(s, false, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
         TCGReg base = addrlo;
         int seg = 0;
 
-        /* ??? We assume all operations have left us with register contents
-           that are zero extended.  So far this appears to be true.  If we
-           want to enforce this, we can either do an explicit zero-extension
-           here, or (if GUEST_BASE == 0, or a segment register is in use)
-           use the ADDR32 prefix.  For now, do nothing.  */
-        if (GUEST_BASE && guest_base_flags) {
+        /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
+        if (GUEST_BASE == 0 || guest_base_flags) {
             seg = guest_base_flags;
             offset = 0;
-        } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
-            tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
-            base = TCG_REG_L1;
-            offset = 0;
+            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+                seg |= P_ADDR32;
+            }
+        } else if (TCG_TARGET_REG_BITS == 64) {
+            /* ??? Note that we can't use the same SIB addressing scheme
+               as for loads, since we require L0 free for bswap.  */
+            if (offset != GUEST_BASE) {
+                if (TARGET_LONG_BITS == 32) {
+                    tcg_out_ext32u(s, TCG_REG_L0, base);
+                    base = TCG_REG_L0;
+                }
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
+                base = TCG_REG_L1;
+                offset = 0;
+            } else if (TARGET_LONG_BITS == 32) {
+                tcg_out_ext32u(s, TCG_REG_L1, base);
+                base = TCG_REG_L1;
+            }
         }
 
         tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
diff --git a/tcg/ia64/tcg-target.c b/tcg/ia64/tcg-target.c
index 6bc9924..5984406 100644
--- a/tcg/ia64/tcg-target.c
+++ b/tcg/ia64/tcg-target.c
@@ -1639,14 +1639,16 @@
         OPC_LD1_M1, OPC_LD2_M1, OPC_LD4_M1, OPC_LD8_M1
     };
     int addr_reg, data_reg, mem_index;
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
     uint64_t fin1, fin2;
     tcg_insn_unit *label_ptr;
 
     data_reg = args[0];
     addr_reg = args[1];
-    opc = args[2];
-    mem_index = args[3];
+    oi = args[2];
+    opc = get_memop(oi);
+    mem_index = get_mmuidx(oi);
     s_bits = opc & MO_SIZE;
 
     /* Read the TLB entry */
@@ -1674,7 +1676,7 @@
                    tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R56, TCG_AREG0),
                    tcg_opc_a1 (TCG_REG_P6, OPC_ADD_A1, TCG_REG_R2,
                                TCG_REG_R2, TCG_REG_R57),
-                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R58, mem_index));
+                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R58, oi));
     label_ptr = s->code_ptr;
     tcg_out_bundle(s, miB,
                    tcg_opc_m1 (TCG_REG_P6, opc_ld_m1[s_bits],
@@ -1701,13 +1703,15 @@
     TCGReg addr_reg, data_reg;
     int mem_index;
     uint64_t pre1, pre2;
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
     tcg_insn_unit *label_ptr;
 
     data_reg = args[0];
     addr_reg = args[1];
-    opc = args[2];
-    mem_index = args[3];
+    oi = args[2];
+    opc = get_memop(oi);
+    mem_index = get_mmuidx(oi);
     s_bits = opc & MO_SIZE;
 
     /* Note that we always use LE helper functions, so the bswap insns
@@ -1736,7 +1740,7 @@
                    tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R56, TCG_AREG0),
                    tcg_opc_a1 (TCG_REG_P6, OPC_ADD_A1, TCG_REG_R2,
                                TCG_REG_R2, TCG_REG_R57),
-                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R59, mem_index));
+                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R59, oi));
     label_ptr = s->code_ptr;
     tcg_out_bundle(s, miB,
                    tcg_opc_m4 (TCG_REG_P6, opc_st_m4[s_bits],
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index b7f4d67..09fefe3 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -992,21 +992,19 @@
     tcg_out_opc_reg(s, OPC_ADDU, base, TCG_REG_A0, addrl);
 }
 
-static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo, TCGReg datahi,
                                 TCGReg addrlo, TCGReg addrhi,
-                                int mem_index, void *raddr,
-                                tcg_insn_unit *label_ptr[2])
+                                void *raddr, tcg_insn_unit *label_ptr[2])
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo;
     label->datahi_reg = datahi;
     label->addrlo_reg = addrlo;
     label->addrhi_reg = addrhi;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr[0];
     if (TARGET_LONG_BITS == 64) {
@@ -1016,7 +1014,8 @@
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGReg v0;
     int i;
 
@@ -1032,9 +1031,9 @@
     } else {
         i = tcg_out_call_iarg_reg(s, i, l->addrlo_reg);
     }
-    i = tcg_out_call_iarg_imm(s, i, l->mem_index);
+    i = tcg_out_call_iarg_imm(s, i, oi);
     i = tcg_out_call_iarg_imm(s, i, (intptr_t)l->raddr);
-    tcg_out_call_int(s, qemu_ld_helpers[opc], false);
+    tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)], false);
     /* delay slot */
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 
@@ -1058,7 +1057,8 @@
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp s_bits = opc & MO_SIZE;
     int i;
 
@@ -1090,13 +1090,13 @@
     default:
         tcg_abort();
     }
-    i = tcg_out_call_iarg_imm(s, i, l->mem_index);
+    i = tcg_out_call_iarg_imm(s, i, oi);
 
     /* Tail call to the store helper.  Thus force the return address
        computation to take place in the return address register.  */
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (intptr_t)l->raddr);
     i = tcg_out_call_iarg_reg(s, i, TCG_REG_RA);
-    tcg_out_call_int(s, qemu_st_helpers[opc], true);
+    tcg_out_call_int(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)], true);
     /* delay slot */
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 }
@@ -1152,6 +1152,7 @@
 {
     TCGReg addr_regl, addr_regh __attribute__((unused));
     TCGReg data_regl, data_regh;
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
@@ -1166,17 +1167,18 @@
     data_regh = (is_64 ? *args++ : 0);
     addr_regl = *args++;
     addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     s_bits = opc & MO_SIZE;
 
     tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index,
                      s_bits, label_ptr, 1);
     tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc);
-    add_qemu_ldst_label(s, 1, opc, data_regl, data_regh, addr_regl, addr_regh,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 1, oi, data_regl, data_regh, addr_regl, addr_regh,
+                        s->code_ptr, label_ptr);
 #else
     if (GUEST_BASE == 0 && data_regl != addr_regl) {
         base = addr_regl;
@@ -1281,6 +1283,7 @@
 {
     TCGReg addr_regl, addr_regh __attribute__((unused));
     TCGReg data_regl, data_regh, base;
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
@@ -1292,10 +1295,11 @@
     data_regh = (is_64 ? *args++ : 0);
     addr_regl = *args++;
     addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     s_bits = opc & 3;
 
     /* Note that we eliminated the helper's address argument,
@@ -1304,8 +1308,8 @@
     tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index,
                      s_bits, label_ptr, 0);
     tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
-    add_qemu_ldst_label(s, 0, opc, data_regl, data_regh, addr_regl, addr_regh,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 0, oi, data_regl, data_regh, addr_regl, addr_regh,
+                        s->code_ptr, label_ptr);
 #else
     if (GUEST_BASE == 0) {
         base = addr_regl;
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 34ae3c2..bbc86d0 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -67,6 +67,37 @@
     temps[temp].mask = -1;
 }
 
+static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op,
+                                TCGOpcode opc, int nargs)
+{
+    int oi = s->gen_next_op_idx;
+    int pi = s->gen_next_parm_idx;
+    int prev = old_op->prev;
+    int next = old_op - s->gen_op_buf;
+    TCGOp *new_op;
+
+    tcg_debug_assert(oi < OPC_BUF_SIZE);
+    tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE);
+    s->gen_next_op_idx = oi + 1;
+    s->gen_next_parm_idx = pi + nargs;
+
+    new_op = &s->gen_op_buf[oi];
+    *new_op = (TCGOp){
+        .opc = opc,
+        .args = pi,
+        .prev = prev,
+        .next = next
+    };
+    if (prev >= 0) {
+        s->gen_op_buf[prev].next = oi;
+    } else {
+        s->gen_first_op_idx = oi;
+    }
+    old_op->prev = oi;
+
+    return new_op;
+}
+
 /* Reset all temporaries, given that there are NB_TEMPS of them.  */
 static void reset_all_temps(int nb_temps)
 {
@@ -162,13 +193,13 @@
     return false;
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args,
+static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
                             TCGOpcode old_op, TCGArg dst, TCGArg src)
 {
     TCGOpcode new_op = op_to_mov(old_op);
     tcg_target_ulong mask;
 
-    s->gen_opc_buf[op_index] = new_op;
+    op->opc = new_op;
 
     reset_temp(dst);
     mask = temps[src].mask;
@@ -193,30 +224,30 @@
         temps[src].next_copy = dst;
     }
 
-    gen_args[0] = dst;
-    gen_args[1] = src;
+    args[0] = dst;
+    args[1] = src;
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args,
+static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg *args,
                              TCGOpcode old_op, TCGArg dst, TCGArg val)
 {
     TCGOpcode new_op = op_to_movi(old_op);
     tcg_target_ulong mask;
 
-    s->gen_opc_buf[op_index] = new_op;
+    op->opc = new_op;
 
     reset_temp(dst);
     temps[dst].state = TCG_TEMP_CONST;
     temps[dst].val = val;
     mask = val;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_movi_i32) {
         /* High bits of the destination are now garbage.  */
         mask |= ~0xffffffffull;
     }
     temps[dst].mask = mask;
 
-    gen_args[0] = dst;
-    gen_args[1] = val;
+    args[0] = dst;
+    args[1] = val;
 }
 
 static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
@@ -533,11 +564,9 @@
 }
 
 /* Propagate constants and copies, fold constant expressions. */
-static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
-                                    TCGArg *args, TCGOpDef *tcg_op_defs)
+static void tcg_constant_folding(TCGContext *s)
 {
-    int nb_ops, op_index, nb_temps, nb_globals;
-    TCGArg *gen_args;
+    int oi, oi_next, nb_temps, nb_globals;
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -548,24 +577,23 @@
     nb_globals = s->nb_globals;
     reset_all_temps(nb_temps);
 
-    nb_ops = tcg_opc_ptr - s->gen_opc_buf;
-    gen_args = args;
-    for (op_index = 0; op_index < nb_ops; op_index++) {
-        TCGOpcode op = s->gen_opc_buf[op_index];
-        const TCGOpDef *def = &tcg_op_defs[op];
+    for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
         tcg_target_ulong mask, partmask, affected;
-        int nb_oargs, nb_iargs, nb_args, i;
+        int nb_oargs, nb_iargs, i;
         TCGArg tmp;
 
-        if (op == INDEX_op_call) {
-            *gen_args++ = tmp = *args++;
-            nb_oargs = tmp >> 16;
-            nb_iargs = tmp & 0xffff;
-            nb_args = nb_oargs + nb_iargs + def->nb_cargs;
+        TCGOp * const op = &s->gen_op_buf[oi];
+        TCGArg * const args = &s->gen_opparam_buf[op->args];
+        TCGOpcode opc = op->opc;
+        const TCGOpDef *def = &tcg_op_defs[opc];
+
+        oi_next = op->next;
+        if (opc == INDEX_op_call) {
+            nb_oargs = op->callo;
+            nb_iargs = op->calli;
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            nb_args = def->nb_args;
         }
 
         /* Do copy propagation */
@@ -576,7 +604,7 @@
         }
 
         /* For commutative operations make constant second argument */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(add):
         CASE_OP_32_64(mul):
         CASE_OP_32_64(and):
@@ -634,7 +662,7 @@
 
         /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
            and "sub r, 0, a => neg r, a" case.  */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(shl):
         CASE_OP_32_64(shr):
         CASE_OP_32_64(sar):
@@ -642,9 +670,7 @@
         CASE_OP_32_64(rotr):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[1]].val == 0) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-                args += 3;
-                gen_args += 2;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
                 continue;
             }
             break;
@@ -657,7 +683,7 @@
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (op == INDEX_op_sub_i32) {
+                if (opc == INDEX_op_sub_i32) {
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
                 } else {
@@ -669,12 +695,9 @@
                 }
                 if (temps[args[1]].state == TCG_TEMP_CONST
                     && temps[args[1]].val == 0) {
-                    s->gen_opc_buf[op_index] = neg_op;
+                    op->opc = neg_op;
                     reset_temp(args[0]);
-                    gen_args[0] = args[0];
-                    gen_args[1] = args[2];
-                    args += 3;
-                    gen_args += 2;
+                    args[1] = args[2];
                     continue;
                 }
             }
@@ -728,12 +751,9 @@
                 if (!have_not) {
                     break;
                 }
-                s->gen_opc_buf[op_index] = not_op;
+                op->opc = not_op;
                 reset_temp(args[0]);
-                gen_args[0] = args[0];
-                gen_args[1] = args[i];
-                args += 3;
-                gen_args += 2;
+                args[1] = args[i];
                 continue;
             }
         default:
@@ -741,7 +761,7 @@
         }
 
         /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(shl):
@@ -769,12 +789,10 @@
             break;
         do_mov3:
             if (temps_are_copies(args[0], args[1])) {
-                s->gen_opc_buf[op_index] = INDEX_op_nop;
+                tcg_op_remove(s, op);
             } else {
-                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
-                gen_args += 2;
+                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
             }
-            args += 3;
             continue;
         default:
             break;
@@ -784,7 +802,7 @@
            output argument is supported. */
         mask = -1;
         affected = -1;
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(ext8s):
             if ((temps[args[1]].mask & 0x80) != 0) {
                 break;
@@ -900,7 +918,8 @@
 
         CASE_OP_32_64(qemu_ld):
             {
-                TCGMemOp mop = args[nb_oargs + nb_iargs];
+                TCGMemOpIdx oi = args[nb_oargs + nb_iargs];
+                TCGMemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
@@ -923,38 +942,31 @@
 
         if (partmask == 0) {
             assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-            args += nb_args;
-            gen_args += 2;
+            tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
             continue;
         }
         if (affected == 0) {
             assert(nb_oargs == 1);
             if (temps_are_copies(args[0], args[1])) {
-                s->gen_opc_buf[op_index] = INDEX_op_nop;
+                tcg_op_remove(s, op);
             } else if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
-                gen_args += 2;
+                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
             } else {
-                tcg_opt_gen_movi(s, op_index, gen_args, op,
+                tcg_opt_gen_movi(s, op, args, opc,
                                  args[0], temps[args[1]].val);
-                gen_args += 2;
             }
-            args += nb_args;
             continue;
         }
 
         /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(and):
         CASE_OP_32_64(mul):
         CASE_OP_32_64(muluh):
         CASE_OP_32_64(mulsh):
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-                args += 3;
-                gen_args += 2;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
                 continue;
             }
             break;
@@ -963,18 +975,15 @@
         }
 
         /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(or):
         CASE_OP_32_64(and):
             if (temps_are_copies(args[1], args[2])) {
                 if (temps_are_copies(args[0], args[1])) {
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 } else {
-                    tcg_opt_gen_mov(s, op_index, gen_args, op,
-                                    args[0], args[1]);
-                    gen_args += 2;
+                    tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
                 }
-                args += 3;
                 continue;
             }
             break;
@@ -983,14 +992,12 @@
         }
 
         /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(andc):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(xor):
             if (temps_are_copies(args[1], args[2])) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-                gen_args += 2;
-                args += 3;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
                 continue;
             }
             break;
@@ -1001,17 +1008,14 @@
         /* Propagate constants through copy operations and do constant
            folding.  Constants will be substituted to arguments by register
            allocator where needed and possible.  Also detect copies. */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(mov):
             if (temps_are_copies(args[0], args[1])) {
-                args += 2;
-                s->gen_opc_buf[op_index] = INDEX_op_nop;
+                tcg_op_remove(s, op);
                 break;
             }
             if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
-                gen_args += 2;
-                args += 2;
+                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
                 break;
             }
             /* Source argument is constant.  Rewrite the operation and
@@ -1019,9 +1023,7 @@
             args[1] = temps[args[1]].val;
             /* fallthrough */
         CASE_OP_32_64(movi):
-            tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], args[1]);
-            gen_args += 2;
-            args += 2;
+            tcg_opt_gen_movi(s, op, args, opc, args[0], args[1]);
             break;
 
         CASE_OP_32_64(not):
@@ -1033,20 +1035,16 @@
         case INDEX_op_ext32s_i64:
         case INDEX_op_ext32u_i64:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding(op, temps[args[1]].val, 0);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 2;
+                tmp = do_constant_folding(opc, temps[args[1]].val, 0);
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
 
         case INDEX_op_trunc_shr_i32:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding(op, temps[args[1]].val, args[2]);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 3;
+                tmp = do_constant_folding(opc, temps[args[1]].val, args[2]);
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1075,11 +1073,9 @@
         CASE_OP_32_64(remu):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding(op, temps[args[1]].val,
+                tmp = do_constant_folding(opc, temps[args[1]].val,
                                           temps[args[2]].val);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 3;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1089,54 +1085,44 @@
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 tmp = deposit64(temps[args[1]].val, args[3], args[4],
                                 temps[args[2]].val);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 5;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(op, args[1], args[2], args[3]);
+            tmp = do_constant_folding_cond(opc, args[1], args[2], args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 4;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
+            tmp = do_constant_folding_cond(opc, args[0], args[1], args[2]);
             if (tmp != 2) {
                 if (tmp) {
                     reset_all_temps(nb_temps);
-                    s->gen_opc_buf[op_index] = INDEX_op_br;
-                    gen_args[0] = args[3];
-                    gen_args += 1;
+                    op->opc = INDEX_op_br;
+                    args[0] = args[3];
                 } else {
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 }
-                args += 4;
                 break;
             }
             goto do_default;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(op, args[1], args[2], args[5]);
+            tmp = do_constant_folding_cond(opc, args[1], args[2], args[5]);
             if (tmp != 2) {
                 if (temps_are_copies(args[0], args[4-tmp])) {
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
-                    tcg_opt_gen_movi(s, op_index, gen_args, op,
+                    tcg_opt_gen_movi(s, op, args, opc,
                                      args[0], temps[args[4-tmp]].val);
-                    gen_args += 2;
                 } else {
-                    tcg_opt_gen_mov(s, op_index, gen_args, op,
-                                    args[0], args[4-tmp]);
-                    gen_args += 2;
+                    tcg_opt_gen_mov(s, op, args, opc, args[0], args[4-tmp]);
                 }
-                args += 6;
                 break;
             }
             goto do_default;
@@ -1154,24 +1140,22 @@
                 uint64_t a = ((uint64_t)ah << 32) | al;
                 uint64_t b = ((uint64_t)bh << 32) | bl;
                 TCGArg rl, rh;
+                TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2);
+                TCGArg *args2 = &s->gen_opparam_buf[op2->args];
 
-                if (op == INDEX_op_add2_i32) {
+                if (opc == INDEX_op_add2_i32) {
                     a += b;
                 } else {
                     a -= b;
                 }
 
-                /* We emit the extra nop when we emit the add2/sub2.  */
-                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
-
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op_index, &gen_args[0],
-                                 op, rl, (uint32_t)a);
-                tcg_opt_gen_movi(s, ++op_index, &gen_args[2],
-                                 op, rh, (uint32_t)(a >> 32));
-                gen_args += 4;
-                args += 6;
+                tcg_opt_gen_movi(s, op, args, opc, rl, (uint32_t)a);
+                tcg_opt_gen_movi(s, op2, args2, opc, rh, (uint32_t)(a >> 32));
+
+                /* We've done all we need to do with the movi.  Skip it.  */
+                oi_next = op2->next;
                 break;
             }
             goto do_default;
@@ -1183,18 +1167,16 @@
                 uint32_t b = temps[args[3]].val;
                 uint64_t r = (uint64_t)a * b;
                 TCGArg rl, rh;
-
-                /* We emit the extra nop when we emit the mulu2.  */
-                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
+                TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2);
+                TCGArg *args2 = &s->gen_opparam_buf[op2->args];
 
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op_index, &gen_args[0],
-                                 op, rl, (uint32_t)r);
-                tcg_opt_gen_movi(s, ++op_index, &gen_args[2],
-                                 op, rh, (uint32_t)(r >> 32));
-                gen_args += 4;
-                args += 4;
+                tcg_opt_gen_movi(s, op, args, opc, rl, (uint32_t)r);
+                tcg_opt_gen_movi(s, op2, args2, opc, rh, (uint32_t)(r >> 32));
+
+                /* We've done all we need to do with the movi.  Skip it.  */
+                oi_next = op2->next;
                 break;
             }
             goto do_default;
@@ -1205,12 +1187,11 @@
                 if (tmp) {
             do_brcond_true:
                     reset_all_temps(nb_temps);
-                    s->gen_opc_buf[op_index] = INDEX_op_br;
-                    gen_args[0] = args[5];
-                    gen_args += 1;
+                    op->opc = INDEX_op_br;
+                    args[0] = args[5];
                 } else {
             do_brcond_false:
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 }
             } else if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
                        && temps[args[2]].state == TCG_TEMP_CONST
@@ -1221,12 +1202,11 @@
                    vs the high word of the input.  */
             do_brcond_high:
                 reset_all_temps(nb_temps);
-                s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
-                gen_args[0] = args[1];
-                gen_args[1] = args[3];
-                gen_args[2] = args[4];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_brcond_i32;
+                args[0] = args[1];
+                args[1] = args[3];
+                args[2] = args[4];
+                args[3] = args[5];
             } else if (args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
@@ -1246,12 +1226,10 @@
                 }
             do_brcond_low:
                 reset_all_temps(nb_temps);
-                s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
-                gen_args[0] = args[0];
-                gen_args[1] = args[2];
-                gen_args[2] = args[4];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_brcond_i32;
+                args[1] = args[2];
+                args[2] = args[4];
+                args[3] = args[5];
             } else if (args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
@@ -1273,15 +1251,13 @@
             } else {
                 goto do_default;
             }
-            args += 6;
             break;
 
         case INDEX_op_setcond2_i32:
             tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
             } else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
                        && temps[args[3]].state == TCG_TEMP_CONST
                        && temps[args[4]].state == TCG_TEMP_CONST
@@ -1290,14 +1266,12 @@
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
-                s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
                 reset_temp(args[0]);
                 temps[args[0]].mask = 1;
-                gen_args[0] = args[0];
-                gen_args[1] = args[2];
-                gen_args[2] = args[4];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_setcond_i32;
+                args[1] = args[2];
+                args[2] = args[4];
+                args[3] = args[5];
             } else if (args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
@@ -1318,12 +1292,9 @@
             do_setcond_low:
                 reset_temp(args[0]);
                 temps[args[0]].mask = 1;
-                s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args[2] = args[3];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_setcond_i32;
+                args[2] = args[3];
+                args[3] = args[5];
             } else if (args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
@@ -1345,7 +1316,6 @@
             } else {
                 goto do_default;
             }
-            args += 6;
             break;
 
         case INDEX_op_call:
@@ -1377,22 +1347,12 @@
                     }
                 }
             }
-            for (i = 0; i < nb_args; i++) {
-                gen_args[i] = args[i];
-            }
-            args += nb_args;
-            gen_args += nb_args;
             break;
         }
     }
-
-    return gen_args;
 }
 
-TCGArg *tcg_optimize(TCGContext *s, uint16_t *tcg_opc_ptr,
-        TCGArg *args, TCGOpDef *tcg_op_defs)
+void tcg_optimize(TCGContext *s)
 {
-    TCGArg *res;
-    res = tcg_constant_folding(s, tcg_opc_ptr, args, tcg_op_defs);
-    return res;
+    tcg_constant_folding(s);
 }
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 203027e..9c6fa28 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -1363,7 +1363,7 @@
    in CR7, loads the addend of the TLB into R3, and returns the register
    containing the guest address (zero-extended into R4).  Clobbers R0 and R2. */
 
-static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits,
+static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
                                TCGReg addrlo, TCGReg addrhi,
                                int mem_index, bool is_read)
 {
@@ -1373,6 +1373,8 @@
            : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
     int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
     TCGReg base = TCG_AREG0;
+    TCGMemOp s_bits = opc & MO_SIZE;
+    int a_bits = get_alignment_bits(opc);
 
     /* Extract the page index, shifted into place for tlb index.  */
     if (TCG_TARGET_REG_BITS == 64) {
@@ -1424,17 +1426,40 @@
        to minimize any load use delay.  */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3, add_off);
 
-    /* Clear the non-page, non-alignment bits from the address.  */
+    /* Clear the non-page, non-alignment bits from the address */
     if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
+        /* We don't support unaligned accesses on 32-bits, preserve
+         * the bottom bits and thus trigger a comparison failure on
+         * unaligned accesses
+         */
+        if (a_bits < 0) {
+            a_bits = s_bits;
+        }
         tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0,
-                    (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
-    } else if (!s_bits) {
-        tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo,
-                    0, 63 - TARGET_PAGE_BITS);
+                    (32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
+    } else if (a_bits) {
+        /* More than byte access, we need to handle alignment */
+        if (a_bits > 0) {
+            /* Alignment required by the front-end, same as 32-bits */
+            tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
+                        64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - a_bits);
+            tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
+       } else {
+           /* We support unaligned accesses, we need to make sure we fail
+            * if we cross a page boundary. The trick is to add the
+            * access_size-1 to the address before masking the low bits.
+            * That will make the address overflow to the next page if we
+            * cross a page boundary which will then force a mismatch of
+            * the TLB compare since the next page cannot possibly be in
+            * the same TLB index.
+            */
+            tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, (1 << s_bits) - 1));
+            tcg_out_rld(s, RLDICR, TCG_REG_R0, TCG_REG_R0,
+                        0, 63 - TARGET_PAGE_BITS);
+        }
     } else {
-        tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
-                    64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - s_bits);
-        tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
+        /* Byte access, just chop off the bits below the page index */
+        tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo, 0, 63 - TARGET_PAGE_BITS);
     }
 
     if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
@@ -1453,28 +1478,27 @@
 /* Record the context of a call to the out of line helper code for the slow
    path for a load or store, so that we can later generate the correct
    helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo_reg, TCGReg datahi_reg,
                                 TCGReg addrlo_reg, TCGReg addrhi_reg,
-                                int mem_index, tcg_insn_unit *raddr,
-                                tcg_insn_unit *lptr)
+                                tcg_insn_unit *raddr, tcg_insn_unit *lptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo_reg;
     label->datahi_reg = datahi_reg;
     label->addrlo_reg = addrlo_reg;
     label->addrhi_reg = addrhi_reg;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = lptr;
 }
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGReg hi, lo, arg = TCG_REG_R3;
 
     reloc_pc14(lb->label_ptr[0], s->code_ptr);
@@ -1495,10 +1519,10 @@
         tcg_out_mov(s, TCG_TYPE_TL, arg++, lo);
     }
 
-    tcg_out_movi(s, TCG_TYPE_I32, arg++, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     lo = lb->datalo_reg;
     hi = lb->datahi_reg;
@@ -1517,7 +1541,8 @@
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp s_bits = opc & MO_SIZE;
     TCGReg hi, lo, arg = TCG_REG_R3;
 
@@ -1564,10 +1589,10 @@
         }
     }
 
-    tcg_out_movi(s, TCG_TYPE_I32, arg++, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tcg_out_b(s, 0, lb->raddr);
 }
@@ -1577,6 +1602,7 @@
 {
     TCGReg datalo, datahi, addrlo, rbase;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1587,12 +1613,13 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
     s_bits = opc & MO_SIZE;
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
-    addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, true);
+    mem_index = get_mmuidx(oi);
+    addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, true);
 
     /* Load a pointer into the current opcode w/conditional branch-link. */
     label_ptr = s->code_ptr;
@@ -1624,7 +1651,7 @@
             tcg_out32(s, LWZ | TAI(datalo, addrlo, 4));
         }
     } else {
-        uint32_t insn = qemu_ldx_opc[opc];
+        uint32_t insn = qemu_ldx_opc[opc & (MO_BSWAP | MO_SSIZE)];
         if (!HAVE_ISA_2_06 && insn == LDBRX) {
             tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, 4));
             tcg_out32(s, LWBRX | TAB(datalo, rbase, addrlo));
@@ -1641,8 +1668,8 @@
     }
 
 #ifdef CONFIG_SOFTMMU
-    add_qemu_ldst_label(s, true, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #endif
 }
 
@@ -1650,6 +1677,7 @@
 {
     TCGReg datalo, datahi, addrlo, rbase;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1660,12 +1688,13 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
     s_bits = opc & MO_SIZE;
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
-    addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, false);
+    mem_index = get_mmuidx(oi);
+    addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, false);
 
     /* Load a pointer into the current opcode w/conditional branch-link. */
     label_ptr = s->code_ptr;
@@ -1694,7 +1723,7 @@
             tcg_out32(s, STW | TAI(datalo, addrlo, 4));
         }
     } else {
-        uint32_t insn = qemu_stx_opc[opc];
+        uint32_t insn = qemu_stx_opc[opc & (MO_BSWAP | MO_SIZE)];
         if (!HAVE_ISA_2_06 && insn == STDBRX) {
             tcg_out32(s, STWBRX | SAB(datalo, rbase, addrlo));
             tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, addrlo, 4));
@@ -1706,8 +1735,8 @@
     }
 
 #ifdef CONFIG_SOFTMMU
-    add_qemu_ldst_label(s, false, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #endif
 }
 
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index 63e9c82..f78a0c1 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -1507,20 +1507,37 @@
 static TCGReg tcg_out_tlb_read(TCGContext* s, TCGReg addr_reg, TCGMemOp opc,
                                int mem_index, bool is_ld)
 {
-    TCGMemOp s_bits = opc & MO_SIZE;
-    uint64_t tlb_mask = TARGET_PAGE_MASK | ((1 << s_bits) - 1);
-    int ofs;
+    int a_bits = get_alignment_bits(opc);
+    int ofs, a_off;
+    uint64_t tlb_mask;
+
+    /* For aligned accesses, we check the first byte and include the alignment
+       bits within the address.  For unaligned access, we check that we don't
+       cross pages using the address of the last byte of the access.  */
+    if (a_bits >= 0) {
+        /* A byte access or an alignment check required */
+        a_off = 0;
+        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
+    } else {
+        a_off = (1 << (opc & MO_SIZE)) - 1;
+        tlb_mask = TARGET_PAGE_MASK;
+    }
 
     if (facilities & FACILITY_GEN_INST_EXT) {
         tcg_out_risbg(s, TCG_REG_R2, addr_reg,
                       64 - CPU_TLB_BITS - CPU_TLB_ENTRY_BITS,
                       63 - CPU_TLB_ENTRY_BITS,
                       64 + CPU_TLB_ENTRY_BITS - TARGET_PAGE_BITS, 1);
-        tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
+        if (a_off) {
+            tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
+            tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
+        } else {
+            tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
+        }
     } else {
         tcg_out_sh64(s, RSY_SRLG, TCG_REG_R2, addr_reg, TCG_REG_NONE,
                      TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-        tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_R3, addr_reg);
+        tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
         tgen_andi(s, TCG_TYPE_I64, TCG_REG_R2,
                   (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
         tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
@@ -1547,17 +1564,16 @@
     return addr_reg;
 }
 
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
-                                TCGReg data, TCGReg addr, int mem_index,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
+                                TCGReg data, TCGReg addr,
                                 tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = data;
     label->addrlo_reg = addr;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr;
 }
@@ -1566,7 +1582,8 @@
 {
     TCGReg addr_reg = lb->addrlo_reg;
     TCGReg data_reg = lb->datalo_reg;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
 
     patch_reloc(lb->label_ptr[0], R_390_PC16DBL, (intptr_t)s->code_ptr, -2);
 
@@ -1574,9 +1591,9 @@
     if (TARGET_LONG_BITS == 64) {
         tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R3, addr_reg);
     }
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R5, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
     tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_R2);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
@@ -1586,7 +1603,8 @@
 {
     TCGReg addr_reg = lb->addrlo_reg;
     TCGReg data_reg = lb->datalo_reg;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
 
     patch_reloc(lb->label_ptr[0], R_390_PC16DBL, (intptr_t)s->code_ptr, -2);
 
@@ -1610,9 +1628,9 @@
     default:
         tcg_abort();
     }
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R6, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
 }
@@ -1635,9 +1653,11 @@
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp opc, int mem_index)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp opc = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
     TCGReg base_reg;
 
@@ -1648,8 +1668,7 @@
 
     tcg_out_qemu_ld_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0);
 
-    add_qemu_ldst_label(s, 1, opc, data_reg, addr_reg, mem_index,
-                        s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 1, oi, data_reg, addr_reg, s->code_ptr, label_ptr);
 #else
     TCGReg index_reg;
     tcg_target_long disp;
@@ -1660,9 +1679,11 @@
 }
 
 static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp opc, int mem_index)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp opc = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
     TCGReg base_reg;
 
@@ -1673,8 +1694,7 @@
 
     tcg_out_qemu_st_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0);
 
-    add_qemu_ldst_label(s, 0, opc, data_reg, addr_reg, mem_index,
-                        s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 0, oi, data_reg, addr_reg, s->code_ptr, label_ptr);
 #else
     TCGReg index_reg;
     tcg_target_long disp;
@@ -1923,11 +1943,11 @@
     case INDEX_op_qemu_ld_i32:
         /* ??? Technically we can use a non-extending instruction.  */
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, args[0], args[1], args[2], args[3]);
+        tcg_out_qemu_ld(s, args[0], args[1], args[2]);
         break;
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
-        tcg_out_qemu_st(s, args[0], args[1], args[2], args[3]);
+        tcg_out_qemu_st(s, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16s_i64:
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 0c4b028..e081828 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -917,7 +917,7 @@
             } else {
                 ra += 1;
             }
-            /* Skip the mem_index argument.  */
+            /* Skip the oi argument.  */
             ra += 1;
         }
                 
@@ -1072,15 +1072,16 @@
 };
 
 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
-                            TCGMemOp memop, int memi, bool is_64)
+                            TCGMemOpIdx oi, bool is_64)
 {
+    TCGMemOp memop = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned memi = get_mmuidx(oi);
     TCGReg addrz, param;
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
                              offsetof(CPUTLBEntry, addr_read));
 
     /* The fast path is exactly one insn.  Thus we can perform the
@@ -1092,7 +1093,8 @@
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
-    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, qemu_ld_opc[memop]);
+    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1,
+                    qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 
     /* TLB Miss.  */
 
@@ -1105,27 +1107,27 @@
 
     /* We use the helpers to extend SB and SW data, leaving the case
        of SL needing explicit extending below.  */
-    if ((memop & ~MO_BSWAP) == MO_SL) {
-        func = qemu_ld_trampoline[memop & ~MO_SIGN];
+    if ((memop & MO_SSIZE) == MO_SL) {
+        func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SIZE)];
     } else {
-        func = qemu_ld_trampoline[memop];
+        func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SSIZE)];
     }
     assert(func != NULL);
     tcg_out_call_nodelay(s, func);
     /* delay slot */
-    tcg_out_movi(s, TCG_TYPE_I32, param, memi);
+    tcg_out_movi(s, TCG_TYPE_I32, param, oi);
 
     /* Recall that all of the helpers return 64-bit results.
        Which complicates things for sparcv8plus.  */
     if (SPARC64) {
         /* We let the helper sign-extend SB and SW, but leave SL for here.  */
-        if (is_64 && (memop & ~MO_BSWAP) == MO_SL) {
+        if (is_64 && (memop & MO_SSIZE) == MO_SL) {
             tcg_out_arithi(s, data, TCG_REG_O0, 0, SHIFT_SRA);
         } else {
             tcg_out_mov(s, TCG_TYPE_REG, data, TCG_REG_O0);
         }
     } else {
-        if (s_bits == MO_64) {
+        if ((memop & MO_SIZE) == MO_64) {
             tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, 32, SHIFT_SLLX);
             tcg_out_arithi(s, TCG_REG_O1, TCG_REG_O1, 0, SHIFT_SRL);
             tcg_out_arith(s, data, TCG_REG_O0, TCG_REG_O1, ARITH_OR);
@@ -1147,20 +1149,21 @@
     }
     tcg_out_ldst_rr(s, data, addr,
                     (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
-                    qemu_ld_opc[memop]);
+                    qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
 
 static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
-                            TCGMemOp memop, int memi)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp memop = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned memi = get_mmuidx(oi);
     TCGReg addrz, param;
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
                              offsetof(CPUTLBEntry, addr_write));
 
     /* The fast path is exactly one insn.  Thus we can perform the entire
@@ -1170,7 +1173,8 @@
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
-    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, qemu_st_opc[memop]);
+    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1,
+                    qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 
     /* TLB Miss.  */
 
@@ -1180,17 +1184,17 @@
         param++;
     }
     tcg_out_mov(s, TCG_TYPE_REG, param++, addr);
-    if (!SPARC64 && s_bits == MO_64) {
+    if (!SPARC64 && (memop & MO_SIZE) == MO_64) {
         /* Skip the high-part; we'll perform the extract in the trampoline.  */
         param++;
     }
     tcg_out_mov(s, TCG_TYPE_REG, param++, data);
 
-    func = qemu_st_trampoline[memop];
+    func = qemu_st_trampoline[memop & (MO_BSWAP | MO_SIZE)];
     assert(func != NULL);
     tcg_out_call_nodelay(s, func);
     /* delay slot */
-    tcg_out_movi(s, TCG_TYPE_REG, param, memi);
+    tcg_out_movi(s, TCG_TYPE_I32, param, oi);
 
     *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #else
@@ -1200,7 +1204,7 @@
     }
     tcg_out_ldst_rr(s, data, addr,
                     (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
-                    qemu_st_opc[memop]);
+                    qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
 
@@ -1365,14 +1369,14 @@
         break;
 
     case INDEX_op_qemu_ld_i32:
-        tcg_out_qemu_ld(s, a0, a1, a2, args[3], false);
+        tcg_out_qemu_ld(s, a0, a1, a2, false);
         break;
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, a0, a1, a2, args[3], true);
+        tcg_out_qemu_ld(s, a0, a1, a2, true);
         break;
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
-        tcg_out_qemu_st(s, a0, a1, a2, args[3]);
+        tcg_out_qemu_st(s, a0, a1, a2);
         break;
 
     case INDEX_op_ld32s_i64:
diff --git a/tcg/tcg-be-ldst.h b/tcg/tcg-be-ldst.h
index 429cba2..cafe8ed 100644
--- a/tcg/tcg-be-ldst.h
+++ b/tcg/tcg-be-ldst.h
@@ -25,13 +25,12 @@
 
 typedef struct TCGLabelQemuLdst {
     bool is_ld;             /* qemu_ld: true, qemu_st: false */
-    TCGMemOp opc;
+    TCGMemOpIdx oi;
     TCGType type;           /* result type of a load */
     TCGReg addrlo_reg;      /* reg index for low word of guest virtual addr */
     TCGReg addrhi_reg;      /* reg index for high word of guest virtual addr */
     TCGReg datalo_reg;      /* reg index for low word to be loaded or stored */
     TCGReg datahi_reg;      /* reg index for high word to be loaded or stored */
-    int mem_index;          /* soft MMU memory index */
     tcg_insn_unit *raddr;   /* gen code addr of the next IR of qemu_ld/st IR */
     tcg_insn_unit *label_ptr[2]; /* label pointers to be updated */
 } TCGLabelQemuLdst;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
new file mode 100644
index 0000000..7d3de33
--- /dev/null
+++ b/tcg/tcg-op.c
@@ -0,0 +1,1935 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "tcg.h"
+#include "tcg-op.h"
+
+/* Reduce the number of ifdefs below.  This assumes that all uses of
+   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
+   the compiler can eliminate.  */
+#if TCG_TARGET_REG_BITS == 64
+extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
+extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
+#define TCGV_LOW  TCGV_LOW_link_error
+#define TCGV_HIGH TCGV_HIGH_link_error
+#endif
+
+/* Note that this is optimized for sequential allocation during translate.
+   Up to and including filling in the forward link immediately.  We'll do
+   proper termination of the end of the list after we finish translation.  */
+
+static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args)
+{
+    int oi = ctx->gen_next_op_idx;
+    int ni = oi + 1;
+    int pi = oi - 1;
+
+    tcg_debug_assert(oi < OPC_BUF_SIZE);
+    ctx->gen_last_op_idx = oi;
+    ctx->gen_next_op_idx = ni;
+
+    ctx->gen_op_buf[oi] = (TCGOp){
+        .opc = opc,
+        .args = args,
+        .prev = pi,
+        .next = ni
+    };
+}
+
+void tcg_gen_op1(TCGContext *ctx, TCGOpcode opc, TCGArg a1)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 1 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 1;
+    ctx->gen_opparam_buf[pi] = a1;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op2(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 2 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 2;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op3(TCGContext *ctx, TCGOpcode opc, TCGArg a1,
+                 TCGArg a2, TCGArg a3)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 3 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 3;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op4(TCGContext *ctx, TCGOpcode opc, TCGArg a1,
+                 TCGArg a2, TCGArg a3, TCGArg a4)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 4 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 4;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+    ctx->gen_opparam_buf[pi + 3] = a4;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op5(TCGContext *ctx, TCGOpcode opc, TCGArg a1,
+                 TCGArg a2, TCGArg a3, TCGArg a4, TCGArg a5)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 5 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 5;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+    ctx->gen_opparam_buf[pi + 3] = a4;
+    ctx->gen_opparam_buf[pi + 4] = a5;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2,
+                 TCGArg a3, TCGArg a4, TCGArg a5, TCGArg a6)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 6 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 6;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+    ctx->gen_opparam_buf[pi + 3] = a4;
+    ctx->gen_opparam_buf[pi + 4] = a5;
+    ctx->gen_opparam_buf[pi + 5] = a6;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+/* 32 bit ops */
+
+void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_add_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
+{
+    if (arg1 == 0 && TCG_TARGET_HAS_neg_i32) {
+        /* Don't recurse with tcg_gen_neg_i32.  */
+        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg2);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg1);
+        tcg_gen_sub_i32(ret, t0, arg2);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_sub_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
+{
+    TCGv_i32 t0;
+    /* Some cases can be optimized here.  */
+    switch (arg2) {
+    case 0:
+        tcg_gen_movi_i32(ret, 0);
+        return;
+    case 0xffffffffu:
+        tcg_gen_mov_i32(ret, arg1);
+        return;
+    case 0xffu:
+        /* Don't recurse with tcg_gen_ext8u_i32.  */
+        if (TCG_TARGET_HAS_ext8u_i32) {
+            tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg1);
+            return;
+        }
+        break;
+    case 0xffffu:
+        if (TCG_TARGET_HAS_ext16u_i32) {
+            tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg1);
+            return;
+        }
+        break;
+    }
+    t0 = tcg_const_i32(arg2);
+    tcg_gen_and_i32(ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* Some cases can be optimized here.  */
+    if (arg2 == -1) {
+        tcg_gen_movi_i32(ret, -1);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_or_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* Some cases can be optimized here.  */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i32) {
+        /* Don't recurse with tcg_gen_not_i32.  */
+        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_xor_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_shl_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_shr_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_sar_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, int label)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_br(label);
+    } else if (cond != TCG_COND_NEVER) {
+        tcg_gen_op4ii_i32(INDEX_op_brcond_i32, arg1, arg2, cond, label);
+    }
+}
+
+void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, int label)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_brcond_i32(cond, arg1, t0, label);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
+                         TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_movi_i32(ret, 1);
+    } else if (cond == TCG_COND_NEVER) {
+        tcg_gen_movi_i32(ret, 0);
+    } else {
+        tcg_gen_op4i_i32(INDEX_op_setcond_i32, ret, arg1, arg2, cond);
+    }
+}
+
+void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
+                          TCGv_i32 arg1, int32_t arg2)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_setcond_i32(cond, ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_mul_i32(ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_div_i32) {
+        tcg_gen_op3_i32(INDEX_op_div_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_sari_i32(t0, arg1, 31);
+        tcg_gen_op5_i32(INDEX_op_div2_i32, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_div_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i32) {
+        tcg_gen_op3_i32(INDEX_op_rem_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_div_i32, t0, arg1, arg2);
+        tcg_gen_mul_i32(t0, t0, arg2);
+        tcg_gen_sub_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_sari_i32(t0, arg1, 31);
+        tcg_gen_op5_i32(INDEX_op_div2_i32, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_rem_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_div_i32) {
+        tcg_gen_op3_i32(INDEX_op_divu_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_movi_i32(t0, 0);
+        tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_divu_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i32) {
+        tcg_gen_op3_i32(INDEX_op_remu_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_divu_i32, t0, arg1, arg2);
+        tcg_gen_mul_i32(t0, t0, arg2);
+        tcg_gen_sub_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_movi_i32(t0, 0);
+        tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_remu_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_andc_i32) {
+        tcg_gen_op3_i32(INDEX_op_andc_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_not_i32(t0, arg2);
+        tcg_gen_and_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_eqv_i32) {
+        tcg_gen_op3_i32(INDEX_op_eqv_i32, ret, arg1, arg2);
+    } else {
+        tcg_gen_xor_i32(ret, arg1, arg2);
+        tcg_gen_not_i32(ret, ret);
+    }
+}
+
+void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_nand_i32) {
+        tcg_gen_op3_i32(INDEX_op_nand_i32, ret, arg1, arg2);
+    } else {
+        tcg_gen_and_i32(ret, arg1, arg2);
+        tcg_gen_not_i32(ret, ret);
+    }
+}
+
+void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_nor_i32) {
+        tcg_gen_op3_i32(INDEX_op_nor_i32, ret, arg1, arg2);
+    } else {
+        tcg_gen_or_i32(ret, arg1, arg2);
+        tcg_gen_not_i32(ret, ret);
+    }
+}
+
+void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_orc_i32) {
+        tcg_gen_op3_i32(INDEX_op_orc_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_not_i32(t0, arg2);
+        tcg_gen_or_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i32) {
+        tcg_gen_op3_i32(INDEX_op_rotl_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0, t1;
+
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        tcg_gen_shl_i32(t0, arg1, arg2);
+        tcg_gen_subfi_i32(t1, 32, arg2);
+        tcg_gen_shr_i32(t1, arg1, t1);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else if (TCG_TARGET_HAS_rot_i32) {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_rotl_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    } else {
+        TCGv_i32 t0, t1;
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        tcg_gen_shli_i32(t0, arg1, arg2);
+        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i32) {
+        tcg_gen_op3_i32(INDEX_op_rotr_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0, t1;
+
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        tcg_gen_shr_i32(t0, arg1, arg2);
+        tcg_gen_subfi_i32(t1, 32, arg2);
+        tcg_gen_shl_i32(t1, arg1, t1);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        tcg_gen_rotli_i32(ret, arg1, 32 - arg2);
+    }
+}
+
+void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
+                         unsigned int ofs, unsigned int len)
+{
+    uint32_t mask;
+    TCGv_i32 t1;
+
+    tcg_debug_assert(ofs < 32);
+    tcg_debug_assert(len <= 32);
+    tcg_debug_assert(ofs + len <= 32);
+
+    if (ofs == 0 && len == 32) {
+        tcg_gen_mov_i32(ret, arg2);
+        return;
+    }
+    if (TCG_TARGET_HAS_deposit_i32 && TCG_TARGET_deposit_i32_valid(ofs, len)) {
+        tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, arg1, arg2, ofs, len);
+        return;
+    }
+
+    mask = (1u << len) - 1;
+    t1 = tcg_temp_new_i32();
+
+    if (ofs + len < 32) {
+        tcg_gen_andi_i32(t1, arg2, mask);
+        tcg_gen_shli_i32(t1, t1, ofs);
+    } else {
+        tcg_gen_shli_i32(t1, arg2, ofs);
+    }
+    tcg_gen_andi_i32(ret, arg1, ~(mask << ofs));
+    tcg_gen_or_i32(ret, ret, t1);
+
+    tcg_temp_free_i32(t1);
+}
+
+void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
+                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2)
+{
+    if (TCG_TARGET_HAS_movcond_i32) {
+        tcg_gen_op6i_i32(INDEX_op_movcond_i32, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        tcg_gen_setcond_i32(cond, t0, c1, c2);
+        tcg_gen_neg_i32(t0, t0);
+        tcg_gen_and_i32(t1, v1, t0);
+        tcg_gen_andc_i32(ret, v2, t0);
+        tcg_gen_or_i32(ret, ret, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
+{
+    if (TCG_TARGET_HAS_add2_i32) {
+        tcg_gen_op6_i32(INDEX_op_add2_i32, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_concat_i32_i64(t0, al, ah);
+        tcg_gen_concat_i32_i64(t1, bl, bh);
+        tcg_gen_add_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
+{
+    if (TCG_TARGET_HAS_sub2_i32) {
+        tcg_gen_op6_i32(INDEX_op_sub2_i32, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_concat_i32_i64(t0, al, ah);
+        tcg_gen_concat_i32_i64(t1, bl, bh);
+        tcg_gen_sub_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_mulu2_i32) {
+        tcg_gen_op4_i32(INDEX_op_mulu2_i32, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_muluh_i32) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
+        tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
+        tcg_gen_mov_i32(rl, t);
+        tcg_temp_free_i32(t);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_extu_i32_i64(t0, arg1);
+        tcg_gen_extu_i32_i64(t1, arg2);
+        tcg_gen_mul_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_muls2_i32) {
+        tcg_gen_op4_i32(INDEX_op_muls2_i32, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_mulsh_i32) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
+        tcg_gen_op3_i32(INDEX_op_mulsh_i32, rh, arg1, arg2);
+        tcg_gen_mov_i32(rl, t);
+        tcg_temp_free_i32(t);
+    } else if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        TCGv_i32 t2 = tcg_temp_new_i32();
+        TCGv_i32 t3 = tcg_temp_new_i32();
+        tcg_gen_mulu2_i32(t0, t1, arg1, arg2);
+        /* Adjust for negative inputs.  */
+        tcg_gen_sari_i32(t2, arg1, 31);
+        tcg_gen_sari_i32(t3, arg2, 31);
+        tcg_gen_and_i32(t2, t2, arg2);
+        tcg_gen_and_i32(t3, t3, arg1);
+        tcg_gen_sub_i32(rh, t1, t2);
+        tcg_gen_sub_i32(rh, rh, t3);
+        tcg_gen_mov_i32(rl, t0);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+        tcg_temp_free_i32(t2);
+        tcg_temp_free_i32(t3);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(t0, arg1);
+        tcg_gen_ext_i32_i64(t1, arg2);
+        tcg_gen_mul_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext8s_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext8s_i32, ret, arg);
+    } else {
+        tcg_gen_shli_i32(ret, arg, 24);
+        tcg_gen_sari_i32(ret, ret, 24);
+    }
+}
+
+void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext16s_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext16s_i32, ret, arg);
+    } else {
+        tcg_gen_shli_i32(ret, arg, 16);
+        tcg_gen_sari_i32(ret, ret, 16);
+    }
+}
+
+void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext8u_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg);
+    } else {
+        tcg_gen_andi_i32(ret, arg, 0xffu);
+    }
+}
+
+void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext16u_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg);
+    } else {
+        tcg_gen_andi_i32(ret, arg, 0xffffu);
+    }
+}
+
+/* Note: we assume the two high bytes are set to zero */
+void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_bswap16_i32) {
+        tcg_gen_op2_i32(INDEX_op_bswap16_i32, ret, arg);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+
+        tcg_gen_ext8u_i32(t0, arg);
+        tcg_gen_shli_i32(t0, t0, 8);
+        tcg_gen_shri_i32(ret, arg, 8);
+        tcg_gen_or_i32(ret, ret, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_bswap32_i32) {
+        tcg_gen_op2_i32(INDEX_op_bswap32_i32, ret, arg);
+    } else {
+        TCGv_i32 t0, t1;
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+
+        tcg_gen_shli_i32(t0, arg, 24);
+
+        tcg_gen_andi_i32(t1, arg, 0x0000ff00);
+        tcg_gen_shli_i32(t1, t1, 8);
+        tcg_gen_or_i32(t0, t0, t1);
+
+        tcg_gen_shri_i32(t1, arg, 8);
+        tcg_gen_andi_i32(t1, t1, 0x0000ff00);
+        tcg_gen_or_i32(t0, t0, t1);
+
+        tcg_gen_shri_i32(t1, arg, 24);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+/* 64-bit ops */
+
+#if TCG_TARGET_REG_BITS == 32
+/* These are all inline for TCG_TARGET_REG_BITS == 64.  */
+
+void tcg_gen_discard_i64(TCGv_i64 arg)
+{
+    tcg_gen_discard_i32(TCGV_LOW(arg));
+    tcg_gen_discard_i32(TCGV_HIGH(arg));
+}
+
+void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+}
+
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
+{
+    tcg_gen_movi_i32(TCGV_LOW(ret), arg);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), arg >> 32);
+}
+
+void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld8u_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld8s_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), 31);
+}
+
+void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld16u_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld16s_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    /* Since arg2 and ret have different types,
+       they cannot be the same temporary */
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset);
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset + 4);
+#else
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset + 4);
+#endif
+}
+
+void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset);
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset + 4);
+#else
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
+    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset + 4);
+#endif
+}
+
+void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_and_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_and_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_xor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_xor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    gen_helper_shl_i64(ret, arg1, arg2);
+}
+
+void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    gen_helper_shr_i64(ret, arg1, arg2);
+}
+
+void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    gen_helper_sar_i64(ret, arg1, arg2);
+}
+
+void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0;
+    TCGv_i32 t1;
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i32();
+
+    tcg_gen_mulu2_i32(TCGV_LOW(t0), TCGV_HIGH(t0),
+                      TCGV_LOW(arg1), TCGV_LOW(arg2));
+
+    tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
+    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
+    tcg_gen_mul_i32(t1, TCGV_HIGH(arg1), TCGV_LOW(arg2));
+    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
+
+    tcg_gen_mov_i64(ret, t0);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i32(t1);
+}
+#endif /* TCG_TARGET_REG_SIZE == 32 */
+
+void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_add_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
+{
+    if (arg1 == 0 && TCG_TARGET_HAS_neg_i64) {
+        /* Don't recurse with tcg_gen_neg_i64.  */
+        tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg2);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg1);
+        tcg_gen_sub_i64(ret, t0, arg2);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_sub_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
+{
+    TCGv_i64 t0;
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+        tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+        return;
+    }
+
+    /* Some cases can be optimized here.  */
+    switch (arg2) {
+    case 0:
+        tcg_gen_movi_i64(ret, 0);
+        return;
+    case 0xffffffffffffffffull:
+        tcg_gen_mov_i64(ret, arg1);
+        return;
+    case 0xffull:
+        /* Don't recurse with tcg_gen_ext8u_i64.  */
+        if (TCG_TARGET_HAS_ext8u_i64) {
+            tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg1);
+            return;
+        }
+        break;
+    case 0xffffu:
+        if (TCG_TARGET_HAS_ext16u_i64) {
+            tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg1);
+            return;
+        }
+        break;
+    case 0xffffffffull:
+        if (TCG_TARGET_HAS_ext32u_i64) {
+            tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg1);
+            return;
+        }
+        break;
+    }
+    t0 = tcg_const_i64(arg2);
+    tcg_gen_and_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+        tcg_gen_ori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+        return;
+    }
+    /* Some cases can be optimized here.  */
+    if (arg2 == -1) {
+        tcg_gen_movi_i64(ret, -1);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_or_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_xori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+        tcg_gen_xori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+        return;
+    }
+    /* Some cases can be optimized here.  */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i64) {
+        /* Don't recurse with tcg_gen_not_i64.  */
+        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_xor_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static inline void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
+                                      unsigned c, bool right, bool arith)
+{
+    tcg_debug_assert(c < 64);
+    if (c == 0) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
+        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
+    } else if (c >= 32) {
+        c -= 32;
+        if (right) {
+            if (arith) {
+                tcg_gen_sari_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
+                tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), 31);
+            } else {
+                tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
+                tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+            }
+        } else {
+            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_LOW(arg1), c);
+            tcg_gen_movi_i32(TCGV_LOW(ret), 0);
+        }
+    } else {
+        TCGv_i32 t0, t1;
+
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        if (right) {
+            tcg_gen_shli_i32(t0, TCGV_HIGH(arg1), 32 - c);
+            if (arith) {
+                tcg_gen_sari_i32(t1, TCGV_HIGH(arg1), c);
+            } else {
+                tcg_gen_shri_i32(t1, TCGV_HIGH(arg1), c);
+            }
+            tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_LOW(arg1), c);
+            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t0);
+            tcg_gen_mov_i32(TCGV_HIGH(ret), t1);
+        } else {
+            tcg_gen_shri_i32(t0, TCGV_LOW(arg1), 32 - c);
+            /* Note: ret can be the same as arg1, so we use t1 */
+            tcg_gen_shli_i32(t1, TCGV_LOW(arg1), c);
+            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), c);
+            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t0);
+            tcg_gen_mov_i32(TCGV_LOW(ret), t1);
+        }
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_shl_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_shr_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_sar_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, int label)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_br(label);
+    } else if (cond != TCG_COND_NEVER) {
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_gen_op6ii_i32(INDEX_op_brcond2_i32, TCGV_LOW(arg1),
+                              TCGV_HIGH(arg1), TCGV_LOW(arg2),
+                              TCGV_HIGH(arg2), cond, label);
+        } else {
+            tcg_gen_op4ii_i64(INDEX_op_brcond_i64, arg1, arg2, cond, label);
+        }
+    }
+}
+
+void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, int label)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_br(label);
+    } else if (cond != TCG_COND_NEVER) {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_brcond_i64(cond, arg1, t0, label);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
+                         TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_movi_i64(ret, 1);
+    } else if (cond == TCG_COND_NEVER) {
+        tcg_gen_movi_i64(ret, 0);
+    } else {
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
+                             TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                             TCGV_LOW(arg2), TCGV_HIGH(arg2), cond);
+            tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+        } else {
+            tcg_gen_op4i_i64(INDEX_op_setcond_i64, ret, arg1, arg2, cond);
+        }
+    }
+}
+
+void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
+                          TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_setcond_i64(cond, ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_mul_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_div_i64) {
+        tcg_gen_op3_i64(INDEX_op_div_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_sari_i64(t0, arg1, 63);
+        tcg_gen_op5_i64(INDEX_op_div2_i64, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_div_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i64) {
+        tcg_gen_op3_i64(INDEX_op_rem_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_div_i64, t0, arg1, arg2);
+        tcg_gen_mul_i64(t0, t0, arg2);
+        tcg_gen_sub_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_sari_i64(t0, arg1, 63);
+        tcg_gen_op5_i64(INDEX_op_div2_i64, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_rem_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_div_i64) {
+        tcg_gen_op3_i64(INDEX_op_divu_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_movi_i64(t0, 0);
+        tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_divu_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i64) {
+        tcg_gen_op3_i64(INDEX_op_remu_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_divu_i64, t0, arg1, arg2);
+        tcg_gen_mul_i64(t0, t0, arg2);
+        tcg_gen_sub_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_movi_i64(t0, 0);
+        tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_remu_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext8s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else if (TCG_TARGET_HAS_ext8s_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext8s_i64, ret, arg);
+    } else {
+        tcg_gen_shli_i64(ret, arg, 56);
+        tcg_gen_sari_i64(ret, ret, 56);
+    }
+}
+
+void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext16s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else if (TCG_TARGET_HAS_ext16s_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext16s_i64, ret, arg);
+    } else {
+        tcg_gen_shli_i64(ret, arg, 48);
+        tcg_gen_sari_i64(ret, ret, 48);
+    }
+}
+
+void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else if (TCG_TARGET_HAS_ext32s_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext32s_i64, ret, arg);
+    } else {
+        tcg_gen_shli_i64(ret, arg, 32);
+        tcg_gen_sari_i64(ret, ret, 32);
+    }
+}
+
+void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext8u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_ext8u_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg);
+    } else {
+        tcg_gen_andi_i64(ret, arg, 0xffu);
+    }
+}
+
+void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext16u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_ext16u_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg);
+    } else {
+        tcg_gen_andi_i64(ret, arg, 0xffffu);
+    }
+}
+
+void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_ext32u_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg);
+    } else {
+        tcg_gen_andi_i64(ret, arg, 0xffffffffu);
+    }
+}
+
+/* Note: we assume the six high bytes are set to zero */
+void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_bswap16_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_bswap16_i64) {
+        tcg_gen_op2_i64(INDEX_op_bswap16_i64, ret, arg);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+
+        tcg_gen_ext8u_i64(t0, arg);
+        tcg_gen_shli_i64(t0, t0, 8);
+        tcg_gen_shri_i64(ret, arg, 8);
+        tcg_gen_or_i64(ret, ret, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+/* Note: we assume the four high bytes are set to zero */
+void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_bswap32_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_bswap32_i64) {
+        tcg_gen_op2_i64(INDEX_op_bswap32_i64, ret, arg);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+
+        tcg_gen_shli_i64(t0, arg, 24);
+        tcg_gen_ext32u_i64(t0, t0);
+
+        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
+        tcg_gen_shli_i64(t1, t1, 8);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 8);
+        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 24);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0, t1;
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+
+        tcg_gen_bswap32_i32(t0, TCGV_LOW(arg));
+        tcg_gen_bswap32_i32(t1, TCGV_HIGH(arg));
+        tcg_gen_mov_i32(TCGV_LOW(ret), t1);
+        tcg_gen_mov_i32(TCGV_HIGH(ret), t0);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    } else if (TCG_TARGET_HAS_bswap64_i64) {
+        tcg_gen_op2_i64(INDEX_op_bswap64_i64, ret, arg);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+
+        tcg_gen_shli_i64(t0, arg, 56);
+
+        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
+        tcg_gen_shli_i64(t1, t1, 40);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_andi_i64(t1, arg, 0x00ff0000);
+        tcg_gen_shli_i64(t1, t1, 24);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_andi_i64(t1, arg, 0xff000000);
+        tcg_gen_shli_i64(t1, t1, 8);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 8);
+        tcg_gen_andi_i64(t1, t1, 0xff000000);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 24);
+        tcg_gen_andi_i64(t1, t1, 0x00ff0000);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 40);
+        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 56);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_not_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_not_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    } else if (TCG_TARGET_HAS_not_i64) {
+        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg);
+    } else {
+        tcg_gen_xori_i64(ret, arg, -1);
+    }
+}
+
+void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_andc_i64) {
+        tcg_gen_op3_i64(INDEX_op_andc_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_not_i64(t0, arg2);
+        tcg_gen_and_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_eqv_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_eqv_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_eqv_i64) {
+        tcg_gen_op3_i64(INDEX_op_eqv_i64, ret, arg1, arg2);
+    } else {
+        tcg_gen_xor_i64(ret, arg1, arg2);
+        tcg_gen_not_i64(ret, ret);
+    }
+}
+
+void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_nand_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_nand_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_nand_i64) {
+        tcg_gen_op3_i64(INDEX_op_nand_i64, ret, arg1, arg2);
+    } else {
+        tcg_gen_and_i64(ret, arg1, arg2);
+        tcg_gen_not_i64(ret, ret);
+    }
+}
+
+void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_nor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_nor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_nor_i64) {
+        tcg_gen_op3_i64(INDEX_op_nor_i64, ret, arg1, arg2);
+    } else {
+        tcg_gen_or_i64(ret, arg1, arg2);
+        tcg_gen_not_i64(ret, ret);
+    }
+}
+
+void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_orc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_orc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_orc_i64) {
+        tcg_gen_op3_i64(INDEX_op_orc_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_not_i64(t0, arg2);
+        tcg_gen_or_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i64) {
+        tcg_gen_op3_i64(INDEX_op_rotl_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_shl_i64(t0, arg1, arg2);
+        tcg_gen_subfi_i64(t1, 64, arg2);
+        tcg_gen_shr_i64(t1, arg1, t1);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else if (TCG_TARGET_HAS_rot_i64) {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_rotl_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_shli_i64(t0, arg1, arg2);
+        tcg_gen_shri_i64(t1, arg1, 64 - arg2);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i64) {
+        tcg_gen_op3_i64(INDEX_op_rotr_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_shr_i64(t0, arg1, arg2);
+        tcg_gen_subfi_i64(t1, 64, arg2);
+        tcg_gen_shl_i64(t1, arg1, t1);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        tcg_gen_rotli_i64(ret, arg1, 64 - arg2);
+    }
+}
+
+void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
+                         unsigned int ofs, unsigned int len)
+{
+    uint64_t mask;
+    TCGv_i64 t1;
+
+    tcg_debug_assert(ofs < 64);
+    tcg_debug_assert(len <= 64);
+    tcg_debug_assert(ofs + len <= 64);
+
+    if (ofs == 0 && len == 64) {
+        tcg_gen_mov_i64(ret, arg2);
+        return;
+    }
+    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(ofs, len)) {
+        tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, arg1, arg2, ofs, len);
+        return;
+    }
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        if (ofs >= 32) {
+            tcg_gen_deposit_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1),
+                                TCGV_LOW(arg2), ofs - 32, len);
+            tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
+            return;
+        }
+        if (ofs + len <= 32) {
+            tcg_gen_deposit_i32(TCGV_LOW(ret), TCGV_LOW(arg1),
+                                TCGV_LOW(arg2), ofs, len);
+            tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
+            return;
+        }
+    }
+
+    mask = (1ull << len) - 1;
+    t1 = tcg_temp_new_i64();
+
+    if (ofs + len < 64) {
+        tcg_gen_andi_i64(t1, arg2, mask);
+        tcg_gen_shli_i64(t1, t1, ofs);
+    } else {
+        tcg_gen_shli_i64(t1, arg2, ofs);
+    }
+    tcg_gen_andi_i64(ret, arg1, ~(mask << ofs));
+    tcg_gen_or_i64(ret, ret, t1);
+
+    tcg_temp_free_i64(t1);
+}
+
+void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
+                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
+                         TCGV_LOW(c1), TCGV_HIGH(c1),
+                         TCGV_LOW(c2), TCGV_HIGH(c2), cond);
+
+        if (TCG_TARGET_HAS_movcond_i32) {
+            tcg_gen_movi_i32(t1, 0);
+            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_LOW(ret), t0, t1,
+                                TCGV_LOW(v1), TCGV_LOW(v2));
+            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_HIGH(ret), t0, t1,
+                                TCGV_HIGH(v1), TCGV_HIGH(v2));
+        } else {
+            tcg_gen_neg_i32(t0, t0);
+
+            tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
+            tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
+            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
+
+            tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
+            tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
+            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
+        }
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    } else if (TCG_TARGET_HAS_movcond_i64) {
+        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_setcond_i64(cond, t0, c1, c2);
+        tcg_gen_neg_i64(t0, t0);
+        tcg_gen_and_i64(t1, v1, t0);
+        tcg_gen_andc_i64(ret, v2, t0);
+        tcg_gen_or_i64(ret, ret, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
+{
+    if (TCG_TARGET_HAS_add2_i64) {
+        tcg_gen_op6_i64(INDEX_op_add2_i64, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_add_i64(t0, al, bl);
+        tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, al);
+        tcg_gen_add_i64(rh, ah, bh);
+        tcg_gen_add_i64(rh, rh, t1);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
+{
+    if (TCG_TARGET_HAS_sub2_i64) {
+        tcg_gen_op6_i64(INDEX_op_sub2_i64, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_sub_i64(t0, al, bl);
+        tcg_gen_setcond_i64(TCG_COND_LTU, t1, al, bl);
+        tcg_gen_sub_i64(rh, ah, bh);
+        tcg_gen_sub_i64(rh, rh, t1);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_mulu2_i64) {
+        tcg_gen_op4_i64(INDEX_op_mulu2_i64, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_muluh_i64) {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
+        tcg_gen_op3_i64(INDEX_op_muluh_i64, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t);
+        tcg_temp_free_i64(t);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_mul_i64(t0, arg1, arg2);
+        gen_helper_muluh_i64(rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_muls2_i64) {
+        tcg_gen_op4_i64(INDEX_op_muls2_i64, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_mulsh_i64) {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
+        tcg_gen_op3_i64(INDEX_op_mulsh_i64, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t);
+        tcg_temp_free_i64(t);
+    } else if (TCG_TARGET_HAS_mulu2_i64 || TCG_TARGET_HAS_muluh_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        TCGv_i64 t2 = tcg_temp_new_i64();
+        TCGv_i64 t3 = tcg_temp_new_i64();
+        tcg_gen_mulu2_i64(t0, t1, arg1, arg2);
+        /* Adjust for negative inputs.  */
+        tcg_gen_sari_i64(t2, arg1, 63);
+        tcg_gen_sari_i64(t3, arg2, 63);
+        tcg_gen_and_i64(t2, t2, arg2);
+        tcg_gen_and_i64(t3, t3, arg1);
+        tcg_gen_sub_i64(rh, t1, t2);
+        tcg_gen_sub_i64(rh, rh, t3);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+        tcg_temp_free_i64(t2);
+        tcg_temp_free_i64(t3);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_mul_i64(t0, arg1, arg2);
+        gen_helper_mulsh_i64(rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+/* Size changing operations.  */
+
+void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg, unsigned count)
+{
+    tcg_debug_assert(count < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        if (count >= 32) {
+            tcg_gen_shri_i32(ret, TCGV_HIGH(arg), count - 32);
+        } else if (count == 0) {
+            tcg_gen_mov_i32(ret, TCGV_LOW(arg));
+        } else {
+            TCGv_i64 t = tcg_temp_new_i64();
+            tcg_gen_shri_i64(t, arg, count);
+            tcg_gen_mov_i32(ret, TCGV_LOW(t));
+            tcg_temp_free_i64(t);
+        }
+    } else if (TCG_TARGET_HAS_trunc_shr_i32) {
+        tcg_gen_op3i_i32(INDEX_op_trunc_shr_i32, ret,
+                         MAKE_TCGV_I32(GET_TCGV_I64(arg)), count);
+    } else if (count == 0) {
+        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(arg)));
+    } else {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_shri_i64(t, arg, count);
+        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(t)));
+        tcg_temp_free_i64(t);
+    }
+}
+
+void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), arg);
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else {
+        /* Note: we assume the target supports move between
+           32 and 64 bit registers.  */
+        tcg_gen_ext32u_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+    }
+}
+
+void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), arg);
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else {
+        /* Note: we assume the target supports move between
+           32 and 64 bit registers.  */
+        tcg_gen_ext32s_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+    }
+}
+
+void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high)
+{
+    TCGv_i64 tmp;
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(dest), low);
+        tcg_gen_mov_i32(TCGV_HIGH(dest), high);
+        return;
+    }
+
+    tmp = tcg_temp_new_i64();
+    /* These extensions are only needed for type correctness.
+       We may be able to do better given target specific information.  */
+    tcg_gen_extu_i32_i64(tmp, high);
+    tcg_gen_extu_i32_i64(dest, low);
+    /* If deposit is available, use it.  Otherwise use the extra
+       knowledge that we have of the zero-extensions above.  */
+    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(32, 32)) {
+        tcg_gen_deposit_i64(dest, dest, tmp, 32, 32);
+    } else {
+        tcg_gen_shli_i64(tmp, tmp, 32);
+        tcg_gen_or_i64(dest, dest, tmp);
+    }
+    tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(lo, TCGV_LOW(arg));
+        tcg_gen_mov_i32(hi, TCGV_HIGH(arg));
+    } else {
+        tcg_gen_trunc_shr_i64_i32(lo, arg, 0);
+        tcg_gen_trunc_shr_i64_i32(hi, arg, 32);
+    }
+}
+
+void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg)
+{
+    tcg_gen_ext32u_i64(lo, arg);
+    tcg_gen_shri_i64(hi, arg, 32);
+}
+
+/* QEMU specific operations.  */
+
+void tcg_gen_goto_tb(unsigned idx)
+{
+    /* We only support two chained exits.  */
+    tcg_debug_assert(idx <= 1);
+#ifdef CONFIG_DEBUG_TCG
+    /* Verify that we havn't seen this numbered exit before.  */
+    tcg_debug_assert((tcg_ctx.goto_tb_issue_mask & (1 << idx)) == 0);
+    tcg_ctx.goto_tb_issue_mask |= 1 << idx;
+#endif
+    tcg_gen_op1i(INDEX_op_goto_tb, idx);
+}
+
+static inline TCGMemOp tcg_canonicalize_memop(TCGMemOp op, bool is64, bool st)
+{
+    /* Trigger the asserts within as early as possible.  */
+    (void)get_alignment_bits(op);
+
+    switch (op & MO_SIZE) {
+    case MO_8:
+        op &= ~MO_BSWAP;
+        break;
+    case MO_16:
+        break;
+    case MO_32:
+        if (!is64) {
+            op &= ~MO_SIGN;
+        }
+        break;
+    case MO_64:
+        if (!is64) {
+            tcg_abort();
+        }
+        break;
+    }
+    if (st) {
+        op &= ~MO_SIGN;
+    }
+    return op;
+}
+
+static void gen_ldst_i32(TCGOpcode opc, TCGv_i32 val, TCGv addr,
+                         TCGMemOp memop, TCGArg idx)
+{
+    TCGMemOpIdx oi = make_memop_idx(memop, idx);
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(opc, val, addr, oi);
+#else
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_op4i_i32(opc, val, TCGV_LOW(addr), TCGV_HIGH(addr), oi);
+    } else {
+        tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(val), GET_TCGV_I64(addr), oi);
+    }
+#endif
+}
+
+static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr,
+                         TCGMemOp memop, TCGArg idx)
+{
+    TCGMemOpIdx oi = make_memop_idx(memop, idx);
+#if TARGET_LONG_BITS == 32
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_op4i_i32(opc, TCGV_LOW(val), TCGV_HIGH(val), addr, oi);
+    } else {
+        tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(val), GET_TCGV_I32(addr), oi);
+    }
+#else
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_op5i_i32(opc, TCGV_LOW(val), TCGV_HIGH(val),
+                         TCGV_LOW(addr), TCGV_HIGH(addr), oi);
+    } else {
+        tcg_gen_op3i_i64(opc, val, addr, oi);
+    }
+#endif
+}
+
+void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+    gen_ldst_i32(INDEX_op_qemu_ld_i32, val, addr, memop, idx);
+}
+
+void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 1);
+    gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx);
+}
+
+void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
+        tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
+        if (memop & MO_SIGN) {
+            tcg_gen_sari_i32(TCGV_HIGH(val), TCGV_LOW(val), 31);
+        } else {
+            tcg_gen_movi_i32(TCGV_HIGH(val), 0);
+        }
+        return;
+    }
+
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+    gen_ldst_i64(INDEX_op_qemu_ld_i64, val, addr, memop, idx);
+}
+
+void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
+        tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop);
+        return;
+    }
+
+    memop = tcg_canonicalize_memop(memop, 1, 1);
+    gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 019dd9b..96adf9a 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -21,359 +21,309 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+
 #include "tcg.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
-int gen_new_label(void);
+/* Basic output routines.  Not for general consumption.  */
 
-static inline void tcg_gen_op0(TCGOpcode opc)
+void tcg_gen_op1(TCGContext *, TCGOpcode, TCGArg);
+void tcg_gen_op2(TCGContext *, TCGOpcode, TCGArg, TCGArg);
+void tcg_gen_op3(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg);
+void tcg_gen_op4(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
+void tcg_gen_op5(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg,
+                 TCGArg, TCGArg);
+void tcg_gen_op6(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg,
+                 TCGArg, TCGArg, TCGArg);
+
+
+static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
+    tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_I32(a1));
 }
 
-static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 arg1)
+static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 a1)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_I64(a1));
 }
 
-static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 arg1)
+static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg a1)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    tcg_gen_op1(&tcg_ctx, opc, a1);
 }
 
-static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg arg1)
+static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = arg1;
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2));
 }
 
-static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2)
+static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2));
 }
 
-static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 a1, TCGArg a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I32(a1), a2);
 }
 
-static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGArg arg2)
+static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 a1, TCGArg a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = arg2;
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I64(a1), a2);
 }
 
-static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGArg arg2)
+static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg a1, TCGArg a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = arg2;
+    tcg_gen_op2(&tcg_ctx, opc, a1, a2);
 }
 
-static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg arg1, TCGArg arg2)
+static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 a1,
+                                   TCGv_i32 a2, TCGv_i32 a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = arg1;
-    *tcg_ctx.gen_opparam_ptr++ = arg2;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(a1),
+                GET_TCGV_I32(a2), GET_TCGV_I32(a3));
 }
 
-static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3)
+static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 a1,
+                                   TCGv_i64 a2, TCGv_i64 a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(a1),
+                GET_TCGV_I64(a2), GET_TCGV_I64(a3));
 }
 
-static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3)
+static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 a1,
+                                    TCGv_i32 a2, TCGArg a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2), a3);
 }
 
-static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 arg1,
-                                    TCGv_i32 arg2, TCGArg arg3)
+static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 a1,
+                                    TCGv_i64 a2, TCGArg a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
-}
-
-static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 arg1,
-                                    TCGv_i64 arg2, TCGArg arg3)
-{
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2), a3);
 }
 
 static inline void tcg_gen_ldst_op_i32(TCGOpcode opc, TCGv_i32 val,
                                        TCGv_ptr base, TCGArg offset)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(val);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_PTR(base);
-    *tcg_ctx.gen_opparam_ptr++ = offset;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(val), GET_TCGV_PTR(base), offset);
 }
 
 static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
                                        TCGv_ptr base, TCGArg offset)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(val);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_PTR(base);
-    *tcg_ctx.gen_opparam_ptr++ = offset;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(val), GET_TCGV_PTR(base), offset);
 }
 
-static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3, TCGv_i32 arg4)
+static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4));
 }
 
-static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3, TCGv_i64 arg4)
+static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4));
 }
 
-static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                    TCGv_i32 arg3, TCGArg arg4)
+static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), a4);
 }
 
-static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                    TCGv_i64 arg3, TCGArg arg4)
+static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), a4);
 }
 
-static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                     TCGArg arg3, TCGArg arg4)
+static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGArg a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2), a3, a4);
 }
 
-static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                     TCGArg arg3, TCGArg arg4)
+static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGArg a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2), a3, a4);
 }
 
-static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3, TCGv_i32 arg4, TCGv_i32 arg5)
+static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4, TCGv_i32 a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg5);
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), GET_TCGV_I32(a5));
 }
 
-static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3, TCGv_i64 arg4, TCGv_i64 arg5)
+static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4, TCGv_i64 a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg5);
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), GET_TCGV_I64(a5));
 }
 
-static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                    TCGv_i32 arg3, TCGv_i32 arg4, TCGArg arg5)
+static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGv_i32 a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), a5);
 }
 
-static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                    TCGv_i64 arg3, TCGv_i64 arg4, TCGArg arg5)
+static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGv_i64 a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), a5);
 }
 
-static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 arg1,
-                                     TCGv_i32 arg2, TCGv_i32 arg3,
-                                     TCGArg arg4, TCGArg arg5)
+static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGv_i32 a3, TCGArg a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), a4, a5);
 }
 
-static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 arg1,
-                                     TCGv_i64 arg2, TCGv_i64 arg3,
-                                     TCGArg arg4, TCGArg arg5)
+static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGv_i64 a3, TCGArg a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), a4, a5);
 }
 
-static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3, TCGv_i32 arg4, TCGv_i32 arg5,
-                                   TCGv_i32 arg6)
+static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4,
+                                   TCGv_i32 a5, TCGv_i32 a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg6);
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), GET_TCGV_I32(a5),
+                GET_TCGV_I32(a6));
 }
 
-static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3, TCGv_i64 arg4, TCGv_i64 arg5,
-                                   TCGv_i64 arg6)
+static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4,
+                                   TCGv_i64 a5, TCGv_i64 a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg6);
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), GET_TCGV_I64(a5),
+                GET_TCGV_I64(a6));
 }
 
-static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                    TCGv_i32 arg3, TCGv_i32 arg4,
-                                    TCGv_i32 arg5, TCGArg arg6)
+static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGv_i32 a4,
+                                    TCGv_i32 a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), GET_TCGV_I32(a5), a6);
 }
 
-static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                    TCGv_i64 arg3, TCGv_i64 arg4,
-                                    TCGv_i64 arg5, TCGArg arg6)
+static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGv_i64 a4,
+                                    TCGv_i64 a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), GET_TCGV_I64(a5), a6);
 }
 
-static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 arg1,
-                                     TCGv_i32 arg2, TCGv_i32 arg3,
-                                     TCGv_i32 arg4, TCGArg arg5, TCGArg arg6)
+static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGv_i32 a3, TCGv_i32 a4,
+                                     TCGArg a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), a5, a6);
 }
 
-static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 arg1,
-                                     TCGv_i64 arg2, TCGv_i64 arg3,
-                                     TCGv_i64 arg4, TCGArg arg5, TCGArg arg6)
+static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGv_i64 a3, TCGv_i64 a4,
+                                     TCGArg a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), a5, a6);
 }
 
-static inline void tcg_add_param_i32(TCGv_i32 val)
-{
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(val);
-}
 
-static inline void tcg_add_param_i64(TCGv_i64 val)
-{
-#if TCG_TARGET_REG_BITS == 32
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(TCGV_LOW(val));
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(TCGV_HIGH(val));
-#else
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(val);
-#endif
-}
+/* Generic ops.  */
+
+int gen_new_label(void);
 
 static inline void gen_set_label(int n)
 {
-    tcg_gen_op1i(INDEX_op_set_label, n);
+    tcg_gen_op1(&tcg_ctx, INDEX_op_set_label, n);
 }
 
 static inline void tcg_gen_br(int label)
 {
-    tcg_gen_op1i(INDEX_op_br, label);
+    tcg_gen_op1(&tcg_ctx, INDEX_op_br, label);
+}
+
+
+/* Helper calls. */
+
+/* 32 bit ops */
+
+void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
+void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
+void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, int label);
+void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, int label);
+void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
+                         TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
+                          TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
+                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2);
+void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
+void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
+void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
+
+static inline void tcg_gen_discard_i32(TCGv_i32 arg)
+{
+    tcg_gen_op1_i32(INDEX_op_discard, arg);
 }
 
 static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
-    if (!TCGV_EQUAL_I32(ret, arg))
+    if (!TCGV_EQUAL_I32(ret, arg)) {
         tcg_gen_op2_i32(INDEX_op_mov_i32, ret, arg);
+    }
 }
 
 static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
@@ -381,44 +331,50 @@
     tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
 }
 
-/* 32 bit ops */
-
-static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld8u_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld8s_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld16u_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld16s_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                   tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_st8_i32, arg1, arg2, offset);
 }
 
-static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_st16_i32, arg1, arg2, offset);
 }
 
-static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_st_i32, arg1, arg2, offset);
 }
@@ -428,126 +384,24 @@
     tcg_gen_op3_i32(INDEX_op_add_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_add_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_sub_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_sub_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
-{
-    TCGv_i32 t0 = tcg_const_i32(arg1);
-    tcg_gen_sub_i32(ret, t0, arg2);
-    tcg_temp_free_i32(t0);
-}
-
-static inline void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_and_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
-    if (TCGV_EQUAL_I32(arg1, arg2)) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
-{
-    TCGv_i32 t0;
-    /* Some cases can be optimized here.  */
-    switch (arg2) {
-    case 0:
-        tcg_gen_movi_i32(ret, 0);
-        return;
-    case 0xffffffffu:
-        tcg_gen_mov_i32(ret, arg1);
-        return;
-    case 0xffu:
-        /* Don't recurse with tcg_gen_ext8u_i32.  */
-        if (TCG_TARGET_HAS_ext8u_i32) {
-            tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg1);
-            return;
-        }
-        break;
-    case 0xffffu:
-        if (TCG_TARGET_HAS_ext16u_i32) {
-            tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg1);
-            return;
-        }
-        break;
-    }
-    t0 = tcg_const_i32(arg2);
-    tcg_gen_and_i32(ret, arg1, t0);
-    tcg_temp_free_i32(t0);
+    tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_or_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
-    if (TCGV_EQUAL_I32(arg1, arg2)) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == -1) {
-        tcg_gen_movi_i32(ret, -1);
-    } else if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_or_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
+    tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_xor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
-    if (TCGV_EQUAL_I32(arg1, arg2)) {
-        tcg_gen_movi_i32(ret, 0);
-    } else {
-        tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i32) {
-        /* Don't recurse with tcg_gen_not_i32.  */
-        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_xor_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
+    tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_shl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
@@ -555,449 +409,102 @@
     tcg_gen_op3_i32(INDEX_op_shl_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_shr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_shr_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shr_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_sar_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_sar_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sar_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1,
-                                      TCGv_i32 arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        tcg_gen_op4ii_i32(INDEX_op_brcond_i32, arg1, arg2, cond, label_index);
-    }
-}
-
-static inline void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1,
-                                       int32_t arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_brcond_i32(cond, arg1, t0, label_index);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
-                                       TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i32(ret, 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i32(ret, 0);
-    } else {
-        tcg_gen_op4i_i32(INDEX_op_setcond_i32, ret, arg1, arg2, cond);
-    }
-}
-
-static inline void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
-                                        TCGv_i32 arg1, int32_t arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i32(ret, 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i32(ret, 0);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_setcond_i32(cond, ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_mul_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_mul_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
-    TCGv_i32 t0 = tcg_const_i32(arg2);
-    tcg_gen_mul_i32(ret, arg1, t0);
-    tcg_temp_free_i32(t0);
-}
-
-static inline void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_div_i32) {
-        tcg_gen_op3_i32(INDEX_op_div_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_sari_i32(t0, arg1, 31);
-        tcg_gen_op5_i32(INDEX_op_div2_i32, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
+    if (TCG_TARGET_HAS_neg_i32) {
+        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
     } else {
-        gen_helper_div_i32(ret, arg1, arg2);
+        tcg_gen_subfi_i32(ret, 0, arg);
     }
 }
 
-static inline void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
-    if (TCG_TARGET_HAS_rem_i32) {
-        tcg_gen_op3_i32(INDEX_op_rem_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_div_i32, t0, arg1, arg2);
-        tcg_gen_mul_i32(t0, t0, arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_sari_i32(t0, arg1, 31);
-        tcg_gen_op5_i32(INDEX_op_div2_i32, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
+    if (TCG_TARGET_HAS_not_i32) {
+        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
     } else {
-        gen_helper_rem_i32(ret, arg1, arg2);
+        tcg_gen_xori_i32(ret, arg, -1);
     }
 }
 
-static inline void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_div_i32) {
-        tcg_gen_op3_i32(INDEX_op_divu_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_movi_i32(t0, 0);
-        tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
-    } else {
-        gen_helper_divu_i32(ret, arg1, arg2);
-    }
-}
+/* 64 bit ops */
 
-static inline void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_rem_i32) {
-        tcg_gen_op3_i32(INDEX_op_remu_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_divu_i32, t0, arg1, arg2);
-        tcg_gen_mul_i32(t0, t0, arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_movi_i32(t0, 0);
-        tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
-    } else {
-        gen_helper_remu_i32(ret, arg1, arg2);
-    }
-}
+void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
+void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
+void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, int label);
+void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, int label);
+void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
+                         TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
+                          TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
+                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2);
+void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
+void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
+void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
 
-#if TCG_TARGET_REG_BITS == 32
+#if TCG_TARGET_REG_BITS == 64
+static inline void tcg_gen_discard_i64(TCGv_i64 arg)
+{
+    tcg_gen_op1_i64(INDEX_op_discard, arg);
+}
 
 static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
 {
     if (!TCGV_EQUAL_I64(ret, arg)) {
-        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-    }
-}
-
-static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
-{
-    tcg_gen_movi_i32(TCGV_LOW(ret), arg);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), arg >> 32);
-}
-
-static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ld8u_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ld8s_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), 31);
-}
-
-static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld16u_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld16s_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-    /* since arg2 and ret have different types, they cannot be the
-       same temporary */
-#ifdef HOST_WORDS_BIGENDIAN
-    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset);
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset + 4);
-#else
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset + 4);
-#endif
-}
-
-static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                   tcg_target_long offset)
-{
-    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-#ifdef HOST_WORDS_BIGENDIAN
-    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset);
-    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset + 4);
-#else
-    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
-    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset + 4);
-#endif
-}
-
-static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op6_i32(INDEX_op_add2_i32, TCGV_LOW(ret), TCGV_HIGH(ret),
-                    TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
-                    TCGV_HIGH(arg2));
-    /* Allow the optimizer room to replace add2 with two moves.  */
-    tcg_gen_op0(INDEX_op_nop);
-}
-
-static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op6_i32(INDEX_op_sub2_i32, TCGV_LOW(ret), TCGV_HIGH(ret),
-                    TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
-                    TCGV_HIGH(arg2));
-    /* Allow the optimizer room to replace sub2 with two moves.  */
-    tcg_gen_op0(INDEX_op_nop);
-}
-
-static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_and_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_and_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-}
-
-static inline void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
-    tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
-}
-
-static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-}
-
-static inline void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_ori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
-    tcg_gen_ori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
-}
-
-static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_xor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_xor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-}
-
-static inline void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_xori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
-    tcg_gen_xori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
-}
-
-/* XXX: use generic code when basic block handling is OK or CPU
-   specific code (x86) */
-static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_shl_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0);
-}
-
-static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_shr_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0);
-}
-
-static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_sar_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1);
-}
-
-static inline void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1,
-                                      TCGv_i64 arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        tcg_gen_op6ii_i32(INDEX_op_brcond2_i32,
-                          TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
-                          TCGV_HIGH(arg2), cond, label_index);
-    }
-}
-
-static inline void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
-                                       TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i32(TCGV_LOW(ret), 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i32(TCGV_LOW(ret), 0);
-    } else {
-        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
-                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
-                         TCGV_LOW(arg2), TCGV_HIGH(arg2), cond);
-    }
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    TCGv_i64 t0;
-    TCGv_i32 t1;
-
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i32();
-
-    if (TCG_TARGET_HAS_mulu2_i32) {
-        tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
-                        TCGV_LOW(arg1), TCGV_LOW(arg2));
-        /* Allow the optimizer room to replace mulu2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        tcg_debug_assert(TCG_TARGET_HAS_muluh_i32);
-        tcg_gen_op3_i32(INDEX_op_mul_i32, TCGV_LOW(t0),
-                        TCGV_LOW(arg1), TCGV_LOW(arg2));
-        tcg_gen_op3_i32(INDEX_op_muluh_i32, TCGV_HIGH(t0),
-                        TCGV_LOW(arg1), TCGV_LOW(arg2));
-    }
-
-    tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
-    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
-    tcg_gen_mul_i32(t1, TCGV_HIGH(arg1), TCGV_LOW(arg2));
-    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
-
-    tcg_gen_mov_i64(ret, t0);
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i32(t1);
-}
-
-static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_div_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_rem_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_divu_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_remu_i64(ret, arg1, arg2);
-}
-
-#else
-
-static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (!TCGV_EQUAL_I64(ret, arg))
         tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
+    }
 }
 
 static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
@@ -1041,7 +548,8 @@
     tcg_gen_ldst_op_i64(INDEX_op_ld32s_i64, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
 }
@@ -1064,7 +572,8 @@
     tcg_gen_ldst_op_i64(INDEX_op_st32_i64, arg1, arg2, offset);
 }
 
-static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
 }
@@ -1081,94 +590,17 @@
 
 static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCGV_EQUAL_I64(arg1, arg2)) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
-{
-    TCGv_i64 t0;
-    /* Some cases can be optimized here.  */
-    switch (arg2) {
-    case 0:
-        tcg_gen_movi_i64(ret, 0);
-        return;
-    case 0xffffffffffffffffull:
-        tcg_gen_mov_i64(ret, arg1);
-        return;
-    case 0xffull:
-        /* Don't recurse with tcg_gen_ext8u_i32.  */
-        if (TCG_TARGET_HAS_ext8u_i64) {
-            tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg1);
-            return;
-        }
-        break;
-    case 0xffffu:
-        if (TCG_TARGET_HAS_ext16u_i64) {
-            tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg1);
-            return;
-        }
-        break;
-    case 0xffffffffull:
-        if (TCG_TARGET_HAS_ext32u_i64) {
-            tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg1);
-            return;
-        }
-        break;
-    }
-    t0 = tcg_const_i64(arg2);
-    tcg_gen_and_i64(ret, arg1, t0);
-    tcg_temp_free_i64(t0);
+    tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCGV_EQUAL_I64(arg1, arg2)) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == -1) {
-        tcg_gen_movi_i64(ret, -1);
-    } else if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_or_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
+    tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCGV_EQUAL_I64(arg1, arg2)) {
-        tcg_gen_movi_i64(ret, 0);
-    } else {
-        tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i64) {
-        /* Don't recurse with tcg_gen_not_i64.  */
-        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_xor_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
+    tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
@@ -1176,993 +608,92 @@
     tcg_gen_op3_i64(INDEX_op_shl_i64, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
 static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
     tcg_gen_op3_i64(INDEX_op_shr_i64, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shr_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
 static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
     tcg_gen_op3_i64(INDEX_op_sar_i64, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sar_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-static inline void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1,
-                                      TCGv_i64 arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        tcg_gen_op4ii_i64(INDEX_op_brcond_i64, arg1, arg2, cond, label_index);
-    }
-}
-
-static inline void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
-                                       TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i64(ret, 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i64(ret, 0);
-    } else {
-        tcg_gen_op4i_i64(INDEX_op_setcond_i64, ret, arg1, arg2, cond);
-    }
-}
-
 static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
     tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
 }
-
-static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+#else /* TCG_TARGET_REG_BITS == 32 */
+static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                   tcg_target_long offset)
 {
-    if (TCG_TARGET_HAS_div_i64) {
-        tcg_gen_op3_i64(INDEX_op_div_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_sari_i64(t0, arg1, 63);
-        tcg_gen_op5_i64(INDEX_op_div2_i64, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_div_i64(ret, arg1, arg2);
-    }
+    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
 }
 
-static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
-    if (TCG_TARGET_HAS_rem_i64) {
-        tcg_gen_op3_i64(INDEX_op_rem_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_div_i64, t0, arg1, arg2);
-        tcg_gen_mul_i64(t0, t0, arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_sari_i64(t0, arg1, 63);
-        tcg_gen_op5_i64(INDEX_op_div2_i64, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_rem_i64(ret, arg1, arg2);
-    }
+    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
 }
 
-static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
-    if (TCG_TARGET_HAS_div_i64) {
-        tcg_gen_op3_i64(INDEX_op_divu_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_movi_i64(t0, 0);
-        tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_divu_i64(ret, arg1, arg2);
-    }
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
 }
 
-static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCG_TARGET_HAS_rem_i64) {
-        tcg_gen_op3_i64(INDEX_op_remu_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_divu_i64, t0, arg1, arg2);
-        tcg_gen_mul_i64(t0, t0, arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_movi_i64(t0, 0);
-        tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_remu_i64(ret, arg1, arg2);
-    }
-}
-#endif /* TCG_TARGET_REG_BITS == 32 */
-
-static inline void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_add_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
+    tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
+                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 }
 
-static inline void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
+static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    TCGv_i64 t0 = tcg_const_i64(arg1);
-    tcg_gen_sub_i64(ret, t0, arg2);
-    tcg_temp_free_i64(t0);
+    tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
+                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 }
 
-static inline void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-static inline void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1,
-                                       int64_t arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_brcond_i64(cond, arg1, t0, label_index);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-static inline void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
-                                        TCGv_i64 arg1, int64_t arg2)
-{
-    TCGv_i64 t0 = tcg_const_i64(arg2);
-    tcg_gen_setcond_i64(cond, ret, arg1, t0);
-    tcg_temp_free_i64(t0);
-}
-
-static inline void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    TCGv_i64 t0 = tcg_const_i64(arg2);
-    tcg_gen_mul_i64(ret, arg1, t0);
-    tcg_temp_free_i64(t0);
-}
-
-
-/***************************************/
-/* optional operations */
-
-static inline void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext8s_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext8s_i32, ret, arg);
-    } else {
-        tcg_gen_shli_i32(ret, arg, 24);
-        tcg_gen_sari_i32(ret, ret, 24);
-    }
-}
-
-static inline void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext16s_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext16s_i32, ret, arg);
-    } else {
-        tcg_gen_shli_i32(ret, arg, 16);
-        tcg_gen_sari_i32(ret, ret, 16);
-    }
-}
-
-static inline void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext8u_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg);
-    } else {
-        tcg_gen_andi_i32(ret, arg, 0xffu);
-    }
-}
-
-static inline void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext16u_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg);
-    } else {
-        tcg_gen_andi_i32(ret, arg, 0xffffu);
-    }
-}
-
-/* Note: we assume the two high bytes are set to zero */
-static inline void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_bswap16_i32) {
-        tcg_gen_op2_i32(INDEX_op_bswap16_i32, ret, arg);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-    
-        tcg_gen_ext8u_i32(t0, arg);
-        tcg_gen_shli_i32(t0, t0, 8);
-        tcg_gen_shri_i32(ret, arg, 8);
-        tcg_gen_or_i32(ret, ret, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_bswap32_i32) {
-        tcg_gen_op2_i32(INDEX_op_bswap32_i32, ret, arg);
-    } else {
-        TCGv_i32 t0, t1;
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-    
-        tcg_gen_shli_i32(t0, arg, 24);
-    
-        tcg_gen_andi_i32(t1, arg, 0x0000ff00);
-        tcg_gen_shli_i32(t1, t1, 8);
-        tcg_gen_or_i32(t0, t0, t1);
-    
-        tcg_gen_shri_i32(t1, arg, 8);
-        tcg_gen_andi_i32(t1, t1, 0x0000ff00);
-        tcg_gen_or_i32(t0, t0, t1);
-    
-        tcg_gen_shri_i32(t1, arg, 24);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-#if TCG_TARGET_REG_BITS == 32
-static inline void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext8s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext16s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext8u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext16u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg,
-                                             unsigned int count)
-{
-    tcg_debug_assert(count < 64);
-    if (count >= 32) {
-        tcg_gen_shri_i32(ret, TCGV_HIGH(arg), count - 32);
-    } else if (count == 0) {
-        tcg_gen_mov_i32(ret, TCGV_LOW(arg));
-    } else {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_shri_i64(t, arg, count);
-        tcg_gen_mov_i32(ret, TCGV_LOW(t));
-        tcg_temp_free_i64(t);
-    }
-}
-
-static inline void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), arg);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), arg);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-/* Note: we assume the six high bytes are set to zero */
-static inline void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-    tcg_gen_bswap16_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-}
-
-/* Note: we assume the four high bytes are set to zero */
-static inline void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-    tcg_gen_bswap32_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-}
-
-static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    TCGv_i32 t0, t1;
-    t0 = tcg_temp_new_i32();
-    t1 = tcg_temp_new_i32();
-
-    tcg_gen_bswap32_i32(t0, TCGV_LOW(arg));
-    tcg_gen_bswap32_i32(t1, TCGV_HIGH(arg));
-    tcg_gen_mov_i32(TCGV_LOW(ret), t1);
-    tcg_gen_mov_i32(TCGV_HIGH(ret), t0);
-    tcg_temp_free_i32(t0);
-    tcg_temp_free_i32(t1);
-}
-#else
-
-static inline void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext8s_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext8s_i64, ret, arg);
-    } else {
-        tcg_gen_shli_i64(ret, arg, 56);
-        tcg_gen_sari_i64(ret, ret, 56);
-    }
-}
-
-static inline void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext16s_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext16s_i64, ret, arg);
-    } else {
-        tcg_gen_shli_i64(ret, arg, 48);
-        tcg_gen_sari_i64(ret, ret, 48);
-    }
-}
-
-static inline void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext32s_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext32s_i64, ret, arg);
-    } else {
-        tcg_gen_shli_i64(ret, arg, 32);
-        tcg_gen_sari_i64(ret, ret, 32);
-    }
-}
-
-static inline void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext8u_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg);
-    } else {
-        tcg_gen_andi_i64(ret, arg, 0xffu);
-    }
-}
-
-static inline void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext16u_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg);
-    } else {
-        tcg_gen_andi_i64(ret, arg, 0xffffu);
-    }
-}
-
-static inline void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext32u_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg);
-    } else {
-        tcg_gen_andi_i64(ret, arg, 0xffffffffu);
-    }
-}
-
-static inline void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg,
-                                             unsigned int count)
-{
-    tcg_debug_assert(count < 64);
-    if (TCG_TARGET_HAS_trunc_shr_i32) {
-        tcg_gen_op3i_i32(INDEX_op_trunc_shr_i32, ret,
-                         MAKE_TCGV_I32(GET_TCGV_I64(arg)), count);
-    } else if (count == 0) {
-        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(arg)));
-    } else {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_shri_i64(t, arg, count);
-        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(t)));
-        tcg_temp_free_i64(t);
-    }
-}
-
-/* Note: we assume the target supports move between 32 and 64 bit
-   registers */
-static inline void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_ext32u_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
-}
-
-/* Note: we assume the target supports move between 32 and 64 bit
-   registers */
-static inline void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_ext32s_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
-}
-
-/* Note: we assume the six high bytes are set to zero */
-static inline void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_bswap16_i64) {
-        tcg_gen_op2_i64(INDEX_op_bswap16_i64, ret, arg);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-
-        tcg_gen_ext8u_i64(t0, arg);
-        tcg_gen_shli_i64(t0, t0, 8);
-        tcg_gen_shri_i64(ret, arg, 8);
-        tcg_gen_or_i64(ret, ret, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-/* Note: we assume the four high bytes are set to zero */
-static inline void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_bswap32_i64) {
-        tcg_gen_op2_i64(INDEX_op_bswap32_i64, ret, arg);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-
-        tcg_gen_shli_i64(t0, arg, 24);
-        tcg_gen_ext32u_i64(t0, t0);
-
-        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
-        tcg_gen_shli_i64(t1, t1, 8);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 8);
-        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 24);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_bswap64_i64) {
-        tcg_gen_op2_i64(INDEX_op_bswap64_i64, ret, arg);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-    
-        tcg_gen_shli_i64(t0, arg, 56);
-    
-        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
-        tcg_gen_shli_i64(t1, t1, 40);
-        tcg_gen_or_i64(t0, t0, t1);
-    
-        tcg_gen_andi_i64(t1, arg, 0x00ff0000);
-        tcg_gen_shli_i64(t1, t1, 24);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_andi_i64(t1, arg, 0xff000000);
-        tcg_gen_shli_i64(t1, t1, 8);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 8);
-        tcg_gen_andi_i64(t1, t1, 0xff000000);
-        tcg_gen_or_i64(t0, t0, t1);
-    
-        tcg_gen_shri_i64(t1, arg, 24);
-        tcg_gen_andi_i64(t1, t1, 0x00ff0000);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 40);
-        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 56);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-#endif
-
-static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_neg_i32) {
-        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(0);
-        tcg_gen_sub_i32(ret, t0, arg);
-        tcg_temp_free_i32(t0);
-    }
-}
+void tcg_gen_discard_i64(TCGv_i64 arg);
+void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
+void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+#endif /* TCG_TARGET_REG_BITS */
 
 static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
 {
     if (TCG_TARGET_HAS_neg_i64) {
         tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(0);
-        tcg_gen_sub_i64(ret, t0, arg);
-        tcg_temp_free_i64(t0);
+        tcg_gen_subfi_i64(ret, 0, arg);
     }
 }
 
-static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
+/* Size changing operations.  */
+
+void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
+void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
+void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high);
+void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg, unsigned int c);
+void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg);
+void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg);
+
+static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
 {
-    if (TCG_TARGET_HAS_not_i32) {
-        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
-    } else {
-        tcg_gen_xori_i32(ret, arg, -1);
-    }
-}
-
-static inline void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_not_i64) {
-        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg);
-    } else {
-        tcg_gen_xori_i64(ret, arg, -1);
-    }
-#else
-    tcg_gen_not_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_not_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-#endif
-}
-
-static inline void tcg_gen_discard_i32(TCGv_i32 arg)
-{
-    tcg_gen_op1_i32(INDEX_op_discard, arg);
-}
-
-static inline void tcg_gen_discard_i64(TCGv_i64 arg)
-{
-#if TCG_TARGET_REG_BITS == 32
-    tcg_gen_discard_i32(TCGV_LOW(arg));
-    tcg_gen_discard_i32(TCGV_HIGH(arg));
-#else
-    tcg_gen_op1_i64(INDEX_op_discard, arg);
-#endif
-}
-
-static inline void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_andc_i32) {
-        tcg_gen_op3_i32(INDEX_op_andc_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_not_i32(t0, arg2);
-        tcg_gen_and_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_andc_i64) {
-        tcg_gen_op3_i64(INDEX_op_andc_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_not_i64(t0, arg2);
-        tcg_gen_and_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-#else
-    tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_eqv_i32) {
-        tcg_gen_op3_i32(INDEX_op_eqv_i32, ret, arg1, arg2);
-    } else {
-        tcg_gen_xor_i32(ret, arg1, arg2);
-        tcg_gen_not_i32(ret, ret);
-    }
-}
-
-static inline void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_eqv_i64) {
-        tcg_gen_op3_i64(INDEX_op_eqv_i64, ret, arg1, arg2);
-    } else {
-        tcg_gen_xor_i64(ret, arg1, arg2);
-        tcg_gen_not_i64(ret, ret);
-    }
-#else
-    tcg_gen_eqv_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_eqv_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_nand_i32) {
-        tcg_gen_op3_i32(INDEX_op_nand_i32, ret, arg1, arg2);
-    } else {
-        tcg_gen_and_i32(ret, arg1, arg2);
-        tcg_gen_not_i32(ret, ret);
-    }
-}
-
-static inline void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_nand_i64) {
-        tcg_gen_op3_i64(INDEX_op_nand_i64, ret, arg1, arg2);
-    } else {
-        tcg_gen_and_i64(ret, arg1, arg2);
-        tcg_gen_not_i64(ret, ret);
-    }
-#else
-    tcg_gen_nand_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_nand_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_nor_i32) {
-        tcg_gen_op3_i32(INDEX_op_nor_i32, ret, arg1, arg2);
-    } else {
-        tcg_gen_or_i32(ret, arg1, arg2);
-        tcg_gen_not_i32(ret, ret);
-    }
-}
-
-static inline void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_nor_i64) {
-        tcg_gen_op3_i64(INDEX_op_nor_i64, ret, arg1, arg2);
-    } else {
-        tcg_gen_or_i64(ret, arg1, arg2);
-        tcg_gen_not_i64(ret, ret);
-    }
-#else
-    tcg_gen_nor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_nor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_orc_i32) {
-        tcg_gen_op3_i32(INDEX_op_orc_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_not_i32(t0, arg2);
-        tcg_gen_or_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_orc_i64) {
-        tcg_gen_op3_i64(INDEX_op_orc_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_not_i64(t0, arg2);
-        tcg_gen_or_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-#else
-    tcg_gen_orc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_orc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i32) {
-        tcg_gen_op3_i32(INDEX_op_rotl_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0, t1;
-
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        tcg_gen_shl_i32(t0, arg1, arg2);
-        tcg_gen_subfi_i32(t1, 32, arg2);
-        tcg_gen_shr_i32(t1, arg1, t1);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i64) {
-        tcg_gen_op3_i64(INDEX_op_rotl_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-        tcg_gen_shl_i64(t0, arg1, arg2);
-        tcg_gen_subfi_i64(t1, 64, arg2);
-        tcg_gen_shr_i64(t1, arg1, t1);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else if (TCG_TARGET_HAS_rot_i32) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_rotl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    } else {
-        TCGv_i32 t0, t1;
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        tcg_gen_shli_i32(t0, arg1, arg2);
-        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else if (TCG_TARGET_HAS_rot_i64) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_rotl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-        tcg_gen_shli_i64(t0, arg1, arg2);
-        tcg_gen_shri_i64(t1, arg1, 64 - arg2);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i32) {
-        tcg_gen_op3_i32(INDEX_op_rotr_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0, t1;
-
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        tcg_gen_shr_i32(t0, arg1, arg2);
-        tcg_gen_subfi_i32(t1, 32, arg2);
-        tcg_gen_shl_i32(t1, arg1, t1);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i64) {
-        tcg_gen_op3_i64(INDEX_op_rotr_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-        tcg_gen_shr_i64(t0, arg1, arg2);
-        tcg_gen_subfi_i64(t1, 64, arg2);
-        tcg_gen_shl_i64(t1, arg1, t1);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        tcg_gen_rotli_i32(ret, arg1, 32 - arg2);
-    }
-}
-
-static inline void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        tcg_gen_rotli_i64(ret, arg1, 64 - arg2);
-    }
-}
-
-static inline void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1,
-                                       TCGv_i32 arg2, unsigned int ofs,
-                                       unsigned int len)
-{
-    uint32_t mask;
-    TCGv_i32 t1;
-
-    tcg_debug_assert(ofs < 32);
-    tcg_debug_assert(len <= 32);
-    tcg_debug_assert(ofs + len <= 32);
-
-    if (ofs == 0 && len == 32) {
-        tcg_gen_mov_i32(ret, arg2);
-        return;
-    }
-    if (TCG_TARGET_HAS_deposit_i32 && TCG_TARGET_deposit_i32_valid(ofs, len)) {
-        tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, arg1, arg2, ofs, len);
-        return;
-    }
-
-    mask = (1u << len) - 1;
-    t1 = tcg_temp_new_i32();
-
-    if (ofs + len < 32) {
-        tcg_gen_andi_i32(t1, arg2, mask);
-        tcg_gen_shli_i32(t1, t1, ofs);
-    } else {
-        tcg_gen_shli_i32(t1, arg2, ofs);
-    }
-    tcg_gen_andi_i32(ret, arg1, ~(mask << ofs));
-    tcg_gen_or_i32(ret, ret, t1);
-
-    tcg_temp_free_i32(t1);
-}
-
-static inline void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1,
-                                       TCGv_i64 arg2, unsigned int ofs,
-                                       unsigned int len)
-{
-    uint64_t mask;
-    TCGv_i64 t1;
-
-    tcg_debug_assert(ofs < 64);
-    tcg_debug_assert(len <= 64);
-    tcg_debug_assert(ofs + len <= 64);
-
-    if (ofs == 0 && len == 64) {
-        tcg_gen_mov_i64(ret, arg2);
-        return;
-    }
-    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(ofs, len)) {
-        tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, arg1, arg2, ofs, len);
-        return;
-    }
-
-#if TCG_TARGET_REG_BITS == 32
-    if (ofs >= 32) {
-        tcg_gen_deposit_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1),
-                            TCGV_LOW(arg2), ofs - 32, len);
-        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
-        return;
-    }
-    if (ofs + len <= 32) {
-        tcg_gen_deposit_i32(TCGV_LOW(ret), TCGV_LOW(arg1),
-                            TCGV_LOW(arg2), ofs, len);
-        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
-        return;
-    }
-#endif
-
-    mask = (1ull << len) - 1;
-    t1 = tcg_temp_new_i64();
-
-    if (ofs + len < 64) {
-        tcg_gen_andi_i64(t1, arg2, mask);
-        tcg_gen_shli_i64(t1, t1, ofs);
-    } else {
-        tcg_gen_shli_i64(t1, arg2, ofs);
-    }
-    tcg_gen_andi_i64(ret, arg1, ~(mask << ofs));
-    tcg_gen_or_i64(ret, ret, t1);
-
-    tcg_temp_free_i64(t1);
-}
-
-static inline void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low,
-                                          TCGv_i32 high)
-{
-#if TCG_TARGET_REG_BITS == 32
-    tcg_gen_mov_i32(TCGV_LOW(dest), low);
-    tcg_gen_mov_i32(TCGV_HIGH(dest), high);
-#else
-    TCGv_i64 tmp = tcg_temp_new_i64();
-    /* These extensions are only needed for type correctness.
-       We may be able to do better given target specific information.  */
-    tcg_gen_extu_i32_i64(tmp, high);
-    tcg_gen_extu_i32_i64(dest, low);
-    /* If deposit is available, use it.  Otherwise use the extra
-       knowledge that we have of the zero-extensions above.  */
-    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(32, 32)) {
-        tcg_gen_deposit_i64(dest, dest, tmp, 32, 32);
-    } else {
-        tcg_gen_shli_i64(tmp, tmp, 32);
-        tcg_gen_or_i64(dest, dest, tmp);
-    }
-    tcg_temp_free_i64(tmp);
-#endif
-}
-
-static inline void tcg_gen_concat32_i64(TCGv_i64 dest, TCGv_i64 low,
-                                        TCGv_i64 high)
-{
-    tcg_gen_deposit_i64(dest, low, high, 32, 32);
+    tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
 }
 
 static inline void tcg_gen_trunc_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
@@ -2170,299 +701,31 @@
     tcg_gen_trunc_shr_i64_i32(ret, arg, 0);
 }
 
-static inline void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg)
-{
-    tcg_gen_trunc_shr_i64_i32(lo, arg, 0);
-    tcg_gen_trunc_shr_i64_i32(hi, arg, 32);
-}
+/* QEMU specific operations.  */
 
-static inline void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg)
-{
-    tcg_gen_ext32u_i64(lo, arg);
-    tcg_gen_shri_i64(hi, arg, 32);
-}
-
-static inline void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret,
-                                       TCGv_i32 c1, TCGv_i32 c2,
-                                       TCGv_i32 v1, TCGv_i32 v2)
-{
-    if (TCG_TARGET_HAS_movcond_i32) {
-        tcg_gen_op6i_i32(INDEX_op_movcond_i32, ret, c1, c2, v1, v2, cond);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        tcg_gen_setcond_i32(cond, t0, c1, c2);
-        tcg_gen_neg_i32(t0, t0);
-        tcg_gen_and_i32(t1, v1, t0);
-        tcg_gen_andc_i32(ret, v2, t0);
-        tcg_gen_or_i32(ret, ret, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
-                                       TCGv_i64 c1, TCGv_i64 c2,
-                                       TCGv_i64 v1, TCGv_i64 v2)
-{
-#if TCG_TARGET_REG_BITS == 32
-    TCGv_i32 t0 = tcg_temp_new_i32();
-    TCGv_i32 t1 = tcg_temp_new_i32();
-    tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
-                     TCGV_LOW(c1), TCGV_HIGH(c1),
-                     TCGV_LOW(c2), TCGV_HIGH(c2), cond);
-
-    if (TCG_TARGET_HAS_movcond_i32) {
-        tcg_gen_movi_i32(t1, 0);
-        tcg_gen_movcond_i32(TCG_COND_NE, TCGV_LOW(ret), t0, t1,
-                            TCGV_LOW(v1), TCGV_LOW(v2));
-        tcg_gen_movcond_i32(TCG_COND_NE, TCGV_HIGH(ret), t0, t1,
-                            TCGV_HIGH(v1), TCGV_HIGH(v2));
-    } else {
-        tcg_gen_neg_i32(t0, t0);
-
-        tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
-        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
-        tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
-
-        tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
-        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
-        tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
-    }
-    tcg_temp_free_i32(t0);
-    tcg_temp_free_i32(t1);
-#else
-    if (TCG_TARGET_HAS_movcond_i64) {
-        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_setcond_i64(cond, t0, c1, c2);
-        tcg_gen_neg_i64(t0, t0);
-        tcg_gen_and_i64(t1, v1, t0);
-        tcg_gen_andc_i64(ret, v2, t0);
-        tcg_gen_or_i64(ret, ret, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-#endif
-}
-
-static inline void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-                                    TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
-{
-    if (TCG_TARGET_HAS_add2_i32) {
-        tcg_gen_op6_i32(INDEX_op_add2_i32, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace add2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_concat_i32_i64(t0, al, ah);
-        tcg_gen_concat_i32_i64(t1, bl, bh);
-        tcg_gen_add_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-                                    TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
-{
-    if (TCG_TARGET_HAS_sub2_i32) {
-        tcg_gen_op6_i32(INDEX_op_sub2_i32, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace sub2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_concat_i32_i64(t0, al, ah);
-        tcg_gen_concat_i32_i64(t1, bl, bh);
-        tcg_gen_sub_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh,
-                                     TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_mulu2_i32) {
-        tcg_gen_op4_i32(INDEX_op_mulu2_i32, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace mulu2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_muluh_i32) {
-        TCGv_i32 t = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
-        tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
-        tcg_gen_mov_i32(rl, t);
-        tcg_temp_free_i32(t);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_extu_i32_i64(t0, arg1);
-        tcg_gen_extu_i32_i64(t1, arg2);
-        tcg_gen_mul_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh,
-                                     TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_muls2_i32) {
-        tcg_gen_op4_i32(INDEX_op_muls2_i32, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace muls2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_mulsh_i32) {
-        TCGv_i32 t = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
-        tcg_gen_op3_i32(INDEX_op_mulsh_i32, rh, arg1, arg2);
-        tcg_gen_mov_i32(rl, t);
-        tcg_temp_free_i32(t);
-    } else if (TCG_TARGET_REG_BITS == 32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        TCGv_i32 t2 = tcg_temp_new_i32();
-        TCGv_i32 t3 = tcg_temp_new_i32();
-        tcg_gen_mulu2_i32(t0, t1, arg1, arg2);
-        /* Adjust for negative inputs.  */
-        tcg_gen_sari_i32(t2, arg1, 31);
-        tcg_gen_sari_i32(t3, arg2, 31);
-        tcg_gen_and_i32(t2, t2, arg2);
-        tcg_gen_and_i32(t3, t3, arg1);
-        tcg_gen_sub_i32(rh, t1, t2);
-        tcg_gen_sub_i32(rh, rh, t3);
-        tcg_gen_mov_i32(rl, t0);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-        tcg_temp_free_i32(t2);
-        tcg_temp_free_i32(t3);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_ext_i32_i64(t0, arg1);
-        tcg_gen_ext_i32_i64(t1, arg2);
-        tcg_gen_mul_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-                                    TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
-{
-    if (TCG_TARGET_HAS_add2_i64) {
-        tcg_gen_op6_i64(INDEX_op_add2_i64, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace add2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_add_i64(t0, al, bl);
-        tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, al);
-        tcg_gen_add_i64(rh, ah, bh);
-        tcg_gen_add_i64(rh, rh, t1);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-                                    TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
-{
-    if (TCG_TARGET_HAS_sub2_i64) {
-        tcg_gen_op6_i64(INDEX_op_sub2_i64, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace sub2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_sub_i64(t0, al, bl);
-        tcg_gen_setcond_i64(TCG_COND_LTU, t1, al, bl);
-        tcg_gen_sub_i64(rh, ah, bh);
-        tcg_gen_sub_i64(rh, rh, t1);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh,
-                                     TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_mulu2_i64) {
-        tcg_gen_op4_i64(INDEX_op_mulu2_i64, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace mulu2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_muluh_i64) {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
-        tcg_gen_op3_i64(INDEX_op_muluh_i64, rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t);
-        tcg_temp_free_i64(t);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_mul_i64(t0, arg1, arg2);
-        gen_helper_muluh_i64(rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
-                                     TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_muls2_i64) {
-        tcg_gen_op4_i64(INDEX_op_muls2_i64, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace muls2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_mulsh_i64) {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
-        tcg_gen_op3_i64(INDEX_op_mulsh_i64, rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t);
-        tcg_temp_free_i64(t);
-    } else if (TCG_TARGET_HAS_mulu2_i64 || TCG_TARGET_HAS_muluh_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        TCGv_i64 t2 = tcg_temp_new_i64();
-        TCGv_i64 t3 = tcg_temp_new_i64();
-        tcg_gen_mulu2_i64(t0, t1, arg1, arg2);
-        /* Adjust for negative inputs.  */
-        tcg_gen_sari_i64(t2, arg1, 63);
-        tcg_gen_sari_i64(t3, arg2, 63);
-        tcg_gen_and_i64(t2, t2, arg2);
-        tcg_gen_and_i64(t3, t3, arg1);
-        tcg_gen_sub_i64(rh, t1, t2);
-        tcg_gen_sub_i64(rh, rh, t3);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-        tcg_temp_free_i64(t2);
-        tcg_temp_free_i64(t3);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_mul_i64(t0, arg1, arg2);
-        gen_helper_mulsh_i64(rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-/***************************************/
-/* QEMU specific operations. Their type depend on the QEMU CPU
-   type. */
 #ifndef TARGET_LONG_BITS
 #error must include QEMU headers
 #endif
 
+/* debug info: write the PC of the corresponding QEMU CPU instruction */
+static inline void tcg_gen_debug_insn_start(uint64_t pc)
+{
+    /* XXX: must really use a 32 bit size for TCGArg in all cases */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+    tcg_gen_op2ii(INDEX_op_debug_insn_start,
+                  (uint32_t)(pc), (uint32_t)(pc >> 32));
+#else
+    tcg_gen_op1i(INDEX_op_debug_insn_start, pc);
+#endif
+}
+
+static inline void tcg_gen_exit_tb(uintptr_t val)
+{
+    tcg_gen_op1i(INDEX_op_exit_tb, val);
+}
+
+void tcg_gen_goto_tb(unsigned idx);
+
 #if TARGET_LONG_BITS == 32
 #define TCGv TCGv_i32
 #define tcg_temp_new() tcg_temp_new_i32()
@@ -2473,7 +736,6 @@
 #define TCGV_UNUSED(x) TCGV_UNUSED_I32(x)
 #define TCGV_IS_UNUSED(x) TCGV_IS_UNUSED_I32(x)
 #define TCGV_EQUAL(a, b) TCGV_EQUAL_I32(a, b)
-#define tcg_add_param_tl tcg_add_param_i32
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32
 #else
@@ -2486,41 +748,10 @@
 #define TCGV_UNUSED(x) TCGV_UNUSED_I64(x)
 #define TCGV_IS_UNUSED(x) TCGV_IS_UNUSED_I64(x)
 #define TCGV_EQUAL(a, b) TCGV_EQUAL_I64(a, b)
-#define tcg_add_param_tl tcg_add_param_i64
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64
 #endif
 
-/* debug info: write the PC of the corresponding QEMU CPU instruction */
-static inline void tcg_gen_debug_insn_start(uint64_t pc)
-{
-    /* XXX: must really use a 32 bit size for TCGArg in all cases */
-#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
-    tcg_gen_op2ii(INDEX_op_debug_insn_start, 
-                  (uint32_t)(pc), (uint32_t)(pc >> 32));
-#else
-    tcg_gen_op1i(INDEX_op_debug_insn_start, pc);
-#endif
-}
-
-static inline void tcg_gen_exit_tb(uintptr_t val)
-{
-    tcg_gen_op1i(INDEX_op_exit_tb, val);
-}
-
-static inline void tcg_gen_goto_tb(unsigned idx)
-{
-    /* We only support two chained exits.  */
-    tcg_debug_assert(idx <= 1);
-#ifdef CONFIG_DEBUG_TCG
-    /* Verify that we havn't seen this numbered exit before.  */
-    tcg_debug_assert((tcg_ctx.goto_tb_issue_mask & (1 << idx)) == 0);
-    tcg_ctx.goto_tb_issue_mask |= 1 << idx;
-#endif
-    tcg_gen_op1i(INDEX_op_goto_tb, idx);
-}
-
-
 void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 042d442..13ccb60 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -27,15 +27,6 @@
  */
 
 /* predefined ops */
-DEF(end, 0, 0, 0, TCG_OPF_NOT_PRESENT) /* must be kept first */
-DEF(nop, 0, 0, 0, TCG_OPF_NOT_PRESENT)
-DEF(nop1, 0, 0, 1, TCG_OPF_NOT_PRESENT)
-DEF(nop2, 0, 0, 2, TCG_OPF_NOT_PRESENT)
-DEF(nop3, 0, 0, 3, TCG_OPF_NOT_PRESENT)
-
-/* variable number of parameters */
-DEF(nopn, 0, 0, 1, TCG_OPF_NOT_PRESENT)
-
 DEF(discard, 1, 0, 0, TCG_OPF_NOT_PRESENT)
 DEF(set_label, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_NOT_PRESENT)
 
@@ -188,13 +179,13 @@
 #define TLADDR_ARGS    (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? 1 : 2)
 #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 
-DEF(qemu_ld_i32, 1, TLADDR_ARGS, 2,
+DEF(qemu_ld_i32, 1, TLADDR_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_st_i32, 0, TLADDR_ARGS + 1, 2,
+DEF(qemu_st_i32, 0, TLADDR_ARGS + 1, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 2,
+DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
-DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 2,
+DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 
 #undef TLADDR_ARGS
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 7a84b87..585ca6c 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -407,7 +407,6 @@
     /* No temps have been previously allocated for size or locality.  */
     memset(s->free_temps, 0, sizeof(s->free_temps));
 
-    s->labels = tcg_malloc(sizeof(TCGLabel) * TCG_MAX_LABELS);
     s->nb_labels = 0;
     s->current_frame_offset = s->frame_start;
 
@@ -415,8 +414,10 @@
     s->goto_tb_issue_mask = 0;
 #endif
 
-    s->gen_opc_ptr = s->gen_opc_buf;
-    s->gen_opparam_ptr = s->gen_opparam_buf;
+    s->gen_first_op_idx = 0;
+    s->gen_last_op_idx = -1;
+    s->gen_next_op_idx = 0;
+    s->gen_next_parm_idx = 0;
 
     s->be = tcg_malloc(sizeof(TCGBackendData));
 }
@@ -703,9 +704,8 @@
 void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
                    int nargs, TCGArg *args)
 {
-    int i, real_args, nb_rets;
+    int i, real_args, nb_rets, pi, pi_first;
     unsigned sizemask, flags;
-    TCGArg *nparam;
     TCGHelperInfo *info;
 
     info = g_hash_table_lookup(s->helpers, (gpointer)func);
@@ -758,8 +758,7 @@
     }
 #endif /* TCG_TARGET_EXTEND_ARGS */
 
-    *s->gen_opc_ptr++ = INDEX_op_call;
-    nparam = s->gen_opparam_ptr++;
+    pi_first = pi = s->gen_next_parm_idx;
     if (ret != TCG_CALL_DUMMY_ARG) {
 #if defined(__sparc__) && !defined(__arch64__) \
     && !defined(CONFIG_TCG_INTERPRETER)
@@ -769,25 +768,25 @@
                two return temporaries, and reassemble below.  */
             retl = tcg_temp_new_i64();
             reth = tcg_temp_new_i64();
-            *s->gen_opparam_ptr++ = GET_TCGV_I64(reth);
-            *s->gen_opparam_ptr++ = GET_TCGV_I64(retl);
+            s->gen_opparam_buf[pi++] = GET_TCGV_I64(reth);
+            s->gen_opparam_buf[pi++] = GET_TCGV_I64(retl);
             nb_rets = 2;
         } else {
-            *s->gen_opparam_ptr++ = ret;
+            s->gen_opparam_buf[pi++] = ret;
             nb_rets = 1;
         }
 #else
         if (TCG_TARGET_REG_BITS < 64 && (sizemask & 1)) {
 #ifdef HOST_WORDS_BIGENDIAN
-            *s->gen_opparam_ptr++ = ret + 1;
-            *s->gen_opparam_ptr++ = ret;
+            s->gen_opparam_buf[pi++] = ret + 1;
+            s->gen_opparam_buf[pi++] = ret;
 #else
-            *s->gen_opparam_ptr++ = ret;
-            *s->gen_opparam_ptr++ = ret + 1;
+            s->gen_opparam_buf[pi++] = ret;
+            s->gen_opparam_buf[pi++] = ret + 1;
 #endif
             nb_rets = 2;
         } else {
-            *s->gen_opparam_ptr++ = ret;
+            s->gen_opparam_buf[pi++] = ret;
             nb_rets = 1;
         }
 #endif
@@ -801,7 +800,7 @@
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
             /* some targets want aligned 64 bit args */
             if (real_args & 1) {
-                *s->gen_opparam_ptr++ = TCG_CALL_DUMMY_ARG;
+                s->gen_opparam_buf[pi++] = TCG_CALL_DUMMY_ARG;
                 real_args++;
             }
 #endif
@@ -816,26 +815,42 @@
 	       have to get more complicated to differentiate between
 	       stack arguments and register arguments.  */
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
-            *s->gen_opparam_ptr++ = args[i] + 1;
-            *s->gen_opparam_ptr++ = args[i];
+            s->gen_opparam_buf[pi++] = args[i] + 1;
+            s->gen_opparam_buf[pi++] = args[i];
 #else
-            *s->gen_opparam_ptr++ = args[i];
-            *s->gen_opparam_ptr++ = args[i] + 1;
+            s->gen_opparam_buf[pi++] = args[i];
+            s->gen_opparam_buf[pi++] = args[i] + 1;
 #endif
             real_args += 2;
             continue;
         }
 
-        *s->gen_opparam_ptr++ = args[i];
+        s->gen_opparam_buf[pi++] = args[i];
         real_args++;
     }
-    *s->gen_opparam_ptr++ = (uintptr_t)func;
-    *s->gen_opparam_ptr++ = flags;
+    s->gen_opparam_buf[pi++] = (uintptr_t)func;
+    s->gen_opparam_buf[pi++] = flags;
 
-    *nparam = (nb_rets << 16) | real_args;
+    i = s->gen_next_op_idx;
+    tcg_debug_assert(i < OPC_BUF_SIZE);
+    tcg_debug_assert(pi <= OPPARAM_BUF_SIZE);
 
-    /* total parameters, needed to go backward in the instruction stream */
-    *s->gen_opparam_ptr++ = 1 + nb_rets + real_args + 3;
+    /* Set links for sequential allocation during translation.  */
+    s->gen_op_buf[i] = (TCGOp){
+        .opc = INDEX_op_call,
+        .callo = nb_rets,
+        .calli = real_args,
+        .args = pi_first,
+        .prev = i - 1,
+        .next = i + 1
+    };
+
+    /* Make sure the calli field didn't overflow.  */
+    tcg_debug_assert(s->gen_op_buf[i].calli == real_args);
+
+    s->gen_last_op_idx = i;
+    s->gen_next_op_idx = i + 1;
+    s->gen_next_parm_idx = pi;
 
 #if defined(__sparc__) && !defined(__arch64__) \
     && !defined(CONFIG_TCG_INTERPRETER)
@@ -870,143 +885,6 @@
 #endif /* TCG_TARGET_EXTEND_ARGS */
 }
 
-#if TCG_TARGET_REG_BITS == 32
-void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
-                        int c, int right, int arith)
-{
-    if (c == 0) {
-        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
-        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
-    } else if (c >= 32) {
-        c -= 32;
-        if (right) {
-            if (arith) {
-                tcg_gen_sari_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
-                tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), 31);
-            } else {
-                tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
-                tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-            }
-        } else {
-            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_LOW(arg1), c);
-            tcg_gen_movi_i32(TCGV_LOW(ret), 0);
-        }
-    } else {
-        TCGv_i32 t0, t1;
-
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        if (right) {
-            tcg_gen_shli_i32(t0, TCGV_HIGH(arg1), 32 - c);
-            if (arith)
-                tcg_gen_sari_i32(t1, TCGV_HIGH(arg1), c);
-            else
-                tcg_gen_shri_i32(t1, TCGV_HIGH(arg1), c);
-            tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_LOW(arg1), c);
-            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t0);
-            tcg_gen_mov_i32(TCGV_HIGH(ret), t1);
-        } else {
-            tcg_gen_shri_i32(t0, TCGV_LOW(arg1), 32 - c);
-            /* Note: ret can be the same as arg1, so we use t1 */
-            tcg_gen_shli_i32(t1, TCGV_LOW(arg1), c);
-            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), c);
-            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t0);
-            tcg_gen_mov_i32(TCGV_LOW(ret), t1);
-        }
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-#endif
-
-static inline TCGMemOp tcg_canonicalize_memop(TCGMemOp op, bool is64, bool st)
-{
-    switch (op & MO_SIZE) {
-    case MO_8:
-        op &= ~MO_BSWAP;
-        break;
-    case MO_16:
-        break;
-    case MO_32:
-        if (!is64) {
-            op &= ~MO_SIGN;
-        }
-        break;
-    case MO_64:
-        if (!is64) {
-            tcg_abort();
-        }
-        break;
-    }
-    if (st) {
-        op &= ~MO_SIGN;
-    }
-    return op;
-}
-
-void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 0, 0);
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_ld_i32;
-    tcg_add_param_i32(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
-void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 0, 1);
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_st_i32;
-    tcg_add_param_i32(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
-void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 1, 0);
-
-#if TCG_TARGET_REG_BITS == 32
-    if ((memop & MO_SIZE) < MO_64) {
-        tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
-        if (memop & MO_SIGN) {
-            tcg_gen_sari_i32(TCGV_HIGH(val), TCGV_LOW(val), 31);
-        } else {
-            tcg_gen_movi_i32(TCGV_HIGH(val), 0);
-        }
-        return;
-    }
-#endif
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_ld_i64;
-    tcg_add_param_i64(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
-void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 1, 1);
-
-#if TCG_TARGET_REG_BITS == 32
-    if ((memop & MO_SIZE) < MO_64) {
-        tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop);
-        return;
-    }
-#endif
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_st_i64;
-    tcg_add_param_i64(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
 static void tcg_reg_alloc_start(TCGContext *s)
 {
     int i;
@@ -1107,22 +985,39 @@
     [MO_BEQ]  = "beq",
 };
 
+static const char * const alignment_name[(MO_AMASK >> MO_ASHIFT) + 1] = {
+#ifdef ALIGNED_ONLY
+    [MO_UNALN >> MO_ASHIFT]    = "un+",
+    [MO_ALIGN >> MO_ASHIFT]    = "",
+#else
+    [MO_UNALN >> MO_ASHIFT]    = "",
+    [MO_ALIGN >> MO_ASHIFT]    = "al+",
+#endif
+    [MO_ALIGN_2 >> MO_ASHIFT]  = "al2+",
+    [MO_ALIGN_4 >> MO_ASHIFT]  = "al4+",
+    [MO_ALIGN_8 >> MO_ASHIFT]  = "al8+",
+    [MO_ALIGN_16 >> MO_ASHIFT] = "al16+",
+    [MO_ALIGN_32 >> MO_ASHIFT] = "al32+",
+    [MO_ALIGN_64 >> MO_ASHIFT] = "al64+",
+};
+
 void tcg_dump_ops(TCGContext *s)
 {
-    const uint16_t *opc_ptr;
-    const TCGArg *args;
-    TCGArg arg;
-    TCGOpcode c;
-    int i, k, nb_oargs, nb_iargs, nb_cargs, first_insn;
-    const TCGOpDef *def;
     char buf[128];
+    TCGOp *op;
+    int oi;
 
-    first_insn = 1;
-    opc_ptr = s->gen_opc_buf;
-    args = s->gen_opparam_buf;
-    while (opc_ptr < s->gen_opc_ptr) {
-        c = *opc_ptr++;
+    for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) {
+        int i, k, nb_oargs, nb_iargs, nb_cargs;
+        const TCGOpDef *def;
+        const TCGArg *args;
+        TCGOpcode c;
+
+        op = &s->gen_op_buf[oi];
+        c = op->opc;
         def = &tcg_op_defs[c];
+        args = &s->gen_opparam_buf[op->args];
+
         if (c == INDEX_op_debug_insn_start) {
             uint64_t pc;
 #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
@@ -1130,21 +1025,14 @@
 #else
             pc = args[0];
 #endif
-            if (!first_insn) {
+            if (oi != s->gen_first_op_idx) {
                 qemu_log("\n");
             }
             qemu_log(" ---- 0x%" PRIx64, pc);
-            first_insn = 0;
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
-            nb_cargs = def->nb_cargs;
         } else if (c == INDEX_op_call) {
-            TCGArg arg;
-
             /* variable number of arguments */
-            arg = *args++;
-            nb_oargs = arg >> 16;
-            nb_iargs = arg & 0xffff;
+            nb_oargs = op->callo;
+            nb_iargs = op->calli;
             nb_cargs = def->nb_cargs;
 
             /* function name, flags, out args */
@@ -1165,26 +1053,20 @@
             }
         } else {
             qemu_log(" %s ", def->name);
-            if (c == INDEX_op_nopn) {
-                /* variable number of arguments */
-                nb_cargs = *args;
-                nb_oargs = 0;
-                nb_iargs = 0;
-            } else {
-                nb_oargs = def->nb_oargs;
-                nb_iargs = def->nb_iargs;
-                nb_cargs = def->nb_cargs;
-            }
-            
+
+            nb_oargs = def->nb_oargs;
+            nb_iargs = def->nb_iargs;
+            nb_cargs = def->nb_cargs;
+
             k = 0;
-            for(i = 0; i < nb_oargs; i++) {
+            for (i = 0; i < nb_oargs; i++) {
                 if (k != 0) {
                     qemu_log(",");
                 }
                 qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
                                                    args[k++]));
             }
-            for(i = 0; i < nb_iargs; i++) {
+            for (i = 0; i < nb_iargs; i++) {
                 if (k != 0) {
                     qemu_log(",");
                 }
@@ -1211,27 +1093,34 @@
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_ld_i64:
             case INDEX_op_qemu_st_i64:
-                if (args[k] < ARRAY_SIZE(ldst_name) && ldst_name[args[k]]) {
-                    qemu_log(",%s", ldst_name[args[k++]]);
-                } else {
-                    qemu_log(",$0x%" TCG_PRIlx, args[k++]);
+                {
+                    TCGMemOpIdx oi = args[k++];
+                    TCGMemOp op = get_memop(oi);
+                    unsigned ix = get_mmuidx(oi);
+
+                    if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) {
+                        qemu_log(",$0x%x,%u", op, ix);
+                    } else {
+                        const char *s_al, *s_op;
+                        s_al = alignment_name[(op & MO_AMASK) >> MO_ASHIFT];
+                        s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)];
+                        qemu_log(",%s%s,%u", s_al, s_op, ix);
+                    }
+                    i = 1;
                 }
-                i = 1;
                 break;
             default:
                 i = 0;
                 break;
             }
-            for(; i < nb_cargs; i++) {
+            for (; i < nb_cargs; i++) {
                 if (k != 0) {
                     qemu_log(",");
                 }
-                arg = args[k++];
-                qemu_log("$0x%" TCG_PRIlx, arg);
+                qemu_log("$0x%" TCG_PRIlx, args[k++]);
             }
         }
         qemu_log("\n");
-        args += nb_iargs + nb_oargs + nb_cargs;
     }
 }
 
@@ -1380,21 +1269,30 @@
 #endif
 }
 
-#ifdef USE_LIVENESS_ANALYSIS
-
-/* set a nop for an operation using 'nb_args' */
-static inline void tcg_set_nop(TCGContext *s, uint16_t *opc_ptr, 
-                               TCGArg *args, int nb_args)
+void tcg_op_remove(TCGContext *s, TCGOp *op)
 {
-    if (nb_args == 0) {
-        *opc_ptr = INDEX_op_nop;
+    int next = op->next;
+    int prev = op->prev;
+
+    if (next >= 0) {
+        s->gen_op_buf[next].prev = prev;
     } else {
-        *opc_ptr = INDEX_op_nopn;
-        args[0] = nb_args;
-        args[nb_args - 1] = nb_args;
+        s->gen_last_op_idx = prev;
     }
+    if (prev >= 0) {
+        s->gen_op_buf[prev].next = next;
+    } else {
+        s->gen_first_op_idx = next;
+    }
+
+    memset(op, -1, sizeof(*op));
+
+#ifdef CONFIG_PROFILER
+    s->del_op_count++;
+#endif
 }
 
+#ifdef USE_LIVENESS_ANALYSIS
 /* liveness analysis: end of function: all temps are dead, and globals
    should be in memory. */
 static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps,
@@ -1424,19 +1322,10 @@
    temporaries are removed. */
 static void tcg_liveness_analysis(TCGContext *s)
 {
-    int i, op_index, nb_args, nb_iargs, nb_oargs, nb_ops;
-    TCGOpcode op, op_new, op_new2;
-    TCGArg *args, arg;
-    const TCGOpDef *def;
     uint8_t *dead_temps, *mem_temps;
-    uint16_t dead_args;
-    uint8_t sync_args;
-    bool have_op_new2;
-    
-    s->gen_opc_ptr++; /* skip end */
+    int oi, oi_prev, nb_ops;
 
-    nb_ops = s->gen_opc_ptr - s->gen_opc_buf;
-
+    nb_ops = s->gen_next_op_idx;
     s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t));
     s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t));
     
@@ -1444,25 +1333,31 @@
     mem_temps = tcg_malloc(s->nb_temps);
     tcg_la_func_end(s, dead_temps, mem_temps);
 
-    args = s->gen_opparam_ptr;
-    op_index = nb_ops - 1;
-    while (op_index >= 0) {
-        op = s->gen_opc_buf[op_index];
-        def = &tcg_op_defs[op];
-        switch(op) {
+    for (oi = s->gen_last_op_idx; oi >= 0; oi = oi_prev) {
+        int i, nb_iargs, nb_oargs;
+        TCGOpcode opc_new, opc_new2;
+        bool have_opc_new2;
+        uint16_t dead_args;
+        uint8_t sync_args;
+        TCGArg arg;
+
+        TCGOp * const op = &s->gen_op_buf[oi];
+        TCGArg * const args = &s->gen_opparam_buf[op->args];
+        TCGOpcode opc = op->opc;
+        const TCGOpDef *def = &tcg_op_defs[opc];
+
+        oi_prev = op->prev;
+
+        switch (opc) {
         case INDEX_op_call:
             {
                 int call_flags;
 
-                nb_args = args[-1];
-                args -= nb_args;
-                arg = *args++;
-                nb_iargs = arg & 0xffff;
-                nb_oargs = arg >> 16;
+                nb_oargs = op->callo;
+                nb_iargs = op->calli;
                 call_flags = args[nb_oargs + nb_iargs + 1];
 
-                /* pure functions can be removed if their result is not
-                   used */
+                /* pure functions can be removed if their result is unused */
                 if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
                     for (i = 0; i < nb_oargs; i++) {
                         arg = args[i];
@@ -1470,8 +1365,7 @@
                             goto do_not_remove_call;
                         }
                     }
-                    tcg_set_nop(s, s->gen_opc_buf + op_index,
-                                args - 1, nb_args);
+                    goto do_remove;
                 } else {
                 do_not_remove_call:
 
@@ -1500,51 +1394,45 @@
                         memset(dead_temps, 1, s->nb_globals);
                     }
 
-                    /* input args are live */
+                    /* record arguments that die in this helper */
                     for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
                         arg = args[i];
                         if (arg != TCG_CALL_DUMMY_ARG) {
                             if (dead_temps[arg]) {
                                 dead_args |= (1 << i);
                             }
-                            dead_temps[arg] = 0;
                         }
                     }
-                    s->op_dead_args[op_index] = dead_args;
-                    s->op_sync_args[op_index] = sync_args;
+                    /* input arguments are live for preceeding opcodes */
+                    for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                        arg = args[i];
+                        dead_temps[arg] = 0;
+                    }
+                    s->op_dead_args[oi] = dead_args;
+                    s->op_sync_args[oi] = sync_args;
                 }
-                args--;
             }
             break;
         case INDEX_op_debug_insn_start:
-            args -= def->nb_args;
-            break;
-        case INDEX_op_nopn:
-            nb_args = args[-1];
-            args -= nb_args;
             break;
         case INDEX_op_discard:
-            args--;
             /* mark the temporary as dead */
             dead_temps[args[0]] = 1;
             mem_temps[args[0]] = 0;
             break;
-        case INDEX_op_end:
-            break;
 
         case INDEX_op_add2_i32:
-            op_new = INDEX_op_add_i32;
+            opc_new = INDEX_op_add_i32;
             goto do_addsub2;
         case INDEX_op_sub2_i32:
-            op_new = INDEX_op_sub_i32;
+            opc_new = INDEX_op_sub_i32;
             goto do_addsub2;
         case INDEX_op_add2_i64:
-            op_new = INDEX_op_add_i64;
+            opc_new = INDEX_op_add_i64;
             goto do_addsub2;
         case INDEX_op_sub2_i64:
-            op_new = INDEX_op_sub_i64;
+            opc_new = INDEX_op_sub_i64;
         do_addsub2:
-            args -= 6;
             nb_iargs = 4;
             nb_oargs = 2;
             /* Test if the high part of the operation is dead, but not
@@ -1555,12 +1443,11 @@
                 if (dead_temps[args[0]] && !mem_temps[args[0]]) {
                     goto do_remove;
                 }
-                /* Create the single operation plus nop.  */
-                s->gen_opc_buf[op_index] = op = op_new;
+                /* Replace the opcode and adjust the args in place,
+                   leaving 3 unused args at the end.  */
+                op->opc = opc = opc_new;
                 args[1] = args[2];
                 args[2] = args[4];
-                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
-                tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 3);
                 /* Fall through and mark the single-word operation live.  */
                 nb_iargs = 2;
                 nb_oargs = 1;
@@ -1568,27 +1455,26 @@
             goto do_not_remove;
 
         case INDEX_op_mulu2_i32:
-            op_new = INDEX_op_mul_i32;
-            op_new2 = INDEX_op_muluh_i32;
-            have_op_new2 = TCG_TARGET_HAS_muluh_i32;
+            opc_new = INDEX_op_mul_i32;
+            opc_new2 = INDEX_op_muluh_i32;
+            have_opc_new2 = TCG_TARGET_HAS_muluh_i32;
             goto do_mul2;
         case INDEX_op_muls2_i32:
-            op_new = INDEX_op_mul_i32;
-            op_new2 = INDEX_op_mulsh_i32;
-            have_op_new2 = TCG_TARGET_HAS_mulsh_i32;
+            opc_new = INDEX_op_mul_i32;
+            opc_new2 = INDEX_op_mulsh_i32;
+            have_opc_new2 = TCG_TARGET_HAS_mulsh_i32;
             goto do_mul2;
         case INDEX_op_mulu2_i64:
-            op_new = INDEX_op_mul_i64;
-            op_new2 = INDEX_op_muluh_i64;
-            have_op_new2 = TCG_TARGET_HAS_muluh_i64;
+            opc_new = INDEX_op_mul_i64;
+            opc_new2 = INDEX_op_muluh_i64;
+            have_opc_new2 = TCG_TARGET_HAS_muluh_i64;
             goto do_mul2;
         case INDEX_op_muls2_i64:
-            op_new = INDEX_op_mul_i64;
-            op_new2 = INDEX_op_mulsh_i64;
-            have_op_new2 = TCG_TARGET_HAS_mulsh_i64;
+            opc_new = INDEX_op_mul_i64;
+            opc_new2 = INDEX_op_mulsh_i64;
+            have_opc_new2 = TCG_TARGET_HAS_mulsh_i64;
             goto do_mul2;
         do_mul2:
-            args -= 4;
             nb_iargs = 2;
             nb_oargs = 2;
             if (dead_temps[args[1]] && !mem_temps[args[1]]) {
@@ -1597,28 +1483,25 @@
                     goto do_remove;
                 }
                 /* The high part of the operation is dead; generate the low. */
-                s->gen_opc_buf[op_index] = op = op_new;
+                op->opc = opc = opc_new;
                 args[1] = args[2];
                 args[2] = args[3];
-            } else if (have_op_new2 && dead_temps[args[0]]
+            } else if (have_opc_new2 && dead_temps[args[0]]
                        && !mem_temps[args[0]]) {
-                /* The low part of the operation is dead; generate the high.  */
-                s->gen_opc_buf[op_index] = op = op_new2;
+                /* The low part of the operation is dead; generate the high. */
+                op->opc = opc = opc_new2;
                 args[0] = args[1];
                 args[1] = args[2];
                 args[2] = args[3];
             } else {
                 goto do_not_remove;
             }
-            assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
-            tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 1);
             /* Mark the single-word operation live.  */
             nb_oargs = 1;
             goto do_not_remove;
 
         default:
             /* XXX: optimize by hardcoding common cases (e.g. triadic ops) */
-            args -= def->nb_args;
             nb_iargs = def->nb_iargs;
             nb_oargs = def->nb_oargs;
 
@@ -1626,24 +1509,20 @@
                its outputs are dead. We assume that nb_oargs == 0
                implies side effects */
             if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) {
-                for(i = 0; i < nb_oargs; i++) {
+                for (i = 0; i < nb_oargs; i++) {
                     arg = args[i];
                     if (!dead_temps[arg] || mem_temps[arg]) {
                         goto do_not_remove;
                     }
                 }
             do_remove:
-                tcg_set_nop(s, s->gen_opc_buf + op_index, args, def->nb_args);
-#ifdef CONFIG_PROFILER
-                s->del_op_count++;
-#endif
+                tcg_op_remove(s, op);
             } else {
             do_not_remove:
-
                 /* output args are dead */
                 dead_args = 0;
                 sync_args = 0;
-                for(i = 0; i < nb_oargs; i++) {
+                for (i = 0; i < nb_oargs; i++) {
                     arg = args[i];
                     if (dead_temps[arg]) {
                         dead_args |= (1 << i);
@@ -1663,24 +1542,23 @@
                     memset(mem_temps, 1, s->nb_globals);
                 }
 
-                /* input args are live */
-                for(i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                /* record arguments that die in this opcode */
+                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
                     arg = args[i];
                     if (dead_temps[arg]) {
                         dead_args |= (1 << i);
                     }
+                }
+                /* input arguments are live for preceeding opcodes */
+                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                    arg = args[i];
                     dead_temps[arg] = 0;
                 }
-                s->op_dead_args[op_index] = dead_args;
-                s->op_sync_args[op_index] = sync_args;
+                s->op_dead_args[oi] = dead_args;
+                s->op_sync_args[oi] = sync_args;
             }
             break;
         }
-        op_index--;
-    }
-
-    if (args != s->gen_opparam_buf) {
-        tcg_abort();
     }
 }
 #else
@@ -2144,6 +2022,16 @@
                 if (!IS_DEAD_ARG(i)) {
                     goto allocate_in_reg;
                 }
+                /* check if the current register has already been allocated
+                   for another input aliased to an output */
+                int k2, i2;
+                for (k2 = 0 ; k2 < k ; k2++) {
+                    i2 = def->sorted_args[nb_oargs + k2];
+                    if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
+                        (new_args[i2] == ts->reg)) {
+                        goto allocate_in_reg;
+                    }
+                }
             }
         }
         reg = ts->reg;
@@ -2247,11 +2135,11 @@
 #define STACK_DIR(x) (x)
 #endif
 
-static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
-                              TCGOpcode opc, const TCGArg *args,
-                              uint16_t dead_args, uint8_t sync_args)
+static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
+                               const TCGArg * const args, uint16_t dead_args,
+                               uint8_t sync_args)
 {
-    int nb_iargs, nb_oargs, flags, nb_regs, i, reg, nb_params;
+    int flags, nb_regs, i, reg;
     TCGArg arg;
     TCGTemp *ts;
     intptr_t stack_offset;
@@ -2260,22 +2148,16 @@
     int allocate_args;
     TCGRegSet allocated_regs;
 
-    arg = *args++;
-
-    nb_oargs = arg >> 16;
-    nb_iargs = arg & 0xffff;
-    nb_params = nb_iargs;
-
     func_addr = (tcg_insn_unit *)(intptr_t)args[nb_oargs + nb_iargs];
     flags = args[nb_oargs + nb_iargs + 1];
 
     nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
-    if (nb_regs > nb_params) {
-        nb_regs = nb_params;
+    if (nb_regs > nb_iargs) {
+        nb_regs = nb_iargs;
     }
 
     /* assign stack slots first */
-    call_stack_size = (nb_params - nb_regs) * sizeof(tcg_target_long);
+    call_stack_size = (nb_iargs - nb_regs) * sizeof(tcg_target_long);
     call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) & 
         ~(TCG_TARGET_STACK_ALIGN - 1);
     allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE);
@@ -2286,7 +2168,7 @@
     }
 
     stack_offset = TCG_TARGET_CALL_STACK_OFFSET;
-    for(i = nb_regs; i < nb_params; i++) {
+    for(i = nb_regs; i < nb_iargs; i++) {
         arg = args[nb_oargs + i];
 #ifdef TCG_TARGET_STACK_GROWSUP
         stack_offset -= sizeof(tcg_target_long);
@@ -2393,8 +2275,6 @@
             }
         }
     }
-    
-    return nb_iargs + nb_oargs + def->nb_cargs + 1;
 }
 
 #ifdef CONFIG_PROFILER
@@ -2405,7 +2285,7 @@
 {
     int i;
 
-    for(i = INDEX_op_end; i < NB_OPS; i++) {
+    for (i = 0; i < NB_OPS; i++) {
         qemu_log("%s %" PRId64 "\n", tcg_op_defs[i].name, tcg_table_op_count[i]);
     }
 }
@@ -2416,10 +2296,7 @@
                                       tcg_insn_unit *gen_code_buf,
                                       long search_pc)
 {
-    TCGOpcode opc;
-    int op_index;
-    const TCGOpDef *def;
-    const TCGArg *args;
+    int oi, oi_next;
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
@@ -2434,8 +2311,7 @@
 #endif
 
 #ifdef USE_TCG_OPTIMIZATIONS
-    s->gen_opparam_ptr =
-        tcg_optimize(s, s->gen_opc_ptr, s->gen_opparam_buf, tcg_op_defs);
+    tcg_optimize(s);
 #endif
 
 #ifdef CONFIG_PROFILER
@@ -2464,42 +2340,30 @@
 
     tcg_out_tb_init(s);
 
-    args = s->gen_opparam_buf;
-    op_index = 0;
+    for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
+        TCGOp * const op = &s->gen_op_buf[oi];
+        TCGArg * const args = &s->gen_opparam_buf[op->args];
+        TCGOpcode opc = op->opc;
+        const TCGOpDef *def = &tcg_op_defs[opc];
+        uint16_t dead_args = s->op_dead_args[oi];
+        uint8_t sync_args = s->op_sync_args[oi];
 
-    for(;;) {
-        opc = s->gen_opc_buf[op_index];
+        oi_next = op->next;
 #ifdef CONFIG_PROFILER
         tcg_table_op_count[opc]++;
 #endif
-        def = &tcg_op_defs[opc];
-#if 0
-        printf("%s: %d %d %d\n", def->name,
-               def->nb_oargs, def->nb_iargs, def->nb_cargs);
-        //        dump_regs(s);
-#endif
-        switch(opc) {
+
+        switch (opc) {
         case INDEX_op_mov_i32:
         case INDEX_op_mov_i64:
-            tcg_reg_alloc_mov(s, def, args, s->op_dead_args[op_index],
-                              s->op_sync_args[op_index]);
+            tcg_reg_alloc_mov(s, def, args, dead_args, sync_args);
             break;
         case INDEX_op_movi_i32:
         case INDEX_op_movi_i64:
-            tcg_reg_alloc_movi(s, args, s->op_dead_args[op_index],
-                               s->op_sync_args[op_index]);
+            tcg_reg_alloc_movi(s, args, dead_args, sync_args);
             break;
         case INDEX_op_debug_insn_start:
-            /* debug instruction */
             break;
-        case INDEX_op_nop:
-        case INDEX_op_nop1:
-        case INDEX_op_nop2:
-        case INDEX_op_nop3:
-            break;
-        case INDEX_op_nopn:
-            args += args[0];
-            goto next;
         case INDEX_op_discard:
             temp_dead(s, args[0]);
             break;
@@ -2508,12 +2372,9 @@
             tcg_out_label(s, args[0], s->code_ptr);
             break;
         case INDEX_op_call:
-            args += tcg_reg_alloc_call(s, def, opc, args,
-                                       s->op_dead_args[op_index],
-                                       s->op_sync_args[op_index]);
-            goto next;
-        case INDEX_op_end:
-            goto the_end;
+            tcg_reg_alloc_call(s, op->callo, op->calli, args,
+                               dead_args, sync_args);
+            break;
         default:
             /* Sanity check that we've not introduced any unhandled opcodes. */
             if (def->flags & TCG_OPF_NOT_PRESENT) {
@@ -2522,21 +2383,17 @@
             /* Note: in order to speed up the code, it would be much
                faster to have specialized register allocator functions for
                some common argument patterns */
-            tcg_reg_alloc_op(s, def, opc, args, s->op_dead_args[op_index],
-                             s->op_sync_args[op_index]);
+            tcg_reg_alloc_op(s, def, opc, args, dead_args, sync_args);
             break;
         }
-        args += def->nb_args;
-    next:
         if (search_pc >= 0 && search_pc < tcg_current_code_size(s)) {
-            return op_index;
+            return oi;
         }
-        op_index++;
 #ifndef NDEBUG
         check_regs(s);
 #endif
     }
- the_end:
+
     /* Generate TB finalization at the end of block */
     tcg_out_tb_finalize(s);
     return -1;
@@ -2547,14 +2404,18 @@
 #ifdef CONFIG_PROFILER
     {
         int n;
-        n = (s->gen_opc_ptr - s->gen_opc_buf);
-        s->op_count += n;
-        if (n > s->op_count_max)
-            s->op_count_max = n;
 
-        s->temp_count += s->nb_temps;
-        if (s->nb_temps > s->temp_count_max)
-            s->temp_count_max = s->nb_temps;
+        n = s->gen_last_op_idx + 1;
+        s->op_count += n;
+        if (n > s->op_count_max) {
+            s->op_count_max = n;
+        }
+
+        n = s->nb_temps;
+        s->temp_count += n;
+        if (n > s->temp_count_max) {
+            s->temp_count_max = n;
+        }
     }
 #endif
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 7285f71..b27ee14 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -159,6 +159,15 @@
 #endif
 
 
+#ifdef CONFIG_DEBUG_TCG
+# define tcg_debug_assert(X) do { assert(X); } while (0)
+#elif QEMU_GNUC_PREREQ(4, 5)
+# define tcg_debug_assert(X) \
+    do { if (!(X)) { __builtin_unreachable(); } } while (0)
+#else
+# define tcg_debug_assert(X) do { (void)(X); } while (0)
+#endif
+
 typedef struct TCGRelocation {
     struct TCGRelocation *next;
     int type;
@@ -242,6 +251,41 @@
     MO_TE    = MO_LE,
 #endif
 
+    /* MO_UNALN accesses are never checked for alignment.
+     * MO_ALIGN accesses will result in a call to the CPU's
+     * do_unaligned_access hook if the guest address is not aligned.
+     * The default depends on whether the target CPU defines ALIGNED_ONLY.
+     * Some architectures (e.g. ARMv8) need the address which is aligned
+     * to a size more than the size of the memory access.
+     * To support such check it's enough the current costless alignment
+     * check implementation in QEMU, but we need to support
+     * an alignment size specifying.
+     * MO_ALIGN supposes a natural alignment
+     * (i.e. the alignment size is the size of a memory access).
+     * Note that an alignment size must be equal or greater
+     * than an access size.
+     * There are three options:
+     * - an alignment to the size of an access (MO_ALIGN);
+     * - an alignment to the specified size that is equal or greater than
+     *   an access size (MO_ALIGN_x where 'x' is a size in bytes);
+     * - unaligned access permitted (MO_UNALN).
+     */
+    MO_ASHIFT = 4,
+    MO_AMASK = 7 << MO_ASHIFT,
+#ifdef ALIGNED_ONLY
+    MO_ALIGN = 0,
+    MO_UNALN = MO_AMASK,
+#else
+    MO_ALIGN = MO_AMASK,
+    MO_UNALN = 0,
+#endif
+    MO_ALIGN_2  = 1 << MO_ASHIFT,
+    MO_ALIGN_4  = 2 << MO_ASHIFT,
+    MO_ALIGN_8  = 3 << MO_ASHIFT,
+    MO_ALIGN_16 = 4 << MO_ASHIFT,
+    MO_ALIGN_32 = 5 << MO_ASHIFT,
+    MO_ALIGN_64 = 6 << MO_ASHIFT,
+
     /* Combinations of the above, for ease of use.  */
     MO_UB    = MO_8,
     MO_UW    = MO_16,
@@ -272,6 +316,45 @@
     MO_SSIZE = MO_SIZE | MO_SIGN,
 } TCGMemOp;
 
+/**
+ * get_alignment_bits
+ * @memop: TCGMemOp value
+ *
+ * Extract the alignment size from the memop.
+ *
+ * Returns: 0 in case of byte access (which is always aligned);
+ *          positive value - number of alignment bits;
+ *          negative value if unaligned access enabled
+ *          and this is not a byte access.
+ */
+static inline int get_alignment_bits(TCGMemOp memop)
+{
+    int a = memop & MO_AMASK;
+    int s = memop & MO_SIZE;
+    int r;
+
+    if (a == MO_UNALN) {
+        /* Negative value if unaligned access enabled,
+         * or zero value in case of byte access.
+         */
+        return -s;
+    } else if (a == MO_ALIGN) {
+        /* A natural alignment: return a number of access size bits */
+        r = s;
+    } else {
+        /* Specific alignment size. It must be equal or greater
+         * than the access size.
+         */
+        r = a >> MO_ASHIFT;
+        tcg_debug_assert(r >= s);
+    }
+#if defined(CONFIG_SOFTMMU)
+    /* The requested alignment cannot overlap the TLB flags.  */
+    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << r) - 1)) == 0);
+#endif
+    return r;
+}
+
 typedef tcg_target_ulong TCGArg;
 
 /* Define a type and accessor macros for variables.  Using pointer types
@@ -448,10 +531,28 @@
     unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
 } TCGTempSet;
 
+typedef struct TCGOp {
+    TCGOpcode opc   : 8;
+
+    /* The number of out and in parameter for a call.  */
+    unsigned callo  : 2;
+    unsigned calli  : 6;
+
+    /* Index of the arguments for this op, or -1 for zero-operand ops.  */
+    signed args     : 16;
+
+    /* Index of the prex/next op, or -1 for the end of the list.  */
+    signed prev     : 16;
+    signed next     : 16;
+} TCGOp;
+
+QEMU_BUILD_BUG_ON(NB_OPS > 0xff);
+QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff);
+QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff);
+
 struct TCGContext {
     uint8_t *pool_cur, *pool_end;
     TCGPool *pool_first, *pool_current, *pool_first_large;
-    TCGLabel *labels;
     int nb_labels;
     int nb_globals;
     int nb_temps;
@@ -469,9 +570,6 @@
                                corresponding output argument needs to be
                                sync to memory. */
     
-    /* tells in which temporary a given register is. It does not take
-       into account fixed registers */
-    int reg_to_temp[TCG_TARGET_NB_REGS];
     TCGRegSet reserved_regs;
     intptr_t current_frame_offset;
     intptr_t frame_start;
@@ -479,8 +577,6 @@
     int frame_reg;
 
     tcg_insn_unit *code_ptr;
-    TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
-    TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
 
     GHashTable *helpers;
 
@@ -508,14 +604,10 @@
     int goto_tb_issue_mask;
 #endif
 
-    uint16_t gen_opc_buf[OPC_BUF_SIZE];
-    TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
-
-    uint16_t *gen_opc_ptr;
-    TCGArg *gen_opparam_ptr;
-    target_ulong gen_opc_pc[OPC_BUF_SIZE];
-    uint16_t gen_opc_icount[OPC_BUF_SIZE];
-    uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
+    int gen_first_op_idx;
+    int gen_last_op_idx;
+    int gen_next_op_idx;
+    int gen_next_parm_idx;
 
     /* Code generation.  Note that we specifically do not use tcg_insn_unit
        here, because there's too much arithmetic throughout that relies
@@ -533,10 +625,38 @@
 
     /* The TCGBackendData structure is private to tcg-target.c.  */
     struct TCGBackendData *be;
+
+    TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
+    TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
+
+    /* tells in which temporary a given register is. It does not take
+       into account fixed registers */
+    int reg_to_temp[TCG_TARGET_NB_REGS];
+
+    TCGOp gen_op_buf[OPC_BUF_SIZE];
+    TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
+
+    target_ulong gen_opc_pc[OPC_BUF_SIZE];
+    uint16_t gen_opc_icount[OPC_BUF_SIZE];
+    uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
+
+    TCGLabel labels[TCG_MAX_LABELS];
 };
 
 extern TCGContext tcg_ctx;
 
+/* The number of opcodes emitted so far.  */
+static inline int tcg_op_buf_count(void)
+{
+    return tcg_ctx.gen_next_op_idx;
+}
+
+/* Test for whether to terminate the TB for using too many opcodes.  */
+static inline bool tcg_op_buf_full(void)
+{
+    return tcg_op_buf_count() >= OPC_MAX_SIZE;
+}
+
 /* pool based memory allocation */
 
 void *tcg_malloc_internal(TCGContext *s, int size);
@@ -667,15 +787,6 @@
     abort();\
 } while (0)
 
-#ifdef CONFIG_DEBUG_TCG
-# define tcg_debug_assert(X) do { assert(X); } while (0)
-#elif QEMU_GNUC_PREREQ(4, 5)
-# define tcg_debug_assert(X) \
-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
-#else
-# define tcg_debug_assert(X) do { (void)(X); } while (0)
-#endif
-
 void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs);
 
 #if UINTPTR_MAX == UINT32_MAX
@@ -705,11 +816,8 @@
 void tcg_gen_callN(TCGContext *s, void *func,
                    TCGArg ret, int nargs, TCGArg *args);
 
-void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
-                        int c, int right, int arith);
-
-TCGArg *tcg_optimize(TCGContext *s, uint16_t *tcg_opc_ptr, TCGArg *args,
-                     TCGOpDef *tcg_op_def);
+void tcg_op_remove(TCGContext *s, TCGOp *op);
+void tcg_optimize(TCGContext *s);
 
 /* only used for debugging purposes */
 void tcg_dump_ops(TCGContext *s);
@@ -765,6 +873,44 @@
     return tcg_ptr_byte_diff(s->code_ptr, s->code_buf);
 }
 
+/* Combine the TCGMemOp and mmu_idx parameters into a single value.  */
+typedef uint32_t TCGMemOpIdx;
+
+/**
+ * make_memop_idx
+ * @op: memory operation
+ * @idx: mmu index
+ *
+ * Encode these values into a single parameter.
+ */
+static inline TCGMemOpIdx make_memop_idx(TCGMemOp op, unsigned idx)
+{
+    tcg_debug_assert(idx <= 15);
+    return (op << 4) | idx;
+}
+
+/**
+ * get_memop
+ * @oi: combined op/idx parameter
+ *
+ * Extract the memory operation from the combined value.
+ */
+static inline TCGMemOp get_memop(TCGMemOpIdx oi)
+{
+    return oi >> 4;
+}
+
+/**
+ * get_mmuidx
+ * @oi: combined op/idx parameter
+ *
+ * Extract the mmu index from the combined value.
+ */
+static inline unsigned get_mmuidx(TCGMemOpIdx oi)
+{
+    return oi & 15;
+}
+
 /**
  * tcg_qemu_tb_exec:
  * @env: CPUArchState * for the CPU
@@ -826,46 +972,46 @@
 #ifdef CONFIG_SOFTMMU
 /* Value zero-extended to tcg register size.  */
 tcg_target_ulong helper_ret_ldub_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_lduw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_ldul_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 uint64_t helper_le_ldq_mmu(CPUArchState *env, target_ulong addr,
-                           int mmu_idx, uintptr_t retaddr);
+                           TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_lduw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_ldul_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 uint64_t helper_be_ldq_mmu(CPUArchState *env, target_ulong addr,
-                           int mmu_idx, uintptr_t retaddr);
+                           TCGMemOpIdx oi, uintptr_t retaddr);
 
 /* Value sign-extended to tcg register size.  */
 tcg_target_ulong helper_ret_ldsb_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_ldsw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_ldsl_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_ldsw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 
 void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
-                        int mmu_idx, uintptr_t retaddr);
+                        TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_le_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_le_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 
 /* Temporary aliases until backends are converted.  */
 #ifdef TARGET_WORDS_BIGENDIAN
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 03a7b46..def4311 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -764,9 +764,6 @@
             tcg_out_r(s, *args++);
         }
         tcg_out_i(s, *args++);
-#ifdef CONFIG_SOFTMMU
-        tcg_out_i(s, *args);
-#endif
         break;
     case INDEX_op_qemu_ld_i64:
         tcg_out_r(s, *args++);
@@ -778,9 +775,6 @@
             tcg_out_r(s, *args++);
         }
         tcg_out_i(s, *args++);
-#ifdef CONFIG_SOFTMMU
-        tcg_out_i(s, *args);
-#endif
         break;
     case INDEX_op_qemu_st_i32:
         tcg_out_r(s, *args++);
diff --git a/tci.c b/tci.c
index 4711ee4..8444948 100644
--- a/tci.c
+++ b/tci.c
@@ -420,35 +420,34 @@
 }
 
 #ifdef CONFIG_SOFTMMU
-# define mmuidx          tci_read_i(&tb_ptr)
 # define qemu_ld_ub \
-    helper_ret_ldub_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_ret_ldub_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_leuw \
-    helper_le_lduw_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_leul \
-    helper_le_ldul_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_leq \
-    helper_le_ldq_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_beuw \
-    helper_be_lduw_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_beul \
-    helper_be_ldul_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_beq \
-    helper_be_ldq_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_st_b(X) \
-    helper_ret_stb_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_ret_stb_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_lew(X) \
-    helper_le_stw_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_lel(X) \
-    helper_le_stl_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_leq(X) \
-    helper_le_stq_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_bew(X) \
-    helper_be_stw_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_bel(X) \
-    helper_be_stl_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_beq(X) \
-    helper_be_stq_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 #else
 # define qemu_ld_ub      ldub_p(g2h(taddr))
 # define qemu_ld_leuw    lduw_le_p(g2h(taddr))
@@ -496,7 +495,7 @@
 #if TCG_TARGET_REG_BITS == 32
         uint64_t v64;
 #endif
-        TCGMemOp memop;
+        TCGMemOpIdx oi;
 
 #if defined(GETPC)
         tci_tb_ptr = (uintptr_t)tb_ptr;
@@ -506,19 +505,6 @@
         tb_ptr += 2;
 
         switch (opc) {
-        case INDEX_op_end:
-        case INDEX_op_nop:
-            break;
-        case INDEX_op_nop1:
-        case INDEX_op_nop2:
-        case INDEX_op_nop3:
-        case INDEX_op_nopn:
-        case INDEX_op_discard:
-            TODO();
-            break;
-        case INDEX_op_set_label:
-            TODO();
-            break;
         case INDEX_op_call:
             t0 = tci_read_ri(&tb_ptr);
 #if TCG_TARGET_REG_BITS == 32
@@ -1120,8 +1106,8 @@
         case INDEX_op_qemu_ld_i32:
             t0 = *tb_ptr++;
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SSIZE)) {
             case MO_UB:
                 tmp32 = qemu_ld_ub;
                 break;
@@ -1157,8 +1143,8 @@
                 t1 = *tb_ptr++;
             }
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SSIZE)) {
             case MO_UB:
                 tmp64 = qemu_ld_ub;
                 break;
@@ -1206,8 +1192,8 @@
         case INDEX_op_qemu_st_i32:
             t0 = tci_read_r(&tb_ptr);
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SIZE)) {
             case MO_UB:
                 qemu_st_b(t0);
                 break;
@@ -1230,8 +1216,8 @@
         case INDEX_op_qemu_st_i64:
             tmp64 = tci_read_r64(&tb_ptr);
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SIZE)) {
             case MO_UB:
                 qemu_st_b(tmp64);
                 break;