Merge changes from topic 'unaligned-access-optimization' into emu-master-dev

* changes:
  target-mips & softmmu: Misaligned Memory Accesses for R6/MSA
  tcg: Use softmmu fast path for unaligned accesses
  target-mips & softmmu: Revert: PRPL: Misaligned Memory Accesses for R6/MSA
diff --git a/Makefile.target b/Makefile.target
index 36ac49c..523602b 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -83,7 +83,7 @@
 #########################################################
 # cpu emulator library
 obj-y = exec.o translate-all.o cpu-exec.o
-obj-y += tcg/tcg.o tcg/optimize.o
+obj-y += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
 obj-$(CONFIG_TCG_INTERPRETER) += tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
diff --git a/android-qemu2-glue/build/Makefile.qemu2-sources.mk b/android-qemu2-glue/build/Makefile.qemu2-sources.mk
index 08b3395..09bd237 100644
--- a/android-qemu2-glue/build/Makefile.qemu2-sources.mk
+++ b/android-qemu2-glue/build/Makefile.qemu2-sources.mk
@@ -341,6 +341,7 @@
     savevm.c \
     tcg/optimize.c \
     tcg/tcg.c \
+    tcg/tcg-op.c \
     translate-all.c \
     vl.c \
     xen-common-stub.c \
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index c085804..24aa095 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -325,14 +325,22 @@
 extern RAMList ram_list;
 
 /* Flags stored in the low bits of the TLB virtual address.  These are
-   defined so that fast path ram access is all zeros.  */
+ * defined so that fast path ram access is all zeros.
+ * The flags all must be between TARGET_PAGE_BITS and
+ * maximum address alignment bit.
+ */
 /* Zero if TLB entry is valid.  */
-#define TLB_INVALID_MASK   (1 << 3)
+#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS - 1))
 /* Set if TLB entry references a clean RAM page.  The iotlb entry will
    contain the page physical address.  */
-#define TLB_NOTDIRTY    (1 << 4)
+#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS - 2))
 /* Set if TLB entry is an IO callback.  */
-#define TLB_MMIO        (1 << 5)
+#define TLB_MMIO            (1 << (TARGET_PAGE_BITS - 3))
+
+/* Use this mask to check interception with an alignment mask
+ * in a TCG backend.
+ */
+#define TLB_FLAGS_MASK  (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO)
 
 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 ram_addr_t last_ram_offset(void);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 0844885..9954e2b 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -104,6 +104,8 @@
                   hwaddr paddr, int prot,
                   int mmu_idx, target_ulong size);
 void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr);
+void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+                 uintptr_t retaddr);
 #else
 static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
 {
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index da53395..79af464 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -11,8 +11,8 @@
 
 static inline void gen_tb_start(void)
 {
-    TCGv_i32 count;
-    TCGv_i32 flag;
+    TCGv_i32 count, flag, imm;
+    int i;
 
     exitreq_label = gen_new_label();
     flag = tcg_temp_new_i32();
@@ -28,9 +28,17 @@
     count = tcg_temp_local_new_i32();
     tcg_gen_ld_i32(count, cpu_env,
                    -ENV_OFFSET + offsetof(CPUState, icount_decr.u32));
+
+    imm = tcg_temp_new_i32();
+    tcg_gen_movi_i32(imm, 0xdeadbeef);
+
     /* This is a horrid hack to allow fixing up the value later.  */
-    icount_arg = tcg_ctx.gen_opparam_ptr + 1;
-    tcg_gen_subi_i32(count, count, 0xdeadbeef);
+    i = tcg_ctx.gen_last_op_idx;
+    i = tcg_ctx.gen_op_buf[i].args;
+    icount_arg = &tcg_ctx.gen_opparam_buf[i + 1];
+
+    tcg_gen_sub_i32(count, count, imm);
+    tcg_temp_free_i32(imm);
 
     tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, icount_label);
     tcg_gen_st16_i32(count, cpu_env,
@@ -48,6 +56,9 @@
         gen_set_label(icount_label);
         tcg_gen_exit_tb((uintptr_t)tb + TB_EXIT_ICOUNT_EXPIRED);
     }
+
+    /* Terminate the linked list.  */
+    tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
 }
 
 static inline void gen_io_start(void)
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index b8a2431..859d4c9 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -123,8 +123,7 @@
     void (*do_interrupt)(CPUState *cpu);
     CPUUnassignedAccess do_unassigned_access;
     void (*do_unaligned_access)(CPUState *cpu, vaddr addr,
-                                int is_write, int is_user, uintptr_t retaddr,
-                                unsigned size);
+                                int is_write, int is_user, uintptr_t retaddr);
     bool (*virtio_is_big_endian)(CPUState *cpu);
     int (*memory_rw_debug)(CPUState *cpu, vaddr addr,
                            uint8_t *buf, int len, bool is_write);
@@ -589,12 +588,11 @@
 
 static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                         int is_write, int is_user,
-                                        uintptr_t retaddr, unsigned size)
+                                        uintptr_t retaddr)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    return cc->do_unaligned_access(cpu, addr, is_write, is_user, retaddr,
-            size);
+    return cc->do_unaligned_access(cpu, addr, is_write, is_user, retaddr);
 }
 #endif
 
diff --git a/softmmu_template.h b/softmmu_template.h
index 18548af..df3e171 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -166,26 +166,27 @@
 #ifdef SOFTMMU_CODE_ACCESS
 static __attribute__((unused))
 #endif
-WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr, int mmu_idx,
-                            uintptr_t retaddr)
+WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
+                            TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
     DATA_TYPE res;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
          != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                                 mmu_idx, retaddr, DATA_SIZE);
-        }
-#endif
         if (!VICTIM_TLB_HIT(ADDR_READ)) {
             tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
                      mmu_idx, retaddr);
@@ -216,16 +217,12 @@
         DATA_TYPE res1, res2;
         unsigned shift;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr, DATA_SIZE);
-#endif
         addr1 = addr & ~(DATA_SIZE - 1);
         addr2 = addr1 + DATA_SIZE;
         /* Note the adjustment at the beginning of the function.
            Undo that for the recursion.  */
-        res1 = helper_le_ld_name(env, addr1, mmu_idx, retaddr + GETPC_ADJ);
-        res2 = helper_le_ld_name(env, addr2, mmu_idx, retaddr + GETPC_ADJ);
+        res1 = helper_le_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
+        res2 = helper_le_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
         shift = (addr & (DATA_SIZE - 1)) * 8;
 
         /* Little-endian combine.  */
@@ -233,14 +230,6 @@
         return res;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr, DATA_SIZE);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
 #if DATA_SIZE == 1
     res = glue(glue(ld, LSUFFIX), _p)((uint8_t *)haddr);
@@ -254,26 +243,27 @@
 #ifdef SOFTMMU_CODE_ACCESS
 static __attribute__((unused))
 #endif
-WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr, int mmu_idx,
-                            uintptr_t retaddr)
+WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
+                            TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
     DATA_TYPE res;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
          != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                                 mmu_idx, retaddr, DATA_SIZE);
-        }
-#endif
         if (!VICTIM_TLB_HIT(ADDR_READ)) {
             tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
                      mmu_idx, retaddr);
@@ -304,16 +294,12 @@
         DATA_TYPE res1, res2;
         unsigned shift;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr, DATA_SIZE);
-#endif
         addr1 = addr & ~(DATA_SIZE - 1);
         addr2 = addr1 + DATA_SIZE;
         /* Note the adjustment at the beginning of the function.
            Undo that for the recursion.  */
-        res1 = helper_be_ld_name(env, addr1, mmu_idx, retaddr + GETPC_ADJ);
-        res2 = helper_be_ld_name(env, addr2, mmu_idx, retaddr + GETPC_ADJ);
+        res1 = helper_be_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
+        res2 = helper_be_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
         shift = (addr & (DATA_SIZE - 1)) * 8;
 
         /* Big-endian combine.  */
@@ -321,14 +307,6 @@
         return res;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
-                             mmu_idx, retaddr, DATA_SIZE);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
     res = glue(glue(ld, LSUFFIX), _be_p)((uint8_t *)haddr);
     return res;
@@ -339,7 +317,8 @@
 glue(glue(helper_ld, SUFFIX), MMUSUFFIX)(CPUArchState *env, target_ulong addr,
                                          int mmu_idx)
 {
-    return helper_te_ld_name (env, addr, mmu_idx, GETRA());
+    TCGMemOpIdx oi = make_memop_idx(SHIFT, mmu_idx);
+    return helper_te_ld_name (env, addr, oi, GETRA());
 }
 
 #ifndef SOFTMMU_CODE_ACCESS
@@ -348,16 +327,16 @@
    avoid this for 64-bit data, or for 32-bit data on 32-bit host.  */
 #if DATA_SIZE * 8 < TCG_TARGET_REG_BITS
 WORD_TYPE helper_le_lds_name(CPUArchState *env, target_ulong addr,
-                             int mmu_idx, uintptr_t retaddr)
+                             TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return (SDATA_TYPE)helper_le_ld_name(env, addr, mmu_idx, retaddr);
+    return (SDATA_TYPE)helper_le_ld_name(env, addr, oi, retaddr);
 }
 
 # if DATA_SIZE > 1
 WORD_TYPE helper_be_lds_name(CPUArchState *env, target_ulong addr,
-                             int mmu_idx, uintptr_t retaddr)
+                             TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    return (SDATA_TYPE)helper_be_ld_name(env, addr, mmu_idx, retaddr);
+    return (SDATA_TYPE)helper_be_ld_name(env, addr, oi, retaddr);
 }
 # endif
 #endif
@@ -382,24 +361,25 @@
 }
 
 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
-                       int mmu_idx, uintptr_t retaddr)
+                       TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                                 mmu_idx, retaddr, DATA_SIZE);
-        }
-#endif
         if (!VICTIM_TLB_HIT(addr_write)) {
             tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
         }
@@ -427,10 +407,6 @@
                      >= TARGET_PAGE_SIZE)) {
         int i;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr, DATA_SIZE);
-#endif
         /* XXX: not efficient, but simple */
         /* Note: relies on the fact that tlb_fill() does not remove the
          * previous page from the TLB cache.  */
@@ -440,19 +416,11 @@
             /* Note the adjustment at the beginning of the function.
                Undo that for the recursion.  */
             glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            mmu_idx, retaddr + GETPC_ADJ);
+                                            oi, retaddr + GETPC_ADJ);
         }
         return;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr, DATA_SIZE);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
 #if DATA_SIZE == 1
     glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
@@ -463,24 +431,25 @@
 
 #if DATA_SIZE > 1
 void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
-                       int mmu_idx, uintptr_t retaddr)
+                       TCGMemOpIdx oi, uintptr_t retaddr)
 {
+    unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    int a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
 
     /* Adjust the given return address.  */
     retaddr -= GETPC_ADJ;
 
+    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+
     /* If the TLB entry is for a different page, reload and try again.  */
     if ((addr & TARGET_PAGE_MASK)
         != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-#ifdef ALIGNED_ONLY
-        if ((addr & (DATA_SIZE - 1)) != 0) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                                 mmu_idx, retaddr, DATA_SIZE);
-        }
-#endif
         if (!VICTIM_TLB_HIT(addr_write)) {
             tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
         }
@@ -508,10 +477,6 @@
                      >= TARGET_PAGE_SIZE)) {
         int i;
     do_unaligned_access:
-#ifdef ALIGNED_ONLY
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr, DATA_SIZE);
-#endif
         /* XXX: not efficient, but simple */
         /* Note: relies on the fact that tlb_fill() does not remove the
          * previous page from the TLB cache.  */
@@ -521,19 +486,11 @@
             /* Note the adjustment at the beginning of the function.
                Undo that for the recursion.  */
             glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            mmu_idx, retaddr + GETPC_ADJ);
+                                            oi, retaddr + GETPC_ADJ);
         }
         return;
     }
 
-    /* Handle aligned access or unaligned access in the same page.  */
-#ifdef ALIGNED_ONLY
-    if ((addr & (DATA_SIZE - 1)) != 0) {
-        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                             mmu_idx, retaddr, DATA_SIZE);
-    }
-#endif
-
     haddr = addr + env->tlb_table[mmu_idx][index].addend;
     glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
 }
@@ -543,9 +500,32 @@
 glue(glue(helper_st, SUFFIX), MMUSUFFIX)(CPUArchState *env, target_ulong addr,
                                          DATA_TYPE val, int mmu_idx)
 {
-    helper_te_st_name(env, addr, val, mmu_idx, GETRA());
+    TCGMemOpIdx oi = make_memop_idx(SHIFT, mmu_idx);
+    helper_te_st_name(env, addr, val, oi, GETRA());
 }
 
+#if DATA_SIZE == 1
+/* Probe for whether the specified guest write access is permitted.
+ * If it is not permitted then an exception will be taken in the same
+ * way as if this were a real write access (and we will not return).
+ * Otherwise the function will return, and there will be a valid
+ * entry in the TLB for this access.
+ */
+void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+                 uintptr_t retaddr)
+{
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        /* TLB entry is for a different page */
+        if (!VICTIM_TLB_HIT(addr_write)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+    }
+}
+#endif
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */
 
 #undef READ_ACCESS_TYPE
diff --git a/target-alpha/cpu-qom.h b/target-alpha/cpu-qom.h
index 273a8ed..b01c6c8 100644
--- a/target-alpha/cpu-qom.h
+++ b/target-alpha/cpu-qom.h
@@ -86,7 +86,6 @@
 int alpha_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
 int alpha_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 void alpha_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
-                                   int is_write, int is_user, uintptr_t retaddr,
-                                   unsigned size);
+                                   int is_write, int is_user, uintptr_t retaddr);
 
 #endif
diff --git a/target-alpha/mem_helper.c b/target-alpha/mem_helper.c
index 1d4666a..fc4f57a 100644
--- a/target-alpha/mem_helper.c
+++ b/target-alpha/mem_helper.c
@@ -97,8 +97,7 @@
 }
 
 void alpha_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
-                                   int is_write, int is_user, uintptr_t retaddr,
-                                   unsigned size)
+                                   int is_write, int is_user, uintptr_t retaddr)
 {
     AlphaCPU *cpu = ALPHA_CPU(cs);
     CPUAlphaState *env = &cpu->env;
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index 76658a0..5bec586 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -2790,7 +2790,6 @@
     target_ulong pc_start;
     target_ulong pc_mask;
     uint32_t insn;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     ExitStatus ret;
@@ -2798,7 +2797,6 @@
     int max_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
 
     ctx.tb = tb;
     ctx.pc = pc_start;
@@ -2839,11 +2837,12 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
-                while (lj < j)
+                while (lj < j) {
                     tcg_ctx.gen_opc_instr_start[lj++] = 0;
+                }
             }
             tcg_ctx.gen_opc_pc[lj] = ctx.pc;
             tcg_ctx.gen_opc_instr_start[lj] = 1;
@@ -2881,7 +2880,7 @@
            or exhaust instruction count, stop generation.  */
         if (ret == NO_EXIT
             && ((ctx.pc & pc_mask) == 0
-                || tcg_ctx.gen_opc_ptr >= gen_opc_end
+                || tcg_op_buf_full()
                 || num_insns >= max_insns
                 || singlestep
                 || ctx.singlestep_enabled)) {
@@ -2912,12 +2911,13 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
-        while (lj <= j)
+        while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
+        }
     } else {
         tb->size = ctx.pc - pc_start;
         tb->icount = num_insns;
diff --git a/target-arm/helper.c b/target-arm/helper.c
index bf8d4b3..d436e34 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -4993,13 +4993,15 @@
         int maxidx = DIV_ROUND_UP(blocklen, TARGET_PAGE_SIZE);
         void *hostaddr[maxidx];
         int try, i;
+        unsigned mmu_idx = cpu_mmu_index(env);
+        TCGMemOpIdx oi = make_memop_idx(MO_UB, mmu_idx);
 
         for (try = 0; try < 2; try++) {
 
             for (i = 0; i < maxidx; i++) {
                 hostaddr[i] = tlb_vaddr_to_host(env,
                                                 vaddr + TARGET_PAGE_SIZE * i,
-                                                1, cpu_mmu_index(env));
+                                                1, mmu_idx);
                 if (!hostaddr[i]) {
                     break;
                 }
@@ -5020,12 +5022,12 @@
              * this purpose use the actual register value passed to us
              * so that we get the fault address right.
              */
-            helper_ret_stb_mmu(env, vaddr_in, 0, cpu_mmu_index(env), GETRA());
+            helper_ret_stb_mmu(env, vaddr_in, 0, oi, GETRA());
             /* Now we can populate the other TLB entries, if any */
             for (i = 0; i < maxidx; i++) {
                 uint64_t va = vaddr + TARGET_PAGE_SIZE * i;
                 if (va != (vaddr_in & TARGET_PAGE_MASK)) {
-                    helper_ret_stb_mmu(env, va, 0, cpu_mmu_index(env), GETRA());
+                    helper_ret_stb_mmu(env, va, 0, oi, GETRA());
                 }
             }
         }
@@ -5042,7 +5044,7 @@
          *    bounce buffer was in use
          */
         for (i = 0; i < blocklen; i++) {
-            helper_ret_stb_mmu(env, vaddr + i, 0, cpu_mmu_index(env), GETRA());
+            helper_ret_stb_mmu(env, vaddr + i, 0, oi, GETRA());
         }
     }
 #else
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 80d2c07..1e4e4ef 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -10899,7 +10899,6 @@
     CPUARMState *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
-    uint16_t *gen_opc_end;
     int j, lj;
     target_ulong pc_start;
     target_ulong next_page_start;
@@ -10910,8 +10909,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -10980,7 +10977,7 @@
         }
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -11030,7 +11027,7 @@
          * ensures prefetch aborts occur at the right place.
          */
         num_insns++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              !dc->ss_active &&
@@ -11090,7 +11087,6 @@
 
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -11102,7 +11098,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index af51568..32d8505 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -10995,7 +10995,6 @@
     CPUARMState *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
-    uint16_t *gen_opc_end;
     int j, lj;
     target_ulong pc_start;
     target_ulong next_page_start;
@@ -11016,8 +11015,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -11150,7 +11147,7 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -11216,7 +11213,7 @@
          * Also stop translation when a page boundary is reached.  This
          * ensures prefetch aborts occur at the right place.  */
         num_insns ++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              !dc->ss_active &&
@@ -11325,7 +11322,6 @@
 
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -11337,7 +11333,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-cris/translate.c b/target-cris/translate.c
index 76406af..b4c4ca3 100644
--- a/target-cris/translate.c
+++ b/target-cris/translate.c
@@ -3116,7 +3116,6 @@
 {
     CPUState *cs = CPU(cpu);
     CPUCRISState *env = &cpu->env;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     unsigned int insn_len;
     int j, lj;
@@ -3142,8 +3141,6 @@
     dc->cpu = cpu;
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->ppc = pc_start;
     dc->pc = pc_start;
@@ -3207,7 +3204,7 @@
         check_breakpoint(env, dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -3291,7 +3288,7 @@
             break;
         }
     } while (!dc->is_jmp && !dc->cpustate_changed
-            && tcg_ctx.gen_opc_ptr < gen_opc_end
+            && !tcg_op_buf_full()
             && !singlestep
             && (dc->pc < next_page_start)
             && num_insns < max_insns);
@@ -3344,9 +3341,9 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
@@ -3361,8 +3358,8 @@
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
         log_target_disas(env, pc_start, dc->pc - pc_start,
                          env->pregs[PR_VR]);
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 #endif
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 8af4ee9..899ca51 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -7901,7 +7901,6 @@
     CPUX86State *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     target_ulong pc_ptr;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj;
     uint64_t flags;
@@ -7970,8 +7969,6 @@
     cpu_ptr1 = tcg_temp_new_ptr();
     cpu_cc_srcT = tcg_temp_local_new();
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     pc_ptr = pc_start;
     lj = -1;
@@ -7992,7 +7989,7 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -8030,7 +8027,7 @@
             break;
         }
         /* if too long translation, stop generation too */
-        if (tcg_ctx.gen_opc_ptr >= gen_opc_end ||
+        if (tcg_op_buf_full() ||
             (pc_ptr - pc_start) >= (TARGET_PAGE_SIZE - 32) ||
             num_insns >= max_insns) {
             gen_jmp_im(pc_ptr - dc->cs_base);
@@ -8047,10 +8044,10 @@
         gen_io_end();
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     /* we don't forget to fill the last values */
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index 8454e8b..79b67d1 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -1062,7 +1062,6 @@
     CPUState *cs = CPU(cpu);
     CPULM32State *env = &cpu->env;
     struct DisasContext ctx, *dc = &ctx;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     int j, lj;
     uint32_t next_page_start;
@@ -1075,8 +1074,6 @@
     dc->num_watchpoints = cpu->num_watchpoints;
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -1100,7 +1097,7 @@
         check_breakpoint(env, dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -1124,7 +1121,7 @@
         num_insns++;
 
     } while (!dc->is_jmp
-         && tcg_ctx.gen_opc_ptr < gen_opc_end
+         && !tcg_op_buf_full()
          && !cs->singlestep_enabled
          && !singlestep
          && (dc->pc < next_page_start)
@@ -1158,9 +1155,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
@@ -1174,9 +1171,8 @@
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
         qemu_log("\n");
         log_target_disas(env, pc_start, dc->pc - pc_start, 0);
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr -
-            tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 }
diff --git a/target-m68k/translate.c b/target-m68k/translate.c
index efd4cfc..5c2f4d0 100644
--- a/target-m68k/translate.c
+++ b/target-m68k/translate.c
@@ -2980,7 +2980,6 @@
     CPUState *cs = CPU(cpu);
     CPUM68KState *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj;
     target_ulong pc_start;
@@ -2993,8 +2992,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->env = env;
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
@@ -3026,7 +3023,7 @@
                 break;
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -3041,7 +3038,7 @@
         dc->insn_pc = dc->pc;
 	disas_m68k_insn(env, dc);
         num_insns++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              (pc_offset) < (TARGET_PAGE_SIZE - 32) &&
@@ -3075,7 +3072,6 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -3086,7 +3082,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-microblaze/translate.c b/target-microblaze/translate.c
index fd2b771..d37f235 100644
--- a/target-microblaze/translate.c
+++ b/target-microblaze/translate.c
@@ -1673,7 +1673,6 @@
 {
     CPUState *cs = CPU(cpu);
     CPUMBState *env = &cpu->env;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     int j, lj;
     struct DisasContext ctx;
@@ -1688,8 +1687,6 @@
     dc->tb = tb;
     org_flags = dc->synced_flags = dc->tb_flags = tb->flags;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->jmp = 0;
     dc->delayed_branch = !!(dc->tb_flags & D_FLAG);
@@ -1732,7 +1729,7 @@
         check_breakpoint(env, dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -1795,10 +1792,10 @@
             break;
         }
     } while (!dc->is_jmp && !dc->cpustate_changed
-         && tcg_ctx.gen_opc_ptr < gen_opc_end
-                 && !singlestep
-         && (dc->pc < next_page_start)
-                 && num_insns < max_insns);
+             && !tcg_op_buf_full()
+             && !singlestep
+             && (dc->pc < next_page_start)
+             && num_insns < max_insns);
 
     npc = dc->pc;
     if (dc->jmp == JMP_DIRECT || dc->jmp == JMP_DIRECT_CC) {
@@ -1846,9 +1843,9 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
@@ -1864,9 +1861,8 @@
 #if DISAS_GNU
         log_target_disas(env, pc_start, dc->pc - pc_start, 0);
 #endif
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr -
-            tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 #endif
diff --git a/target-mips/cpu-qom.h b/target-mips/cpu-qom.h
index cbd626b..2ffc1bf 100644
--- a/target-mips/cpu-qom.h
+++ b/target-mips/cpu-qom.h
@@ -82,7 +82,6 @@
 int mips_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
 int mips_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
-                                  int is_write, int is_user, uintptr_t retaddr,
-                                  unsigned size);
+                                  int is_write, int is_user, uintptr_t retaddr);
 
 #endif
diff --git a/target-mips/cpu.h b/target-mips/cpu.h
index 21c8cfc..8ab5df2 100644
--- a/target-mips/cpu.h
+++ b/target-mips/cpu.h
@@ -804,8 +804,6 @@
 void r4k_invalidate_tlb (CPUMIPSState *env, int idx, int use_extra);
 hwaddr cpu_mips_translate_address (CPUMIPSState *env, target_ulong address,
 		                               int rw);
-bool cpu_mips_validate_access(CPUMIPSState *env, target_ulong address,
-                            target_ulong badvaddr, unsigned data_size, int rw);
 #endif
 target_ulong exception_resume_pc (CPUMIPSState *env);
 
diff --git a/target-mips/helper.c b/target-mips/helper.c
index bfcffa2..ba198c5 100644
--- a/target-mips/helper.c
+++ b/target-mips/helper.c
@@ -618,42 +618,6 @@
         return physical;
     }
 }
-
-bool cpu_mips_validate_access(CPUMIPSState *env, target_ulong address,
-                            target_ulong badvaddr, unsigned data_size, int rw)
-{
-    hwaddr physical;
-    int prot;
-    int access_type = ACCESS_INT;
-    int ret;
-    target_ulong addr;
-
-    addr = address & ~((target_ulong) data_size - 1);
-    ret = get_physical_address(env, &physical, &prot,
-            addr, rw, access_type);
-    if (ret != TLBRET_MATCH) {
-        if (ret != TLBRET_BADADDR && addr > badvaddr) {
-            badvaddr = addr;
-        }
-        raise_mmu_exception(env, badvaddr, rw, ret);
-        return false;
-    }
-    if (data_size > 1
-            && unlikely((address & ~TARGET_PAGE_MASK) + data_size - 1
-                        >= TARGET_PAGE_SIZE)) {
-        addr += data_size;
-        ret = get_physical_address(env, &physical, &prot,
-                addr, rw, access_type);
-        if (ret != TLBRET_MATCH) {
-            if (ret != TLBRET_BADADDR) {
-                badvaddr = addr;
-            }
-            raise_mmu_exception(env, badvaddr, rw, ret);
-            return false;
-        }
-    }
-    return true;
-}
 #endif
 
 static const char * const excp_names[EXCP_LAST + 1] = {
diff --git a/target-mips/helper.h b/target-mips/helper.h
index 3687df4..b8716de 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -931,5 +931,11 @@
 DEF_HELPER_4(msa_ffint_s_df, void, env, i32, i32, i32)
 DEF_HELPER_4(msa_ffint_u_df, void, env, i32, i32, i32)
 
-DEF_HELPER_5(msa_ld_df, void, env, i32, i32, i32, s32)
-DEF_HELPER_5(msa_st_df, void, env, i32, i32, i32, s32)
+#define MSALDST_PROTO(type)                         \
+DEF_HELPER_3(msa_ld_ ## type, void, env, i32, tl)   \
+DEF_HELPER_3(msa_st_ ## type, void, env, i32, tl)
+MSALDST_PROTO(b)
+MSALDST_PROTO(h)
+MSALDST_PROTO(w)
+MSALDST_PROTO(d)
+#undef MSALDST_PROTO
diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
index c5edf4d..26cabac 100644
--- a/target-mips/op_helper.c
+++ b/target-mips/op_helper.c
@@ -90,10 +90,10 @@
     }                                                                   \
 }
 #endif
-HELPER_LD(lbu, ldub, uint8_t)
-HELPER_LD(lhu, lduw, uint16_t)
 HELPER_LD(lw, ldl, int32_t)
+#if defined(TARGET_MIPS64)
 HELPER_LD(ld, ldq, int64_t)
+#endif
 #undef HELPER_LD
 
 #if defined(CONFIG_USER_ONLY)
@@ -118,9 +118,10 @@
 }
 #endif
 HELPER_ST(sb, stb, uint8_t)
-HELPER_ST(sh, stw, uint16_t)
 HELPER_ST(sw, stl, uint32_t)
+#if defined(TARGET_MIPS64)
 HELPER_ST(sd, stq, uint64_t)
+#endif
 #undef HELPER_ST
 
 target_ulong helper_clo (target_ulong arg1)
@@ -2287,26 +2288,13 @@
 
 void mips_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
                                   int access_type, int is_user,
-                                  uintptr_t retaddr, unsigned size)
+                                  uintptr_t retaddr)
 {
     MIPSCPU *cpu = MIPS_CPU(cs);
     CPUMIPSState *env = &cpu->env;
     int error_code = 0;
     int excp;
 
-    if (env->insn_flags & ISA_MIPS32R6) {
-        /* Release 6 provides support for misaligned memory access for
-         * all ordinary memory reference instructions
-         * */
-        if (!cpu_mips_validate_access(env, addr, addr, size, access_type)) {
-            CPUState *cs = CPU(mips_env_get_cpu(env));
-            do_raise_exception_err(env, cs->exception_index,
-                                   env->error_code, retaddr);
-            return;
-        }
-        return;
-    }
-
     env->CP0_BadVAddr = addr;
 
     if (access_type == MMU_DATA_STORE) {
@@ -3698,101 +3686,81 @@
 #define DF_ELEMENTS(df) (MSA_WRLEN / DF_BITS(df))
 
 #if !defined(CONFIG_USER_ONLY)
-static bool cpu_mips_validate_msa_block_access(CPUMIPSState *env,
-        target_ulong address, int df, int rw)
-{
-    int i;
-    for (i = 0; i < DF_ELEMENTS(df); i++) {
-        if (!cpu_mips_validate_access(env, address + (i << df),
-                address, (1 << df), rw)) {
-            CPUState *cs = CPU(mips_env_get_cpu(env));
-            do_raise_exception_err(env, cs->exception_index,
-                                   env->error_code, GETRA());
-            return false;
-        }
-    }
-    return true;
-}
+#define MEMOP_IDX(DF)                                           \
+        TCGMemOpIdx oi = make_memop_idx(MO_TE | DF | MO_UNALN,  \
+                                        cpu_mmu_index(env));
+#else
+#define MEMOP_IDX(DF)
 #endif
 
-void helper_msa_ld_df(CPUMIPSState *env, uint32_t df, uint32_t wd, uint32_t rs,
-                     int32_t s10)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    target_ulong addr = env->active_tc.gpr[rs] + (s10 << df);
-    int i;
+#define MSA_LD_DF(DF, TYPE, LD_INSN, ...)                               \
+void helper_msa_ld_ ## TYPE(CPUMIPSState *env, uint32_t wd,             \
+                            target_ulong addr)                          \
+{                                                                       \
+    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
+    wr_t wx;                                                            \
+    int i;                                                              \
+    MEMOP_IDX(DF)                                                       \
+    for (i = 0; i < DF_ELEMENTS(DF); i++) {                             \
+        wx.TYPE[i] = LD_INSN(env, addr + (i << DF), ##__VA_ARGS__);     \
+    }                                                                   \
+    memcpy(pwd, &wx, sizeof(wr_t));                                     \
+}
 
 #if !defined(CONFIG_USER_ONLY)
-    if (!cpu_mips_validate_msa_block_access(env, addr, df, MMU_DATA_LOAD)) {
-        return;
-    }
+MSA_LD_DF(DF_BYTE,   b, helper_ret_ldub_mmu, oi, GETRA())
+MSA_LD_DF(DF_HALF,   h, helper_ret_lduw_mmu, oi, GETRA())
+MSA_LD_DF(DF_WORD,   w, helper_ret_ldul_mmu, oi, GETRA())
+MSA_LD_DF(DF_DOUBLE, d, helper_ret_ldq_mmu,  oi, GETRA())
+#else
+MSA_LD_DF(DF_BYTE,   b, cpu_ldub_data)
+MSA_LD_DF(DF_HALF,   h, cpu_lduw_data)
+MSA_LD_DF(DF_WORD,   w, cpu_ldl_data)
+MSA_LD_DF(DF_DOUBLE, d, cpu_ldq_data)
 #endif
 
-    switch (df) {
-    case DF_BYTE:
-        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {
-            pwd->b[i] = do_lbu(env, addr + (i << DF_BYTE),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_HALF:
-        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {
-            pwd->h[i] = do_lhu(env, addr + (i << DF_HALF),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_WORD:
-        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {
-            pwd->w[i] = do_lw(env, addr + (i << DF_WORD),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_DOUBLE:
-        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {
-            pwd->d[i] = do_ld(env, addr + (i << DF_DOUBLE),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
+#define MSA_PAGESPAN(x) \
+        ((((x) & ~TARGET_PAGE_MASK) + MSA_WRLEN/8 - 1) >= TARGET_PAGE_SIZE)
+
+static inline void ensure_writable_pages(CPUMIPSState *env,
+                                         target_ulong addr,
+                                         int mmu_idx,
+                                         uintptr_t retaddr)
+{
+#if !defined(CONFIG_USER_ONLY)
+    target_ulong page_addr;
+    if (unlikely(MSA_PAGESPAN(addr))) {
+        /* first page */
+        probe_write(env, addr, mmu_idx, retaddr);
+        /* second page */
+        page_addr = (addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
+        probe_write(env, page_addr, mmu_idx, retaddr);
     }
+#endif
 }
 
-void helper_msa_st_df(CPUMIPSState *env, uint32_t df, uint32_t wd, uint32_t rs,
-                     int32_t s10)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    target_ulong addr = env->active_tc.gpr[rs] + (s10 << df);
-    int i;
+#define MSA_ST_DF(DF, TYPE, ST_INSN, ...)                               \
+void helper_msa_st_ ## TYPE(CPUMIPSState *env, uint32_t wd,             \
+                            target_ulong addr)                          \
+{                                                                       \
+    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
+    int mmu_idx = cpu_mmu_index(env);                                   \
+    int i;                                                              \
+    MEMOP_IDX(DF)                                                       \
+    ensure_writable_pages(env, addr, mmu_idx, GETRA());                 \
+    for (i = 0; i < DF_ELEMENTS(DF); i++) {                             \
+        ST_INSN(env, addr + (i << DF), pwd->TYPE[i], ##__VA_ARGS__);    \
+    }                                                                   \
+}
 
 #if !defined(CONFIG_USER_ONLY)
-    if (!cpu_mips_validate_msa_block_access(env, addr, df, MMU_DATA_STORE)) {
-        return;
-    }
+MSA_ST_DF(DF_BYTE,   b, helper_ret_stb_mmu, oi, GETRA())
+MSA_ST_DF(DF_HALF,   h, helper_ret_stw_mmu, oi, GETRA())
+MSA_ST_DF(DF_WORD,   w, helper_ret_stl_mmu, oi, GETRA())
+MSA_ST_DF(DF_DOUBLE, d, helper_ret_stq_mmu, oi, GETRA())
+#else
+MSA_ST_DF(DF_BYTE,   b, cpu_stb_data)
+MSA_ST_DF(DF_HALF,   h, cpu_stw_data)
+MSA_ST_DF(DF_WORD,   w, cpu_stl_data)
+MSA_ST_DF(DF_DOUBLE, d, cpu_stq_data)
 #endif
-
-    switch (df) {
-    case DF_BYTE:
-        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {
-            do_sb(env, addr + (i << DF_BYTE), pwd->b[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_HALF:
-        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {
-            do_sh(env, addr + (i << DF_HALF), pwd->h[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_WORD:
-        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {
-            do_sw(env, addr + (i << DF_WORD), pwd->w[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_DOUBLE:
-        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {
-            do_sd(env, addr + (i << DF_DOUBLE), pwd->d[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    }
-}
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 02e9495..44b5fcc 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -2171,6 +2171,7 @@
     int32_t CP0_Config1;
     /* Routine used to access memory */
     int mem_idx;
+    TCGMemOp default_tcg_memop_mask;
     uint32_t hflags, saved_hflags;
     int bstate;
     target_ulong btarget;
@@ -2848,12 +2849,14 @@
     switch (opc) {
 #if defined(TARGET_MIPS64)
     case OPC_LWU:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUL);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUL |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lwu";
         break;
     case OPC_LD:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEQ);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEQ |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "ld";
         break;
@@ -2924,17 +2927,20 @@
         opn = "lwpc";
         break;
     case OPC_LW:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESL);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESL |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lw";
         break;
     case OPC_LH:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESW);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESW |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lh";
         break;
     case OPC_LHU:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUW);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUW |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lhu";
         break;
@@ -3018,7 +3024,8 @@
     switch (opc) {
 #if defined(TARGET_MIPS64)
     case OPC_SD:
-        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEQ);
+        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEQ |
+                           ctx->default_tcg_memop_mask);
         opn = "sd";
         break;
     case OPC_SDL:
@@ -3033,11 +3040,13 @@
         break;
 #endif
     case OPC_SW:
-        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUL);
+        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUL |
+                           ctx->default_tcg_memop_mask);
         opn = "sw";
         break;
     case OPC_SH:
-        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUW);
+        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUW |
+                           ctx->default_tcg_memop_mask);
         opn = "sh";
         break;
     case OPC_SB:
@@ -3114,7 +3123,8 @@
     case OPC_LWC1:
         {
             TCGv_i32 fp0 = tcg_temp_new_i32();
-            tcg_gen_qemu_ld_i32(fp0, t0, ctx->mem_idx, MO_TESL);
+            tcg_gen_qemu_ld_i32(fp0, t0, ctx->mem_idx, MO_TESL |
+                                ctx->default_tcg_memop_mask);
             gen_store_fpr32(ctx, fp0, ft);
             tcg_temp_free_i32(fp0);
         }
@@ -3124,7 +3134,8 @@
         {
             TCGv_i32 fp0 = tcg_temp_new_i32();
             gen_load_fpr32(ctx, fp0, ft);
-            tcg_gen_qemu_st_i32(fp0, t0, ctx->mem_idx, MO_TEUL);
+            tcg_gen_qemu_st_i32(fp0, t0, ctx->mem_idx, MO_TEUL |
+                                ctx->default_tcg_memop_mask);
             tcg_temp_free_i32(fp0);
         }
         opn = "swc1";
@@ -3132,7 +3143,8 @@
     case OPC_LDC1:
         {
             TCGv_i64 fp0 = tcg_temp_new_i64();
-            tcg_gen_qemu_ld_i64(fp0, t0, ctx->mem_idx, MO_TEQ);
+            tcg_gen_qemu_ld_i64(fp0, t0, ctx->mem_idx, MO_TEQ |
+                                ctx->default_tcg_memop_mask);
             gen_store_fpr64(ctx, fp0, ft);
             tcg_temp_free_i64(fp0);
         }
@@ -3142,7 +3154,8 @@
         {
             TCGv_i64 fp0 = tcg_temp_new_i64();
             gen_load_fpr64(ctx, fp0, ft);
-            tcg_gen_qemu_st_i64(fp0, t0, ctx->mem_idx, MO_TEQ);
+            tcg_gen_qemu_st_i64(fp0, t0, ctx->mem_idx, MO_TEQ |
+                                ctx->default_tcg_memop_mask);
             tcg_temp_free_i64(fp0);
         }
         opn = "sdc1";
@@ -19086,32 +19099,39 @@
             uint8_t wd = (ctx->opcode >> 6) & 0x1f;
             uint8_t df = (ctx->opcode >> 0) & 0x3;
 
-            TCGv_i32 tdf = tcg_const_i32(df);
             TCGv_i32 twd = tcg_const_i32(wd);
-            TCGv_i32 trs = tcg_const_i32(rs);
-            TCGv_i32 ts10 = tcg_const_i32(s10);
+            TCGv taddr = tcg_temp_new();
+            gen_base_offset_addr(ctx, taddr, rs, s10 << df);
 
             switch (MASK_MSA_MINOR(opcode)) {
             case OPC_LD_B:
+                gen_helper_msa_ld_b(cpu_env, twd, taddr);
+                break;
             case OPC_LD_H:
+                gen_helper_msa_ld_h(cpu_env, twd, taddr);
+                break;
             case OPC_LD_W:
+                gen_helper_msa_ld_w(cpu_env, twd, taddr);
+                break;
             case OPC_LD_D:
-                save_cpu_state(ctx, 1);
-                gen_helper_msa_ld_df(cpu_env, tdf, twd, trs, ts10);
+                gen_helper_msa_ld_d(cpu_env, twd, taddr);
                 break;
             case OPC_ST_B:
+                gen_helper_msa_st_b(cpu_env, twd, taddr);
+                break;
             case OPC_ST_H:
+                gen_helper_msa_st_h(cpu_env, twd, taddr);
+                break;
             case OPC_ST_W:
+                gen_helper_msa_st_w(cpu_env, twd, taddr);
+                break;
             case OPC_ST_D:
-                save_cpu_state(ctx, 1);
-                gen_helper_msa_st_df(cpu_env, tdf, twd, trs, ts10);
+                gen_helper_msa_st_d(cpu_env, twd, taddr);
                 break;
             }
 
             tcg_temp_free_i32(twd);
-            tcg_temp_free_i32(tdf);
-            tcg_temp_free_i32(trs);
-            tcg_temp_free_i32(ts10);
+            tcg_temp_free(taddr);
         }
         break;
     default:
@@ -19766,7 +19786,6 @@
     CPUMIPSState *env = &cpu->env;
     DisasContext ctx;
     target_ulong pc_start;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     int num_insns;
@@ -19778,7 +19797,6 @@
         qemu_log("search pc %d\n", search_pc);
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.saved_pc = -1;
     ctx.singlestep_enabled = cs->singlestep_enabled;
@@ -19800,6 +19818,8 @@
 #else
         ctx.mem_idx = ctx.hflags & MIPS_HFLAG_KSU;
 #endif
+    ctx.default_tcg_memop_mask = (ctx.insn_flags & ISA_MIPS32R6) ?
+                                 MO_UNALN : MO_ALIGN;
     num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
     if (max_insns == 0)
@@ -19822,7 +19842,7 @@
         }
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -19880,7 +19900,7 @@
         if ((ctx.pc & (TARGET_PAGE_SIZE - 1)) == 0)
             break;
 
-        if (tcg_ctx.gen_opc_ptr >= gen_opc_end) {
+        if (tcg_op_buf_full()) {
             break;
         }
 
@@ -19915,9 +19935,9 @@
     }
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-moxie/translate.c b/target-moxie/translate.c
index 4541b9b..cc9af66 100644
--- a/target-moxie/translate.c
+++ b/target-moxie/translate.c
@@ -827,14 +827,12 @@
     CPUState *cs = CPU(cpu);
     DisasContext ctx;
     target_ulong pc_start;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     CPUMoxieState *env = &cpu->env;
     int num_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.saved_pc = -1;
     ctx.tb = tb;
@@ -857,7 +855,7 @@
         }
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -879,7 +877,7 @@
         if ((ctx.pc & (TARGET_PAGE_SIZE - 1)) == 0) {
             break;
         }
-    } while (ctx.bstate == BS_NONE && tcg_ctx.gen_opc_ptr < gen_opc_end);
+    } while (ctx.bstate == BS_NONE && !tcg_op_buf_full());
 
     if (cs->singlestep_enabled) {
         tcg_gen_movi_tl(cpu_pc, ctx.pc);
@@ -900,9 +898,9 @@
     }
  done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-openrisc/translate.c b/target-openrisc/translate.c
index 407bd97..fdc1a16 100644
--- a/target-openrisc/translate.c
+++ b/target-openrisc/translate.c
@@ -1642,7 +1642,6 @@
 {
     CPUState *cs = CPU(cpu);
     struct DisasContext ctx, *dc = &ctx;
-    uint16_t *gen_opc_end;
     uint32_t pc_start;
     int j, k;
     uint32_t next_page_start;
@@ -1652,7 +1651,6 @@
     pc_start = tb->pc;
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     dc->is_jmp = DISAS_NEXT;
     dc->ppc = pc_start;
     dc->pc = pc_start;
@@ -1680,7 +1678,7 @@
     do {
         check_breakpoint(cpu, dc);
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (k < j) {
                 k++;
                 while (k < j) {
@@ -1721,7 +1719,7 @@
             }
         }
     } while (!dc->is_jmp
-             && tcg_ctx.gen_opc_ptr < gen_opc_end
+             && !tcg_op_buf_full()
              && !cs->singlestep_enabled
              && !singlestep
              && (dc->pc < next_page_start)
@@ -1759,9 +1757,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         k++;
         while (k <= j) {
             tcg_ctx.gen_opc_instr_start[k++] = 0;
@@ -1775,9 +1773,8 @@
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
         qemu_log("\n");
         log_target_disas(&cpu->env, pc_start, dc->pc - pc_start, 0);
-        qemu_log("\nisize=%d osize=%td\n",
-            dc->pc - pc_start, tcg_ctx.gen_opc_ptr -
-            tcg_ctx.gen_opc_buf);
+        qemu_log("\nisize=%d osize=%d\n",
+                 dc->pc - pc_start, tcg_op_buf_count());
     }
 #endif
 }
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index d381632..344cb5d 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -11273,14 +11273,12 @@
     DisasContext ctx, *ctxp = &ctx;
     opc_handler_t **table, *handler;
     target_ulong pc_start;
-    uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int j, lj = -1;
     int num_insns;
     int max_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.nip = pc_start;
     ctx.tb = tb;
     ctx.exception = POWERPC_EXCP_NONE;
@@ -11332,8 +11330,7 @@
     gen_tb_start();
     tcg_clear_temp_count();
     /* Set env in case of segfault during code fetch */
-    while (ctx.exception == POWERPC_EXCP_NONE
-            && tcg_ctx.gen_opc_ptr < gen_opc_end) {
+    while (ctx.exception == POWERPC_EXCP_NONE && !tcg_op_buf_full()) {
         if (unlikely(!QTAILQ_EMPTY(&cs->breakpoints))) {
             QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
                 if (bp->pc == ctx.nip) {
@@ -11343,7 +11340,7 @@
             }
         }
         if (unlikely(search_pc)) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -11449,9 +11446,9 @@
         tcg_gen_exit_tb(0);
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (unlikely(search_pc)) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index dbf1993..215ef2e 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -4750,7 +4750,6 @@
     DisasContext dc;
     target_ulong pc_start;
     uint64_t next_page_start;
-    uint16_t *gen_opc_end;
     int j, lj = -1;
     int num_insns, max_insns;
     CPUBreakpoint *bp;
@@ -4769,8 +4768,6 @@
     dc.cc_op = CC_OP_DYNAMIC;
     do_debug = dc.singlestep_enabled = cs->singlestep_enabled;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
 
     num_insns = 0;
@@ -4783,7 +4780,7 @@
 
     do {
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -4821,7 +4818,7 @@
            or exhaust instruction count, stop generation.  */
         if (status == NO_EXIT
             && (dc.pc >= next_page_start
-                || tcg_ctx.gen_opc_ptr >= gen_opc_end
+                || tcg_op_buf_full()
                 || num_insns >= max_insns
                 || singlestep
                 || cs->singlestep_enabled)) {
@@ -4856,9 +4853,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 3088edc..bc82044 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -1865,14 +1865,12 @@
     CPUSH4State *env = &cpu->env;
     DisasContext ctx;
     target_ulong pc_start;
-    static uint16_t *gen_opc_end;
     CPUBreakpoint *bp;
     int i, ii;
     int num_insns;
     int max_insns;
 
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.flags = (uint32_t)tb->flags;
     ctx.bstate = BS_NONE;
@@ -1891,7 +1889,7 @@
     if (max_insns == 0)
         max_insns = CF_COUNT_MASK;
     gen_tb_start();
-    while (ctx.bstate == BS_NONE && tcg_ctx.gen_opc_ptr < gen_opc_end) {
+    while (ctx.bstate == BS_NONE && !tcg_op_buf_full()) {
         if (unlikely(!QTAILQ_EMPTY(&cs->breakpoints))) {
             QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
                 if (ctx.pc == bp->pc) {
@@ -1904,7 +1902,7 @@
 	    }
 	}
         if (search_pc) {
-            i = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            i = tcg_op_buf_count();
             if (ii < i) {
                 ii++;
                 while (ii < i)
@@ -1962,9 +1960,9 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (search_pc) {
-        i = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        i = tcg_op_buf_count();
         ii++;
         while (ii <= i)
             tcg_ctx.gen_opc_instr_start[ii++] = 0;
diff --git a/target-sparc/cpu-qom.h b/target-sparc/cpu-qom.h
index 4bfbb84..477c4d5 100644
--- a/target-sparc/cpu-qom.h
+++ b/target-sparc/cpu-qom.h
@@ -83,7 +83,6 @@
 int sparc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 void QEMU_NORETURN sparc_cpu_do_unaligned_access(CPUState *cpu,
                                                  vaddr addr, int is_write,
-                                                 int is_user, uintptr_t retaddr,
-                                                 unsigned size);
+                                                 int is_user, uintptr_t retaddr);
 
 #endif
diff --git a/target-sparc/ldst_helper.c b/target-sparc/ldst_helper.c
index b032608..1a62e19 100644
--- a/target-sparc/ldst_helper.c
+++ b/target-sparc/ldst_helper.c
@@ -2420,8 +2420,7 @@
 #if !defined(CONFIG_USER_ONLY)
 void QEMU_NORETURN sparc_cpu_do_unaligned_access(CPUState *cs,
                                                  vaddr addr, int is_write,
-                                                 int is_user, uintptr_t retaddr,
-                                                 unsigned size)
+                                                 int is_user, uintptr_t retaddr)
 {
     SPARCCPU *cpu = SPARC_CPU(cs);
     CPUSPARCState *env = &cpu->env;
diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index 78c4e21..99b4f53 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -5245,7 +5245,6 @@
     CPUState *cs = CPU(cpu);
     CPUSPARCState *env = &cpu->env;
     target_ulong pc_start, last_pc;
-    uint16_t *gen_opc_end;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
     int j, lj = -1;
@@ -5265,7 +5264,6 @@
     dc->fpu_enabled = tb_fpu_enabled(tb->flags);
     dc->address_mask_32bit = tb_am_enabled(tb->flags);
     dc->singlestep = (cs->singlestep_enabled || singlestep);
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
 
     num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
@@ -5287,7 +5285,7 @@
         }
         if (spc) {
             qemu_log("Search PC...\n");
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j)
@@ -5320,7 +5318,7 @@
         if (dc->singlestep) {
             break;
         }
-    } while ((tcg_ctx.gen_opc_ptr < gen_opc_end) &&
+    } while (!tcg_op_buf_full() &&
              (dc->pc - pc_start) < (TARGET_PAGE_SIZE - 32) &&
              num_insns < max_insns);
 
@@ -5342,9 +5340,9 @@
         }
     }
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
     if (spc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j)
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-tricore/translate.c b/target-tricore/translate.c
index d5a9596..68e3c53 100644
--- a/target-tricore/translate.c
+++ b/target-tricore/translate.c
@@ -2430,7 +2430,6 @@
     DisasContext ctx;
     target_ulong pc_start;
     int num_insns;
-    uint16_t *gen_opc_end;
 
     if (search_pc) {
         qemu_log("search pc %d\n", search_pc);
@@ -2438,7 +2437,6 @@
 
     num_insns = 0;
     pc_start = tb->pc;
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     ctx.pc = pc_start;
     ctx.saved_pc = -1;
     ctx.tb = tb;
@@ -2454,7 +2452,7 @@
 
         num_insns++;
 
-        if (tcg_ctx.gen_opc_ptr >= gen_opc_end) {
+        if (tcg_op_buf_full()) {
             gen_save_pc(ctx.next_pc);
             tcg_gen_exit_tb(0);
             break;
@@ -2468,7 +2466,6 @@
     }
 
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
     if (search_pc) {
         printf("done_generating search pc\n");
     } else {
diff --git a/target-unicore32/translate.c b/target-unicore32/translate.c
index 653c225..e95c1e2 100644
--- a/target-unicore32/translate.c
+++ b/target-unicore32/translate.c
@@ -1877,7 +1877,6 @@
     CPUUniCore32State *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     CPUBreakpoint *bp;
-    uint16_t *gen_opc_end;
     int j, lj;
     target_ulong pc_start;
     uint32_t next_page_start;
@@ -1891,8 +1890,6 @@
 
     dc->tb = tb;
 
-    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
-
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
     dc->singlestep_enabled = cs->singlestep_enabled;
@@ -1933,7 +1930,7 @@
             }
         }
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -1965,7 +1962,7 @@
          * Also stop translation when a page boundary is reached.  This
          * ensures prefetch aborts occur at the right place.  */
         num_insns++;
-    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+    } while (!dc->is_jmp && !tcg_op_buf_full() &&
              !cs->singlestep_enabled &&
              !singlestep &&
              dc->pc < next_page_start &&
@@ -2037,7 +2034,6 @@
 
 done_generating:
     gen_tb_end(tb, num_insns);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -2048,7 +2044,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         lj++;
         while (lj <= j) {
             tcg_ctx.gen_opc_instr_start[lj++] = 0;
diff --git a/target-xtensa/cpu-qom.h b/target-xtensa/cpu-qom.h
index e232bbf..9de5c6e 100644
--- a/target-xtensa/cpu-qom.h
+++ b/target-xtensa/cpu-qom.h
@@ -91,7 +91,6 @@
 int xtensa_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
 int xtensa_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 void xtensa_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
-                                    int is_write, int is_user,
-                                    uintptr_t retaddr, unsigned size);
+                                    int is_write, int is_user, uintptr_t retaddr);
 
 #endif
diff --git a/target-xtensa/op_helper.c b/target-xtensa/op_helper.c
index 3d431de..872e5a8 100644
--- a/target-xtensa/op_helper.c
+++ b/target-xtensa/op_helper.c
@@ -33,7 +33,7 @@
 #include "qemu/timer.h"
 
 void xtensa_cpu_do_unaligned_access(CPUState *cs,
-        vaddr addr, int is_write, int is_user, uintptr_t retaddr, unsigned size)
+        vaddr addr, int is_write, int is_user, uintptr_t retaddr)
 {
     XtensaCPU *cpu = XTENSA_CPU(cs);
     CPUXtensaState *env = &cpu->env;
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index badca19..7ee565c 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -2987,7 +2987,6 @@
     DisasContext dc;
     int insn_count = 0;
     int j, lj = -1;
-    uint16_t *gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
     int max_insns = tb->cflags & CF_COUNT_MASK;
     uint32_t pc_start = tb->pc;
     uint32_t next_page_start =
@@ -3030,7 +3029,7 @@
         check_breakpoint(env, &dc);
 
         if (search_pc) {
-            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            j = tcg_op_buf_count();
             if (lj < j) {
                 lj++;
                 while (lj < j) {
@@ -3081,7 +3080,7 @@
     } while (dc.is_jmp == DISAS_NEXT &&
             insn_count < max_insns &&
             dc.pc < next_page_start &&
-            tcg_ctx.gen_opc_ptr < gen_opc_end);
+            !tcg_op_buf_full());
 
     reset_litbase(&dc);
     reset_sar_tracker(&dc);
@@ -3097,7 +3096,6 @@
         gen_jumpi(&dc, dc.pc, 0);
     }
     gen_tb_end(tb, insn_count);
-    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
@@ -3108,7 +3106,7 @@
     }
 #endif
     if (search_pc) {
-        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        j = tcg_op_buf_count();
         memset(tcg_ctx.gen_opc_instr_start + lj + 1, 0,
                 (j - lj) * sizeof(tcg_ctx.gen_opc_instr_start[0]));
     } else {
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 987c0bd..35920e1 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -280,7 +280,7 @@
     I3312_LDRSHX    = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
     I3312_LDRSWX    = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
 
-    I3312_TO_I3310  = 0x00206800,
+    I3312_TO_I3310  = 0x00200800,
     I3312_TO_I3313  = 0x01000000,
 
     /* Load/store register pair instructions.  */
@@ -496,13 +496,14 @@
 }
 
 static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
-                              TCGReg rd, TCGReg base, TCGReg regoff)
+                              TCGReg rd, TCGReg base, TCGType ext,
+                              TCGReg regoff)
 {
     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
-    tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 | base << 5 | rd);
+    tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
+              0x4000 | ext << 13 | base << 5 | rd);
 }
 
-
 static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
                               TCGReg rd, TCGReg rn, intptr_t offset)
 {
@@ -677,7 +678,7 @@
 
     /* Worst-case scenario, move offset to temp register, use reg offset.  */
     tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
-    tcg_out_ldst_r(s, insn, rd, rn, TCG_REG_TMP);
+    tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
 }
 
 static inline void tcg_out_mov(TCGContext *s,
@@ -962,7 +963,7 @@
 
 #ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
- *                                     int mmu_idx, uintptr_t ra)
+ *                                     TCGMemOpIdx oi, uintptr_t ra)
  */
 static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
@@ -975,7 +976,8 @@
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
- *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ *                                     uintxx_t val, TCGMemOpIdx oi,
+ *                                     uintptr_t ra)
  */
 static void * const qemu_st_helpers[16] = {
     [MO_UB]   = helper_ret_stb_mmu,
@@ -996,16 +998,17 @@
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp size = opc & MO_SIZE;
 
     reloc_pc19(lb->label_ptr[0], s->code_ptr);
 
-    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_X0, TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, oi);
     tcg_out_adr(s, TCG_REG_X3, lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
     if (opc & MO_SIGN) {
         tcg_out_sxt(s, lb->type, size, lb->datalo_reg, TCG_REG_X0);
     } else {
@@ -1017,33 +1020,32 @@
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp size = opc & MO_SIZE;
 
     reloc_pc19(lb->label_ptr[0], s->code_ptr);
 
-    tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_X0, TCG_AREG0);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
     tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, oi);
     tcg_out_adr(s, TCG_REG_X4, lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
     tcg_out_goto(s, lb->raddr);
 }
 
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGType ext, TCGReg data_reg, TCGReg addr_reg,
-                                int mem_index, tcg_insn_unit *raddr,
-                                tcg_insn_unit *label_ptr)
+                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->type = ext;
     label->datalo_reg = data_reg;
     label->addrlo_reg = addr_reg;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr;
 }
@@ -1052,14 +1054,30 @@
    slow path for the failure case, which will be patched later when finalizing
    the slow path. Generated code returns the host addend in X1,
    clobbers X0,X2,X3,TMP. */
-static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp s_bits,
+static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
                              tcg_insn_unit **label_ptr, int mem_index,
                              bool is_read)
 {
-    TCGReg base = TCG_AREG0;
     int tlb_offset = is_read ?
         offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
+    int a_bits = get_alignment_bits(opc);
+    TCGReg base = TCG_AREG0, x3;
+    uint64_t tlb_mask;
+
+    /* For aligned accesses, we check the first byte and include the alignment
+       bits within the address.  For unaligned access, we check that we don't
+       cross pages using the address of the last byte of the access.  */
+    if (a_bits >= 0) {
+        /* A byte access or an alignment check required */
+        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
+        x3 = addr_reg;
+    } else {
+        tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64,
+                     TCG_REG_X3, addr_reg, (1 << (opc & MO_SIZE)) - 1);
+        tlb_mask = TARGET_PAGE_MASK;
+        x3 = TCG_REG_X3;
+    }
 
     /* Extract the TLB index from the address into X0.
        X0<CPU_TLB_BITS:0> =
@@ -1067,11 +1085,9 @@
     tcg_out_ubfm(s, TARGET_LONG_BITS == 64, TCG_REG_X0, addr_reg,
                  TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
 
-    /* Store the page mask part of the address and the low s_bits into X3.
-       Later this allows checking for equality and alignment at the same time.
-       X3 = addr_reg & (PAGE_MASK | ((1 << s_bits) - 1)) */
-    tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64, TCG_REG_X3,
-                     addr_reg, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+    /* Store the page mask part of the address into X3.  */
+    tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64,
+                     TCG_REG_X3, x3, tlb_mask);
 
     /* Add any "high bits" from the tlb offset to the env address into X2,
        to take advantage of the LSL12 form of the ADDI instruction.
@@ -1110,51 +1126,52 @@
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGMemOp memop, TCGType ext,
-                                   TCGReg data_r, TCGReg addr_r, TCGReg off_r)
+                                   TCGReg data_r, TCGReg addr_r,
+                                   TCGType otype, TCGReg off_r)
 {
     const TCGMemOp bswap = memop & MO_BSWAP;
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
-        tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, otype, off_r);
         break;
     case MO_SB:
         tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
-                       data_r, addr_r, off_r);
+                       data_r, addr_r, otype, off_r);
         break;
     case MO_UW:
-        tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, otype, off_r);
         if (bswap) {
             tcg_out_rev16(s, data_r, data_r);
         }
         break;
     case MO_SW:
         if (bswap) {
-            tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, otype, off_r);
             tcg_out_rev16(s, data_r, data_r);
             tcg_out_sxt(s, ext, MO_16, data_r, data_r);
         } else {
-            tcg_out_ldst_r(s, ext ? I3312_LDRSHX : I3312_LDRSHW,
-                           data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, (ext ? I3312_LDRSHX : I3312_LDRSHW),
+                           data_r, addr_r, otype, off_r);
         }
         break;
     case MO_UL:
-        tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, otype, off_r);
         if (bswap) {
             tcg_out_rev32(s, data_r, data_r);
         }
         break;
     case MO_SL:
         if (bswap) {
-            tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, otype, off_r);
             tcg_out_rev32(s, data_r, data_r);
             tcg_out_sxt(s, TCG_TYPE_I64, MO_32, data_r, data_r);
         } else {
-            tcg_out_ldst_r(s, I3312_LDRSWX, data_r, addr_r, off_r);
+            tcg_out_ldst_r(s, I3312_LDRSWX, data_r, addr_r, otype, off_r);
         }
         break;
     case MO_Q:
-        tcg_out_ldst_r(s, I3312_LDRX, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_LDRX, data_r, addr_r, otype, off_r);
         if (bswap) {
             tcg_out_rev64(s, data_r, data_r);
         }
@@ -1165,34 +1182,35 @@
 }
 
 static void tcg_out_qemu_st_direct(TCGContext *s, TCGMemOp memop,
-                                   TCGReg data_r, TCGReg addr_r, TCGReg off_r)
+                                   TCGReg data_r, TCGReg addr_r,
+                                   TCGType otype, TCGReg off_r)
 {
     const TCGMemOp bswap = memop & MO_BSWAP;
 
     switch (memop & MO_SIZE) {
     case MO_8:
-        tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, otype, off_r);
         break;
     case MO_16:
         if (bswap && data_r != TCG_REG_XZR) {
             tcg_out_rev16(s, TCG_REG_TMP, data_r);
             data_r = TCG_REG_TMP;
         }
-        tcg_out_ldst_r(s, I3312_STRH, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRH, data_r, addr_r, otype, off_r);
         break;
     case MO_32:
         if (bswap && data_r != TCG_REG_XZR) {
             tcg_out_rev32(s, TCG_REG_TMP, data_r);
             data_r = TCG_REG_TMP;
         }
-        tcg_out_ldst_r(s, I3312_STRW, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRW, data_r, addr_r, otype, off_r);
         break;
     case MO_64:
         if (bswap && data_r != TCG_REG_XZR) {
             tcg_out_rev64(s, TCG_REG_TMP, data_r);
             data_r = TCG_REG_TMP;
         }
-        tcg_out_ldst_r(s, I3312_STRX, data_r, addr_r, off_r);
+        tcg_out_ldst_r(s, I3312_STRX, data_r, addr_r, otype, off_r);
         break;
     default:
         tcg_abort();
@@ -1200,36 +1218,44 @@
 }
 
 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp memop, TCGType ext, int mem_index)
+                            TCGMemOpIdx oi, TCGType ext)
 {
+    TCGMemOp memop = get_memop(oi);
+    const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
 
-    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 1);
-    tcg_out_qemu_ld_direct(s, memop, ext, data_reg, addr_reg, TCG_REG_X1);
-    add_qemu_ldst_label(s, true, memop, ext, data_reg, addr_reg,
-                        mem_index, s->code_ptr, label_ptr);
+    tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 1);
+    tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
+                           TCG_REG_X1, otype, addr_reg);
+    add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
+                        s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-    tcg_out_qemu_ld_direct(s, memop, ext, data_reg, addr_reg,
-                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
+    tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
+                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
+                           otype, addr_reg);
 #endif /* CONFIG_SOFTMMU */
 }
 
 static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp memop, int mem_index)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp memop = get_memop(oi);
+    const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
 
-    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 0);
-    tcg_out_qemu_st_direct(s, memop, data_reg, addr_reg, TCG_REG_X1);
-    add_qemu_ldst_label(s, false, memop, s_bits == MO_64, data_reg, addr_reg,
-                        mem_index, s->code_ptr, label_ptr);
+    tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 0);
+    tcg_out_qemu_st_direct(s, memop, data_reg,
+                           TCG_REG_X1, otype, addr_reg);
+    add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
+                        data_reg, addr_reg, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-    tcg_out_qemu_st_direct(s, memop, data_reg, addr_reg,
-                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
+    tcg_out_qemu_st_direct(s, memop, data_reg,
+                           GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR,
+                           otype, addr_reg);
 #endif /* CONFIG_SOFTMMU */
 }
 
@@ -1518,11 +1544,11 @@
 
     case INDEX_op_qemu_ld_i32:
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, a0, a1, a2, ext, args[3]);
+        tcg_out_qemu_ld(s, a0, a1, a2, ext);
         break;
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
-        tcg_out_qemu_st(s, REG0(0), a1, a2, args[3]);
+        tcg_out_qemu_st(s, REG0(0), a1, a2);
         break;
 
     case INDEX_op_bswap64_i64:
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index e40301c..a078a22 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1223,20 +1223,19 @@
 /* Record the context of a call to the out of line helper code for the slow
    path for a load or store, so that we can later generate the correct
    helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo, TCGReg datahi, TCGReg addrlo,
-                                TCGReg addrhi, int mem_index,
-                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
+                                TCGReg addrhi, tcg_insn_unit *raddr,
+                                tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo;
     label->datahi_reg = datahi;
     label->addrlo_reg = addrlo;
     label->addrhi_reg = addrhi;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr;
 }
@@ -1244,7 +1243,8 @@
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
     TCGReg argreg, datalo, datahi;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     void *func;
 
     reloc_pc24(lb->label_ptr[0], s->code_ptr);
@@ -1255,16 +1255,16 @@
     } else {
         argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
     }
-    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
+    argreg = tcg_out_arg_imm32(s, argreg, oi);
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
     /* For armv6 we can use the canonical unsigned helpers and minimize
        icache usage.  For pre-armv6, use the signed helpers since we do
        not have a single insn sign-extend.  */
     if (use_armv6_instructions) {
-        func = qemu_ld_helpers[opc & ~MO_SIGN];
+        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
     } else {
-        func = qemu_ld_helpers[opc];
+        func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
         if (opc & MO_SIGN) {
             opc = MO_UL;
         }
@@ -1304,7 +1304,8 @@
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
     TCGReg argreg, datalo, datahi;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
 
     reloc_pc24(lb->label_ptr[0], s->code_ptr);
 
@@ -1334,11 +1335,11 @@
         break;
     }
 
-    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
+    argreg = tcg_out_arg_imm32(s, argreg, oi);
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
     /* Tail-call to the helper, which will return to the fast path.  */
-    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc]);
+    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 }
 #endif /* SOFTMMU */
 
@@ -1465,6 +1466,7 @@
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 {
     TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1476,10 +1478,11 @@
     datahi = (is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 1);
 
     /* This a conditional BL only to load a pointer within this opcode into LR
@@ -1489,8 +1492,8 @@
 
     tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend);
 
-    add_qemu_ldst_label(s, true, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
@@ -1594,6 +1597,7 @@
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 {
     TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1605,10 +1609,11 @@
     datahi = (is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 0);
 
     tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
@@ -1617,8 +1622,8 @@
     label_ptr = s->code_ptr;
     tcg_out_bl_noaddr(s, COND_NE);
 
-    add_qemu_ldst_label(s, false, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE);
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4133dcf..bde8a62 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1172,14 +1172,16 @@
    First argument register is clobbered.  */
 
 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-                                    int mem_index, TCGMemOp s_bits,
+                                    int mem_index, TCGMemOp opc,
                                     tcg_insn_unit **label_ptr, int which)
 {
     const TCGReg r0 = TCG_REG_L0;
     const TCGReg r1 = TCG_REG_L1;
     TCGType ttype = TCG_TYPE_I32;
-    TCGType htype = TCG_TYPE_I32;
-    int trexw = 0, hrexw = 0;
+    TCGType tlbtype = TCG_TYPE_I32;
+    int trexw = 0, hrexw = 0, tlbrexw = 0;
+    int a_bits = get_alignment_bits(opc);
+    target_ulong tlb_mask;
 
     if (TCG_TARGET_REG_BITS == 64) {
         if (TARGET_LONG_BITS == 64) {
@@ -1187,20 +1189,32 @@
             trexw = P_REXW;
         }
         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
-            htype = TCG_TYPE_I64;
             hrexw = P_REXW;
+            if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
+                tlbtype = TCG_TYPE_I64;
+                tlbrexw = P_REXW;
+            }
         }
     }
 
-    tcg_out_mov(s, htype, r0, addrlo);
-    tcg_out_mov(s, ttype, r1, addrlo);
+    tcg_out_mov(s, tlbtype, r0, addrlo);
+    if (a_bits >= 0) {
+        /* A byte access or an alignment check required */
+        tcg_out_mov(s, ttype, r1, addrlo);
+        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
+    } else {
+        /* For unaligned access check that we don't cross pages using
+           the page address of the last byte.  */
+        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo,
+                             (1 << (opc & MO_SIZE)) - 1);
+        tlb_mask = TARGET_PAGE_MASK;
+    }
 
-    tcg_out_shifti(s, SHIFT_SHR + hrexw, r0,
+    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 
-    tgen_arithi(s, ARITH_AND + trexw, r1,
-                TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
-    tgen_arithi(s, ARITH_AND + hrexw, r0,
+    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
+    tgen_arithi(s, ARITH_AND + tlbrexw, r0,
                 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
     tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
@@ -1244,21 +1258,20 @@
  * Record the context of a call to the out of line helper code for the slow path
  * for a load or store, so that we can later generate the correct helper code
  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo, TCGReg datahi,
                                 TCGReg addrlo, TCGReg addrhi,
-                                int mem_index, tcg_insn_unit *raddr,
+                                tcg_insn_unit *raddr,
                                 tcg_insn_unit **label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo;
     label->datahi_reg = datahi;
     label->addrlo_reg = addrlo;
     label->addrhi_reg = addrhi;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr[0];
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
@@ -1271,7 +1284,8 @@
  */
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = l->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGReg data_reg;
     tcg_insn_unit **label_ptr = &l->label_ptr[0];
 
@@ -1295,20 +1309,19 @@
             ofs += 4;
         }
 
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, oi);
         ofs += 4;
 
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr);
+        tcg_out_sti(s, TCG_TYPE_PTR, TCG_REG_ESP, ofs, (uintptr_t)l->raddr);
     } else {
         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
         /* The second argument is already loaded with addrlo.  */
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
-                     l->mem_index);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
                      (uintptr_t)l->raddr);
     }
 
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     data_reg = l->datalo_reg;
     switch (opc & MO_SSIZE) {
@@ -1354,7 +1367,8 @@
  */
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = l->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp s_bits = opc & MO_SIZE;
     tcg_insn_unit **label_ptr = &l->label_ptr[0];
     TCGReg retaddr;
@@ -1387,19 +1401,18 @@
             ofs += 4;
         }
 
-        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, oi);
         ofs += 4;
 
         retaddr = TCG_REG_EAX;
-        tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr);
-        tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs);
+        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
+        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
     } else {
         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
         /* The second argument is already loaded with addrlo.  */
         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                     tcg_target_call_iarg_regs[2], l->datalo_reg);
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
-                     l->mem_index);
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
 
         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
             retaddr = tcg_target_call_iarg_regs[4];
@@ -1414,7 +1427,7 @@
 
     /* "Tail call" to the helper, with the return address back inline.  */
     tcg_out_push(s, retaddr);
-    tcg_out_jmp(s, qemu_st_helpers[opc]);
+    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 }
 #elif defined(__x86_64__) && defined(__linux__)
 # include <asm/prctl.h>
@@ -1435,8 +1448,8 @@
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
-                                   TCGReg base, intptr_t ofs, int seg,
-                                   TCGMemOp memop)
+                                   TCGReg base, int index, intptr_t ofs,
+                                   int seg, TCGMemOp memop)
 {
     const TCGMemOp real_bswap = memop & MO_BSWAP;
     TCGMemOp bswap = real_bswap;
@@ -1449,13 +1462,16 @@
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
-        tcg_out_modrm_offset(s, OPC_MOVZBL + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
+                                 base, index, 0, ofs);
         break;
     case MO_SB:
-        tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
+                                 base, index, 0, ofs);
         break;
     case MO_UW:
-        tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
+                                 base, index, 0, ofs);
         if (real_bswap) {
             tcg_out_rolw_8(s, datalo);
         }
@@ -1463,20 +1479,21 @@
     case MO_SW:
         if (real_bswap) {
             if (have_movbe) {
-                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
-                                     datalo, base, ofs);
+                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                         datalo, base, index, 0, ofs);
             } else {
-                tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
+                                         base, index, 0, ofs);
                 tcg_out_rolw_8(s, datalo);
             }
             tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
+                                     datalo, base, index, 0, ofs);
         }
         break;
     case MO_UL:
-        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
@@ -1484,19 +1501,22 @@
 #if TCG_TARGET_REG_BITS == 64
     case MO_SL:
         if (real_bswap) {
-            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+                                     base, index, 0, ofs);
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
             }
             tcg_out_ext32s(s, datalo, datalo);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
+                                     base, index, 0, ofs);
         }
         break;
 #endif
     case MO_Q:
         if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
+                                     base, index, 0, ofs);
             if (bswap) {
                 tcg_out_bswap64(s, datalo);
             }
@@ -1507,11 +1527,15 @@
                 datahi = t;
             }
             if (base != datalo) {
-                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
-                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
+                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+                                         base, index, 0, ofs);
+                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
+                                         base, index, 0, ofs + 4);
             } else {
-                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
-                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
+                                         base, index, 0, ofs + 4);
+                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+                                         base, index, 0, ofs);
             }
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
@@ -1531,10 +1555,10 @@
 {
     TCGReg datalo, datahi, addrlo;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    TCGMemOp s_bits;
     tcg_insn_unit *label_ptr[2];
 #endif
 
@@ -1542,43 +1566,52 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args++;
-    s_bits = opc & MO_SIZE;
+    mem_index = get_mmuidx(oi);
 
-    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
                      label_ptr, offsetof(CPUTLBEntry, addr_read));
 
     /* TLB Hit.  */
-    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
+    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
 
     /* Record the current context of a load into ldst label */
-    add_qemu_ldst_label(s, true, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
         TCGReg base = addrlo;
+        int index = -1;
         int seg = 0;
 
-        /* ??? We assume all operations have left us with register contents
-           that are zero extended.  So far this appears to be true.  If we
-           want to enforce this, we can either do an explicit zero-extension
-           here, or (if GUEST_BASE == 0, or a segment register is in use)
-           use the ADDR32 prefix.  For now, do nothing.  */
-        if (GUEST_BASE && guest_base_flags) {
+        /* For a 32-bit guest, the high 32 bits may contain garbage.
+           We can do this with the ADDR32 prefix if we're not using
+           a guest base, or when using segmentation.  Otherwise we
+           need to zero-extend manually.  */
+        if (GUEST_BASE == 0 || guest_base_flags) {
             seg = guest_base_flags;
             offset = 0;
-        } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
-            tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
-            base = TCG_REG_L1;
-            offset = 0;
+            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+                seg |= P_ADDR32;
+            }
+        } else if (TCG_TARGET_REG_BITS == 64) {
+            if (TARGET_LONG_BITS == 32) {
+                tcg_out_ext32u(s, TCG_REG_L0, base);
+                base = TCG_REG_L0;
+            }
+            if (offset != GUEST_BASE) {
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
+                index = TCG_REG_L1;
+                offset = 0;
+            }
         }
 
-        tcg_out_qemu_ld_direct(s, datalo, datahi, base, offset, seg, opc);
+        tcg_out_qemu_ld_direct(s, datalo, datahi,
+                               base, index, offset, seg, opc);
     }
 #endif
 }
@@ -1662,10 +1695,10 @@
 {
     TCGReg datalo, datahi, addrlo;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    TCGMemOp s_bits;
     tcg_insn_unit *label_ptr[2];
 #endif
 
@@ -1673,40 +1706,50 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args++;
-    s_bits = opc & MO_SIZE;
+    mem_index = get_mmuidx(oi);
 
-    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, s_bits,
+    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
                      label_ptr, offsetof(CPUTLBEntry, addr_write));
 
     /* TLB Hit.  */
     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
 
     /* Record the current context of a store into ldst label */
-    add_qemu_ldst_label(s, false, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
         TCGReg base = addrlo;
         int seg = 0;
 
-        /* ??? We assume all operations have left us with register contents
-           that are zero extended.  So far this appears to be true.  If we
-           want to enforce this, we can either do an explicit zero-extension
-           here, or (if GUEST_BASE == 0, or a segment register is in use)
-           use the ADDR32 prefix.  For now, do nothing.  */
-        if (GUEST_BASE && guest_base_flags) {
+        /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
+        if (GUEST_BASE == 0 || guest_base_flags) {
             seg = guest_base_flags;
             offset = 0;
-        } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
-            tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
-            base = TCG_REG_L1;
-            offset = 0;
+            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+                seg |= P_ADDR32;
+            }
+        } else if (TCG_TARGET_REG_BITS == 64) {
+            /* ??? Note that we can't use the same SIB addressing scheme
+               as for loads, since we require L0 free for bswap.  */
+            if (offset != GUEST_BASE) {
+                if (TARGET_LONG_BITS == 32) {
+                    tcg_out_ext32u(s, TCG_REG_L0, base);
+                    base = TCG_REG_L0;
+                }
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
+                base = TCG_REG_L1;
+                offset = 0;
+            } else if (TARGET_LONG_BITS == 32) {
+                tcg_out_ext32u(s, TCG_REG_L1, base);
+                base = TCG_REG_L1;
+            }
         }
 
         tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
diff --git a/tcg/ia64/tcg-target.c b/tcg/ia64/tcg-target.c
index 6bc9924..5984406 100644
--- a/tcg/ia64/tcg-target.c
+++ b/tcg/ia64/tcg-target.c
@@ -1639,14 +1639,16 @@
         OPC_LD1_M1, OPC_LD2_M1, OPC_LD4_M1, OPC_LD8_M1
     };
     int addr_reg, data_reg, mem_index;
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
     uint64_t fin1, fin2;
     tcg_insn_unit *label_ptr;
 
     data_reg = args[0];
     addr_reg = args[1];
-    opc = args[2];
-    mem_index = args[3];
+    oi = args[2];
+    opc = get_memop(oi);
+    mem_index = get_mmuidx(oi);
     s_bits = opc & MO_SIZE;
 
     /* Read the TLB entry */
@@ -1674,7 +1676,7 @@
                    tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R56, TCG_AREG0),
                    tcg_opc_a1 (TCG_REG_P6, OPC_ADD_A1, TCG_REG_R2,
                                TCG_REG_R2, TCG_REG_R57),
-                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R58, mem_index));
+                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R58, oi));
     label_ptr = s->code_ptr;
     tcg_out_bundle(s, miB,
                    tcg_opc_m1 (TCG_REG_P6, opc_ld_m1[s_bits],
@@ -1701,13 +1703,15 @@
     TCGReg addr_reg, data_reg;
     int mem_index;
     uint64_t pre1, pre2;
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
     tcg_insn_unit *label_ptr;
 
     data_reg = args[0];
     addr_reg = args[1];
-    opc = args[2];
-    mem_index = args[3];
+    oi = args[2];
+    opc = get_memop(oi);
+    mem_index = get_mmuidx(oi);
     s_bits = opc & MO_SIZE;
 
     /* Note that we always use LE helper functions, so the bswap insns
@@ -1736,7 +1740,7 @@
                    tcg_opc_mov_a(TCG_REG_P7, TCG_REG_R56, TCG_AREG0),
                    tcg_opc_a1 (TCG_REG_P6, OPC_ADD_A1, TCG_REG_R2,
                                TCG_REG_R2, TCG_REG_R57),
-                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R59, mem_index));
+                   tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R59, oi));
     label_ptr = s->code_ptr;
     tcg_out_bundle(s, miB,
                    tcg_opc_m4 (TCG_REG_P6, opc_st_m4[s_bits],
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index b7f4d67..09fefe3 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -992,21 +992,19 @@
     tcg_out_opc_reg(s, OPC_ADDU, base, TCG_REG_A0, addrl);
 }
 
-static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo, TCGReg datahi,
                                 TCGReg addrlo, TCGReg addrhi,
-                                int mem_index, void *raddr,
-                                tcg_insn_unit *label_ptr[2])
+                                void *raddr, tcg_insn_unit *label_ptr[2])
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo;
     label->datahi_reg = datahi;
     label->addrlo_reg = addrlo;
     label->addrhi_reg = addrhi;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr[0];
     if (TARGET_LONG_BITS == 64) {
@@ -1016,7 +1014,8 @@
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGReg v0;
     int i;
 
@@ -1032,9 +1031,9 @@
     } else {
         i = tcg_out_call_iarg_reg(s, i, l->addrlo_reg);
     }
-    i = tcg_out_call_iarg_imm(s, i, l->mem_index);
+    i = tcg_out_call_iarg_imm(s, i, oi);
     i = tcg_out_call_iarg_imm(s, i, (intptr_t)l->raddr);
-    tcg_out_call_int(s, qemu_ld_helpers[opc], false);
+    tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)], false);
     /* delay slot */
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 
@@ -1058,7 +1057,8 @@
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-    TCGMemOp opc = l->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp s_bits = opc & MO_SIZE;
     int i;
 
@@ -1090,13 +1090,13 @@
     default:
         tcg_abort();
     }
-    i = tcg_out_call_iarg_imm(s, i, l->mem_index);
+    i = tcg_out_call_iarg_imm(s, i, oi);
 
     /* Tail call to the store helper.  Thus force the return address
        computation to take place in the return address register.  */
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (intptr_t)l->raddr);
     i = tcg_out_call_iarg_reg(s, i, TCG_REG_RA);
-    tcg_out_call_int(s, qemu_st_helpers[opc], true);
+    tcg_out_call_int(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)], true);
     /* delay slot */
     tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 }
@@ -1152,6 +1152,7 @@
 {
     TCGReg addr_regl, addr_regh __attribute__((unused));
     TCGReg data_regl, data_regh;
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
@@ -1166,17 +1167,18 @@
     data_regh = (is_64 ? *args++ : 0);
     addr_regl = *args++;
     addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     s_bits = opc & MO_SIZE;
 
     tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index,
                      s_bits, label_ptr, 1);
     tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc);
-    add_qemu_ldst_label(s, 1, opc, data_regl, data_regh, addr_regl, addr_regh,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 1, oi, data_regl, data_regh, addr_regl, addr_regh,
+                        s->code_ptr, label_ptr);
 #else
     if (GUEST_BASE == 0 && data_regl != addr_regl) {
         base = addr_regl;
@@ -1281,6 +1283,7 @@
 {
     TCGReg addr_regl, addr_regh __attribute__((unused));
     TCGReg data_regl, data_regh, base;
+    TCGMemOpIdx oi;
     TCGMemOp opc;
 #if defined(CONFIG_SOFTMMU)
     tcg_insn_unit *label_ptr[2];
@@ -1292,10 +1295,11 @@
     data_regh = (is_64 ? *args++ : 0);
     addr_regl = *args++;
     addr_regh = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    mem_index = *args;
+    mem_index = get_mmuidx(oi);
     s_bits = opc & 3;
 
     /* Note that we eliminated the helper's address argument,
@@ -1304,8 +1308,8 @@
     tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index,
                      s_bits, label_ptr, 0);
     tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
-    add_qemu_ldst_label(s, 0, opc, data_regl, data_regh, addr_regl, addr_regh,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 0, oi, data_regl, data_regh, addr_regl, addr_regh,
+                        s->code_ptr, label_ptr);
 #else
     if (GUEST_BASE == 0) {
         base = addr_regl;
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 34ae3c2..bbc86d0 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -67,6 +67,37 @@
     temps[temp].mask = -1;
 }
 
+static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op,
+                                TCGOpcode opc, int nargs)
+{
+    int oi = s->gen_next_op_idx;
+    int pi = s->gen_next_parm_idx;
+    int prev = old_op->prev;
+    int next = old_op - s->gen_op_buf;
+    TCGOp *new_op;
+
+    tcg_debug_assert(oi < OPC_BUF_SIZE);
+    tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE);
+    s->gen_next_op_idx = oi + 1;
+    s->gen_next_parm_idx = pi + nargs;
+
+    new_op = &s->gen_op_buf[oi];
+    *new_op = (TCGOp){
+        .opc = opc,
+        .args = pi,
+        .prev = prev,
+        .next = next
+    };
+    if (prev >= 0) {
+        s->gen_op_buf[prev].next = oi;
+    } else {
+        s->gen_first_op_idx = oi;
+    }
+    old_op->prev = oi;
+
+    return new_op;
+}
+
 /* Reset all temporaries, given that there are NB_TEMPS of them.  */
 static void reset_all_temps(int nb_temps)
 {
@@ -162,13 +193,13 @@
     return false;
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args,
+static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
                             TCGOpcode old_op, TCGArg dst, TCGArg src)
 {
     TCGOpcode new_op = op_to_mov(old_op);
     tcg_target_ulong mask;
 
-    s->gen_opc_buf[op_index] = new_op;
+    op->opc = new_op;
 
     reset_temp(dst);
     mask = temps[src].mask;
@@ -193,30 +224,30 @@
         temps[src].next_copy = dst;
     }
 
-    gen_args[0] = dst;
-    gen_args[1] = src;
+    args[0] = dst;
+    args[1] = src;
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args,
+static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg *args,
                              TCGOpcode old_op, TCGArg dst, TCGArg val)
 {
     TCGOpcode new_op = op_to_movi(old_op);
     tcg_target_ulong mask;
 
-    s->gen_opc_buf[op_index] = new_op;
+    op->opc = new_op;
 
     reset_temp(dst);
     temps[dst].state = TCG_TEMP_CONST;
     temps[dst].val = val;
     mask = val;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_movi_i32) {
         /* High bits of the destination are now garbage.  */
         mask |= ~0xffffffffull;
     }
     temps[dst].mask = mask;
 
-    gen_args[0] = dst;
-    gen_args[1] = val;
+    args[0] = dst;
+    args[1] = val;
 }
 
 static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
@@ -533,11 +564,9 @@
 }
 
 /* Propagate constants and copies, fold constant expressions. */
-static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
-                                    TCGArg *args, TCGOpDef *tcg_op_defs)
+static void tcg_constant_folding(TCGContext *s)
 {
-    int nb_ops, op_index, nb_temps, nb_globals;
-    TCGArg *gen_args;
+    int oi, oi_next, nb_temps, nb_globals;
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -548,24 +577,23 @@
     nb_globals = s->nb_globals;
     reset_all_temps(nb_temps);
 
-    nb_ops = tcg_opc_ptr - s->gen_opc_buf;
-    gen_args = args;
-    for (op_index = 0; op_index < nb_ops; op_index++) {
-        TCGOpcode op = s->gen_opc_buf[op_index];
-        const TCGOpDef *def = &tcg_op_defs[op];
+    for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
         tcg_target_ulong mask, partmask, affected;
-        int nb_oargs, nb_iargs, nb_args, i;
+        int nb_oargs, nb_iargs, i;
         TCGArg tmp;
 
-        if (op == INDEX_op_call) {
-            *gen_args++ = tmp = *args++;
-            nb_oargs = tmp >> 16;
-            nb_iargs = tmp & 0xffff;
-            nb_args = nb_oargs + nb_iargs + def->nb_cargs;
+        TCGOp * const op = &s->gen_op_buf[oi];
+        TCGArg * const args = &s->gen_opparam_buf[op->args];
+        TCGOpcode opc = op->opc;
+        const TCGOpDef *def = &tcg_op_defs[opc];
+
+        oi_next = op->next;
+        if (opc == INDEX_op_call) {
+            nb_oargs = op->callo;
+            nb_iargs = op->calli;
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            nb_args = def->nb_args;
         }
 
         /* Do copy propagation */
@@ -576,7 +604,7 @@
         }
 
         /* For commutative operations make constant second argument */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(add):
         CASE_OP_32_64(mul):
         CASE_OP_32_64(and):
@@ -634,7 +662,7 @@
 
         /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
            and "sub r, 0, a => neg r, a" case.  */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(shl):
         CASE_OP_32_64(shr):
         CASE_OP_32_64(sar):
@@ -642,9 +670,7 @@
         CASE_OP_32_64(rotr):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[1]].val == 0) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-                args += 3;
-                gen_args += 2;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
                 continue;
             }
             break;
@@ -657,7 +683,7 @@
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (op == INDEX_op_sub_i32) {
+                if (opc == INDEX_op_sub_i32) {
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
                 } else {
@@ -669,12 +695,9 @@
                 }
                 if (temps[args[1]].state == TCG_TEMP_CONST
                     && temps[args[1]].val == 0) {
-                    s->gen_opc_buf[op_index] = neg_op;
+                    op->opc = neg_op;
                     reset_temp(args[0]);
-                    gen_args[0] = args[0];
-                    gen_args[1] = args[2];
-                    args += 3;
-                    gen_args += 2;
+                    args[1] = args[2];
                     continue;
                 }
             }
@@ -728,12 +751,9 @@
                 if (!have_not) {
                     break;
                 }
-                s->gen_opc_buf[op_index] = not_op;
+                op->opc = not_op;
                 reset_temp(args[0]);
-                gen_args[0] = args[0];
-                gen_args[1] = args[i];
-                args += 3;
-                gen_args += 2;
+                args[1] = args[i];
                 continue;
             }
         default:
@@ -741,7 +761,7 @@
         }
 
         /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(shl):
@@ -769,12 +789,10 @@
             break;
         do_mov3:
             if (temps_are_copies(args[0], args[1])) {
-                s->gen_opc_buf[op_index] = INDEX_op_nop;
+                tcg_op_remove(s, op);
             } else {
-                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
-                gen_args += 2;
+                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
             }
-            args += 3;
             continue;
         default:
             break;
@@ -784,7 +802,7 @@
            output argument is supported. */
         mask = -1;
         affected = -1;
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(ext8s):
             if ((temps[args[1]].mask & 0x80) != 0) {
                 break;
@@ -900,7 +918,8 @@
 
         CASE_OP_32_64(qemu_ld):
             {
-                TCGMemOp mop = args[nb_oargs + nb_iargs];
+                TCGMemOpIdx oi = args[nb_oargs + nb_iargs];
+                TCGMemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
@@ -923,38 +942,31 @@
 
         if (partmask == 0) {
             assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-            args += nb_args;
-            gen_args += 2;
+            tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
             continue;
         }
         if (affected == 0) {
             assert(nb_oargs == 1);
             if (temps_are_copies(args[0], args[1])) {
-                s->gen_opc_buf[op_index] = INDEX_op_nop;
+                tcg_op_remove(s, op);
             } else if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
-                gen_args += 2;
+                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
             } else {
-                tcg_opt_gen_movi(s, op_index, gen_args, op,
+                tcg_opt_gen_movi(s, op, args, opc,
                                  args[0], temps[args[1]].val);
-                gen_args += 2;
             }
-            args += nb_args;
             continue;
         }
 
         /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(and):
         CASE_OP_32_64(mul):
         CASE_OP_32_64(muluh):
         CASE_OP_32_64(mulsh):
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-                args += 3;
-                gen_args += 2;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
                 continue;
             }
             break;
@@ -963,18 +975,15 @@
         }
 
         /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(or):
         CASE_OP_32_64(and):
             if (temps_are_copies(args[1], args[2])) {
                 if (temps_are_copies(args[0], args[1])) {
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 } else {
-                    tcg_opt_gen_mov(s, op_index, gen_args, op,
-                                    args[0], args[1]);
-                    gen_args += 2;
+                    tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
                 }
-                args += 3;
                 continue;
             }
             break;
@@ -983,14 +992,12 @@
         }
 
         /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(andc):
         CASE_OP_32_64(sub):
         CASE_OP_32_64(xor):
             if (temps_are_copies(args[1], args[2])) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
-                gen_args += 2;
-                args += 3;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], 0);
                 continue;
             }
             break;
@@ -1001,17 +1008,14 @@
         /* Propagate constants through copy operations and do constant
            folding.  Constants will be substituted to arguments by register
            allocator where needed and possible.  Also detect copies. */
-        switch (op) {
+        switch (opc) {
         CASE_OP_32_64(mov):
             if (temps_are_copies(args[0], args[1])) {
-                args += 2;
-                s->gen_opc_buf[op_index] = INDEX_op_nop;
+                tcg_op_remove(s, op);
                 break;
             }
             if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(s, op_index, gen_args, op, args[0], args[1]);
-                gen_args += 2;
-                args += 2;
+                tcg_opt_gen_mov(s, op, args, opc, args[0], args[1]);
                 break;
             }
             /* Source argument is constant.  Rewrite the operation and
@@ -1019,9 +1023,7 @@
             args[1] = temps[args[1]].val;
             /* fallthrough */
         CASE_OP_32_64(movi):
-            tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], args[1]);
-            gen_args += 2;
-            args += 2;
+            tcg_opt_gen_movi(s, op, args, opc, args[0], args[1]);
             break;
 
         CASE_OP_32_64(not):
@@ -1033,20 +1035,16 @@
         case INDEX_op_ext32s_i64:
         case INDEX_op_ext32u_i64:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding(op, temps[args[1]].val, 0);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 2;
+                tmp = do_constant_folding(opc, temps[args[1]].val, 0);
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
 
         case INDEX_op_trunc_shr_i32:
             if (temps[args[1]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding(op, temps[args[1]].val, args[2]);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 3;
+                tmp = do_constant_folding(opc, temps[args[1]].val, args[2]);
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1075,11 +1073,9 @@
         CASE_OP_32_64(remu):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
-                tmp = do_constant_folding(op, temps[args[1]].val,
+                tmp = do_constant_folding(opc, temps[args[1]].val,
                                           temps[args[2]].val);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 3;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
@@ -1089,54 +1085,44 @@
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 tmp = deposit64(temps[args[1]].val, args[3], args[4],
                                 temps[args[2]].val);
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 5;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(op, args[1], args[2], args[3]);
+            tmp = do_constant_folding_cond(opc, args[1], args[2], args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
-                args += 4;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
                 break;
             }
             goto do_default;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
+            tmp = do_constant_folding_cond(opc, args[0], args[1], args[2]);
             if (tmp != 2) {
                 if (tmp) {
                     reset_all_temps(nb_temps);
-                    s->gen_opc_buf[op_index] = INDEX_op_br;
-                    gen_args[0] = args[3];
-                    gen_args += 1;
+                    op->opc = INDEX_op_br;
+                    args[0] = args[3];
                 } else {
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 }
-                args += 4;
                 break;
             }
             goto do_default;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(op, args[1], args[2], args[5]);
+            tmp = do_constant_folding_cond(opc, args[1], args[2], args[5]);
             if (tmp != 2) {
                 if (temps_are_copies(args[0], args[4-tmp])) {
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
-                    tcg_opt_gen_movi(s, op_index, gen_args, op,
+                    tcg_opt_gen_movi(s, op, args, opc,
                                      args[0], temps[args[4-tmp]].val);
-                    gen_args += 2;
                 } else {
-                    tcg_opt_gen_mov(s, op_index, gen_args, op,
-                                    args[0], args[4-tmp]);
-                    gen_args += 2;
+                    tcg_opt_gen_mov(s, op, args, opc, args[0], args[4-tmp]);
                 }
-                args += 6;
                 break;
             }
             goto do_default;
@@ -1154,24 +1140,22 @@
                 uint64_t a = ((uint64_t)ah << 32) | al;
                 uint64_t b = ((uint64_t)bh << 32) | bl;
                 TCGArg rl, rh;
+                TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2);
+                TCGArg *args2 = &s->gen_opparam_buf[op2->args];
 
-                if (op == INDEX_op_add2_i32) {
+                if (opc == INDEX_op_add2_i32) {
                     a += b;
                 } else {
                     a -= b;
                 }
 
-                /* We emit the extra nop when we emit the add2/sub2.  */
-                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
-
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op_index, &gen_args[0],
-                                 op, rl, (uint32_t)a);
-                tcg_opt_gen_movi(s, ++op_index, &gen_args[2],
-                                 op, rh, (uint32_t)(a >> 32));
-                gen_args += 4;
-                args += 6;
+                tcg_opt_gen_movi(s, op, args, opc, rl, (uint32_t)a);
+                tcg_opt_gen_movi(s, op2, args2, opc, rh, (uint32_t)(a >> 32));
+
+                /* We've done all we need to do with the movi.  Skip it.  */
+                oi_next = op2->next;
                 break;
             }
             goto do_default;
@@ -1183,18 +1167,16 @@
                 uint32_t b = temps[args[3]].val;
                 uint64_t r = (uint64_t)a * b;
                 TCGArg rl, rh;
-
-                /* We emit the extra nop when we emit the mulu2.  */
-                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
+                TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2);
+                TCGArg *args2 = &s->gen_opparam_buf[op2->args];
 
                 rl = args[0];
                 rh = args[1];
-                tcg_opt_gen_movi(s, op_index, &gen_args[0],
-                                 op, rl, (uint32_t)r);
-                tcg_opt_gen_movi(s, ++op_index, &gen_args[2],
-                                 op, rh, (uint32_t)(r >> 32));
-                gen_args += 4;
-                args += 4;
+                tcg_opt_gen_movi(s, op, args, opc, rl, (uint32_t)r);
+                tcg_opt_gen_movi(s, op2, args2, opc, rh, (uint32_t)(r >> 32));
+
+                /* We've done all we need to do with the movi.  Skip it.  */
+                oi_next = op2->next;
                 break;
             }
             goto do_default;
@@ -1205,12 +1187,11 @@
                 if (tmp) {
             do_brcond_true:
                     reset_all_temps(nb_temps);
-                    s->gen_opc_buf[op_index] = INDEX_op_br;
-                    gen_args[0] = args[5];
-                    gen_args += 1;
+                    op->opc = INDEX_op_br;
+                    args[0] = args[5];
                 } else {
             do_brcond_false:
-                    s->gen_opc_buf[op_index] = INDEX_op_nop;
+                    tcg_op_remove(s, op);
                 }
             } else if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
                        && temps[args[2]].state == TCG_TEMP_CONST
@@ -1221,12 +1202,11 @@
                    vs the high word of the input.  */
             do_brcond_high:
                 reset_all_temps(nb_temps);
-                s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
-                gen_args[0] = args[1];
-                gen_args[1] = args[3];
-                gen_args[2] = args[4];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_brcond_i32;
+                args[0] = args[1];
+                args[1] = args[3];
+                args[2] = args[4];
+                args[3] = args[5];
             } else if (args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
@@ -1246,12 +1226,10 @@
                 }
             do_brcond_low:
                 reset_all_temps(nb_temps);
-                s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
-                gen_args[0] = args[0];
-                gen_args[1] = args[2];
-                gen_args[2] = args[4];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_brcond_i32;
+                args[1] = args[2];
+                args[2] = args[4];
+                args[3] = args[5];
             } else if (args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
@@ -1273,15 +1251,13 @@
             } else {
                 goto do_default;
             }
-            args += 6;
             break;
 
         case INDEX_op_setcond2_i32:
             tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], tmp);
-                gen_args += 2;
+                tcg_opt_gen_movi(s, op, args, opc, args[0], tmp);
             } else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
                        && temps[args[3]].state == TCG_TEMP_CONST
                        && temps[args[4]].state == TCG_TEMP_CONST
@@ -1290,14 +1266,12 @@
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
-                s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
                 reset_temp(args[0]);
                 temps[args[0]].mask = 1;
-                gen_args[0] = args[0];
-                gen_args[1] = args[2];
-                gen_args[2] = args[4];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_setcond_i32;
+                args[1] = args[2];
+                args[2] = args[4];
+                args[3] = args[5];
             } else if (args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
@@ -1318,12 +1292,9 @@
             do_setcond_low:
                 reset_temp(args[0]);
                 temps[args[0]].mask = 1;
-                s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args[2] = args[3];
-                gen_args[3] = args[5];
-                gen_args += 4;
+                op->opc = INDEX_op_setcond_i32;
+                args[2] = args[3];
+                args[3] = args[5];
             } else if (args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
@@ -1345,7 +1316,6 @@
             } else {
                 goto do_default;
             }
-            args += 6;
             break;
 
         case INDEX_op_call:
@@ -1377,22 +1347,12 @@
                     }
                 }
             }
-            for (i = 0; i < nb_args; i++) {
-                gen_args[i] = args[i];
-            }
-            args += nb_args;
-            gen_args += nb_args;
             break;
         }
     }
-
-    return gen_args;
 }
 
-TCGArg *tcg_optimize(TCGContext *s, uint16_t *tcg_opc_ptr,
-        TCGArg *args, TCGOpDef *tcg_op_defs)
+void tcg_optimize(TCGContext *s)
 {
-    TCGArg *res;
-    res = tcg_constant_folding(s, tcg_opc_ptr, args, tcg_op_defs);
-    return res;
+    tcg_constant_folding(s);
 }
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 203027e..9c6fa28 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -1363,7 +1363,7 @@
    in CR7, loads the addend of the TLB into R3, and returns the register
    containing the guest address (zero-extended into R4).  Clobbers R0 and R2. */
 
-static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits,
+static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
                                TCGReg addrlo, TCGReg addrhi,
                                int mem_index, bool is_read)
 {
@@ -1373,6 +1373,8 @@
            : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
     int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
     TCGReg base = TCG_AREG0;
+    TCGMemOp s_bits = opc & MO_SIZE;
+    int a_bits = get_alignment_bits(opc);
 
     /* Extract the page index, shifted into place for tlb index.  */
     if (TCG_TARGET_REG_BITS == 64) {
@@ -1424,17 +1426,40 @@
        to minimize any load use delay.  */
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3, add_off);
 
-    /* Clear the non-page, non-alignment bits from the address.  */
+    /* Clear the non-page, non-alignment bits from the address */
     if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
+        /* We don't support unaligned accesses on 32-bits, preserve
+         * the bottom bits and thus trigger a comparison failure on
+         * unaligned accesses
+         */
+        if (a_bits < 0) {
+            a_bits = s_bits;
+        }
         tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0,
-                    (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
-    } else if (!s_bits) {
-        tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo,
-                    0, 63 - TARGET_PAGE_BITS);
+                    (32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
+    } else if (a_bits) {
+        /* More than byte access, we need to handle alignment */
+        if (a_bits > 0) {
+            /* Alignment required by the front-end, same as 32-bits */
+            tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
+                        64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - a_bits);
+            tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
+       } else {
+           /* We support unaligned accesses, we need to make sure we fail
+            * if we cross a page boundary. The trick is to add the
+            * access_size-1 to the address before masking the low bits.
+            * That will make the address overflow to the next page if we
+            * cross a page boundary which will then force a mismatch of
+            * the TLB compare since the next page cannot possibly be in
+            * the same TLB index.
+            */
+            tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, (1 << s_bits) - 1));
+            tcg_out_rld(s, RLDICR, TCG_REG_R0, TCG_REG_R0,
+                        0, 63 - TARGET_PAGE_BITS);
+        }
     } else {
-        tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
-                    64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - s_bits);
-        tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
+        /* Byte access, just chop off the bits below the page index */
+        tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo, 0, 63 - TARGET_PAGE_BITS);
     }
 
     if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
@@ -1453,28 +1478,27 @@
 /* Record the context of a call to the out of line helper code for the slow
    path for a load or store, so that we can later generate the correct
    helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
                                 TCGReg datalo_reg, TCGReg datahi_reg,
                                 TCGReg addrlo_reg, TCGReg addrhi_reg,
-                                int mem_index, tcg_insn_unit *raddr,
-                                tcg_insn_unit *lptr)
+                                tcg_insn_unit *raddr, tcg_insn_unit *lptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = datalo_reg;
     label->datahi_reg = datahi_reg;
     label->addrlo_reg = addrlo_reg;
     label->addrhi_reg = addrhi_reg;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = lptr;
 }
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGReg hi, lo, arg = TCG_REG_R3;
 
     reloc_pc14(lb->label_ptr[0], s->code_ptr);
@@ -1495,10 +1519,10 @@
         tcg_out_mov(s, TCG_TYPE_TL, arg++, lo);
     }
 
-    tcg_out_movi(s, TCG_TYPE_I32, arg++, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     lo = lb->datalo_reg;
     hi = lb->datahi_reg;
@@ -1517,7 +1541,8 @@
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
     TCGMemOp s_bits = opc & MO_SIZE;
     TCGReg hi, lo, arg = TCG_REG_R3;
 
@@ -1564,10 +1589,10 @@
         }
     }
 
-    tcg_out_movi(s, TCG_TYPE_I32, arg++, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
     tcg_out32(s, MFSPR | RT(arg) | LR);
 
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tcg_out_b(s, 0, lb->raddr);
 }
@@ -1577,6 +1602,7 @@
 {
     TCGReg datalo, datahi, addrlo, rbase;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1587,12 +1613,13 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
     s_bits = opc & MO_SIZE;
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
-    addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, true);
+    mem_index = get_mmuidx(oi);
+    addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, true);
 
     /* Load a pointer into the current opcode w/conditional branch-link. */
     label_ptr = s->code_ptr;
@@ -1624,7 +1651,7 @@
             tcg_out32(s, LWZ | TAI(datalo, addrlo, 4));
         }
     } else {
-        uint32_t insn = qemu_ldx_opc[opc];
+        uint32_t insn = qemu_ldx_opc[opc & (MO_BSWAP | MO_SSIZE)];
         if (!HAVE_ISA_2_06 && insn == LDBRX) {
             tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, 4));
             tcg_out32(s, LWBRX | TAB(datalo, rbase, addrlo));
@@ -1641,8 +1668,8 @@
     }
 
 #ifdef CONFIG_SOFTMMU
-    add_qemu_ldst_label(s, true, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #endif
 }
 
@@ -1650,6 +1677,7 @@
 {
     TCGReg datalo, datahi, addrlo, rbase;
     TCGReg addrhi __attribute__((unused));
+    TCGMemOpIdx oi;
     TCGMemOp opc, s_bits;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
@@ -1660,12 +1688,13 @@
     datahi = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
     addrlo = *args++;
     addrhi = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
-    opc = *args++;
+    oi = *args++;
+    opc = get_memop(oi);
     s_bits = opc & MO_SIZE;
 
 #ifdef CONFIG_SOFTMMU
-    mem_index = *args;
-    addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, false);
+    mem_index = get_mmuidx(oi);
+    addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, false);
 
     /* Load a pointer into the current opcode w/conditional branch-link. */
     label_ptr = s->code_ptr;
@@ -1694,7 +1723,7 @@
             tcg_out32(s, STW | TAI(datalo, addrlo, 4));
         }
     } else {
-        uint32_t insn = qemu_stx_opc[opc];
+        uint32_t insn = qemu_stx_opc[opc & (MO_BSWAP | MO_SIZE)];
         if (!HAVE_ISA_2_06 && insn == STDBRX) {
             tcg_out32(s, STWBRX | SAB(datalo, rbase, addrlo));
             tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, addrlo, 4));
@@ -1706,8 +1735,8 @@
     }
 
 #ifdef CONFIG_SOFTMMU
-    add_qemu_ldst_label(s, false, opc, datalo, datahi, addrlo, addrhi,
-                        mem_index, s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
+                        s->code_ptr, label_ptr);
 #endif
 }
 
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index 63e9c82..f78a0c1 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -1507,20 +1507,37 @@
 static TCGReg tcg_out_tlb_read(TCGContext* s, TCGReg addr_reg, TCGMemOp opc,
                                int mem_index, bool is_ld)
 {
-    TCGMemOp s_bits = opc & MO_SIZE;
-    uint64_t tlb_mask = TARGET_PAGE_MASK | ((1 << s_bits) - 1);
-    int ofs;
+    int a_bits = get_alignment_bits(opc);
+    int ofs, a_off;
+    uint64_t tlb_mask;
+
+    /* For aligned accesses, we check the first byte and include the alignment
+       bits within the address.  For unaligned access, we check that we don't
+       cross pages using the address of the last byte of the access.  */
+    if (a_bits >= 0) {
+        /* A byte access or an alignment check required */
+        a_off = 0;
+        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
+    } else {
+        a_off = (1 << (opc & MO_SIZE)) - 1;
+        tlb_mask = TARGET_PAGE_MASK;
+    }
 
     if (facilities & FACILITY_GEN_INST_EXT) {
         tcg_out_risbg(s, TCG_REG_R2, addr_reg,
                       64 - CPU_TLB_BITS - CPU_TLB_ENTRY_BITS,
                       63 - CPU_TLB_ENTRY_BITS,
                       64 + CPU_TLB_ENTRY_BITS - TARGET_PAGE_BITS, 1);
-        tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
+        if (a_off) {
+            tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
+            tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
+        } else {
+            tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
+        }
     } else {
         tcg_out_sh64(s, RSY_SRLG, TCG_REG_R2, addr_reg, TCG_REG_NONE,
                      TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-        tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_R3, addr_reg);
+        tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
         tgen_andi(s, TCG_TYPE_I64, TCG_REG_R2,
                   (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
         tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
@@ -1547,17 +1564,16 @@
     return addr_reg;
 }
 
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
-                                TCGReg data, TCGReg addr, int mem_index,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
+                                TCGReg data, TCGReg addr,
                                 tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
     label->is_ld = is_ld;
-    label->opc = opc;
+    label->oi = oi;
     label->datalo_reg = data;
     label->addrlo_reg = addr;
-    label->mem_index = mem_index;
     label->raddr = raddr;
     label->label_ptr[0] = label_ptr;
 }
@@ -1566,7 +1582,8 @@
 {
     TCGReg addr_reg = lb->addrlo_reg;
     TCGReg data_reg = lb->datalo_reg;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
 
     patch_reloc(lb->label_ptr[0], R_390_PC16DBL, (intptr_t)s->code_ptr, -2);
 
@@ -1574,9 +1591,9 @@
     if (TARGET_LONG_BITS == 64) {
         tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R3, addr_reg);
     }
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R5, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_ld_helpers[opc]);
+    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
     tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_R2);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
@@ -1586,7 +1603,8 @@
 {
     TCGReg addr_reg = lb->addrlo_reg;
     TCGReg data_reg = lb->datalo_reg;
-    TCGMemOp opc = lb->opc;
+    TCGMemOpIdx oi = lb->oi;
+    TCGMemOp opc = get_memop(oi);
 
     patch_reloc(lb->label_ptr[0], R_390_PC16DBL, (intptr_t)s->code_ptr, -2);
 
@@ -1610,9 +1628,9 @@
     default:
         tcg_abort();
     }
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, oi);
     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R6, (uintptr_t)lb->raddr);
-    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
     tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
 }
@@ -1635,9 +1653,11 @@
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp opc, int mem_index)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp opc = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
     TCGReg base_reg;
 
@@ -1648,8 +1668,7 @@
 
     tcg_out_qemu_ld_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0);
 
-    add_qemu_ldst_label(s, 1, opc, data_reg, addr_reg, mem_index,
-                        s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 1, oi, data_reg, addr_reg, s->code_ptr, label_ptr);
 #else
     TCGReg index_reg;
     tcg_target_long disp;
@@ -1660,9 +1679,11 @@
 }
 
 static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
-                            TCGMemOp opc, int mem_index)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp opc = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
+    unsigned mem_index = get_mmuidx(oi);
     tcg_insn_unit *label_ptr;
     TCGReg base_reg;
 
@@ -1673,8 +1694,7 @@
 
     tcg_out_qemu_st_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0);
 
-    add_qemu_ldst_label(s, 0, opc, data_reg, addr_reg, mem_index,
-                        s->code_ptr, label_ptr);
+    add_qemu_ldst_label(s, 0, oi, data_reg, addr_reg, s->code_ptr, label_ptr);
 #else
     TCGReg index_reg;
     tcg_target_long disp;
@@ -1923,11 +1943,11 @@
     case INDEX_op_qemu_ld_i32:
         /* ??? Technically we can use a non-extending instruction.  */
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, args[0], args[1], args[2], args[3]);
+        tcg_out_qemu_ld(s, args[0], args[1], args[2]);
         break;
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
-        tcg_out_qemu_st(s, args[0], args[1], args[2], args[3]);
+        tcg_out_qemu_st(s, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16s_i64:
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 0c4b028..e081828 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -917,7 +917,7 @@
             } else {
                 ra += 1;
             }
-            /* Skip the mem_index argument.  */
+            /* Skip the oi argument.  */
             ra += 1;
         }
                 
@@ -1072,15 +1072,16 @@
 };
 
 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
-                            TCGMemOp memop, int memi, bool is_64)
+                            TCGMemOpIdx oi, bool is_64)
 {
+    TCGMemOp memop = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned memi = get_mmuidx(oi);
     TCGReg addrz, param;
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
                              offsetof(CPUTLBEntry, addr_read));
 
     /* The fast path is exactly one insn.  Thus we can perform the
@@ -1092,7 +1093,8 @@
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
-    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, qemu_ld_opc[memop]);
+    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1,
+                    qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 
     /* TLB Miss.  */
 
@@ -1105,27 +1107,27 @@
 
     /* We use the helpers to extend SB and SW data, leaving the case
        of SL needing explicit extending below.  */
-    if ((memop & ~MO_BSWAP) == MO_SL) {
-        func = qemu_ld_trampoline[memop & ~MO_SIGN];
+    if ((memop & MO_SSIZE) == MO_SL) {
+        func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SIZE)];
     } else {
-        func = qemu_ld_trampoline[memop];
+        func = qemu_ld_trampoline[memop & (MO_BSWAP | MO_SSIZE)];
     }
     assert(func != NULL);
     tcg_out_call_nodelay(s, func);
     /* delay slot */
-    tcg_out_movi(s, TCG_TYPE_I32, param, memi);
+    tcg_out_movi(s, TCG_TYPE_I32, param, oi);
 
     /* Recall that all of the helpers return 64-bit results.
        Which complicates things for sparcv8plus.  */
     if (SPARC64) {
         /* We let the helper sign-extend SB and SW, but leave SL for here.  */
-        if (is_64 && (memop & ~MO_BSWAP) == MO_SL) {
+        if (is_64 && (memop & MO_SSIZE) == MO_SL) {
             tcg_out_arithi(s, data, TCG_REG_O0, 0, SHIFT_SRA);
         } else {
             tcg_out_mov(s, TCG_TYPE_REG, data, TCG_REG_O0);
         }
     } else {
-        if (s_bits == MO_64) {
+        if ((memop & MO_SIZE) == MO_64) {
             tcg_out_arithi(s, TCG_REG_O0, TCG_REG_O0, 32, SHIFT_SLLX);
             tcg_out_arithi(s, TCG_REG_O1, TCG_REG_O1, 0, SHIFT_SRL);
             tcg_out_arith(s, data, TCG_REG_O0, TCG_REG_O1, ARITH_OR);
@@ -1147,20 +1149,21 @@
     }
     tcg_out_ldst_rr(s, data, addr,
                     (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
-                    qemu_ld_opc[memop]);
+                    qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
 
 static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
-                            TCGMemOp memop, int memi)
+                            TCGMemOpIdx oi)
 {
+    TCGMemOp memop = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
-    TCGMemOp s_bits = memop & MO_SIZE;
+    unsigned memi = get_mmuidx(oi);
     TCGReg addrz, param;
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
                              offsetof(CPUTLBEntry, addr_write));
 
     /* The fast path is exactly one insn.  Thus we can perform the entire
@@ -1170,7 +1173,8 @@
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
-    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1, qemu_st_opc[memop]);
+    tcg_out_ldst_rr(s, data, addrz, TCG_REG_O1,
+                    qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 
     /* TLB Miss.  */
 
@@ -1180,17 +1184,17 @@
         param++;
     }
     tcg_out_mov(s, TCG_TYPE_REG, param++, addr);
-    if (!SPARC64 && s_bits == MO_64) {
+    if (!SPARC64 && (memop & MO_SIZE) == MO_64) {
         /* Skip the high-part; we'll perform the extract in the trampoline.  */
         param++;
     }
     tcg_out_mov(s, TCG_TYPE_REG, param++, data);
 
-    func = qemu_st_trampoline[memop];
+    func = qemu_st_trampoline[memop & (MO_BSWAP | MO_SIZE)];
     assert(func != NULL);
     tcg_out_call_nodelay(s, func);
     /* delay slot */
-    tcg_out_movi(s, TCG_TYPE_REG, param, memi);
+    tcg_out_movi(s, TCG_TYPE_I32, param, oi);
 
     *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #else
@@ -1200,7 +1204,7 @@
     }
     tcg_out_ldst_rr(s, data, addr,
                     (GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_G0),
-                    qemu_st_opc[memop]);
+                    qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
 #endif /* CONFIG_SOFTMMU */
 }
 
@@ -1365,14 +1369,14 @@
         break;
 
     case INDEX_op_qemu_ld_i32:
-        tcg_out_qemu_ld(s, a0, a1, a2, args[3], false);
+        tcg_out_qemu_ld(s, a0, a1, a2, false);
         break;
     case INDEX_op_qemu_ld_i64:
-        tcg_out_qemu_ld(s, a0, a1, a2, args[3], true);
+        tcg_out_qemu_ld(s, a0, a1, a2, true);
         break;
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
-        tcg_out_qemu_st(s, a0, a1, a2, args[3]);
+        tcg_out_qemu_st(s, a0, a1, a2);
         break;
 
     case INDEX_op_ld32s_i64:
diff --git a/tcg/tcg-be-ldst.h b/tcg/tcg-be-ldst.h
index 429cba2..cafe8ed 100644
--- a/tcg/tcg-be-ldst.h
+++ b/tcg/tcg-be-ldst.h
@@ -25,13 +25,12 @@
 
 typedef struct TCGLabelQemuLdst {
     bool is_ld;             /* qemu_ld: true, qemu_st: false */
-    TCGMemOp opc;
+    TCGMemOpIdx oi;
     TCGType type;           /* result type of a load */
     TCGReg addrlo_reg;      /* reg index for low word of guest virtual addr */
     TCGReg addrhi_reg;      /* reg index for high word of guest virtual addr */
     TCGReg datalo_reg;      /* reg index for low word to be loaded or stored */
     TCGReg datahi_reg;      /* reg index for high word to be loaded or stored */
-    int mem_index;          /* soft MMU memory index */
     tcg_insn_unit *raddr;   /* gen code addr of the next IR of qemu_ld/st IR */
     tcg_insn_unit *label_ptr[2]; /* label pointers to be updated */
 } TCGLabelQemuLdst;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
new file mode 100644
index 0000000..7d3de33
--- /dev/null
+++ b/tcg/tcg-op.c
@@ -0,0 +1,1935 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "tcg.h"
+#include "tcg-op.h"
+
+/* Reduce the number of ifdefs below.  This assumes that all uses of
+   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
+   the compiler can eliminate.  */
+#if TCG_TARGET_REG_BITS == 64
+extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
+extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
+#define TCGV_LOW  TCGV_LOW_link_error
+#define TCGV_HIGH TCGV_HIGH_link_error
+#endif
+
+/* Note that this is optimized for sequential allocation during translate.
+   Up to and including filling in the forward link immediately.  We'll do
+   proper termination of the end of the list after we finish translation.  */
+
+static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args)
+{
+    int oi = ctx->gen_next_op_idx;
+    int ni = oi + 1;
+    int pi = oi - 1;
+
+    tcg_debug_assert(oi < OPC_BUF_SIZE);
+    ctx->gen_last_op_idx = oi;
+    ctx->gen_next_op_idx = ni;
+
+    ctx->gen_op_buf[oi] = (TCGOp){
+        .opc = opc,
+        .args = args,
+        .prev = pi,
+        .next = ni
+    };
+}
+
+void tcg_gen_op1(TCGContext *ctx, TCGOpcode opc, TCGArg a1)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 1 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 1;
+    ctx->gen_opparam_buf[pi] = a1;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op2(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 2 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 2;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op3(TCGContext *ctx, TCGOpcode opc, TCGArg a1,
+                 TCGArg a2, TCGArg a3)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 3 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 3;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op4(TCGContext *ctx, TCGOpcode opc, TCGArg a1,
+                 TCGArg a2, TCGArg a3, TCGArg a4)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 4 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 4;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+    ctx->gen_opparam_buf[pi + 3] = a4;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op5(TCGContext *ctx, TCGOpcode opc, TCGArg a1,
+                 TCGArg a2, TCGArg a3, TCGArg a4, TCGArg a5)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 5 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 5;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+    ctx->gen_opparam_buf[pi + 3] = a4;
+    ctx->gen_opparam_buf[pi + 4] = a5;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2,
+                 TCGArg a3, TCGArg a4, TCGArg a5, TCGArg a6)
+{
+    int pi = ctx->gen_next_parm_idx;
+
+    tcg_debug_assert(pi + 6 <= OPPARAM_BUF_SIZE);
+    ctx->gen_next_parm_idx = pi + 6;
+    ctx->gen_opparam_buf[pi + 0] = a1;
+    ctx->gen_opparam_buf[pi + 1] = a2;
+    ctx->gen_opparam_buf[pi + 2] = a3;
+    ctx->gen_opparam_buf[pi + 3] = a4;
+    ctx->gen_opparam_buf[pi + 4] = a5;
+    ctx->gen_opparam_buf[pi + 5] = a6;
+
+    tcg_emit_op(ctx, opc, pi);
+}
+
+/* 32 bit ops */
+
+void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_add_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
+{
+    if (arg1 == 0 && TCG_TARGET_HAS_neg_i32) {
+        /* Don't recurse with tcg_gen_neg_i32.  */
+        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg2);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg1);
+        tcg_gen_sub_i32(ret, t0, arg2);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_sub_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
+{
+    TCGv_i32 t0;
+    /* Some cases can be optimized here.  */
+    switch (arg2) {
+    case 0:
+        tcg_gen_movi_i32(ret, 0);
+        return;
+    case 0xffffffffu:
+        tcg_gen_mov_i32(ret, arg1);
+        return;
+    case 0xffu:
+        /* Don't recurse with tcg_gen_ext8u_i32.  */
+        if (TCG_TARGET_HAS_ext8u_i32) {
+            tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg1);
+            return;
+        }
+        break;
+    case 0xffffu:
+        if (TCG_TARGET_HAS_ext16u_i32) {
+            tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg1);
+            return;
+        }
+        break;
+    }
+    t0 = tcg_const_i32(arg2);
+    tcg_gen_and_i32(ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* Some cases can be optimized here.  */
+    if (arg2 == -1) {
+        tcg_gen_movi_i32(ret, -1);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_or_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    /* Some cases can be optimized here.  */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i32) {
+        /* Don't recurse with tcg_gen_not_i32.  */
+        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_xor_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_shl_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_shr_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_sar_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, int label)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_br(label);
+    } else if (cond != TCG_COND_NEVER) {
+        tcg_gen_op4ii_i32(INDEX_op_brcond_i32, arg1, arg2, cond, label);
+    }
+}
+
+void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, int label)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_brcond_i32(cond, arg1, t0, label);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
+                         TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_movi_i32(ret, 1);
+    } else if (cond == TCG_COND_NEVER) {
+        tcg_gen_movi_i32(ret, 0);
+    } else {
+        tcg_gen_op4i_i32(INDEX_op_setcond_i32, ret, arg1, arg2, cond);
+    }
+}
+
+void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
+                          TCGv_i32 arg1, int32_t arg2)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_setcond_i32(cond, ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+{
+    TCGv_i32 t0 = tcg_const_i32(arg2);
+    tcg_gen_mul_i32(ret, arg1, t0);
+    tcg_temp_free_i32(t0);
+}
+
+void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_div_i32) {
+        tcg_gen_op3_i32(INDEX_op_div_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_sari_i32(t0, arg1, 31);
+        tcg_gen_op5_i32(INDEX_op_div2_i32, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_div_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i32) {
+        tcg_gen_op3_i32(INDEX_op_rem_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_div_i32, t0, arg1, arg2);
+        tcg_gen_mul_i32(t0, t0, arg2);
+        tcg_gen_sub_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_sari_i32(t0, arg1, 31);
+        tcg_gen_op5_i32(INDEX_op_div2_i32, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_rem_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_div_i32) {
+        tcg_gen_op3_i32(INDEX_op_divu_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_movi_i32(t0, 0);
+        tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_divu_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i32) {
+        tcg_gen_op3_i32(INDEX_op_remu_i32, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_divu_i32, t0, arg1, arg2);
+        tcg_gen_mul_i32(t0, t0, arg2);
+        tcg_gen_sub_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    } else if (TCG_TARGET_HAS_div2_i32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_movi_i32(t0, 0);
+        tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i32(t0);
+    } else {
+        gen_helper_remu_i32(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_andc_i32) {
+        tcg_gen_op3_i32(INDEX_op_andc_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_not_i32(t0, arg2);
+        tcg_gen_and_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_eqv_i32) {
+        tcg_gen_op3_i32(INDEX_op_eqv_i32, ret, arg1, arg2);
+    } else {
+        tcg_gen_xor_i32(ret, arg1, arg2);
+        tcg_gen_not_i32(ret, ret);
+    }
+}
+
+void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_nand_i32) {
+        tcg_gen_op3_i32(INDEX_op_nand_i32, ret, arg1, arg2);
+    } else {
+        tcg_gen_and_i32(ret, arg1, arg2);
+        tcg_gen_not_i32(ret, ret);
+    }
+}
+
+void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_nor_i32) {
+        tcg_gen_op3_i32(INDEX_op_nor_i32, ret, arg1, arg2);
+    } else {
+        tcg_gen_or_i32(ret, arg1, arg2);
+        tcg_gen_not_i32(ret, ret);
+    }
+}
+
+void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_orc_i32) {
+        tcg_gen_op3_i32(INDEX_op_orc_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        tcg_gen_not_i32(t0, arg2);
+        tcg_gen_or_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i32) {
+        tcg_gen_op3_i32(INDEX_op_rotl_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0, t1;
+
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        tcg_gen_shl_i32(t0, arg1, arg2);
+        tcg_gen_subfi_i32(t1, 32, arg2);
+        tcg_gen_shr_i32(t1, arg1, t1);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else if (TCG_TARGET_HAS_rot_i32) {
+        TCGv_i32 t0 = tcg_const_i32(arg2);
+        tcg_gen_rotl_i32(ret, arg1, t0);
+        tcg_temp_free_i32(t0);
+    } else {
+        TCGv_i32 t0, t1;
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        tcg_gen_shli_i32(t0, arg1, arg2);
+        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i32) {
+        tcg_gen_op3_i32(INDEX_op_rotr_i32, ret, arg1, arg2);
+    } else {
+        TCGv_i32 t0, t1;
+
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        tcg_gen_shr_i32(t0, arg1, arg2);
+        tcg_gen_subfi_i32(t1, 32, arg2);
+        tcg_gen_shl_i32(t1, arg1, t1);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 32);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i32(ret, arg1);
+    } else {
+        tcg_gen_rotli_i32(ret, arg1, 32 - arg2);
+    }
+}
+
+void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
+                         unsigned int ofs, unsigned int len)
+{
+    uint32_t mask;
+    TCGv_i32 t1;
+
+    tcg_debug_assert(ofs < 32);
+    tcg_debug_assert(len <= 32);
+    tcg_debug_assert(ofs + len <= 32);
+
+    if (ofs == 0 && len == 32) {
+        tcg_gen_mov_i32(ret, arg2);
+        return;
+    }
+    if (TCG_TARGET_HAS_deposit_i32 && TCG_TARGET_deposit_i32_valid(ofs, len)) {
+        tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, arg1, arg2, ofs, len);
+        return;
+    }
+
+    mask = (1u << len) - 1;
+    t1 = tcg_temp_new_i32();
+
+    if (ofs + len < 32) {
+        tcg_gen_andi_i32(t1, arg2, mask);
+        tcg_gen_shli_i32(t1, t1, ofs);
+    } else {
+        tcg_gen_shli_i32(t1, arg2, ofs);
+    }
+    tcg_gen_andi_i32(ret, arg1, ~(mask << ofs));
+    tcg_gen_or_i32(ret, ret, t1);
+
+    tcg_temp_free_i32(t1);
+}
+
+void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
+                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2)
+{
+    if (TCG_TARGET_HAS_movcond_i32) {
+        tcg_gen_op6i_i32(INDEX_op_movcond_i32, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        tcg_gen_setcond_i32(cond, t0, c1, c2);
+        tcg_gen_neg_i32(t0, t0);
+        tcg_gen_and_i32(t1, v1, t0);
+        tcg_gen_andc_i32(ret, v2, t0);
+        tcg_gen_or_i32(ret, ret, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
+{
+    if (TCG_TARGET_HAS_add2_i32) {
+        tcg_gen_op6_i32(INDEX_op_add2_i32, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_concat_i32_i64(t0, al, ah);
+        tcg_gen_concat_i32_i64(t1, bl, bh);
+        tcg_gen_add_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
+{
+    if (TCG_TARGET_HAS_sub2_i32) {
+        tcg_gen_op6_i32(INDEX_op_sub2_i32, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_concat_i32_i64(t0, al, ah);
+        tcg_gen_concat_i32_i64(t1, bl, bh);
+        tcg_gen_sub_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_mulu2_i32) {
+        tcg_gen_op4_i32(INDEX_op_mulu2_i32, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_muluh_i32) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
+        tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
+        tcg_gen_mov_i32(rl, t);
+        tcg_temp_free_i32(t);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_extu_i32_i64(t0, arg1);
+        tcg_gen_extu_i32_i64(t1, arg2);
+        tcg_gen_mul_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_HAS_muls2_i32) {
+        tcg_gen_op4_i32(INDEX_op_muls2_i32, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_mulsh_i32) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
+        tcg_gen_op3_i32(INDEX_op_mulsh_i32, rh, arg1, arg2);
+        tcg_gen_mov_i32(rl, t);
+        tcg_temp_free_i32(t);
+    } else if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        TCGv_i32 t2 = tcg_temp_new_i32();
+        TCGv_i32 t3 = tcg_temp_new_i32();
+        tcg_gen_mulu2_i32(t0, t1, arg1, arg2);
+        /* Adjust for negative inputs.  */
+        tcg_gen_sari_i32(t2, arg1, 31);
+        tcg_gen_sari_i32(t3, arg2, 31);
+        tcg_gen_and_i32(t2, t2, arg2);
+        tcg_gen_and_i32(t3, t3, arg1);
+        tcg_gen_sub_i32(rh, t1, t2);
+        tcg_gen_sub_i32(rh, rh, t3);
+        tcg_gen_mov_i32(rl, t0);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+        tcg_temp_free_i32(t2);
+        tcg_temp_free_i32(t3);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(t0, arg1);
+        tcg_gen_ext_i32_i64(t1, arg2);
+        tcg_gen_mul_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext8s_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext8s_i32, ret, arg);
+    } else {
+        tcg_gen_shli_i32(ret, arg, 24);
+        tcg_gen_sari_i32(ret, ret, 24);
+    }
+}
+
+void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext16s_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext16s_i32, ret, arg);
+    } else {
+        tcg_gen_shli_i32(ret, arg, 16);
+        tcg_gen_sari_i32(ret, ret, 16);
+    }
+}
+
+void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext8u_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg);
+    } else {
+        tcg_gen_andi_i32(ret, arg, 0xffu);
+    }
+}
+
+void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_ext16u_i32) {
+        tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg);
+    } else {
+        tcg_gen_andi_i32(ret, arg, 0xffffu);
+    }
+}
+
+/* Note: we assume the two high bytes are set to zero */
+void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_bswap16_i32) {
+        tcg_gen_op2_i32(INDEX_op_bswap16_i32, ret, arg);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+
+        tcg_gen_ext8u_i32(t0, arg);
+        tcg_gen_shli_i32(t0, t0, 8);
+        tcg_gen_shri_i32(ret, arg, 8);
+        tcg_gen_or_i32(ret, ret, t0);
+        tcg_temp_free_i32(t0);
+    }
+}
+
+void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_bswap32_i32) {
+        tcg_gen_op2_i32(INDEX_op_bswap32_i32, ret, arg);
+    } else {
+        TCGv_i32 t0, t1;
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+
+        tcg_gen_shli_i32(t0, arg, 24);
+
+        tcg_gen_andi_i32(t1, arg, 0x0000ff00);
+        tcg_gen_shli_i32(t1, t1, 8);
+        tcg_gen_or_i32(t0, t0, t1);
+
+        tcg_gen_shri_i32(t1, arg, 8);
+        tcg_gen_andi_i32(t1, t1, 0x0000ff00);
+        tcg_gen_or_i32(t0, t0, t1);
+
+        tcg_gen_shri_i32(t1, arg, 24);
+        tcg_gen_or_i32(ret, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+/* 64-bit ops */
+
+#if TCG_TARGET_REG_BITS == 32
+/* These are all inline for TCG_TARGET_REG_BITS == 64.  */
+
+void tcg_gen_discard_i64(TCGv_i64 arg)
+{
+    tcg_gen_discard_i32(TCGV_LOW(arg));
+    tcg_gen_discard_i32(TCGV_HIGH(arg));
+}
+
+void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+}
+
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
+{
+    tcg_gen_movi_i32(TCGV_LOW(ret), arg);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), arg >> 32);
+}
+
+void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld8u_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld8s_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), 31);
+}
+
+void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld16u_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld16s_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+}
+
+void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+}
+
+void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+{
+    /* Since arg2 and ret have different types,
+       they cannot be the same temporary */
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset);
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset + 4);
+#else
+    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
+    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset + 4);
+#endif
+}
+
+void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
+{
+#ifdef TCG_TARGET_WORDS_BIGENDIAN
+    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset);
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset + 4);
+#else
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
+    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset + 4);
+#endif
+}
+
+void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_and_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_and_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_xor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+    tcg_gen_xor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+}
+
+void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    gen_helper_shl_i64(ret, arg1, arg2);
+}
+
+void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    gen_helper_shr_i64(ret, arg1, arg2);
+}
+
+void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    gen_helper_sar_i64(ret, arg1, arg2);
+}
+
+void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0;
+    TCGv_i32 t1;
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i32();
+
+    tcg_gen_mulu2_i32(TCGV_LOW(t0), TCGV_HIGH(t0),
+                      TCGV_LOW(arg1), TCGV_LOW(arg2));
+
+    tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
+    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
+    tcg_gen_mul_i32(t1, TCGV_HIGH(arg1), TCGV_LOW(arg2));
+    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
+
+    tcg_gen_mov_i64(ret, t0);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i32(t1);
+}
+#endif /* TCG_TARGET_REG_SIZE == 32 */
+
+void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_add_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
+{
+    if (arg1 == 0 && TCG_TARGET_HAS_neg_i64) {
+        /* Don't recurse with tcg_gen_neg_i64.  */
+        tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg2);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg1);
+        tcg_gen_sub_i64(ret, t0, arg2);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_sub_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
+{
+    TCGv_i64 t0;
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+        tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+        return;
+    }
+
+    /* Some cases can be optimized here.  */
+    switch (arg2) {
+    case 0:
+        tcg_gen_movi_i64(ret, 0);
+        return;
+    case 0xffffffffffffffffull:
+        tcg_gen_mov_i64(ret, arg1);
+        return;
+    case 0xffull:
+        /* Don't recurse with tcg_gen_ext8u_i64.  */
+        if (TCG_TARGET_HAS_ext8u_i64) {
+            tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg1);
+            return;
+        }
+        break;
+    case 0xffffu:
+        if (TCG_TARGET_HAS_ext16u_i64) {
+            tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg1);
+            return;
+        }
+        break;
+    case 0xffffffffull:
+        if (TCG_TARGET_HAS_ext32u_i64) {
+            tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg1);
+            return;
+        }
+        break;
+    }
+    t0 = tcg_const_i64(arg2);
+    tcg_gen_and_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+        tcg_gen_ori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+        return;
+    }
+    /* Some cases can be optimized here.  */
+    if (arg2 == -1) {
+        tcg_gen_movi_i64(ret, -1);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_or_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_xori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
+        tcg_gen_xori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
+        return;
+    }
+    /* Some cases can be optimized here.  */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i64) {
+        /* Don't recurse with tcg_gen_not_i64.  */
+        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_xor_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static inline void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
+                                      unsigned c, bool right, bool arith)
+{
+    tcg_debug_assert(c < 64);
+    if (c == 0) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
+        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
+    } else if (c >= 32) {
+        c -= 32;
+        if (right) {
+            if (arith) {
+                tcg_gen_sari_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
+                tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), 31);
+            } else {
+                tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
+                tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+            }
+        } else {
+            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_LOW(arg1), c);
+            tcg_gen_movi_i32(TCGV_LOW(ret), 0);
+        }
+    } else {
+        TCGv_i32 t0, t1;
+
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+        if (right) {
+            tcg_gen_shli_i32(t0, TCGV_HIGH(arg1), 32 - c);
+            if (arith) {
+                tcg_gen_sari_i32(t1, TCGV_HIGH(arg1), c);
+            } else {
+                tcg_gen_shri_i32(t1, TCGV_HIGH(arg1), c);
+            }
+            tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_LOW(arg1), c);
+            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t0);
+            tcg_gen_mov_i32(TCGV_HIGH(ret), t1);
+        } else {
+            tcg_gen_shri_i32(t0, TCGV_LOW(arg1), 32 - c);
+            /* Note: ret can be the same as arg1, so we use t1 */
+            tcg_gen_shli_i32(t1, TCGV_LOW(arg1), c);
+            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), c);
+            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t0);
+            tcg_gen_mov_i32(TCGV_LOW(ret), t1);
+        }
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_shl_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_shr_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1);
+    } else if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_sar_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, int label)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_br(label);
+    } else if (cond != TCG_COND_NEVER) {
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_gen_op6ii_i32(INDEX_op_brcond2_i32, TCGV_LOW(arg1),
+                              TCGV_HIGH(arg1), TCGV_LOW(arg2),
+                              TCGV_HIGH(arg2), cond, label);
+        } else {
+            tcg_gen_op4ii_i64(INDEX_op_brcond_i64, arg1, arg2, cond, label);
+        }
+    }
+}
+
+void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, int label)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_br(label);
+    } else if (cond != TCG_COND_NEVER) {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_brcond_i64(cond, arg1, t0, label);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
+                         TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_movi_i64(ret, 1);
+    } else if (cond == TCG_COND_NEVER) {
+        tcg_gen_movi_i64(ret, 0);
+    } else {
+        if (TCG_TARGET_REG_BITS == 32) {
+            tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
+                             TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                             TCGV_LOW(arg2), TCGV_HIGH(arg2), cond);
+            tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+        } else {
+            tcg_gen_op4i_i64(INDEX_op_setcond_i64, ret, arg1, arg2, cond);
+        }
+    }
+}
+
+void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
+                          TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_setcond_i64(cond, ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+{
+    TCGv_i64 t0 = tcg_const_i64(arg2);
+    tcg_gen_mul_i64(ret, arg1, t0);
+    tcg_temp_free_i64(t0);
+}
+
+void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_div_i64) {
+        tcg_gen_op3_i64(INDEX_op_div_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_sari_i64(t0, arg1, 63);
+        tcg_gen_op5_i64(INDEX_op_div2_i64, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_div_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i64) {
+        tcg_gen_op3_i64(INDEX_op_rem_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_div_i64, t0, arg1, arg2);
+        tcg_gen_mul_i64(t0, t0, arg2);
+        tcg_gen_sub_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_sari_i64(t0, arg1, 63);
+        tcg_gen_op5_i64(INDEX_op_div2_i64, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_rem_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_div_i64) {
+        tcg_gen_op3_i64(INDEX_op_divu_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_movi_i64(t0, 0);
+        tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_divu_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rem_i64) {
+        tcg_gen_op3_i64(INDEX_op_remu_i64, ret, arg1, arg2);
+    } else if (TCG_TARGET_HAS_div_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_divu_i64, t0, arg1, arg2);
+        tcg_gen_mul_i64(t0, t0, arg2);
+        tcg_gen_sub_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    } else if (TCG_TARGET_HAS_div2_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_movi_i64(t0, 0);
+        tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, t0, arg2);
+        tcg_temp_free_i64(t0);
+    } else {
+        gen_helper_remu_i64(ret, arg1, arg2);
+    }
+}
+
+void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext8s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else if (TCG_TARGET_HAS_ext8s_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext8s_i64, ret, arg);
+    } else {
+        tcg_gen_shli_i64(ret, arg, 56);
+        tcg_gen_sari_i64(ret, ret, 56);
+    }
+}
+
+void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext16s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else if (TCG_TARGET_HAS_ext16s_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext16s_i64, ret, arg);
+    } else {
+        tcg_gen_shli_i64(ret, arg, 48);
+        tcg_gen_sari_i64(ret, ret, 48);
+    }
+}
+
+void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else if (TCG_TARGET_HAS_ext32s_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext32s_i64, ret, arg);
+    } else {
+        tcg_gen_shli_i64(ret, arg, 32);
+        tcg_gen_sari_i64(ret, ret, 32);
+    }
+}
+
+void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext8u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_ext8u_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg);
+    } else {
+        tcg_gen_andi_i64(ret, arg, 0xffu);
+    }
+}
+
+void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_ext16u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_ext16u_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg);
+    } else {
+        tcg_gen_andi_i64(ret, arg, 0xffffu);
+    }
+}
+
+void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_ext32u_i64) {
+        tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg);
+    } else {
+        tcg_gen_andi_i64(ret, arg, 0xffffffffu);
+    }
+}
+
+/* Note: we assume the six high bytes are set to zero */
+void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_bswap16_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_bswap16_i64) {
+        tcg_gen_op2_i64(INDEX_op_bswap16_i64, ret, arg);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+
+        tcg_gen_ext8u_i64(t0, arg);
+        tcg_gen_shli_i64(t0, t0, 8);
+        tcg_gen_shri_i64(ret, arg, 8);
+        tcg_gen_or_i64(ret, ret, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+/* Note: we assume the four high bytes are set to zero */
+void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_bswap32_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else if (TCG_TARGET_HAS_bswap32_i64) {
+        tcg_gen_op2_i64(INDEX_op_bswap32_i64, ret, arg);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+
+        tcg_gen_shli_i64(t0, arg, 24);
+        tcg_gen_ext32u_i64(t0, t0);
+
+        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
+        tcg_gen_shli_i64(t1, t1, 8);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 8);
+        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 24);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0, t1;
+        t0 = tcg_temp_new_i32();
+        t1 = tcg_temp_new_i32();
+
+        tcg_gen_bswap32_i32(t0, TCGV_LOW(arg));
+        tcg_gen_bswap32_i32(t1, TCGV_HIGH(arg));
+        tcg_gen_mov_i32(TCGV_LOW(ret), t1);
+        tcg_gen_mov_i32(TCGV_HIGH(ret), t0);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    } else if (TCG_TARGET_HAS_bswap64_i64) {
+        tcg_gen_op2_i64(INDEX_op_bswap64_i64, ret, arg);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+
+        tcg_gen_shli_i64(t0, arg, 56);
+
+        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
+        tcg_gen_shli_i64(t1, t1, 40);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_andi_i64(t1, arg, 0x00ff0000);
+        tcg_gen_shli_i64(t1, t1, 24);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_andi_i64(t1, arg, 0xff000000);
+        tcg_gen_shli_i64(t1, t1, 8);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 8);
+        tcg_gen_andi_i64(t1, t1, 0xff000000);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 24);
+        tcg_gen_andi_i64(t1, t1, 0x00ff0000);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 40);
+        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
+        tcg_gen_or_i64(t0, t0, t1);
+
+        tcg_gen_shri_i64(t1, arg, 56);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_not_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_not_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    } else if (TCG_TARGET_HAS_not_i64) {
+        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg);
+    } else {
+        tcg_gen_xori_i64(ret, arg, -1);
+    }
+}
+
+void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_andc_i64) {
+        tcg_gen_op3_i64(INDEX_op_andc_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_not_i64(t0, arg2);
+        tcg_gen_and_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_eqv_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_eqv_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_eqv_i64) {
+        tcg_gen_op3_i64(INDEX_op_eqv_i64, ret, arg1, arg2);
+    } else {
+        tcg_gen_xor_i64(ret, arg1, arg2);
+        tcg_gen_not_i64(ret, ret);
+    }
+}
+
+void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_nand_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_nand_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_nand_i64) {
+        tcg_gen_op3_i64(INDEX_op_nand_i64, ret, arg1, arg2);
+    } else {
+        tcg_gen_and_i64(ret, arg1, arg2);
+        tcg_gen_not_i64(ret, ret);
+    }
+}
+
+void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_nor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_nor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_nor_i64) {
+        tcg_gen_op3_i64(INDEX_op_nor_i64, ret, arg1, arg2);
+    } else {
+        tcg_gen_or_i64(ret, arg1, arg2);
+        tcg_gen_not_i64(ret, ret);
+    }
+}
+
+void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_orc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_orc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
+    } else if (TCG_TARGET_HAS_orc_i64) {
+        tcg_gen_op3_i64(INDEX_op_orc_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_not_i64(t0, arg2);
+        tcg_gen_or_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i64) {
+        tcg_gen_op3_i64(INDEX_op_rotl_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_shl_i64(t0, arg1, arg2);
+        tcg_gen_subfi_i64(t1, 64, arg2);
+        tcg_gen_shr_i64(t1, arg1, t1);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else if (TCG_TARGET_HAS_rot_i64) {
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_rotl_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_shli_i64(t0, arg1, arg2);
+        tcg_gen_shri_i64(t1, arg1, 64 - arg2);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_rot_i64) {
+        tcg_gen_op3_i64(INDEX_op_rotr_i64, ret, arg1, arg2);
+    } else {
+        TCGv_i64 t0, t1;
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        tcg_gen_shr_i64(t0, arg1, arg2);
+        tcg_gen_subfi_i64(t1, 64, arg2);
+        tcg_gen_shl_i64(t1, arg1, t1);
+        tcg_gen_or_i64(ret, t0, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+{
+    tcg_debug_assert(arg2 < 64);
+    /* some cases can be optimized here */
+    if (arg2 == 0) {
+        tcg_gen_mov_i64(ret, arg1);
+    } else {
+        tcg_gen_rotli_i64(ret, arg1, 64 - arg2);
+    }
+}
+
+void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
+                         unsigned int ofs, unsigned int len)
+{
+    uint64_t mask;
+    TCGv_i64 t1;
+
+    tcg_debug_assert(ofs < 64);
+    tcg_debug_assert(len <= 64);
+    tcg_debug_assert(ofs + len <= 64);
+
+    if (ofs == 0 && len == 64) {
+        tcg_gen_mov_i64(ret, arg2);
+        return;
+    }
+    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(ofs, len)) {
+        tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, arg1, arg2, ofs, len);
+        return;
+    }
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        if (ofs >= 32) {
+            tcg_gen_deposit_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1),
+                                TCGV_LOW(arg2), ofs - 32, len);
+            tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
+            return;
+        }
+        if (ofs + len <= 32) {
+            tcg_gen_deposit_i32(TCGV_LOW(ret), TCGV_LOW(arg1),
+                                TCGV_LOW(arg2), ofs, len);
+            tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
+            return;
+        }
+    }
+
+    mask = (1ull << len) - 1;
+    t1 = tcg_temp_new_i64();
+
+    if (ofs + len < 64) {
+        tcg_gen_andi_i64(t1, arg2, mask);
+        tcg_gen_shli_i64(t1, t1, ofs);
+    } else {
+        tcg_gen_shli_i64(t1, arg2, ofs);
+    }
+    tcg_gen_andi_i64(ret, arg1, ~(mask << ofs));
+    tcg_gen_or_i64(ret, ret, t1);
+
+    tcg_temp_free_i64(t1);
+}
+
+void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
+                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
+                         TCGV_LOW(c1), TCGV_HIGH(c1),
+                         TCGV_LOW(c2), TCGV_HIGH(c2), cond);
+
+        if (TCG_TARGET_HAS_movcond_i32) {
+            tcg_gen_movi_i32(t1, 0);
+            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_LOW(ret), t0, t1,
+                                TCGV_LOW(v1), TCGV_LOW(v2));
+            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_HIGH(ret), t0, t1,
+                                TCGV_HIGH(v1), TCGV_HIGH(v2));
+        } else {
+            tcg_gen_neg_i32(t0, t0);
+
+            tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
+            tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
+            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
+
+            tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
+            tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
+            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
+        }
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    } else if (TCG_TARGET_HAS_movcond_i64) {
+        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_setcond_i64(cond, t0, c1, c2);
+        tcg_gen_neg_i64(t0, t0);
+        tcg_gen_and_i64(t1, v1, t0);
+        tcg_gen_andc_i64(ret, v2, t0);
+        tcg_gen_or_i64(ret, ret, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
+{
+    if (TCG_TARGET_HAS_add2_i64) {
+        tcg_gen_op6_i64(INDEX_op_add2_i64, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_add_i64(t0, al, bl);
+        tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, al);
+        tcg_gen_add_i64(rh, ah, bh);
+        tcg_gen_add_i64(rh, rh, t1);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
+{
+    if (TCG_TARGET_HAS_sub2_i64) {
+        tcg_gen_op6_i64(INDEX_op_sub2_i64, rl, rh, al, ah, bl, bh);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_sub_i64(t0, al, bl);
+        tcg_gen_setcond_i64(TCG_COND_LTU, t1, al, bl);
+        tcg_gen_sub_i64(rh, ah, bh);
+        tcg_gen_sub_i64(rh, rh, t1);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
+void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_mulu2_i64) {
+        tcg_gen_op4_i64(INDEX_op_mulu2_i64, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_muluh_i64) {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
+        tcg_gen_op3_i64(INDEX_op_muluh_i64, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t);
+        tcg_temp_free_i64(t);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_mul_i64(t0, arg1, arg2);
+        gen_helper_muluh_i64(rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    if (TCG_TARGET_HAS_muls2_i64) {
+        tcg_gen_op4_i64(INDEX_op_muls2_i64, rl, rh, arg1, arg2);
+    } else if (TCG_TARGET_HAS_mulsh_i64) {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
+        tcg_gen_op3_i64(INDEX_op_mulsh_i64, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t);
+        tcg_temp_free_i64(t);
+    } else if (TCG_TARGET_HAS_mulu2_i64 || TCG_TARGET_HAS_muluh_i64) {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        TCGv_i64 t2 = tcg_temp_new_i64();
+        TCGv_i64 t3 = tcg_temp_new_i64();
+        tcg_gen_mulu2_i64(t0, t1, arg1, arg2);
+        /* Adjust for negative inputs.  */
+        tcg_gen_sari_i64(t2, arg1, 63);
+        tcg_gen_sari_i64(t3, arg2, 63);
+        tcg_gen_and_i64(t2, t2, arg2);
+        tcg_gen_and_i64(t3, t3, arg1);
+        tcg_gen_sub_i64(rh, t1, t2);
+        tcg_gen_sub_i64(rh, rh, t3);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+        tcg_temp_free_i64(t2);
+        tcg_temp_free_i64(t3);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        tcg_gen_mul_i64(t0, arg1, arg2);
+        gen_helper_mulsh_i64(rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t0);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+/* Size changing operations.  */
+
+void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg, unsigned count)
+{
+    tcg_debug_assert(count < 64);
+    if (TCG_TARGET_REG_BITS == 32) {
+        if (count >= 32) {
+            tcg_gen_shri_i32(ret, TCGV_HIGH(arg), count - 32);
+        } else if (count == 0) {
+            tcg_gen_mov_i32(ret, TCGV_LOW(arg));
+        } else {
+            TCGv_i64 t = tcg_temp_new_i64();
+            tcg_gen_shri_i64(t, arg, count);
+            tcg_gen_mov_i32(ret, TCGV_LOW(t));
+            tcg_temp_free_i64(t);
+        }
+    } else if (TCG_TARGET_HAS_trunc_shr_i32) {
+        tcg_gen_op3i_i32(INDEX_op_trunc_shr_i32, ret,
+                         MAKE_TCGV_I32(GET_TCGV_I64(arg)), count);
+    } else if (count == 0) {
+        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(arg)));
+    } else {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_shri_i64(t, arg, count);
+        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(t)));
+        tcg_temp_free_i64(t);
+    }
+}
+
+void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), arg);
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    } else {
+        /* Note: we assume the target supports move between
+           32 and 64 bit registers.  */
+        tcg_gen_ext32u_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+    }
+}
+
+void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(ret), arg);
+        tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
+    } else {
+        /* Note: we assume the target supports move between
+           32 and 64 bit registers.  */
+        tcg_gen_ext32s_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
+    }
+}
+
+void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high)
+{
+    TCGv_i64 tmp;
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(TCGV_LOW(dest), low);
+        tcg_gen_mov_i32(TCGV_HIGH(dest), high);
+        return;
+    }
+
+    tmp = tcg_temp_new_i64();
+    /* These extensions are only needed for type correctness.
+       We may be able to do better given target specific information.  */
+    tcg_gen_extu_i32_i64(tmp, high);
+    tcg_gen_extu_i32_i64(dest, low);
+    /* If deposit is available, use it.  Otherwise use the extra
+       knowledge that we have of the zero-extensions above.  */
+    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(32, 32)) {
+        tcg_gen_deposit_i64(dest, dest, tmp, 32, 32);
+    } else {
+        tcg_gen_shli_i64(tmp, tmp, 32);
+        tcg_gen_or_i64(dest, dest, tmp);
+    }
+    tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_mov_i32(lo, TCGV_LOW(arg));
+        tcg_gen_mov_i32(hi, TCGV_HIGH(arg));
+    } else {
+        tcg_gen_trunc_shr_i64_i32(lo, arg, 0);
+        tcg_gen_trunc_shr_i64_i32(hi, arg, 32);
+    }
+}
+
+void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg)
+{
+    tcg_gen_ext32u_i64(lo, arg);
+    tcg_gen_shri_i64(hi, arg, 32);
+}
+
+/* QEMU specific operations.  */
+
+void tcg_gen_goto_tb(unsigned idx)
+{
+    /* We only support two chained exits.  */
+    tcg_debug_assert(idx <= 1);
+#ifdef CONFIG_DEBUG_TCG
+    /* Verify that we havn't seen this numbered exit before.  */
+    tcg_debug_assert((tcg_ctx.goto_tb_issue_mask & (1 << idx)) == 0);
+    tcg_ctx.goto_tb_issue_mask |= 1 << idx;
+#endif
+    tcg_gen_op1i(INDEX_op_goto_tb, idx);
+}
+
+static inline TCGMemOp tcg_canonicalize_memop(TCGMemOp op, bool is64, bool st)
+{
+    /* Trigger the asserts within as early as possible.  */
+    (void)get_alignment_bits(op);
+
+    switch (op & MO_SIZE) {
+    case MO_8:
+        op &= ~MO_BSWAP;
+        break;
+    case MO_16:
+        break;
+    case MO_32:
+        if (!is64) {
+            op &= ~MO_SIGN;
+        }
+        break;
+    case MO_64:
+        if (!is64) {
+            tcg_abort();
+        }
+        break;
+    }
+    if (st) {
+        op &= ~MO_SIGN;
+    }
+    return op;
+}
+
+static void gen_ldst_i32(TCGOpcode opc, TCGv_i32 val, TCGv addr,
+                         TCGMemOp memop, TCGArg idx)
+{
+    TCGMemOpIdx oi = make_memop_idx(memop, idx);
+#if TARGET_LONG_BITS == 32
+    tcg_gen_op3i_i32(opc, val, addr, oi);
+#else
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_op4i_i32(opc, val, TCGV_LOW(addr), TCGV_HIGH(addr), oi);
+    } else {
+        tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(val), GET_TCGV_I64(addr), oi);
+    }
+#endif
+}
+
+static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr,
+                         TCGMemOp memop, TCGArg idx)
+{
+    TCGMemOpIdx oi = make_memop_idx(memop, idx);
+#if TARGET_LONG_BITS == 32
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_op4i_i32(opc, TCGV_LOW(val), TCGV_HIGH(val), addr, oi);
+    } else {
+        tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(val), GET_TCGV_I32(addr), oi);
+    }
+#else
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_gen_op5i_i32(opc, TCGV_LOW(val), TCGV_HIGH(val),
+                         TCGV_LOW(addr), TCGV_HIGH(addr), oi);
+    } else {
+        tcg_gen_op3i_i64(opc, val, addr, oi);
+    }
+#endif
+}
+
+void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 0);
+    gen_ldst_i32(INDEX_op_qemu_ld_i32, val, addr, memop, idx);
+}
+
+void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    memop = tcg_canonicalize_memop(memop, 0, 1);
+    gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx);
+}
+
+void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
+        tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
+        if (memop & MO_SIGN) {
+            tcg_gen_sari_i32(TCGV_HIGH(val), TCGV_LOW(val), 31);
+        } else {
+            tcg_gen_movi_i32(TCGV_HIGH(val), 0);
+        }
+        return;
+    }
+
+    memop = tcg_canonicalize_memop(memop, 1, 0);
+    gen_ldst_i64(INDEX_op_qemu_ld_i64, val, addr, memop, idx);
+}
+
+void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
+{
+    if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
+        tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop);
+        return;
+    }
+
+    memop = tcg_canonicalize_memop(memop, 1, 1);
+    gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 019dd9b..96adf9a 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -21,359 +21,309 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+
 #include "tcg.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
-int gen_new_label(void);
+/* Basic output routines.  Not for general consumption.  */
 
-static inline void tcg_gen_op0(TCGOpcode opc)
+void tcg_gen_op1(TCGContext *, TCGOpcode, TCGArg);
+void tcg_gen_op2(TCGContext *, TCGOpcode, TCGArg, TCGArg);
+void tcg_gen_op3(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg);
+void tcg_gen_op4(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
+void tcg_gen_op5(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg,
+                 TCGArg, TCGArg);
+void tcg_gen_op6(TCGContext *, TCGOpcode, TCGArg, TCGArg, TCGArg,
+                 TCGArg, TCGArg, TCGArg);
+
+
+static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
+    tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_I32(a1));
 }
 
-static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 arg1)
+static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 a1)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
+    tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_I64(a1));
 }
 
-static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 arg1)
+static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg a1)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
+    tcg_gen_op1(&tcg_ctx, opc, a1);
 }
 
-static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg arg1)
+static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = arg1;
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2));
 }
 
-static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2)
+static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2));
 }
 
-static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 a1, TCGArg a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I32(a1), a2);
 }
 
-static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGArg arg2)
+static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 a1, TCGArg a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = arg2;
+    tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_I64(a1), a2);
 }
 
-static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGArg arg2)
+static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg a1, TCGArg a2)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = arg2;
+    tcg_gen_op2(&tcg_ctx, opc, a1, a2);
 }
 
-static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg arg1, TCGArg arg2)
+static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 a1,
+                                   TCGv_i32 a2, TCGv_i32 a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = arg1;
-    *tcg_ctx.gen_opparam_ptr++ = arg2;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(a1),
+                GET_TCGV_I32(a2), GET_TCGV_I32(a3));
 }
 
-static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3)
+static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 a1,
+                                   TCGv_i64 a2, TCGv_i64 a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(a1),
+                GET_TCGV_I64(a2), GET_TCGV_I64(a3));
 }
 
-static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3)
+static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 a1,
+                                    TCGv_i32 a2, TCGArg a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2), a3);
 }
 
-static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 arg1,
-                                    TCGv_i32 arg2, TCGArg arg3)
+static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 a1,
+                                    TCGv_i64 a2, TCGArg a3)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
-}
-
-static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 arg1,
-                                    TCGv_i64 arg2, TCGArg arg3)
-{
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2), a3);
 }
 
 static inline void tcg_gen_ldst_op_i32(TCGOpcode opc, TCGv_i32 val,
                                        TCGv_ptr base, TCGArg offset)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(val);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_PTR(base);
-    *tcg_ctx.gen_opparam_ptr++ = offset;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I32(val), GET_TCGV_PTR(base), offset);
 }
 
 static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
                                        TCGv_ptr base, TCGArg offset)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(val);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_PTR(base);
-    *tcg_ctx.gen_opparam_ptr++ = offset;
+    tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_I64(val), GET_TCGV_PTR(base), offset);
 }
 
-static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3, TCGv_i32 arg4)
+static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4));
 }
 
-static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3, TCGv_i64 arg4)
+static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4));
 }
 
-static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                    TCGv_i32 arg3, TCGArg arg4)
+static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), a4);
 }
 
-static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                    TCGv_i64 arg3, TCGArg arg4)
+static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), a4);
 }
 
-static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                     TCGArg arg3, TCGArg arg4)
+static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGArg a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2), a3, a4);
 }
 
-static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                     TCGArg arg3, TCGArg arg4)
+static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGArg a3, TCGArg a4)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = arg3;
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
+    tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2), a3, a4);
 }
 
-static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3, TCGv_i32 arg4, TCGv_i32 arg5)
+static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4, TCGv_i32 a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg5);
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), GET_TCGV_I32(a5));
 }
 
-static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3, TCGv_i64 arg4, TCGv_i64 arg5)
+static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4, TCGv_i64 a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg5);
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), GET_TCGV_I64(a5));
 }
 
-static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                    TCGv_i32 arg3, TCGv_i32 arg4, TCGArg arg5)
+static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGv_i32 a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), a5);
 }
 
-static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                    TCGv_i64 arg3, TCGv_i64 arg4, TCGArg arg5)
+static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGv_i64 a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), a5);
 }
 
-static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 arg1,
-                                     TCGv_i32 arg2, TCGv_i32 arg3,
-                                     TCGArg arg4, TCGArg arg5)
+static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGv_i32 a3, TCGArg a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), a4, a5);
 }
 
-static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 arg1,
-                                     TCGv_i64 arg2, TCGv_i64 arg3,
-                                     TCGArg arg4, TCGArg arg5)
+static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGv_i64 a3, TCGArg a4, TCGArg a5)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = arg4;
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
+    tcg_gen_op5(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), a4, a5);
 }
 
-static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                   TCGv_i32 arg3, TCGv_i32 arg4, TCGv_i32 arg5,
-                                   TCGv_i32 arg6)
+static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4,
+                                   TCGv_i32 a5, TCGv_i32 a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg6);
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), GET_TCGV_I32(a5),
+                GET_TCGV_I32(a6));
 }
 
-static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                   TCGv_i64 arg3, TCGv_i64 arg4, TCGv_i64 arg5,
-                                   TCGv_i64 arg6)
+static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4,
+                                   TCGv_i64 a5, TCGv_i64 a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg6);
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), GET_TCGV_I64(a5),
+                GET_TCGV_I64(a6));
 }
 
-static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
-                                    TCGv_i32 arg3, TCGv_i32 arg4,
-                                    TCGv_i32 arg5, TCGArg arg6)
+static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGv_i32 a4,
+                                    TCGv_i32 a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), GET_TCGV_I32(a5), a6);
 }
 
-static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 arg1, TCGv_i64 arg2,
-                                    TCGv_i64 arg3, TCGv_i64 arg4,
-                                    TCGv_i64 arg5, TCGArg arg6)
+static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGv_i64 a4,
+                                    TCGv_i64 a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg5);
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), GET_TCGV_I64(a5), a6);
 }
 
-static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 arg1,
-                                     TCGv_i32 arg2, TCGv_i32 arg3,
-                                     TCGv_i32 arg4, TCGArg arg5, TCGArg arg6)
+static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGv_i32 a3, TCGv_i32 a4,
+                                     TCGArg a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I32(a1), GET_TCGV_I32(a2),
+                GET_TCGV_I32(a3), GET_TCGV_I32(a4), a5, a6);
 }
 
-static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 arg1,
-                                     TCGv_i64 arg2, TCGv_i64 arg3,
-                                     TCGv_i64 arg4, TCGArg arg5, TCGArg arg6)
+static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGv_i64 a3, TCGv_i64 a4,
+                                     TCGArg a5, TCGArg a6)
 {
-    *tcg_ctx.gen_opc_ptr++ = opc;
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg1);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg2);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg3);
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(arg4);
-    *tcg_ctx.gen_opparam_ptr++ = arg5;
-    *tcg_ctx.gen_opparam_ptr++ = arg6;
+    tcg_gen_op6(&tcg_ctx, opc, GET_TCGV_I64(a1), GET_TCGV_I64(a2),
+                GET_TCGV_I64(a3), GET_TCGV_I64(a4), a5, a6);
 }
 
-static inline void tcg_add_param_i32(TCGv_i32 val)
-{
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(val);
-}
 
-static inline void tcg_add_param_i64(TCGv_i64 val)
-{
-#if TCG_TARGET_REG_BITS == 32
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(TCGV_LOW(val));
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(TCGV_HIGH(val));
-#else
-    *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I64(val);
-#endif
-}
+/* Generic ops.  */
+
+int gen_new_label(void);
 
 static inline void gen_set_label(int n)
 {
-    tcg_gen_op1i(INDEX_op_set_label, n);
+    tcg_gen_op1(&tcg_ctx, INDEX_op_set_label, n);
 }
 
 static inline void tcg_gen_br(int label)
 {
-    tcg_gen_op1i(INDEX_op_br, label);
+    tcg_gen_op1(&tcg_ctx, INDEX_op_br, label);
+}
+
+
+/* Helper calls. */
+
+/* 32 bit ops */
+
+void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
+void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
+void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, int label);
+void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, int label);
+void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
+                         TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
+                          TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
+                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2);
+void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
+void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
+void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
+
+static inline void tcg_gen_discard_i32(TCGv_i32 arg)
+{
+    tcg_gen_op1_i32(INDEX_op_discard, arg);
 }
 
 static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
-    if (!TCGV_EQUAL_I32(ret, arg))
+    if (!TCGV_EQUAL_I32(ret, arg)) {
         tcg_gen_op2_i32(INDEX_op_mov_i32, ret, arg);
+    }
 }
 
 static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
@@ -381,44 +331,50 @@
     tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
 }
 
-/* 32 bit ops */
-
-static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld8u_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld8s_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld16u_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld16s_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_ld_i32, ret, arg2, offset);
 }
 
-static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                   tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_st8_i32, arg1, arg2, offset);
 }
 
-static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_st16_i32, arg1, arg2, offset);
 }
 
-static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i32(INDEX_op_st_i32, arg1, arg2, offset);
 }
@@ -428,126 +384,24 @@
     tcg_gen_op3_i32(INDEX_op_add_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_add_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_sub_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_sub_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
-{
-    TCGv_i32 t0 = tcg_const_i32(arg1);
-    tcg_gen_sub_i32(ret, t0, arg2);
-    tcg_temp_free_i32(t0);
-}
-
-static inline void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_and_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
-    if (TCGV_EQUAL_I32(arg1, arg2)) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
-{
-    TCGv_i32 t0;
-    /* Some cases can be optimized here.  */
-    switch (arg2) {
-    case 0:
-        tcg_gen_movi_i32(ret, 0);
-        return;
-    case 0xffffffffu:
-        tcg_gen_mov_i32(ret, arg1);
-        return;
-    case 0xffu:
-        /* Don't recurse with tcg_gen_ext8u_i32.  */
-        if (TCG_TARGET_HAS_ext8u_i32) {
-            tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg1);
-            return;
-        }
-        break;
-    case 0xffffu:
-        if (TCG_TARGET_HAS_ext16u_i32) {
-            tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg1);
-            return;
-        }
-        break;
-    }
-    t0 = tcg_const_i32(arg2);
-    tcg_gen_and_i32(ret, arg1, t0);
-    tcg_temp_free_i32(t0);
+    tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_or_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
-    if (TCGV_EQUAL_I32(arg1, arg2)) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == -1) {
-        tcg_gen_movi_i32(ret, -1);
-    } else if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_or_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
+    tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_xor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
-    if (TCGV_EQUAL_I32(arg1, arg2)) {
-        tcg_gen_movi_i32(ret, 0);
-    } else {
-        tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i32) {
-        /* Don't recurse with tcg_gen_not_i32.  */
-        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_xor_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
+    tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_shl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
@@ -555,449 +409,102 @@
     tcg_gen_op3_i32(INDEX_op_shl_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_shr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_shr_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shr_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_sar_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_sar_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sar_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1,
-                                      TCGv_i32 arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        tcg_gen_op4ii_i32(INDEX_op_brcond_i32, arg1, arg2, cond, label_index);
-    }
-}
-
-static inline void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1,
-                                       int32_t arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_brcond_i32(cond, arg1, t0, label_index);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
-                                       TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i32(ret, 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i32(ret, 0);
-    } else {
-        tcg_gen_op4i_i32(INDEX_op_setcond_i32, ret, arg1, arg2, cond);
-    }
-}
-
-static inline void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
-                                        TCGv_i32 arg1, int32_t arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i32(ret, 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i32(ret, 0);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_setcond_i32(cond, ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
 static inline void tcg_gen_mul_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
     tcg_gen_op3_i32(INDEX_op_mul_i32, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
-    TCGv_i32 t0 = tcg_const_i32(arg2);
-    tcg_gen_mul_i32(ret, arg1, t0);
-    tcg_temp_free_i32(t0);
-}
-
-static inline void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_div_i32) {
-        tcg_gen_op3_i32(INDEX_op_div_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_sari_i32(t0, arg1, 31);
-        tcg_gen_op5_i32(INDEX_op_div2_i32, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
+    if (TCG_TARGET_HAS_neg_i32) {
+        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
     } else {
-        gen_helper_div_i32(ret, arg1, arg2);
+        tcg_gen_subfi_i32(ret, 0, arg);
     }
 }
 
-static inline void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
-    if (TCG_TARGET_HAS_rem_i32) {
-        tcg_gen_op3_i32(INDEX_op_rem_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_div_i32, t0, arg1, arg2);
-        tcg_gen_mul_i32(t0, t0, arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_sari_i32(t0, arg1, 31);
-        tcg_gen_op5_i32(INDEX_op_div2_i32, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
+    if (TCG_TARGET_HAS_not_i32) {
+        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
     } else {
-        gen_helper_rem_i32(ret, arg1, arg2);
+        tcg_gen_xori_i32(ret, arg, -1);
     }
 }
 
-static inline void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_div_i32) {
-        tcg_gen_op3_i32(INDEX_op_divu_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_movi_i32(t0, 0);
-        tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
-    } else {
-        gen_helper_divu_i32(ret, arg1, arg2);
-    }
-}
+/* 64 bit ops */
 
-static inline void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_rem_i32) {
-        tcg_gen_op3_i32(INDEX_op_remu_i32, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_divu_i32, t0, arg1, arg2);
-        tcg_gen_mul_i32(t0, t0, arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    } else if (TCG_TARGET_HAS_div2_i32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_movi_i32(t0, 0);
-        tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i32(t0);
-    } else {
-        gen_helper_remu_i32(ret, arg1, arg2);
-    }
-}
+void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
+void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
+void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, int label);
+void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, int label);
+void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
+                         TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
+                          TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
+                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2);
+void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
+void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
+void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
 
-#if TCG_TARGET_REG_BITS == 32
+#if TCG_TARGET_REG_BITS == 64
+static inline void tcg_gen_discard_i64(TCGv_i64 arg)
+{
+    tcg_gen_op1_i64(INDEX_op_discard, arg);
+}
 
 static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
 {
     if (!TCGV_EQUAL_I64(ret, arg)) {
-        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-    }
-}
-
-static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
-{
-    tcg_gen_movi_i32(TCGV_LOW(ret), arg);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), arg >> 32);
-}
-
-static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ld8u_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ld8s_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), 31);
-}
-
-static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld16u_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld16s_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-    /* since arg2 and ret have different types, they cannot be the
-       same temporary */
-#ifdef HOST_WORDS_BIGENDIAN
-    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset);
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset + 4);
-#else
-    tcg_gen_ld_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_ld_i32(TCGV_HIGH(ret), arg2, offset + 4);
-#endif
-}
-
-static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                   tcg_target_long offset)
-{
-    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
-}
-
-static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-#ifdef HOST_WORDS_BIGENDIAN
-    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset);
-    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset + 4);
-#else
-    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
-    tcg_gen_st_i32(TCGV_HIGH(arg1), arg2, offset + 4);
-#endif
-}
-
-static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op6_i32(INDEX_op_add2_i32, TCGV_LOW(ret), TCGV_HIGH(ret),
-                    TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
-                    TCGV_HIGH(arg2));
-    /* Allow the optimizer room to replace add2 with two moves.  */
-    tcg_gen_op0(INDEX_op_nop);
-}
-
-static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op6_i32(INDEX_op_sub2_i32, TCGV_LOW(ret), TCGV_HIGH(ret),
-                    TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
-                    TCGV_HIGH(arg2));
-    /* Allow the optimizer room to replace sub2 with two moves.  */
-    tcg_gen_op0(INDEX_op_nop);
-}
-
-static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_and_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_and_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-}
-
-static inline void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
-    tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
-}
-
-static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-}
-
-static inline void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_ori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
-    tcg_gen_ori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
-}
-
-static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_xor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_xor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-}
-
-static inline void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_xori_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
-    tcg_gen_xori_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
-}
-
-/* XXX: use generic code when basic block handling is OK or CPU
-   specific code (x86) */
-static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_shl_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0);
-}
-
-static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_shr_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0);
-}
-
-static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_sar_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1);
-}
-
-static inline void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1,
-                                      TCGv_i64 arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        tcg_gen_op6ii_i32(INDEX_op_brcond2_i32,
-                          TCGV_LOW(arg1), TCGV_HIGH(arg1), TCGV_LOW(arg2),
-                          TCGV_HIGH(arg2), cond, label_index);
-    }
-}
-
-static inline void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
-                                       TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i32(TCGV_LOW(ret), 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i32(TCGV_LOW(ret), 0);
-    } else {
-        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
-                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
-                         TCGV_LOW(arg2), TCGV_HIGH(arg2), cond);
-    }
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    TCGv_i64 t0;
-    TCGv_i32 t1;
-
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i32();
-
-    if (TCG_TARGET_HAS_mulu2_i32) {
-        tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
-                        TCGV_LOW(arg1), TCGV_LOW(arg2));
-        /* Allow the optimizer room to replace mulu2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        tcg_debug_assert(TCG_TARGET_HAS_muluh_i32);
-        tcg_gen_op3_i32(INDEX_op_mul_i32, TCGV_LOW(t0),
-                        TCGV_LOW(arg1), TCGV_LOW(arg2));
-        tcg_gen_op3_i32(INDEX_op_muluh_i32, TCGV_HIGH(t0),
-                        TCGV_LOW(arg1), TCGV_LOW(arg2));
-    }
-
-    tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
-    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
-    tcg_gen_mul_i32(t1, TCGV_HIGH(arg1), TCGV_LOW(arg2));
-    tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
-
-    tcg_gen_mov_i64(ret, t0);
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i32(t1);
-}
-
-static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_div_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_rem_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_divu_i64(ret, arg1, arg2);
-}
-
-static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    gen_helper_remu_i64(ret, arg1, arg2);
-}
-
-#else
-
-static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (!TCGV_EQUAL_I64(ret, arg))
         tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
+    }
 }
 
 static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
@@ -1041,7 +548,8 @@
     tcg_gen_ldst_op_i64(INDEX_op_ld32s_i64, ret, arg2, offset);
 }
 
-static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
 }
@@ -1064,7 +572,8 @@
     tcg_gen_ldst_op_i64(INDEX_op_st32_i64, arg1, arg2, offset);
 }
 
-static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset)
+static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                  tcg_target_long offset)
 {
     tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
 }
@@ -1081,94 +590,17 @@
 
 static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCGV_EQUAL_I64(arg1, arg2)) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
-{
-    TCGv_i64 t0;
-    /* Some cases can be optimized here.  */
-    switch (arg2) {
-    case 0:
-        tcg_gen_movi_i64(ret, 0);
-        return;
-    case 0xffffffffffffffffull:
-        tcg_gen_mov_i64(ret, arg1);
-        return;
-    case 0xffull:
-        /* Don't recurse with tcg_gen_ext8u_i32.  */
-        if (TCG_TARGET_HAS_ext8u_i64) {
-            tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg1);
-            return;
-        }
-        break;
-    case 0xffffu:
-        if (TCG_TARGET_HAS_ext16u_i64) {
-            tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg1);
-            return;
-        }
-        break;
-    case 0xffffffffull:
-        if (TCG_TARGET_HAS_ext32u_i64) {
-            tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg1);
-            return;
-        }
-        break;
-    }
-    t0 = tcg_const_i64(arg2);
-    tcg_gen_and_i64(ret, arg1, t0);
-    tcg_temp_free_i64(t0);
+    tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCGV_EQUAL_I64(arg1, arg2)) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == -1) {
-        tcg_gen_movi_i64(ret, -1);
-    } else if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_or_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
+    tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCGV_EQUAL_I64(arg1, arg2)) {
-        tcg_gen_movi_i64(ret, 0);
-    } else {
-        tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
-    }
-}
-
-static inline void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* Some cases can be optimized here.  */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else if (arg2 == -1 && TCG_TARGET_HAS_not_i64) {
-        /* Don't recurse with tcg_gen_not_i64.  */
-        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_xor_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
+    tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
 }
 
 static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
@@ -1176,993 +608,92 @@
     tcg_gen_op3_i64(INDEX_op_shl_i64, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
 static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
     tcg_gen_op3_i64(INDEX_op_shr_i64, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shr_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
 static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
     tcg_gen_op3_i64(INDEX_op_sar_i64, ret, arg1, arg2);
 }
 
-static inline void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sar_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-static inline void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1,
-                                      TCGv_i64 arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        tcg_gen_op4ii_i64(INDEX_op_brcond_i64, arg1, arg2, cond, label_index);
-    }
-}
-
-static inline void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
-                                       TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_movi_i64(ret, 1);
-    } else if (cond == TCG_COND_NEVER) {
-        tcg_gen_movi_i64(ret, 0);
-    } else {
-        tcg_gen_op4i_i64(INDEX_op_setcond_i64, ret, arg1, arg2, cond);
-    }
-}
-
 static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
     tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
 }
-
-static inline void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+#else /* TCG_TARGET_REG_BITS == 32 */
+static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                   tcg_target_long offset)
 {
-    if (TCG_TARGET_HAS_div_i64) {
-        tcg_gen_op3_i64(INDEX_op_div_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_sari_i64(t0, arg1, 63);
-        tcg_gen_op5_i64(INDEX_op_div2_i64, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_div_i64(ret, arg1, arg2);
-    }
+    tcg_gen_st8_i32(TCGV_LOW(arg1), arg2, offset);
 }
 
-static inline void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
-    if (TCG_TARGET_HAS_rem_i64) {
-        tcg_gen_op3_i64(INDEX_op_rem_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_div_i64, t0, arg1, arg2);
-        tcg_gen_mul_i64(t0, t0, arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_sari_i64(t0, arg1, 63);
-        tcg_gen_op5_i64(INDEX_op_div2_i64, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_rem_i64(ret, arg1, arg2);
-    }
+    tcg_gen_st16_i32(TCGV_LOW(arg1), arg2, offset);
 }
 
-static inline void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
 {
-    if (TCG_TARGET_HAS_div_i64) {
-        tcg_gen_op3_i64(INDEX_op_divu_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_movi_i64(t0, 0);
-        tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_divu_i64(ret, arg1, arg2);
-    }
+    tcg_gen_st_i32(TCGV_LOW(arg1), arg2, offset);
 }
 
-static inline void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    if (TCG_TARGET_HAS_rem_i64) {
-        tcg_gen_op3_i64(INDEX_op_remu_i64, ret, arg1, arg2);
-    } else if (TCG_TARGET_HAS_div_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_divu_i64, t0, arg1, arg2);
-        tcg_gen_mul_i64(t0, t0, arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    } else if (TCG_TARGET_HAS_div2_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_movi_i64(t0, 0);
-        tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, t0, arg2);
-        tcg_temp_free_i64(t0);
-    } else {
-        gen_helper_remu_i64(ret, arg1, arg2);
-    }
-}
-#endif /* TCG_TARGET_REG_BITS == 32 */
-
-static inline void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_add_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
+    tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
+                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 }
 
-static inline void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
+static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
-    TCGv_i64 t0 = tcg_const_i64(arg1);
-    tcg_gen_sub_i64(ret, t0, arg2);
-    tcg_temp_free_i64(t0);
+    tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret), TCGV_LOW(arg1),
+                     TCGV_HIGH(arg1), TCGV_LOW(arg2), TCGV_HIGH(arg2));
 }
 
-static inline void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-static inline void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1,
-                                       int64_t arg2, int label_index)
-{
-    if (cond == TCG_COND_ALWAYS) {
-        tcg_gen_br(label_index);
-    } else if (cond != TCG_COND_NEVER) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_brcond_i64(cond, arg1, t0, label_index);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-static inline void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
-                                        TCGv_i64 arg1, int64_t arg2)
-{
-    TCGv_i64 t0 = tcg_const_i64(arg2);
-    tcg_gen_setcond_i64(cond, ret, arg1, t0);
-    tcg_temp_free_i64(t0);
-}
-
-static inline void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    TCGv_i64 t0 = tcg_const_i64(arg2);
-    tcg_gen_mul_i64(ret, arg1, t0);
-    tcg_temp_free_i64(t0);
-}
-
-
-/***************************************/
-/* optional operations */
-
-static inline void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext8s_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext8s_i32, ret, arg);
-    } else {
-        tcg_gen_shli_i32(ret, arg, 24);
-        tcg_gen_sari_i32(ret, ret, 24);
-    }
-}
-
-static inline void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext16s_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext16s_i32, ret, arg);
-    } else {
-        tcg_gen_shli_i32(ret, arg, 16);
-        tcg_gen_sari_i32(ret, ret, 16);
-    }
-}
-
-static inline void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext8u_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg);
-    } else {
-        tcg_gen_andi_i32(ret, arg, 0xffu);
-    }
-}
-
-static inline void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_ext16u_i32) {
-        tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg);
-    } else {
-        tcg_gen_andi_i32(ret, arg, 0xffffu);
-    }
-}
-
-/* Note: we assume the two high bytes are set to zero */
-static inline void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_bswap16_i32) {
-        tcg_gen_op2_i32(INDEX_op_bswap16_i32, ret, arg);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-    
-        tcg_gen_ext8u_i32(t0, arg);
-        tcg_gen_shli_i32(t0, t0, 8);
-        tcg_gen_shri_i32(ret, arg, 8);
-        tcg_gen_or_i32(ret, ret, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_bswap32_i32) {
-        tcg_gen_op2_i32(INDEX_op_bswap32_i32, ret, arg);
-    } else {
-        TCGv_i32 t0, t1;
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-    
-        tcg_gen_shli_i32(t0, arg, 24);
-    
-        tcg_gen_andi_i32(t1, arg, 0x0000ff00);
-        tcg_gen_shli_i32(t1, t1, 8);
-        tcg_gen_or_i32(t0, t0, t1);
-    
-        tcg_gen_shri_i32(t1, arg, 8);
-        tcg_gen_andi_i32(t1, t1, 0x0000ff00);
-        tcg_gen_or_i32(t0, t0, t1);
-    
-        tcg_gen_shri_i32(t1, arg, 24);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-#if TCG_TARGET_REG_BITS == 32
-static inline void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext8s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext16s_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-static inline void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext8u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_ext16u_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg,
-                                             unsigned int count)
-{
-    tcg_debug_assert(count < 64);
-    if (count >= 32) {
-        tcg_gen_shri_i32(ret, TCGV_HIGH(arg), count - 32);
-    } else if (count == 0) {
-        tcg_gen_mov_i32(ret, TCGV_LOW(arg));
-    } else {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_shri_i64(t, arg, count);
-        tcg_gen_mov_i32(ret, TCGV_LOW(t));
-        tcg_temp_free_i64(t);
-    }
-}
-
-static inline void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), arg);
-    tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-}
-
-static inline void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_mov_i32(TCGV_LOW(ret), arg);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
-}
-
-/* Note: we assume the six high bytes are set to zero */
-static inline void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-    tcg_gen_bswap16_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-}
-
-/* Note: we assume the four high bytes are set to zero */
-static inline void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-    tcg_gen_bswap32_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-}
-
-static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    TCGv_i32 t0, t1;
-    t0 = tcg_temp_new_i32();
-    t1 = tcg_temp_new_i32();
-
-    tcg_gen_bswap32_i32(t0, TCGV_LOW(arg));
-    tcg_gen_bswap32_i32(t1, TCGV_HIGH(arg));
-    tcg_gen_mov_i32(TCGV_LOW(ret), t1);
-    tcg_gen_mov_i32(TCGV_HIGH(ret), t0);
-    tcg_temp_free_i32(t0);
-    tcg_temp_free_i32(t1);
-}
-#else
-
-static inline void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext8s_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext8s_i64, ret, arg);
-    } else {
-        tcg_gen_shli_i64(ret, arg, 56);
-        tcg_gen_sari_i64(ret, ret, 56);
-    }
-}
-
-static inline void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext16s_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext16s_i64, ret, arg);
-    } else {
-        tcg_gen_shli_i64(ret, arg, 48);
-        tcg_gen_sari_i64(ret, ret, 48);
-    }
-}
-
-static inline void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext32s_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext32s_i64, ret, arg);
-    } else {
-        tcg_gen_shli_i64(ret, arg, 32);
-        tcg_gen_sari_i64(ret, ret, 32);
-    }
-}
-
-static inline void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext8u_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg);
-    } else {
-        tcg_gen_andi_i64(ret, arg, 0xffu);
-    }
-}
-
-static inline void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext16u_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg);
-    } else {
-        tcg_gen_andi_i64(ret, arg, 0xffffu);
-    }
-}
-
-static inline void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_ext32u_i64) {
-        tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg);
-    } else {
-        tcg_gen_andi_i64(ret, arg, 0xffffffffu);
-    }
-}
-
-static inline void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg,
-                                             unsigned int count)
-{
-    tcg_debug_assert(count < 64);
-    if (TCG_TARGET_HAS_trunc_shr_i32) {
-        tcg_gen_op3i_i32(INDEX_op_trunc_shr_i32, ret,
-                         MAKE_TCGV_I32(GET_TCGV_I64(arg)), count);
-    } else if (count == 0) {
-        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(arg)));
-    } else {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_shri_i64(t, arg, count);
-        tcg_gen_mov_i32(ret, MAKE_TCGV_I32(GET_TCGV_I64(t)));
-        tcg_temp_free_i64(t);
-    }
-}
-
-/* Note: we assume the target supports move between 32 and 64 bit
-   registers */
-static inline void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_ext32u_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
-}
-
-/* Note: we assume the target supports move between 32 and 64 bit
-   registers */
-static inline void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg)
-{
-    tcg_gen_ext32s_i64(ret, MAKE_TCGV_I64(GET_TCGV_I32(arg)));
-}
-
-/* Note: we assume the six high bytes are set to zero */
-static inline void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_bswap16_i64) {
-        tcg_gen_op2_i64(INDEX_op_bswap16_i64, ret, arg);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-
-        tcg_gen_ext8u_i64(t0, arg);
-        tcg_gen_shli_i64(t0, t0, 8);
-        tcg_gen_shri_i64(ret, arg, 8);
-        tcg_gen_or_i64(ret, ret, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-/* Note: we assume the four high bytes are set to zero */
-static inline void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_bswap32_i64) {
-        tcg_gen_op2_i64(INDEX_op_bswap32_i64, ret, arg);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-
-        tcg_gen_shli_i64(t0, arg, 24);
-        tcg_gen_ext32u_i64(t0, t0);
-
-        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
-        tcg_gen_shli_i64(t1, t1, 8);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 8);
-        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 24);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_bswap64_i64) {
-        tcg_gen_op2_i64(INDEX_op_bswap64_i64, ret, arg);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-    
-        tcg_gen_shli_i64(t0, arg, 56);
-    
-        tcg_gen_andi_i64(t1, arg, 0x0000ff00);
-        tcg_gen_shli_i64(t1, t1, 40);
-        tcg_gen_or_i64(t0, t0, t1);
-    
-        tcg_gen_andi_i64(t1, arg, 0x00ff0000);
-        tcg_gen_shli_i64(t1, t1, 24);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_andi_i64(t1, arg, 0xff000000);
-        tcg_gen_shli_i64(t1, t1, 8);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 8);
-        tcg_gen_andi_i64(t1, t1, 0xff000000);
-        tcg_gen_or_i64(t0, t0, t1);
-    
-        tcg_gen_shri_i64(t1, arg, 24);
-        tcg_gen_andi_i64(t1, t1, 0x00ff0000);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 40);
-        tcg_gen_andi_i64(t1, t1, 0x0000ff00);
-        tcg_gen_or_i64(t0, t0, t1);
-
-        tcg_gen_shri_i64(t1, arg, 56);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-#endif
-
-static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_neg_i32) {
-        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
-    } else {
-        TCGv_i32 t0 = tcg_const_i32(0);
-        tcg_gen_sub_i32(ret, t0, arg);
-        tcg_temp_free_i32(t0);
-    }
-}
+void tcg_gen_discard_i64(TCGv_i64 arg);
+void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
+void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+#endif /* TCG_TARGET_REG_BITS */
 
 static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
 {
     if (TCG_TARGET_HAS_neg_i64) {
         tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(0);
-        tcg_gen_sub_i64(ret, t0, arg);
-        tcg_temp_free_i64(t0);
+        tcg_gen_subfi_i64(ret, 0, arg);
     }
 }
 
-static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
+/* Size changing operations.  */
+
+void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
+void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
+void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high);
+void tcg_gen_trunc_shr_i64_i32(TCGv_i32 ret, TCGv_i64 arg, unsigned int c);
+void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg);
+void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg);
+
+static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
 {
-    if (TCG_TARGET_HAS_not_i32) {
-        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
-    } else {
-        tcg_gen_xori_i32(ret, arg, -1);
-    }
-}
-
-static inline void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_not_i64) {
-        tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg);
-    } else {
-        tcg_gen_xori_i64(ret, arg, -1);
-    }
-#else
-    tcg_gen_not_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_not_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
-#endif
-}
-
-static inline void tcg_gen_discard_i32(TCGv_i32 arg)
-{
-    tcg_gen_op1_i32(INDEX_op_discard, arg);
-}
-
-static inline void tcg_gen_discard_i64(TCGv_i64 arg)
-{
-#if TCG_TARGET_REG_BITS == 32
-    tcg_gen_discard_i32(TCGV_LOW(arg));
-    tcg_gen_discard_i32(TCGV_HIGH(arg));
-#else
-    tcg_gen_op1_i64(INDEX_op_discard, arg);
-#endif
-}
-
-static inline void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_andc_i32) {
-        tcg_gen_op3_i32(INDEX_op_andc_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_not_i32(t0, arg2);
-        tcg_gen_and_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_andc_i64) {
-        tcg_gen_op3_i64(INDEX_op_andc_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_not_i64(t0, arg2);
-        tcg_gen_and_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-#else
-    tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_eqv_i32) {
-        tcg_gen_op3_i32(INDEX_op_eqv_i32, ret, arg1, arg2);
-    } else {
-        tcg_gen_xor_i32(ret, arg1, arg2);
-        tcg_gen_not_i32(ret, ret);
-    }
-}
-
-static inline void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_eqv_i64) {
-        tcg_gen_op3_i64(INDEX_op_eqv_i64, ret, arg1, arg2);
-    } else {
-        tcg_gen_xor_i64(ret, arg1, arg2);
-        tcg_gen_not_i64(ret, ret);
-    }
-#else
-    tcg_gen_eqv_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_eqv_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_nand_i32) {
-        tcg_gen_op3_i32(INDEX_op_nand_i32, ret, arg1, arg2);
-    } else {
-        tcg_gen_and_i32(ret, arg1, arg2);
-        tcg_gen_not_i32(ret, ret);
-    }
-}
-
-static inline void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_nand_i64) {
-        tcg_gen_op3_i64(INDEX_op_nand_i64, ret, arg1, arg2);
-    } else {
-        tcg_gen_and_i64(ret, arg1, arg2);
-        tcg_gen_not_i64(ret, ret);
-    }
-#else
-    tcg_gen_nand_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_nand_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_nor_i32) {
-        tcg_gen_op3_i32(INDEX_op_nor_i32, ret, arg1, arg2);
-    } else {
-        tcg_gen_or_i32(ret, arg1, arg2);
-        tcg_gen_not_i32(ret, ret);
-    }
-}
-
-static inline void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_nor_i64) {
-        tcg_gen_op3_i64(INDEX_op_nor_i64, ret, arg1, arg2);
-    } else {
-        tcg_gen_or_i64(ret, arg1, arg2);
-        tcg_gen_not_i64(ret, ret);
-    }
-#else
-    tcg_gen_nor_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_nor_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_orc_i32) {
-        tcg_gen_op3_i32(INDEX_op_orc_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_not_i32(t0, arg2);
-        tcg_gen_or_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    }
-}
-
-static inline void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-#if TCG_TARGET_REG_BITS == 64
-    if (TCG_TARGET_HAS_orc_i64) {
-        tcg_gen_op3_i64(INDEX_op_orc_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_not_i64(t0, arg2);
-        tcg_gen_or_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    }
-#else
-    tcg_gen_orc_i32(TCGV_LOW(ret), TCGV_LOW(arg1), TCGV_LOW(arg2));
-    tcg_gen_orc_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), TCGV_HIGH(arg2));
-#endif
-}
-
-static inline void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i32) {
-        tcg_gen_op3_i32(INDEX_op_rotl_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0, t1;
-
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        tcg_gen_shl_i32(t0, arg1, arg2);
-        tcg_gen_subfi_i32(t1, 32, arg2);
-        tcg_gen_shr_i32(t1, arg1, t1);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i64) {
-        tcg_gen_op3_i64(INDEX_op_rotl_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-        tcg_gen_shl_i64(t0, arg1, arg2);
-        tcg_gen_subfi_i64(t1, 64, arg2);
-        tcg_gen_shr_i64(t1, arg1, t1);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else if (TCG_TARGET_HAS_rot_i32) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_rotl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
-    } else {
-        TCGv_i32 t0, t1;
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        tcg_gen_shli_i32(t0, arg1, arg2);
-        tcg_gen_shri_i32(t1, arg1, 32 - arg2);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else if (TCG_TARGET_HAS_rot_i64) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_rotl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-        tcg_gen_shli_i64(t0, arg1, arg2);
-        tcg_gen_shri_i64(t1, arg1, 64 - arg2);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i32) {
-        tcg_gen_op3_i32(INDEX_op_rotr_i32, ret, arg1, arg2);
-    } else {
-        TCGv_i32 t0, t1;
-
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        tcg_gen_shr_i32(t0, arg1, arg2);
-        tcg_gen_subfi_i32(t1, 32, arg2);
-        tcg_gen_shl_i32(t1, arg1, t1);
-        tcg_gen_or_i32(ret, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_rot_i64) {
-        tcg_gen_op3_i64(INDEX_op_rotr_i64, ret, arg1, arg2);
-    } else {
-        TCGv_i64 t0, t1;
-        t0 = tcg_temp_new_i64();
-        t1 = tcg_temp_new_i64();
-        tcg_gen_shr_i64(t0, arg1, arg2);
-        tcg_gen_subfi_i64(t1, 64, arg2);
-        tcg_gen_shl_i64(t1, arg1, t1);
-        tcg_gen_or_i64(ret, t0, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i32(ret, arg1);
-    } else {
-        tcg_gen_rotli_i32(ret, arg1, 32 - arg2);
-    }
-}
-
-static inline void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
-{
-    /* some cases can be optimized here */
-    if (arg2 == 0) {
-        tcg_gen_mov_i64(ret, arg1);
-    } else {
-        tcg_gen_rotli_i64(ret, arg1, 64 - arg2);
-    }
-}
-
-static inline void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1,
-                                       TCGv_i32 arg2, unsigned int ofs,
-                                       unsigned int len)
-{
-    uint32_t mask;
-    TCGv_i32 t1;
-
-    tcg_debug_assert(ofs < 32);
-    tcg_debug_assert(len <= 32);
-    tcg_debug_assert(ofs + len <= 32);
-
-    if (ofs == 0 && len == 32) {
-        tcg_gen_mov_i32(ret, arg2);
-        return;
-    }
-    if (TCG_TARGET_HAS_deposit_i32 && TCG_TARGET_deposit_i32_valid(ofs, len)) {
-        tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, arg1, arg2, ofs, len);
-        return;
-    }
-
-    mask = (1u << len) - 1;
-    t1 = tcg_temp_new_i32();
-
-    if (ofs + len < 32) {
-        tcg_gen_andi_i32(t1, arg2, mask);
-        tcg_gen_shli_i32(t1, t1, ofs);
-    } else {
-        tcg_gen_shli_i32(t1, arg2, ofs);
-    }
-    tcg_gen_andi_i32(ret, arg1, ~(mask << ofs));
-    tcg_gen_or_i32(ret, ret, t1);
-
-    tcg_temp_free_i32(t1);
-}
-
-static inline void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1,
-                                       TCGv_i64 arg2, unsigned int ofs,
-                                       unsigned int len)
-{
-    uint64_t mask;
-    TCGv_i64 t1;
-
-    tcg_debug_assert(ofs < 64);
-    tcg_debug_assert(len <= 64);
-    tcg_debug_assert(ofs + len <= 64);
-
-    if (ofs == 0 && len == 64) {
-        tcg_gen_mov_i64(ret, arg2);
-        return;
-    }
-    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(ofs, len)) {
-        tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, arg1, arg2, ofs, len);
-        return;
-    }
-
-#if TCG_TARGET_REG_BITS == 32
-    if (ofs >= 32) {
-        tcg_gen_deposit_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1),
-                            TCGV_LOW(arg2), ofs - 32, len);
-        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
-        return;
-    }
-    if (ofs + len <= 32) {
-        tcg_gen_deposit_i32(TCGV_LOW(ret), TCGV_LOW(arg1),
-                            TCGV_LOW(arg2), ofs, len);
-        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
-        return;
-    }
-#endif
-
-    mask = (1ull << len) - 1;
-    t1 = tcg_temp_new_i64();
-
-    if (ofs + len < 64) {
-        tcg_gen_andi_i64(t1, arg2, mask);
-        tcg_gen_shli_i64(t1, t1, ofs);
-    } else {
-        tcg_gen_shli_i64(t1, arg2, ofs);
-    }
-    tcg_gen_andi_i64(ret, arg1, ~(mask << ofs));
-    tcg_gen_or_i64(ret, ret, t1);
-
-    tcg_temp_free_i64(t1);
-}
-
-static inline void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low,
-                                          TCGv_i32 high)
-{
-#if TCG_TARGET_REG_BITS == 32
-    tcg_gen_mov_i32(TCGV_LOW(dest), low);
-    tcg_gen_mov_i32(TCGV_HIGH(dest), high);
-#else
-    TCGv_i64 tmp = tcg_temp_new_i64();
-    /* These extensions are only needed for type correctness.
-       We may be able to do better given target specific information.  */
-    tcg_gen_extu_i32_i64(tmp, high);
-    tcg_gen_extu_i32_i64(dest, low);
-    /* If deposit is available, use it.  Otherwise use the extra
-       knowledge that we have of the zero-extensions above.  */
-    if (TCG_TARGET_HAS_deposit_i64 && TCG_TARGET_deposit_i64_valid(32, 32)) {
-        tcg_gen_deposit_i64(dest, dest, tmp, 32, 32);
-    } else {
-        tcg_gen_shli_i64(tmp, tmp, 32);
-        tcg_gen_or_i64(dest, dest, tmp);
-    }
-    tcg_temp_free_i64(tmp);
-#endif
-}
-
-static inline void tcg_gen_concat32_i64(TCGv_i64 dest, TCGv_i64 low,
-                                        TCGv_i64 high)
-{
-    tcg_gen_deposit_i64(dest, low, high, 32, 32);
+    tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
 }
 
 static inline void tcg_gen_trunc_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
@@ -2170,299 +701,31 @@
     tcg_gen_trunc_shr_i64_i32(ret, arg, 0);
 }
 
-static inline void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg)
-{
-    tcg_gen_trunc_shr_i64_i32(lo, arg, 0);
-    tcg_gen_trunc_shr_i64_i32(hi, arg, 32);
-}
+/* QEMU specific operations.  */
 
-static inline void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg)
-{
-    tcg_gen_ext32u_i64(lo, arg);
-    tcg_gen_shri_i64(hi, arg, 32);
-}
-
-static inline void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret,
-                                       TCGv_i32 c1, TCGv_i32 c2,
-                                       TCGv_i32 v1, TCGv_i32 v2)
-{
-    if (TCG_TARGET_HAS_movcond_i32) {
-        tcg_gen_op6i_i32(INDEX_op_movcond_i32, ret, c1, c2, v1, v2, cond);
-    } else {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        tcg_gen_setcond_i32(cond, t0, c1, c2);
-        tcg_gen_neg_i32(t0, t0);
-        tcg_gen_and_i32(t1, v1, t0);
-        tcg_gen_andc_i32(ret, v2, t0);
-        tcg_gen_or_i32(ret, ret, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-
-static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
-                                       TCGv_i64 c1, TCGv_i64 c2,
-                                       TCGv_i64 v1, TCGv_i64 v2)
-{
-#if TCG_TARGET_REG_BITS == 32
-    TCGv_i32 t0 = tcg_temp_new_i32();
-    TCGv_i32 t1 = tcg_temp_new_i32();
-    tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
-                     TCGV_LOW(c1), TCGV_HIGH(c1),
-                     TCGV_LOW(c2), TCGV_HIGH(c2), cond);
-
-    if (TCG_TARGET_HAS_movcond_i32) {
-        tcg_gen_movi_i32(t1, 0);
-        tcg_gen_movcond_i32(TCG_COND_NE, TCGV_LOW(ret), t0, t1,
-                            TCGV_LOW(v1), TCGV_LOW(v2));
-        tcg_gen_movcond_i32(TCG_COND_NE, TCGV_HIGH(ret), t0, t1,
-                            TCGV_HIGH(v1), TCGV_HIGH(v2));
-    } else {
-        tcg_gen_neg_i32(t0, t0);
-
-        tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
-        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
-        tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
-
-        tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
-        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
-        tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
-    }
-    tcg_temp_free_i32(t0);
-    tcg_temp_free_i32(t1);
-#else
-    if (TCG_TARGET_HAS_movcond_i64) {
-        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_setcond_i64(cond, t0, c1, c2);
-        tcg_gen_neg_i64(t0, t0);
-        tcg_gen_and_i64(t1, v1, t0);
-        tcg_gen_andc_i64(ret, v2, t0);
-        tcg_gen_or_i64(ret, ret, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-#endif
-}
-
-static inline void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-                                    TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
-{
-    if (TCG_TARGET_HAS_add2_i32) {
-        tcg_gen_op6_i32(INDEX_op_add2_i32, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace add2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_concat_i32_i64(t0, al, ah);
-        tcg_gen_concat_i32_i64(t1, bl, bh);
-        tcg_gen_add_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-                                    TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
-{
-    if (TCG_TARGET_HAS_sub2_i32) {
-        tcg_gen_op6_i32(INDEX_op_sub2_i32, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace sub2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_concat_i32_i64(t0, al, ah);
-        tcg_gen_concat_i32_i64(t1, bl, bh);
-        tcg_gen_sub_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh,
-                                     TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_mulu2_i32) {
-        tcg_gen_op4_i32(INDEX_op_mulu2_i32, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace mulu2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_muluh_i32) {
-        TCGv_i32 t = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
-        tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
-        tcg_gen_mov_i32(rl, t);
-        tcg_temp_free_i32(t);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_extu_i32_i64(t0, arg1);
-        tcg_gen_extu_i32_i64(t1, arg2);
-        tcg_gen_mul_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh,
-                                     TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    if (TCG_TARGET_HAS_muls2_i32) {
-        tcg_gen_op4_i32(INDEX_op_muls2_i32, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace muls2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_mulsh_i32) {
-        TCGv_i32 t = tcg_temp_new_i32();
-        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
-        tcg_gen_op3_i32(INDEX_op_mulsh_i32, rh, arg1, arg2);
-        tcg_gen_mov_i32(rl, t);
-        tcg_temp_free_i32(t);
-    } else if (TCG_TARGET_REG_BITS == 32) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        TCGv_i32 t1 = tcg_temp_new_i32();
-        TCGv_i32 t2 = tcg_temp_new_i32();
-        TCGv_i32 t3 = tcg_temp_new_i32();
-        tcg_gen_mulu2_i32(t0, t1, arg1, arg2);
-        /* Adjust for negative inputs.  */
-        tcg_gen_sari_i32(t2, arg1, 31);
-        tcg_gen_sari_i32(t3, arg2, 31);
-        tcg_gen_and_i32(t2, t2, arg2);
-        tcg_gen_and_i32(t3, t3, arg1);
-        tcg_gen_sub_i32(rh, t1, t2);
-        tcg_gen_sub_i32(rh, rh, t3);
-        tcg_gen_mov_i32(rl, t0);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-        tcg_temp_free_i32(t2);
-        tcg_temp_free_i32(t3);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_ext_i32_i64(t0, arg1);
-        tcg_gen_ext_i32_i64(t1, arg2);
-        tcg_gen_mul_i64(t0, t0, t1);
-        tcg_gen_extr_i64_i32(rl, rh, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-                                    TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
-{
-    if (TCG_TARGET_HAS_add2_i64) {
-        tcg_gen_op6_i64(INDEX_op_add2_i64, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace add2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_add_i64(t0, al, bl);
-        tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, al);
-        tcg_gen_add_i64(rh, ah, bh);
-        tcg_gen_add_i64(rh, rh, t1);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-                                    TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
-{
-    if (TCG_TARGET_HAS_sub2_i64) {
-        tcg_gen_op6_i64(INDEX_op_sub2_i64, rl, rh, al, ah, bl, bh);
-        /* Allow the optimizer room to replace sub2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_sub_i64(t0, al, bl);
-        tcg_gen_setcond_i64(TCG_COND_LTU, t1, al, bl);
-        tcg_gen_sub_i64(rh, ah, bh);
-        tcg_gen_sub_i64(rh, rh, t1);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-    }
-}
-
-static inline void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh,
-                                     TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_mulu2_i64) {
-        tcg_gen_op4_i64(INDEX_op_mulu2_i64, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace mulu2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_muluh_i64) {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
-        tcg_gen_op3_i64(INDEX_op_muluh_i64, rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t);
-        tcg_temp_free_i64(t);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_mul_i64(t0, arg1, arg2);
-        gen_helper_muluh_i64(rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
-                                     TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    if (TCG_TARGET_HAS_muls2_i64) {
-        tcg_gen_op4_i64(INDEX_op_muls2_i64, rl, rh, arg1, arg2);
-        /* Allow the optimizer room to replace muls2 with two moves.  */
-        tcg_gen_op0(INDEX_op_nop);
-    } else if (TCG_TARGET_HAS_mulsh_i64) {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
-        tcg_gen_op3_i64(INDEX_op_mulsh_i64, rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t);
-        tcg_temp_free_i64(t);
-    } else if (TCG_TARGET_HAS_mulu2_i64 || TCG_TARGET_HAS_muluh_i64) {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        TCGv_i64 t2 = tcg_temp_new_i64();
-        TCGv_i64 t3 = tcg_temp_new_i64();
-        tcg_gen_mulu2_i64(t0, t1, arg1, arg2);
-        /* Adjust for negative inputs.  */
-        tcg_gen_sari_i64(t2, arg1, 63);
-        tcg_gen_sari_i64(t3, arg2, 63);
-        tcg_gen_and_i64(t2, t2, arg2);
-        tcg_gen_and_i64(t3, t3, arg1);
-        tcg_gen_sub_i64(rh, t1, t2);
-        tcg_gen_sub_i64(rh, rh, t3);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
-        tcg_temp_free_i64(t2);
-        tcg_temp_free_i64(t3);
-    } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        tcg_gen_mul_i64(t0, arg1, arg2);
-        gen_helper_mulsh_i64(rh, arg1, arg2);
-        tcg_gen_mov_i64(rl, t0);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-/***************************************/
-/* QEMU specific operations. Their type depend on the QEMU CPU
-   type. */
 #ifndef TARGET_LONG_BITS
 #error must include QEMU headers
 #endif
 
+/* debug info: write the PC of the corresponding QEMU CPU instruction */
+static inline void tcg_gen_debug_insn_start(uint64_t pc)
+{
+    /* XXX: must really use a 32 bit size for TCGArg in all cases */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+    tcg_gen_op2ii(INDEX_op_debug_insn_start,
+                  (uint32_t)(pc), (uint32_t)(pc >> 32));
+#else
+    tcg_gen_op1i(INDEX_op_debug_insn_start, pc);
+#endif
+}
+
+static inline void tcg_gen_exit_tb(uintptr_t val)
+{
+    tcg_gen_op1i(INDEX_op_exit_tb, val);
+}
+
+void tcg_gen_goto_tb(unsigned idx);
+
 #if TARGET_LONG_BITS == 32
 #define TCGv TCGv_i32
 #define tcg_temp_new() tcg_temp_new_i32()
@@ -2473,7 +736,6 @@
 #define TCGV_UNUSED(x) TCGV_UNUSED_I32(x)
 #define TCGV_IS_UNUSED(x) TCGV_IS_UNUSED_I32(x)
 #define TCGV_EQUAL(a, b) TCGV_EQUAL_I32(a, b)
-#define tcg_add_param_tl tcg_add_param_i32
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32
 #else
@@ -2486,41 +748,10 @@
 #define TCGV_UNUSED(x) TCGV_UNUSED_I64(x)
 #define TCGV_IS_UNUSED(x) TCGV_IS_UNUSED_I64(x)
 #define TCGV_EQUAL(a, b) TCGV_EQUAL_I64(a, b)
-#define tcg_add_param_tl tcg_add_param_i64
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64
 #endif
 
-/* debug info: write the PC of the corresponding QEMU CPU instruction */
-static inline void tcg_gen_debug_insn_start(uint64_t pc)
-{
-    /* XXX: must really use a 32 bit size for TCGArg in all cases */
-#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
-    tcg_gen_op2ii(INDEX_op_debug_insn_start, 
-                  (uint32_t)(pc), (uint32_t)(pc >> 32));
-#else
-    tcg_gen_op1i(INDEX_op_debug_insn_start, pc);
-#endif
-}
-
-static inline void tcg_gen_exit_tb(uintptr_t val)
-{
-    tcg_gen_op1i(INDEX_op_exit_tb, val);
-}
-
-static inline void tcg_gen_goto_tb(unsigned idx)
-{
-    /* We only support two chained exits.  */
-    tcg_debug_assert(idx <= 1);
-#ifdef CONFIG_DEBUG_TCG
-    /* Verify that we havn't seen this numbered exit before.  */
-    tcg_debug_assert((tcg_ctx.goto_tb_issue_mask & (1 << idx)) == 0);
-    tcg_ctx.goto_tb_issue_mask |= 1 << idx;
-#endif
-    tcg_gen_op1i(INDEX_op_goto_tb, idx);
-}
-
-
 void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 042d442..13ccb60 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -27,15 +27,6 @@
  */
 
 /* predefined ops */
-DEF(end, 0, 0, 0, TCG_OPF_NOT_PRESENT) /* must be kept first */
-DEF(nop, 0, 0, 0, TCG_OPF_NOT_PRESENT)
-DEF(nop1, 0, 0, 1, TCG_OPF_NOT_PRESENT)
-DEF(nop2, 0, 0, 2, TCG_OPF_NOT_PRESENT)
-DEF(nop3, 0, 0, 3, TCG_OPF_NOT_PRESENT)
-
-/* variable number of parameters */
-DEF(nopn, 0, 0, 1, TCG_OPF_NOT_PRESENT)
-
 DEF(discard, 1, 0, 0, TCG_OPF_NOT_PRESENT)
 DEF(set_label, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_NOT_PRESENT)
 
@@ -188,13 +179,13 @@
 #define TLADDR_ARGS    (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? 1 : 2)
 #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 
-DEF(qemu_ld_i32, 1, TLADDR_ARGS, 2,
+DEF(qemu_ld_i32, 1, TLADDR_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_st_i32, 0, TLADDR_ARGS + 1, 2,
+DEF(qemu_st_i32, 0, TLADDR_ARGS + 1, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
-DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 2,
+DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
-DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 2,
+DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 
 #undef TLADDR_ARGS
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 7a84b87..585ca6c 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -407,7 +407,6 @@
     /* No temps have been previously allocated for size or locality.  */
     memset(s->free_temps, 0, sizeof(s->free_temps));
 
-    s->labels = tcg_malloc(sizeof(TCGLabel) * TCG_MAX_LABELS);
     s->nb_labels = 0;
     s->current_frame_offset = s->frame_start;
 
@@ -415,8 +414,10 @@
     s->goto_tb_issue_mask = 0;
 #endif
 
-    s->gen_opc_ptr = s->gen_opc_buf;
-    s->gen_opparam_ptr = s->gen_opparam_buf;
+    s->gen_first_op_idx = 0;
+    s->gen_last_op_idx = -1;
+    s->gen_next_op_idx = 0;
+    s->gen_next_parm_idx = 0;
 
     s->be = tcg_malloc(sizeof(TCGBackendData));
 }
@@ -703,9 +704,8 @@
 void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
                    int nargs, TCGArg *args)
 {
-    int i, real_args, nb_rets;
+    int i, real_args, nb_rets, pi, pi_first;
     unsigned sizemask, flags;
-    TCGArg *nparam;
     TCGHelperInfo *info;
 
     info = g_hash_table_lookup(s->helpers, (gpointer)func);
@@ -758,8 +758,7 @@
     }
 #endif /* TCG_TARGET_EXTEND_ARGS */
 
-    *s->gen_opc_ptr++ = INDEX_op_call;
-    nparam = s->gen_opparam_ptr++;
+    pi_first = pi = s->gen_next_parm_idx;
     if (ret != TCG_CALL_DUMMY_ARG) {
 #if defined(__sparc__) && !defined(__arch64__) \
     && !defined(CONFIG_TCG_INTERPRETER)
@@ -769,25 +768,25 @@
                two return temporaries, and reassemble below.  */
             retl = tcg_temp_new_i64();
             reth = tcg_temp_new_i64();
-            *s->gen_opparam_ptr++ = GET_TCGV_I64(reth);
-            *s->gen_opparam_ptr++ = GET_TCGV_I64(retl);
+            s->gen_opparam_buf[pi++] = GET_TCGV_I64(reth);
+            s->gen_opparam_buf[pi++] = GET_TCGV_I64(retl);
             nb_rets = 2;
         } else {
-            *s->gen_opparam_ptr++ = ret;
+            s->gen_opparam_buf[pi++] = ret;
             nb_rets = 1;
         }
 #else
         if (TCG_TARGET_REG_BITS < 64 && (sizemask & 1)) {
 #ifdef HOST_WORDS_BIGENDIAN
-            *s->gen_opparam_ptr++ = ret + 1;
-            *s->gen_opparam_ptr++ = ret;
+            s->gen_opparam_buf[pi++] = ret + 1;
+            s->gen_opparam_buf[pi++] = ret;
 #else
-            *s->gen_opparam_ptr++ = ret;
-            *s->gen_opparam_ptr++ = ret + 1;
+            s->gen_opparam_buf[pi++] = ret;
+            s->gen_opparam_buf[pi++] = ret + 1;
 #endif
             nb_rets = 2;
         } else {
-            *s->gen_opparam_ptr++ = ret;
+            s->gen_opparam_buf[pi++] = ret;
             nb_rets = 1;
         }
 #endif
@@ -801,7 +800,7 @@
 #ifdef TCG_TARGET_CALL_ALIGN_ARGS
             /* some targets want aligned 64 bit args */
             if (real_args & 1) {
-                *s->gen_opparam_ptr++ = TCG_CALL_DUMMY_ARG;
+                s->gen_opparam_buf[pi++] = TCG_CALL_DUMMY_ARG;
                 real_args++;
             }
 #endif
@@ -816,26 +815,42 @@
 	       have to get more complicated to differentiate between
 	       stack arguments and register arguments.  */
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
-            *s->gen_opparam_ptr++ = args[i] + 1;
-            *s->gen_opparam_ptr++ = args[i];
+            s->gen_opparam_buf[pi++] = args[i] + 1;
+            s->gen_opparam_buf[pi++] = args[i];
 #else
-            *s->gen_opparam_ptr++ = args[i];
-            *s->gen_opparam_ptr++ = args[i] + 1;
+            s->gen_opparam_buf[pi++] = args[i];
+            s->gen_opparam_buf[pi++] = args[i] + 1;
 #endif
             real_args += 2;
             continue;
         }
 
-        *s->gen_opparam_ptr++ = args[i];
+        s->gen_opparam_buf[pi++] = args[i];
         real_args++;
     }
-    *s->gen_opparam_ptr++ = (uintptr_t)func;
-    *s->gen_opparam_ptr++ = flags;
+    s->gen_opparam_buf[pi++] = (uintptr_t)func;
+    s->gen_opparam_buf[pi++] = flags;
 
-    *nparam = (nb_rets << 16) | real_args;
+    i = s->gen_next_op_idx;
+    tcg_debug_assert(i < OPC_BUF_SIZE);
+    tcg_debug_assert(pi <= OPPARAM_BUF_SIZE);
 
-    /* total parameters, needed to go backward in the instruction stream */
-    *s->gen_opparam_ptr++ = 1 + nb_rets + real_args + 3;
+    /* Set links for sequential allocation during translation.  */
+    s->gen_op_buf[i] = (TCGOp){
+        .opc = INDEX_op_call,
+        .callo = nb_rets,
+        .calli = real_args,
+        .args = pi_first,
+        .prev = i - 1,
+        .next = i + 1
+    };
+
+    /* Make sure the calli field didn't overflow.  */
+    tcg_debug_assert(s->gen_op_buf[i].calli == real_args);
+
+    s->gen_last_op_idx = i;
+    s->gen_next_op_idx = i + 1;
+    s->gen_next_parm_idx = pi;
 
 #if defined(__sparc__) && !defined(__arch64__) \
     && !defined(CONFIG_TCG_INTERPRETER)
@@ -870,143 +885,6 @@
 #endif /* TCG_TARGET_EXTEND_ARGS */
 }
 
-#if TCG_TARGET_REG_BITS == 32
-void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
-                        int c, int right, int arith)
-{
-    if (c == 0) {
-        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg1));
-        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1));
-    } else if (c >= 32) {
-        c -= 32;
-        if (right) {
-            if (arith) {
-                tcg_gen_sari_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
-                tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), 31);
-            } else {
-                tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), c);
-                tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
-            }
-        } else {
-            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_LOW(arg1), c);
-            tcg_gen_movi_i32(TCGV_LOW(ret), 0);
-        }
-    } else {
-        TCGv_i32 t0, t1;
-
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        if (right) {
-            tcg_gen_shli_i32(t0, TCGV_HIGH(arg1), 32 - c);
-            if (arith)
-                tcg_gen_sari_i32(t1, TCGV_HIGH(arg1), c);
-            else
-                tcg_gen_shri_i32(t1, TCGV_HIGH(arg1), c);
-            tcg_gen_shri_i32(TCGV_LOW(ret), TCGV_LOW(arg1), c);
-            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t0);
-            tcg_gen_mov_i32(TCGV_HIGH(ret), t1);
-        } else {
-            tcg_gen_shri_i32(t0, TCGV_LOW(arg1), 32 - c);
-            /* Note: ret can be the same as arg1, so we use t1 */
-            tcg_gen_shli_i32(t1, TCGV_LOW(arg1), c);
-            tcg_gen_shli_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), c);
-            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t0);
-            tcg_gen_mov_i32(TCGV_LOW(ret), t1);
-        }
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    }
-}
-#endif
-
-static inline TCGMemOp tcg_canonicalize_memop(TCGMemOp op, bool is64, bool st)
-{
-    switch (op & MO_SIZE) {
-    case MO_8:
-        op &= ~MO_BSWAP;
-        break;
-    case MO_16:
-        break;
-    case MO_32:
-        if (!is64) {
-            op &= ~MO_SIGN;
-        }
-        break;
-    case MO_64:
-        if (!is64) {
-            tcg_abort();
-        }
-        break;
-    }
-    if (st) {
-        op &= ~MO_SIGN;
-    }
-    return op;
-}
-
-void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 0, 0);
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_ld_i32;
-    tcg_add_param_i32(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
-void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 0, 1);
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_st_i32;
-    tcg_add_param_i32(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
-void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 1, 0);
-
-#if TCG_TARGET_REG_BITS == 32
-    if ((memop & MO_SIZE) < MO_64) {
-        tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
-        if (memop & MO_SIGN) {
-            tcg_gen_sari_i32(TCGV_HIGH(val), TCGV_LOW(val), 31);
-        } else {
-            tcg_gen_movi_i32(TCGV_HIGH(val), 0);
-        }
-        return;
-    }
-#endif
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_ld_i64;
-    tcg_add_param_i64(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
-void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, TCGMemOp memop)
-{
-    memop = tcg_canonicalize_memop(memop, 1, 1);
-
-#if TCG_TARGET_REG_BITS == 32
-    if ((memop & MO_SIZE) < MO_64) {
-        tcg_gen_qemu_st_i32(TCGV_LOW(val), addr, idx, memop);
-        return;
-    }
-#endif
-
-    *tcg_ctx.gen_opc_ptr++ = INDEX_op_qemu_st_i64;
-    tcg_add_param_i64(val);
-    tcg_add_param_tl(addr);
-    *tcg_ctx.gen_opparam_ptr++ = memop;
-    *tcg_ctx.gen_opparam_ptr++ = idx;
-}
-
 static void tcg_reg_alloc_start(TCGContext *s)
 {
     int i;
@@ -1107,22 +985,39 @@
     [MO_BEQ]  = "beq",
 };
 
+static const char * const alignment_name[(MO_AMASK >> MO_ASHIFT) + 1] = {
+#ifdef ALIGNED_ONLY
+    [MO_UNALN >> MO_ASHIFT]    = "un+",
+    [MO_ALIGN >> MO_ASHIFT]    = "",
+#else
+    [MO_UNALN >> MO_ASHIFT]    = "",
+    [MO_ALIGN >> MO_ASHIFT]    = "al+",
+#endif
+    [MO_ALIGN_2 >> MO_ASHIFT]  = "al2+",
+    [MO_ALIGN_4 >> MO_ASHIFT]  = "al4+",
+    [MO_ALIGN_8 >> MO_ASHIFT]  = "al8+",
+    [MO_ALIGN_16 >> MO_ASHIFT] = "al16+",
+    [MO_ALIGN_32 >> MO_ASHIFT] = "al32+",
+    [MO_ALIGN_64 >> MO_ASHIFT] = "al64+",
+};
+
 void tcg_dump_ops(TCGContext *s)
 {
-    const uint16_t *opc_ptr;
-    const TCGArg *args;
-    TCGArg arg;
-    TCGOpcode c;
-    int i, k, nb_oargs, nb_iargs, nb_cargs, first_insn;
-    const TCGOpDef *def;
     char buf[128];
+    TCGOp *op;
+    int oi;
 
-    first_insn = 1;
-    opc_ptr = s->gen_opc_buf;
-    args = s->gen_opparam_buf;
-    while (opc_ptr < s->gen_opc_ptr) {
-        c = *opc_ptr++;
+    for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) {
+        int i, k, nb_oargs, nb_iargs, nb_cargs;
+        const TCGOpDef *def;
+        const TCGArg *args;
+        TCGOpcode c;
+
+        op = &s->gen_op_buf[oi];
+        c = op->opc;
         def = &tcg_op_defs[c];
+        args = &s->gen_opparam_buf[op->args];
+
         if (c == INDEX_op_debug_insn_start) {
             uint64_t pc;
 #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
@@ -1130,21 +1025,14 @@
 #else
             pc = args[0];
 #endif
-            if (!first_insn) {
+            if (oi != s->gen_first_op_idx) {
                 qemu_log("\n");
             }
             qemu_log(" ---- 0x%" PRIx64, pc);
-            first_insn = 0;
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
-            nb_cargs = def->nb_cargs;
         } else if (c == INDEX_op_call) {
-            TCGArg arg;
-
             /* variable number of arguments */
-            arg = *args++;
-            nb_oargs = arg >> 16;
-            nb_iargs = arg & 0xffff;
+            nb_oargs = op->callo;
+            nb_iargs = op->calli;
             nb_cargs = def->nb_cargs;
 
             /* function name, flags, out args */
@@ -1165,26 +1053,20 @@
             }
         } else {
             qemu_log(" %s ", def->name);
-            if (c == INDEX_op_nopn) {
-                /* variable number of arguments */
-                nb_cargs = *args;
-                nb_oargs = 0;
-                nb_iargs = 0;
-            } else {
-                nb_oargs = def->nb_oargs;
-                nb_iargs = def->nb_iargs;
-                nb_cargs = def->nb_cargs;
-            }
-            
+
+            nb_oargs = def->nb_oargs;
+            nb_iargs = def->nb_iargs;
+            nb_cargs = def->nb_cargs;
+
             k = 0;
-            for(i = 0; i < nb_oargs; i++) {
+            for (i = 0; i < nb_oargs; i++) {
                 if (k != 0) {
                     qemu_log(",");
                 }
                 qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
                                                    args[k++]));
             }
-            for(i = 0; i < nb_iargs; i++) {
+            for (i = 0; i < nb_iargs; i++) {
                 if (k != 0) {
                     qemu_log(",");
                 }
@@ -1211,27 +1093,34 @@
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_ld_i64:
             case INDEX_op_qemu_st_i64:
-                if (args[k] < ARRAY_SIZE(ldst_name) && ldst_name[args[k]]) {
-                    qemu_log(",%s", ldst_name[args[k++]]);
-                } else {
-                    qemu_log(",$0x%" TCG_PRIlx, args[k++]);
+                {
+                    TCGMemOpIdx oi = args[k++];
+                    TCGMemOp op = get_memop(oi);
+                    unsigned ix = get_mmuidx(oi);
+
+                    if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) {
+                        qemu_log(",$0x%x,%u", op, ix);
+                    } else {
+                        const char *s_al, *s_op;
+                        s_al = alignment_name[(op & MO_AMASK) >> MO_ASHIFT];
+                        s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)];
+                        qemu_log(",%s%s,%u", s_al, s_op, ix);
+                    }
+                    i = 1;
                 }
-                i = 1;
                 break;
             default:
                 i = 0;
                 break;
             }
-            for(; i < nb_cargs; i++) {
+            for (; i < nb_cargs; i++) {
                 if (k != 0) {
                     qemu_log(",");
                 }
-                arg = args[k++];
-                qemu_log("$0x%" TCG_PRIlx, arg);
+                qemu_log("$0x%" TCG_PRIlx, args[k++]);
             }
         }
         qemu_log("\n");
-        args += nb_iargs + nb_oargs + nb_cargs;
     }
 }
 
@@ -1380,21 +1269,30 @@
 #endif
 }
 
-#ifdef USE_LIVENESS_ANALYSIS
-
-/* set a nop for an operation using 'nb_args' */
-static inline void tcg_set_nop(TCGContext *s, uint16_t *opc_ptr, 
-                               TCGArg *args, int nb_args)
+void tcg_op_remove(TCGContext *s, TCGOp *op)
 {
-    if (nb_args == 0) {
-        *opc_ptr = INDEX_op_nop;
+    int next = op->next;
+    int prev = op->prev;
+
+    if (next >= 0) {
+        s->gen_op_buf[next].prev = prev;
     } else {
-        *opc_ptr = INDEX_op_nopn;
-        args[0] = nb_args;
-        args[nb_args - 1] = nb_args;
+        s->gen_last_op_idx = prev;
     }
+    if (prev >= 0) {
+        s->gen_op_buf[prev].next = next;
+    } else {
+        s->gen_first_op_idx = next;
+    }
+
+    memset(op, -1, sizeof(*op));
+
+#ifdef CONFIG_PROFILER
+    s->del_op_count++;
+#endif
 }
 
+#ifdef USE_LIVENESS_ANALYSIS
 /* liveness analysis: end of function: all temps are dead, and globals
    should be in memory. */
 static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps,
@@ -1424,19 +1322,10 @@
    temporaries are removed. */
 static void tcg_liveness_analysis(TCGContext *s)
 {
-    int i, op_index, nb_args, nb_iargs, nb_oargs, nb_ops;
-    TCGOpcode op, op_new, op_new2;
-    TCGArg *args, arg;
-    const TCGOpDef *def;
     uint8_t *dead_temps, *mem_temps;
-    uint16_t dead_args;
-    uint8_t sync_args;
-    bool have_op_new2;
-    
-    s->gen_opc_ptr++; /* skip end */
+    int oi, oi_prev, nb_ops;
 
-    nb_ops = s->gen_opc_ptr - s->gen_opc_buf;
-
+    nb_ops = s->gen_next_op_idx;
     s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t));
     s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t));
     
@@ -1444,25 +1333,31 @@
     mem_temps = tcg_malloc(s->nb_temps);
     tcg_la_func_end(s, dead_temps, mem_temps);
 
-    args = s->gen_opparam_ptr;
-    op_index = nb_ops - 1;
-    while (op_index >= 0) {
-        op = s->gen_opc_buf[op_index];
-        def = &tcg_op_defs[op];
-        switch(op) {
+    for (oi = s->gen_last_op_idx; oi >= 0; oi = oi_prev) {
+        int i, nb_iargs, nb_oargs;
+        TCGOpcode opc_new, opc_new2;
+        bool have_opc_new2;
+        uint16_t dead_args;
+        uint8_t sync_args;
+        TCGArg arg;
+
+        TCGOp * const op = &s->gen_op_buf[oi];
+        TCGArg * const args = &s->gen_opparam_buf[op->args];
+        TCGOpcode opc = op->opc;
+        const TCGOpDef *def = &tcg_op_defs[opc];
+
+        oi_prev = op->prev;
+
+        switch (opc) {
         case INDEX_op_call:
             {
                 int call_flags;
 
-                nb_args = args[-1];
-                args -= nb_args;
-                arg = *args++;
-                nb_iargs = arg & 0xffff;
-                nb_oargs = arg >> 16;
+                nb_oargs = op->callo;
+                nb_iargs = op->calli;
                 call_flags = args[nb_oargs + nb_iargs + 1];
 
-                /* pure functions can be removed if their result is not
-                   used */
+                /* pure functions can be removed if their result is unused */
                 if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
                     for (i = 0; i < nb_oargs; i++) {
                         arg = args[i];
@@ -1470,8 +1365,7 @@
                             goto do_not_remove_call;
                         }
                     }
-                    tcg_set_nop(s, s->gen_opc_buf + op_index,
-                                args - 1, nb_args);
+                    goto do_remove;
                 } else {
                 do_not_remove_call:
 
@@ -1500,51 +1394,45 @@
                         memset(dead_temps, 1, s->nb_globals);
                     }
 
-                    /* input args are live */
+                    /* record arguments that die in this helper */
                     for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
                         arg = args[i];
                         if (arg != TCG_CALL_DUMMY_ARG) {
                             if (dead_temps[arg]) {
                                 dead_args |= (1 << i);
                             }
-                            dead_temps[arg] = 0;
                         }
                     }
-                    s->op_dead_args[op_index] = dead_args;
-                    s->op_sync_args[op_index] = sync_args;
+                    /* input arguments are live for preceeding opcodes */
+                    for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                        arg = args[i];
+                        dead_temps[arg] = 0;
+                    }
+                    s->op_dead_args[oi] = dead_args;
+                    s->op_sync_args[oi] = sync_args;
                 }
-                args--;
             }
             break;
         case INDEX_op_debug_insn_start:
-            args -= def->nb_args;
-            break;
-        case INDEX_op_nopn:
-            nb_args = args[-1];
-            args -= nb_args;
             break;
         case INDEX_op_discard:
-            args--;
             /* mark the temporary as dead */
             dead_temps[args[0]] = 1;
             mem_temps[args[0]] = 0;
             break;
-        case INDEX_op_end:
-            break;
 
         case INDEX_op_add2_i32:
-            op_new = INDEX_op_add_i32;
+            opc_new = INDEX_op_add_i32;
             goto do_addsub2;
         case INDEX_op_sub2_i32:
-            op_new = INDEX_op_sub_i32;
+            opc_new = INDEX_op_sub_i32;
             goto do_addsub2;
         case INDEX_op_add2_i64:
-            op_new = INDEX_op_add_i64;
+            opc_new = INDEX_op_add_i64;
             goto do_addsub2;
         case INDEX_op_sub2_i64:
-            op_new = INDEX_op_sub_i64;
+            opc_new = INDEX_op_sub_i64;
         do_addsub2:
-            args -= 6;
             nb_iargs = 4;
             nb_oargs = 2;
             /* Test if the high part of the operation is dead, but not
@@ -1555,12 +1443,11 @@
                 if (dead_temps[args[0]] && !mem_temps[args[0]]) {
                     goto do_remove;
                 }
-                /* Create the single operation plus nop.  */
-                s->gen_opc_buf[op_index] = op = op_new;
+                /* Replace the opcode and adjust the args in place,
+                   leaving 3 unused args at the end.  */
+                op->opc = opc = opc_new;
                 args[1] = args[2];
                 args[2] = args[4];
-                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
-                tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 3);
                 /* Fall through and mark the single-word operation live.  */
                 nb_iargs = 2;
                 nb_oargs = 1;
@@ -1568,27 +1455,26 @@
             goto do_not_remove;
 
         case INDEX_op_mulu2_i32:
-            op_new = INDEX_op_mul_i32;
-            op_new2 = INDEX_op_muluh_i32;
-            have_op_new2 = TCG_TARGET_HAS_muluh_i32;
+            opc_new = INDEX_op_mul_i32;
+            opc_new2 = INDEX_op_muluh_i32;
+            have_opc_new2 = TCG_TARGET_HAS_muluh_i32;
             goto do_mul2;
         case INDEX_op_muls2_i32:
-            op_new = INDEX_op_mul_i32;
-            op_new2 = INDEX_op_mulsh_i32;
-            have_op_new2 = TCG_TARGET_HAS_mulsh_i32;
+            opc_new = INDEX_op_mul_i32;
+            opc_new2 = INDEX_op_mulsh_i32;
+            have_opc_new2 = TCG_TARGET_HAS_mulsh_i32;
             goto do_mul2;
         case INDEX_op_mulu2_i64:
-            op_new = INDEX_op_mul_i64;
-            op_new2 = INDEX_op_muluh_i64;
-            have_op_new2 = TCG_TARGET_HAS_muluh_i64;
+            opc_new = INDEX_op_mul_i64;
+            opc_new2 = INDEX_op_muluh_i64;
+            have_opc_new2 = TCG_TARGET_HAS_muluh_i64;
             goto do_mul2;
         case INDEX_op_muls2_i64:
-            op_new = INDEX_op_mul_i64;
-            op_new2 = INDEX_op_mulsh_i64;
-            have_op_new2 = TCG_TARGET_HAS_mulsh_i64;
+            opc_new = INDEX_op_mul_i64;
+            opc_new2 = INDEX_op_mulsh_i64;
+            have_opc_new2 = TCG_TARGET_HAS_mulsh_i64;
             goto do_mul2;
         do_mul2:
-            args -= 4;
             nb_iargs = 2;
             nb_oargs = 2;
             if (dead_temps[args[1]] && !mem_temps[args[1]]) {
@@ -1597,28 +1483,25 @@
                     goto do_remove;
                 }
                 /* The high part of the operation is dead; generate the low. */
-                s->gen_opc_buf[op_index] = op = op_new;
+                op->opc = opc = opc_new;
                 args[1] = args[2];
                 args[2] = args[3];
-            } else if (have_op_new2 && dead_temps[args[0]]
+            } else if (have_opc_new2 && dead_temps[args[0]]
                        && !mem_temps[args[0]]) {
-                /* The low part of the operation is dead; generate the high.  */
-                s->gen_opc_buf[op_index] = op = op_new2;
+                /* The low part of the operation is dead; generate the high. */
+                op->opc = opc = opc_new2;
                 args[0] = args[1];
                 args[1] = args[2];
                 args[2] = args[3];
             } else {
                 goto do_not_remove;
             }
-            assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
-            tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 1);
             /* Mark the single-word operation live.  */
             nb_oargs = 1;
             goto do_not_remove;
 
         default:
             /* XXX: optimize by hardcoding common cases (e.g. triadic ops) */
-            args -= def->nb_args;
             nb_iargs = def->nb_iargs;
             nb_oargs = def->nb_oargs;
 
@@ -1626,24 +1509,20 @@
                its outputs are dead. We assume that nb_oargs == 0
                implies side effects */
             if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) {
-                for(i = 0; i < nb_oargs; i++) {
+                for (i = 0; i < nb_oargs; i++) {
                     arg = args[i];
                     if (!dead_temps[arg] || mem_temps[arg]) {
                         goto do_not_remove;
                     }
                 }
             do_remove:
-                tcg_set_nop(s, s->gen_opc_buf + op_index, args, def->nb_args);
-#ifdef CONFIG_PROFILER
-                s->del_op_count++;
-#endif
+                tcg_op_remove(s, op);
             } else {
             do_not_remove:
-
                 /* output args are dead */
                 dead_args = 0;
                 sync_args = 0;
-                for(i = 0; i < nb_oargs; i++) {
+                for (i = 0; i < nb_oargs; i++) {
                     arg = args[i];
                     if (dead_temps[arg]) {
                         dead_args |= (1 << i);
@@ -1663,24 +1542,23 @@
                     memset(mem_temps, 1, s->nb_globals);
                 }
 
-                /* input args are live */
-                for(i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                /* record arguments that die in this opcode */
+                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
                     arg = args[i];
                     if (dead_temps[arg]) {
                         dead_args |= (1 << i);
                     }
+                }
+                /* input arguments are live for preceeding opcodes */
+                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                    arg = args[i];
                     dead_temps[arg] = 0;
                 }
-                s->op_dead_args[op_index] = dead_args;
-                s->op_sync_args[op_index] = sync_args;
+                s->op_dead_args[oi] = dead_args;
+                s->op_sync_args[oi] = sync_args;
             }
             break;
         }
-        op_index--;
-    }
-
-    if (args != s->gen_opparam_buf) {
-        tcg_abort();
     }
 }
 #else
@@ -2144,6 +2022,16 @@
                 if (!IS_DEAD_ARG(i)) {
                     goto allocate_in_reg;
                 }
+                /* check if the current register has already been allocated
+                   for another input aliased to an output */
+                int k2, i2;
+                for (k2 = 0 ; k2 < k ; k2++) {
+                    i2 = def->sorted_args[nb_oargs + k2];
+                    if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
+                        (new_args[i2] == ts->reg)) {
+                        goto allocate_in_reg;
+                    }
+                }
             }
         }
         reg = ts->reg;
@@ -2247,11 +2135,11 @@
 #define STACK_DIR(x) (x)
 #endif
 
-static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
-                              TCGOpcode opc, const TCGArg *args,
-                              uint16_t dead_args, uint8_t sync_args)
+static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
+                               const TCGArg * const args, uint16_t dead_args,
+                               uint8_t sync_args)
 {
-    int nb_iargs, nb_oargs, flags, nb_regs, i, reg, nb_params;
+    int flags, nb_regs, i, reg;
     TCGArg arg;
     TCGTemp *ts;
     intptr_t stack_offset;
@@ -2260,22 +2148,16 @@
     int allocate_args;
     TCGRegSet allocated_regs;
 
-    arg = *args++;
-
-    nb_oargs = arg >> 16;
-    nb_iargs = arg & 0xffff;
-    nb_params = nb_iargs;
-
     func_addr = (tcg_insn_unit *)(intptr_t)args[nb_oargs + nb_iargs];
     flags = args[nb_oargs + nb_iargs + 1];
 
     nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
-    if (nb_regs > nb_params) {
-        nb_regs = nb_params;
+    if (nb_regs > nb_iargs) {
+        nb_regs = nb_iargs;
     }
 
     /* assign stack slots first */
-    call_stack_size = (nb_params - nb_regs) * sizeof(tcg_target_long);
+    call_stack_size = (nb_iargs - nb_regs) * sizeof(tcg_target_long);
     call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) & 
         ~(TCG_TARGET_STACK_ALIGN - 1);
     allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE);
@@ -2286,7 +2168,7 @@
     }
 
     stack_offset = TCG_TARGET_CALL_STACK_OFFSET;
-    for(i = nb_regs; i < nb_params; i++) {
+    for(i = nb_regs; i < nb_iargs; i++) {
         arg = args[nb_oargs + i];
 #ifdef TCG_TARGET_STACK_GROWSUP
         stack_offset -= sizeof(tcg_target_long);
@@ -2393,8 +2275,6 @@
             }
         }
     }
-    
-    return nb_iargs + nb_oargs + def->nb_cargs + 1;
 }
 
 #ifdef CONFIG_PROFILER
@@ -2405,7 +2285,7 @@
 {
     int i;
 
-    for(i = INDEX_op_end; i < NB_OPS; i++) {
+    for (i = 0; i < NB_OPS; i++) {
         qemu_log("%s %" PRId64 "\n", tcg_op_defs[i].name, tcg_table_op_count[i]);
     }
 }
@@ -2416,10 +2296,7 @@
                                       tcg_insn_unit *gen_code_buf,
                                       long search_pc)
 {
-    TCGOpcode opc;
-    int op_index;
-    const TCGOpDef *def;
-    const TCGArg *args;
+    int oi, oi_next;
 
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
@@ -2434,8 +2311,7 @@
 #endif
 
 #ifdef USE_TCG_OPTIMIZATIONS
-    s->gen_opparam_ptr =
-        tcg_optimize(s, s->gen_opc_ptr, s->gen_opparam_buf, tcg_op_defs);
+    tcg_optimize(s);
 #endif
 
 #ifdef CONFIG_PROFILER
@@ -2464,42 +2340,30 @@
 
     tcg_out_tb_init(s);
 
-    args = s->gen_opparam_buf;
-    op_index = 0;
+    for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
+        TCGOp * const op = &s->gen_op_buf[oi];
+        TCGArg * const args = &s->gen_opparam_buf[op->args];
+        TCGOpcode opc = op->opc;
+        const TCGOpDef *def = &tcg_op_defs[opc];
+        uint16_t dead_args = s->op_dead_args[oi];
+        uint8_t sync_args = s->op_sync_args[oi];
 
-    for(;;) {
-        opc = s->gen_opc_buf[op_index];
+        oi_next = op->next;
 #ifdef CONFIG_PROFILER
         tcg_table_op_count[opc]++;
 #endif
-        def = &tcg_op_defs[opc];
-#if 0
-        printf("%s: %d %d %d\n", def->name,
-               def->nb_oargs, def->nb_iargs, def->nb_cargs);
-        //        dump_regs(s);
-#endif
-        switch(opc) {
+
+        switch (opc) {
         case INDEX_op_mov_i32:
         case INDEX_op_mov_i64:
-            tcg_reg_alloc_mov(s, def, args, s->op_dead_args[op_index],
-                              s->op_sync_args[op_index]);
+            tcg_reg_alloc_mov(s, def, args, dead_args, sync_args);
             break;
         case INDEX_op_movi_i32:
         case INDEX_op_movi_i64:
-            tcg_reg_alloc_movi(s, args, s->op_dead_args[op_index],
-                               s->op_sync_args[op_index]);
+            tcg_reg_alloc_movi(s, args, dead_args, sync_args);
             break;
         case INDEX_op_debug_insn_start:
-            /* debug instruction */
             break;
-        case INDEX_op_nop:
-        case INDEX_op_nop1:
-        case INDEX_op_nop2:
-        case INDEX_op_nop3:
-            break;
-        case INDEX_op_nopn:
-            args += args[0];
-            goto next;
         case INDEX_op_discard:
             temp_dead(s, args[0]);
             break;
@@ -2508,12 +2372,9 @@
             tcg_out_label(s, args[0], s->code_ptr);
             break;
         case INDEX_op_call:
-            args += tcg_reg_alloc_call(s, def, opc, args,
-                                       s->op_dead_args[op_index],
-                                       s->op_sync_args[op_index]);
-            goto next;
-        case INDEX_op_end:
-            goto the_end;
+            tcg_reg_alloc_call(s, op->callo, op->calli, args,
+                               dead_args, sync_args);
+            break;
         default:
             /* Sanity check that we've not introduced any unhandled opcodes. */
             if (def->flags & TCG_OPF_NOT_PRESENT) {
@@ -2522,21 +2383,17 @@
             /* Note: in order to speed up the code, it would be much
                faster to have specialized register allocator functions for
                some common argument patterns */
-            tcg_reg_alloc_op(s, def, opc, args, s->op_dead_args[op_index],
-                             s->op_sync_args[op_index]);
+            tcg_reg_alloc_op(s, def, opc, args, dead_args, sync_args);
             break;
         }
-        args += def->nb_args;
-    next:
         if (search_pc >= 0 && search_pc < tcg_current_code_size(s)) {
-            return op_index;
+            return oi;
         }
-        op_index++;
 #ifndef NDEBUG
         check_regs(s);
 #endif
     }
- the_end:
+
     /* Generate TB finalization at the end of block */
     tcg_out_tb_finalize(s);
     return -1;
@@ -2547,14 +2404,18 @@
 #ifdef CONFIG_PROFILER
     {
         int n;
-        n = (s->gen_opc_ptr - s->gen_opc_buf);
-        s->op_count += n;
-        if (n > s->op_count_max)
-            s->op_count_max = n;
 
-        s->temp_count += s->nb_temps;
-        if (s->nb_temps > s->temp_count_max)
-            s->temp_count_max = s->nb_temps;
+        n = s->gen_last_op_idx + 1;
+        s->op_count += n;
+        if (n > s->op_count_max) {
+            s->op_count_max = n;
+        }
+
+        n = s->nb_temps;
+        s->temp_count += n;
+        if (n > s->temp_count_max) {
+            s->temp_count_max = n;
+        }
     }
 #endif
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 7285f71..b27ee14 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -159,6 +159,15 @@
 #endif
 
 
+#ifdef CONFIG_DEBUG_TCG
+# define tcg_debug_assert(X) do { assert(X); } while (0)
+#elif QEMU_GNUC_PREREQ(4, 5)
+# define tcg_debug_assert(X) \
+    do { if (!(X)) { __builtin_unreachable(); } } while (0)
+#else
+# define tcg_debug_assert(X) do { (void)(X); } while (0)
+#endif
+
 typedef struct TCGRelocation {
     struct TCGRelocation *next;
     int type;
@@ -242,6 +251,41 @@
     MO_TE    = MO_LE,
 #endif
 
+    /* MO_UNALN accesses are never checked for alignment.
+     * MO_ALIGN accesses will result in a call to the CPU's
+     * do_unaligned_access hook if the guest address is not aligned.
+     * The default depends on whether the target CPU defines ALIGNED_ONLY.
+     * Some architectures (e.g. ARMv8) need the address which is aligned
+     * to a size more than the size of the memory access.
+     * To support such check it's enough the current costless alignment
+     * check implementation in QEMU, but we need to support
+     * an alignment size specifying.
+     * MO_ALIGN supposes a natural alignment
+     * (i.e. the alignment size is the size of a memory access).
+     * Note that an alignment size must be equal or greater
+     * than an access size.
+     * There are three options:
+     * - an alignment to the size of an access (MO_ALIGN);
+     * - an alignment to the specified size that is equal or greater than
+     *   an access size (MO_ALIGN_x where 'x' is a size in bytes);
+     * - unaligned access permitted (MO_UNALN).
+     */
+    MO_ASHIFT = 4,
+    MO_AMASK = 7 << MO_ASHIFT,
+#ifdef ALIGNED_ONLY
+    MO_ALIGN = 0,
+    MO_UNALN = MO_AMASK,
+#else
+    MO_ALIGN = MO_AMASK,
+    MO_UNALN = 0,
+#endif
+    MO_ALIGN_2  = 1 << MO_ASHIFT,
+    MO_ALIGN_4  = 2 << MO_ASHIFT,
+    MO_ALIGN_8  = 3 << MO_ASHIFT,
+    MO_ALIGN_16 = 4 << MO_ASHIFT,
+    MO_ALIGN_32 = 5 << MO_ASHIFT,
+    MO_ALIGN_64 = 6 << MO_ASHIFT,
+
     /* Combinations of the above, for ease of use.  */
     MO_UB    = MO_8,
     MO_UW    = MO_16,
@@ -272,6 +316,45 @@
     MO_SSIZE = MO_SIZE | MO_SIGN,
 } TCGMemOp;
 
+/**
+ * get_alignment_bits
+ * @memop: TCGMemOp value
+ *
+ * Extract the alignment size from the memop.
+ *
+ * Returns: 0 in case of byte access (which is always aligned);
+ *          positive value - number of alignment bits;
+ *          negative value if unaligned access enabled
+ *          and this is not a byte access.
+ */
+static inline int get_alignment_bits(TCGMemOp memop)
+{
+    int a = memop & MO_AMASK;
+    int s = memop & MO_SIZE;
+    int r;
+
+    if (a == MO_UNALN) {
+        /* Negative value if unaligned access enabled,
+         * or zero value in case of byte access.
+         */
+        return -s;
+    } else if (a == MO_ALIGN) {
+        /* A natural alignment: return a number of access size bits */
+        r = s;
+    } else {
+        /* Specific alignment size. It must be equal or greater
+         * than the access size.
+         */
+        r = a >> MO_ASHIFT;
+        tcg_debug_assert(r >= s);
+    }
+#if defined(CONFIG_SOFTMMU)
+    /* The requested alignment cannot overlap the TLB flags.  */
+    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << r) - 1)) == 0);
+#endif
+    return r;
+}
+
 typedef tcg_target_ulong TCGArg;
 
 /* Define a type and accessor macros for variables.  Using pointer types
@@ -448,10 +531,28 @@
     unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
 } TCGTempSet;
 
+typedef struct TCGOp {
+    TCGOpcode opc   : 8;
+
+    /* The number of out and in parameter for a call.  */
+    unsigned callo  : 2;
+    unsigned calli  : 6;
+
+    /* Index of the arguments for this op, or -1 for zero-operand ops.  */
+    signed args     : 16;
+
+    /* Index of the prex/next op, or -1 for the end of the list.  */
+    signed prev     : 16;
+    signed next     : 16;
+} TCGOp;
+
+QEMU_BUILD_BUG_ON(NB_OPS > 0xff);
+QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff);
+QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff);
+
 struct TCGContext {
     uint8_t *pool_cur, *pool_end;
     TCGPool *pool_first, *pool_current, *pool_first_large;
-    TCGLabel *labels;
     int nb_labels;
     int nb_globals;
     int nb_temps;
@@ -469,9 +570,6 @@
                                corresponding output argument needs to be
                                sync to memory. */
     
-    /* tells in which temporary a given register is. It does not take
-       into account fixed registers */
-    int reg_to_temp[TCG_TARGET_NB_REGS];
     TCGRegSet reserved_regs;
     intptr_t current_frame_offset;
     intptr_t frame_start;
@@ -479,8 +577,6 @@
     int frame_reg;
 
     tcg_insn_unit *code_ptr;
-    TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
-    TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
 
     GHashTable *helpers;
 
@@ -508,14 +604,10 @@
     int goto_tb_issue_mask;
 #endif
 
-    uint16_t gen_opc_buf[OPC_BUF_SIZE];
-    TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
-
-    uint16_t *gen_opc_ptr;
-    TCGArg *gen_opparam_ptr;
-    target_ulong gen_opc_pc[OPC_BUF_SIZE];
-    uint16_t gen_opc_icount[OPC_BUF_SIZE];
-    uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
+    int gen_first_op_idx;
+    int gen_last_op_idx;
+    int gen_next_op_idx;
+    int gen_next_parm_idx;
 
     /* Code generation.  Note that we specifically do not use tcg_insn_unit
        here, because there's too much arithmetic throughout that relies
@@ -533,10 +625,38 @@
 
     /* The TCGBackendData structure is private to tcg-target.c.  */
     struct TCGBackendData *be;
+
+    TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
+    TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
+
+    /* tells in which temporary a given register is. It does not take
+       into account fixed registers */
+    int reg_to_temp[TCG_TARGET_NB_REGS];
+
+    TCGOp gen_op_buf[OPC_BUF_SIZE];
+    TCGArg gen_opparam_buf[OPPARAM_BUF_SIZE];
+
+    target_ulong gen_opc_pc[OPC_BUF_SIZE];
+    uint16_t gen_opc_icount[OPC_BUF_SIZE];
+    uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
+
+    TCGLabel labels[TCG_MAX_LABELS];
 };
 
 extern TCGContext tcg_ctx;
 
+/* The number of opcodes emitted so far.  */
+static inline int tcg_op_buf_count(void)
+{
+    return tcg_ctx.gen_next_op_idx;
+}
+
+/* Test for whether to terminate the TB for using too many opcodes.  */
+static inline bool tcg_op_buf_full(void)
+{
+    return tcg_op_buf_count() >= OPC_MAX_SIZE;
+}
+
 /* pool based memory allocation */
 
 void *tcg_malloc_internal(TCGContext *s, int size);
@@ -667,15 +787,6 @@
     abort();\
 } while (0)
 
-#ifdef CONFIG_DEBUG_TCG
-# define tcg_debug_assert(X) do { assert(X); } while (0)
-#elif QEMU_GNUC_PREREQ(4, 5)
-# define tcg_debug_assert(X) \
-    do { if (!(X)) { __builtin_unreachable(); } } while (0)
-#else
-# define tcg_debug_assert(X) do { (void)(X); } while (0)
-#endif
-
 void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs);
 
 #if UINTPTR_MAX == UINT32_MAX
@@ -705,11 +816,8 @@
 void tcg_gen_callN(TCGContext *s, void *func,
                    TCGArg ret, int nargs, TCGArg *args);
 
-void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
-                        int c, int right, int arith);
-
-TCGArg *tcg_optimize(TCGContext *s, uint16_t *tcg_opc_ptr, TCGArg *args,
-                     TCGOpDef *tcg_op_def);
+void tcg_op_remove(TCGContext *s, TCGOp *op);
+void tcg_optimize(TCGContext *s);
 
 /* only used for debugging purposes */
 void tcg_dump_ops(TCGContext *s);
@@ -765,6 +873,44 @@
     return tcg_ptr_byte_diff(s->code_ptr, s->code_buf);
 }
 
+/* Combine the TCGMemOp and mmu_idx parameters into a single value.  */
+typedef uint32_t TCGMemOpIdx;
+
+/**
+ * make_memop_idx
+ * @op: memory operation
+ * @idx: mmu index
+ *
+ * Encode these values into a single parameter.
+ */
+static inline TCGMemOpIdx make_memop_idx(TCGMemOp op, unsigned idx)
+{
+    tcg_debug_assert(idx <= 15);
+    return (op << 4) | idx;
+}
+
+/**
+ * get_memop
+ * @oi: combined op/idx parameter
+ *
+ * Extract the memory operation from the combined value.
+ */
+static inline TCGMemOp get_memop(TCGMemOpIdx oi)
+{
+    return oi >> 4;
+}
+
+/**
+ * get_mmuidx
+ * @oi: combined op/idx parameter
+ *
+ * Extract the mmu index from the combined value.
+ */
+static inline unsigned get_mmuidx(TCGMemOpIdx oi)
+{
+    return oi & 15;
+}
+
 /**
  * tcg_qemu_tb_exec:
  * @env: CPUArchState * for the CPU
@@ -826,46 +972,46 @@
 #ifdef CONFIG_SOFTMMU
 /* Value zero-extended to tcg register size.  */
 tcg_target_ulong helper_ret_ldub_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_lduw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_ldul_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 uint64_t helper_le_ldq_mmu(CPUArchState *env, target_ulong addr,
-                           int mmu_idx, uintptr_t retaddr);
+                           TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_lduw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_ldul_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 uint64_t helper_be_ldq_mmu(CPUArchState *env, target_ulong addr,
-                           int mmu_idx, uintptr_t retaddr);
+                           TCGMemOpIdx oi, uintptr_t retaddr);
 
 /* Value sign-extended to tcg register size.  */
 tcg_target_ulong helper_ret_ldsb_mmu(CPUArchState *env, target_ulong addr,
-                                     int mmu_idx, uintptr_t retaddr);
+                                     TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_ldsw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_le_ldsl_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_ldsw_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
-                                    int mmu_idx, uintptr_t retaddr);
+                                    TCGMemOpIdx oi, uintptr_t retaddr);
 
 void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
-                        int mmu_idx, uintptr_t retaddr);
+                        TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_le_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_le_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
-                       int mmu_idx, uintptr_t retaddr);
+                       TCGMemOpIdx oi, uintptr_t retaddr);
 
 /* Temporary aliases until backends are converted.  */
 #ifdef TARGET_WORDS_BIGENDIAN
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 03a7b46..def4311 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -764,9 +764,6 @@
             tcg_out_r(s, *args++);
         }
         tcg_out_i(s, *args++);
-#ifdef CONFIG_SOFTMMU
-        tcg_out_i(s, *args);
-#endif
         break;
     case INDEX_op_qemu_ld_i64:
         tcg_out_r(s, *args++);
@@ -778,9 +775,6 @@
             tcg_out_r(s, *args++);
         }
         tcg_out_i(s, *args++);
-#ifdef CONFIG_SOFTMMU
-        tcg_out_i(s, *args);
-#endif
         break;
     case INDEX_op_qemu_st_i32:
         tcg_out_r(s, *args++);
diff --git a/tci.c b/tci.c
index 4711ee4..8444948 100644
--- a/tci.c
+++ b/tci.c
@@ -420,35 +420,34 @@
 }
 
 #ifdef CONFIG_SOFTMMU
-# define mmuidx          tci_read_i(&tb_ptr)
 # define qemu_ld_ub \
-    helper_ret_ldub_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_ret_ldub_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_leuw \
-    helper_le_lduw_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_leul \
-    helper_le_ldul_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_leq \
-    helper_le_ldq_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_beuw \
-    helper_be_lduw_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_beul \
-    helper_be_ldul_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_ld_beq \
-    helper_be_ldq_mmu(env, taddr, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 # define qemu_st_b(X) \
-    helper_ret_stb_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_ret_stb_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_lew(X) \
-    helper_le_stw_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_lel(X) \
-    helper_le_stl_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_leq(X) \
-    helper_le_stq_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_le_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_bew(X) \
-    helper_be_stw_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_bel(X) \
-    helper_be_stl_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 # define qemu_st_beq(X) \
-    helper_be_stq_mmu(env, taddr, X, mmuidx, (uintptr_t)tb_ptr)
+    helper_be_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 #else
 # define qemu_ld_ub      ldub_p(g2h(taddr))
 # define qemu_ld_leuw    lduw_le_p(g2h(taddr))
@@ -496,7 +495,7 @@
 #if TCG_TARGET_REG_BITS == 32
         uint64_t v64;
 #endif
-        TCGMemOp memop;
+        TCGMemOpIdx oi;
 
 #if defined(GETPC)
         tci_tb_ptr = (uintptr_t)tb_ptr;
@@ -506,19 +505,6 @@
         tb_ptr += 2;
 
         switch (opc) {
-        case INDEX_op_end:
-        case INDEX_op_nop:
-            break;
-        case INDEX_op_nop1:
-        case INDEX_op_nop2:
-        case INDEX_op_nop3:
-        case INDEX_op_nopn:
-        case INDEX_op_discard:
-            TODO();
-            break;
-        case INDEX_op_set_label:
-            TODO();
-            break;
         case INDEX_op_call:
             t0 = tci_read_ri(&tb_ptr);
 #if TCG_TARGET_REG_BITS == 32
@@ -1120,8 +1106,8 @@
         case INDEX_op_qemu_ld_i32:
             t0 = *tb_ptr++;
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SSIZE)) {
             case MO_UB:
                 tmp32 = qemu_ld_ub;
                 break;
@@ -1157,8 +1143,8 @@
                 t1 = *tb_ptr++;
             }
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SSIZE)) {
             case MO_UB:
                 tmp64 = qemu_ld_ub;
                 break;
@@ -1206,8 +1192,8 @@
         case INDEX_op_qemu_st_i32:
             t0 = tci_read_r(&tb_ptr);
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SIZE)) {
             case MO_UB:
                 qemu_st_b(t0);
                 break;
@@ -1230,8 +1216,8 @@
         case INDEX_op_qemu_st_i64:
             tmp64 = tci_read_r64(&tb_ptr);
             taddr = tci_read_ulong(&tb_ptr);
-            memop = tci_read_i(&tb_ptr);
-            switch (memop) {
+            oi = tci_read_i(&tb_ptr);
+            switch (get_memop(oi) & (MO_BSWAP | MO_SIZE)) {
             case MO_UB:
                 qemu_st_b(tmp64);
                 break;