target-arm: Mostly stylistic upstream integration.

This changes the sources to get much closer to the upstream 2.0.0
sources by changing a minimum amount of things, i.e.:

- Adding new macros and declarations that don't impact previous ones.

- Modify the gen_ldXX/gen_setXX functions in translate.c to better
  match upstream.

This considerably reduces the differences between upstream and our
branch for now, though target-arm/machine.c needs a lot of future
love to convert everything to VMStateDescription (which will likely
break snapshots, once again).

Change-Id: I35a0b39fcddfffc32ac212ab3c8056c2a7df5f15
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 61754e9..13e836c 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -70,6 +70,19 @@
 /* ARM-specific interrupt pending bits.  */
 #define CPU_INTERRUPT_FIQ   CPU_INTERRUPT_TGT_EXT_1
 
+/* The usual mapping for an AArch64 system register to its AArch32
+ * counterpart is for the 32 bit world to have access to the lower
+ * half only (with writes leaving the upper half untouched). It's
+ * therefore useful to be able to pass TCG the offset of the least
+ * significant half of a uint64_t struct member.
+ */
+#ifdef HOST_WORDS_BIGENDIAN
+#define offsetoflow32(S, M) (offsetof(S, M) + sizeof(uint32_t))
+#define offsetofhigh32(S, M) offsetof(S, M)
+#else
+#define offsetoflow32(S, M) offsetof(S, M)
+#define offsetofhigh32(S, M) (offsetof(S, M) + sizeof(uint32_t))
+#endif
 /* Meanings of the ARMCPU object's two inbound GPIO lines */
 #define ARM_CPU_IRQ 0
 #define ARM_CPU_FIQ 1
@@ -307,6 +320,58 @@
                               int mmu_idx);
 #define cpu_handle_mmu_fault cpu_arm_handle_mmu_fault
 
+/* SCTLR bit meanings. Several bits have been reused in newer
+ * versions of the architecture; in that case we define constants
+ * for both old and new bit meanings. Code which tests against those
+ * bits should probably check or otherwise arrange that the CPU
+ * is the architectural version it expects.
+ */
+#define SCTLR_M       (1U << 0)
+#define SCTLR_A       (1U << 1)
+#define SCTLR_C       (1U << 2)
+#define SCTLR_W       (1U << 3) /* up to v6; RAO in v7 */
+#define SCTLR_SA      (1U << 3)
+#define SCTLR_P       (1U << 4) /* up to v5; RAO in v6 and v7 */
+#define SCTLR_SA0     (1U << 4) /* v8 onward, AArch64 only */
+#define SCTLR_D       (1U << 5) /* up to v5; RAO in v6 */
+#define SCTLR_CP15BEN (1U << 5) /* v7 onward */
+#define SCTLR_L       (1U << 6) /* up to v5; RAO in v6 and v7; RAZ in v8 */
+#define SCTLR_B       (1U << 7) /* up to v6; RAZ in v7 */
+#define SCTLR_ITD     (1U << 7) /* v8 onward */
+#define SCTLR_S       (1U << 8) /* up to v6; RAZ in v7 */
+#define SCTLR_SED     (1U << 8) /* v8 onward */
+#define SCTLR_R       (1U << 9) /* up to v6; RAZ in v7 */
+#define SCTLR_UMA     (1U << 9) /* v8 onward, AArch64 only */
+#define SCTLR_F       (1U << 10) /* up to v6 */
+#define SCTLR_SW      (1U << 10) /* v7 onward */
+#define SCTLR_Z       (1U << 11)
+#define SCTLR_I       (1U << 12)
+#define SCTLR_V       (1U << 13)
+#define SCTLR_RR      (1U << 14) /* up to v7 */
+#define SCTLR_DZE     (1U << 14) /* v8 onward, AArch64 only */
+#define SCTLR_L4      (1U << 15) /* up to v6; RAZ in v7 */
+#define SCTLR_UCT     (1U << 15) /* v8 onward, AArch64 only */
+#define SCTLR_DT      (1U << 16) /* up to ??, RAO in v6 and v7 */
+#define SCTLR_nTWI    (1U << 16) /* v8 onward */
+#define SCTLR_HA      (1U << 17)
+#define SCTLR_IT      (1U << 18) /* up to ??, RAO in v6 and v7 */
+#define SCTLR_nTWE    (1U << 18) /* v8 onward */
+#define SCTLR_WXN     (1U << 19)
+#define SCTLR_ST      (1U << 20) /* up to ??, RAZ in v6 */
+#define SCTLR_UWXN    (1U << 20) /* v7 onward */
+#define SCTLR_FI      (1U << 21)
+#define SCTLR_U       (1U << 22)
+#define SCTLR_XP      (1U << 23) /* up to v6; v7 onward RAO */
+#define SCTLR_VE      (1U << 24) /* up to v7 */
+#define SCTLR_E0E     (1U << 24) /* v8 onward, AArch64 only */
+#define SCTLR_EE      (1U << 25)
+#define SCTLR_L2      (1U << 26) /* up to v6, RAZ in v7 */
+#define SCTLR_UCI     (1U << 26) /* v8 onward, AArch64 only */
+#define SCTLR_NMFI    (1U << 27)
+#define SCTLR_TRE     (1U << 28)
+#define SCTLR_AFE     (1U << 29)
+#define SCTLR_TE      (1U << 30)
+
 static inline void cpu_set_tls(CPUARMState *env, target_ulong newtls)
 {
   env->cp15.c13_tls2 = newtls;
@@ -329,6 +394,7 @@
 #define CPSR_Z (1U << 30)
 #define CPSR_N (1U << 31)
 #define CPSR_NZCV (CPSR_N | CPSR_Z | CPSR_C | CPSR_V)
+#define CPSR_AIF (CPSR_A | CPSR_I | CPSR_F)
 
 #define CPSR_IT (CPSR_IT_0_1 | CPSR_IT_2_7)
 #define CACHED_CPSR_BITS (CPSR_T | CPSR_GE | CPSR_IT | CPSR_Q | CPSR_NZCV)
@@ -337,6 +403,62 @@
 /* Execution state bits.  MRS read as zero, MSR writes ignored.  */
 #define CPSR_EXEC (CPSR_T | CPSR_IT | CPSR_J)
 
+/* Bit definitions for ARMv8 SPSR (PSTATE) format.
+ * Only these are valid when in AArch64 mode; in
+ * AArch32 mode SPSRs are basically CPSR-format.
+ */
+#define PSTATE_M (0xFU)
+#define PSTATE_nRW (1U << 4)
+#define PSTATE_F (1U << 6)
+#define PSTATE_I (1U << 7)
+#define PSTATE_A (1U << 8)
+#define PSTATE_D (1U << 9)
+#define PSTATE_IL (1U << 20)
+#define PSTATE_SS (1U << 21)
+#if 0
+#define PSTATE_V (1U << 28)
+#define PSTATE_C (1U << 29)
+#define PSTATE_Z (1U << 30)
+#define PSTATE_N (1U << 31)
+#endif
+#define PSTATE_NZCV (PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V)
+#define PSTATE_DAIF (PSTATE_D | PSTATE_A | PSTATE_I | PSTATE_F)
+#define CACHED_PSTATE_BITS (PSTATE_NZCV | PSTATE_DAIF)
+/* Mode values for AArch64 */
+#define PSTATE_MODE_EL3h 13
+#define PSTATE_MODE_EL3t 12
+#define PSTATE_MODE_EL2h 9
+#define PSTATE_MODE_EL2t 8
+#define PSTATE_MODE_EL1h 5
+#define PSTATE_MODE_EL1t 4
+#define PSTATE_MODE_EL0t 0
+
+#if 0 // TODO(digit): Enable this
+/* Return the current PSTATE value. For the moment we don't support 32<->64 bit
+ * interprocessing, so we don't attempt to sync with the cpsr state used by
+ * the 32 bit decoder.
+ */
+static inline uint32_t pstate_read(CPUARMState *env)
+{
+    int ZF;
+
+    ZF = (env->ZF == 0);
+    return (env->NF & 0x80000000) | (ZF << 30)
+        | (env->CF << 29) | ((env->VF & 0x80000000) >> 3)
+        | env->pstate | env->daif;
+}
+
+static inline void pstate_write(CPUARMState *env, uint32_t val)
+{
+    env->ZF = (~val) & PSTATE_Z;
+    env->NF = val;
+    env->CF = (val >> 29) & 1;
+    env->VF = (val << 3) & 0x80000000;
+    env->daif = val & PSTATE_DAIF;
+    env->pstate = val & ~CACHED_PSTATE_BITS;
+}
+#endif  // 0
+
 /* Return the current CPSR value.  */
 uint32_t cpsr_read(CPUARMState *env);
 /* Set the CPSR.  Note that some bits of mask must be all-set or all-clear.  */
@@ -383,6 +505,44 @@
 /* Return the current FPSCR value.  */
 uint32_t vfp_get_fpscr(CPUARMState *env);
 void vfp_set_fpscr(CPUARMState *env, uint32_t val);
+/* For A64 the FPSCR is split into two logically distinct registers,
+ * FPCR and FPSR. However since they still use non-overlapping bits
+ * we store the underlying state in fpscr and just mask on read/write.
+ */
+#define FPSR_MASK 0xf800009f
+#define FPCR_MASK 0x07f79f00
+static inline uint32_t vfp_get_fpsr(CPUARMState *env)
+{
+    return vfp_get_fpscr(env) & FPSR_MASK;
+}
+
+static inline void vfp_set_fpsr(CPUARMState *env, uint32_t val)
+{
+    uint32_t new_fpscr = (vfp_get_fpscr(env) & ~FPSR_MASK) | (val & FPSR_MASK);
+    vfp_set_fpscr(env, new_fpscr);
+}
+
+static inline uint32_t vfp_get_fpcr(CPUARMState *env)
+{
+    return vfp_get_fpscr(env) & FPCR_MASK;
+}
+
+static inline void vfp_set_fpcr(CPUARMState *env, uint32_t val)
+{
+    uint32_t new_fpscr = (vfp_get_fpscr(env) & ~FPCR_MASK) | (val & FPCR_MASK);
+    vfp_set_fpscr(env, new_fpscr);
+}
+
+enum arm_fprounding {
+    FPROUNDING_TIEEVEN,
+    FPROUNDING_POSINF,
+    FPROUNDING_NEGINF,
+    FPROUNDING_ZERO,
+    FPROUNDING_TIEAWAY,
+    FPROUNDING_ODD
+};
+
+int arm_rmode_to_sf(int rmode);
 
 enum arm_cpu_mode {
   ARM_CPU_MODE_USR = 0x10,
@@ -468,10 +628,31 @@
  *    or via MRRC/MCRR?)
  * We allow 4 bits for opc1 because MRRC/MCRR have a 4 bit field.
  * (In this case crn and opc2 should be zero.)
+ * For AArch64, there is no 32/64 bit size distinction;
+ * instead all registers have a 2 bit op0, 3 bit op1 and op2,
+ * and 4 bit CRn and CRm. The encoding patterns are chosen
+ * to be easy to convert to and from the KVM encodings, and also
+ * so that the hashtable can contain both AArch32 and AArch64
+ * registers (to allow for interprocessing where we might run
+ * 32 bit code on a 64 bit core).
  */
+/* This bit is private to our hashtable cpreg; in KVM register
+ * IDs the AArch64/32 distinction is the KVM_REG_ARM/ARM64
+ * in the upper bits of the 64 bit ID.
+ */
+#define CP_REG_AA64_SHIFT 28
+#define CP_REG_AA64_MASK (1 << CP_REG_AA64_SHIFT)
 #define ENCODE_CP_REG(cp, is64, crn, crm, opc1, opc2)   \
     (((cp) << 16) | ((is64) << 15) | ((crn) << 11) |    \
      ((crm) << 7) | ((opc1) << 3) | (opc2))
+#define ENCODE_AA64_CP_REG(cp, crn, crm, op0, op1, op2) \
+    (CP_REG_AA64_MASK |                                 \
+     ((cp) << CP_REG_ARM_COPROC_SHIFT) |                \
+     ((op0) << CP_REG_ARM64_SYSREG_OP0_SHIFT) |         \
+     ((op1) << CP_REG_ARM64_SYSREG_OP1_SHIFT) |         \
+     ((crn) << CP_REG_ARM64_SYSREG_CRN_SHIFT) |         \
+     ((crm) << CP_REG_ARM64_SYSREG_CRM_SHIFT) |         \
+     ((op2) << CP_REG_ARM64_SYSREG_OP2_SHIFT))
 
 /* Convert a full 64 bit KVM register ID to the truncated 32 bit
  * version used as a key for the coprocessor register hashtable
@@ -479,7 +660,9 @@
 static inline uint32_t kvm_to_cpreg_id(uint64_t kvmid)
 {
     uint32_t cpregid = kvmid;
-    if ((kvmid & CP_REG_SIZE_MASK) == CP_REG_SIZE_U64) {
+    if ((kvmid & CP_REG_ARCH_MASK) == CP_REG_ARM64) {
+        cpregid |= CP_REG_AA64_MASK;
+    } else if ((kvmid & CP_REG_SIZE_MASK) == CP_REG_SIZE_U64) {
         cpregid |= (1 << 15);
     }
     return cpregid;
@@ -490,11 +673,18 @@
  */
 static inline uint64_t cpreg_to_kvm_id(uint32_t cpregid)
 {
-    uint64_t kvmid = cpregid & ~(1 << 15);
+    uint64_t kvmid;
+
+    if (cpregid & CP_REG_AA64_MASK) {
+        kvmid = cpregid & ~CP_REG_AA64_MASK;
+        kvmid |= CP_REG_SIZE_U64 | CP_REG_ARM64;
+    } else {
+        kvmid = cpregid & ~(1 << 15);
     if (cpregid & (1 << 15)) {
         kvmid |= CP_REG_SIZE_U64 | CP_REG_ARM;
     } else {
         kvmid |= CP_REG_SIZE_U32 | CP_REG_ARM;
+        }
     }
     return kvmid;
 }
@@ -530,6 +720,20 @@
 /* Mask of only the flag bits in a type field */
 #define ARM_CP_FLAG_MASK 0x7f
 
+/* Valid values for ARMCPRegInfo state field, indicating which of
+ * the AArch32 and AArch64 execution states this register is visible in.
+ * If the reginfo doesn't explicitly specify then it is AArch32 only.
+ * If the reginfo is declared to be visible in both states then a second
+ * reginfo is synthesised for the AArch32 view of the AArch64 register,
+ * such that the AArch32 view is the lower 32 bits of the AArch64 one.
+ * Note that we rely on the values of these enums as we iterate through
+ * the various states in some places.
+ */
+enum {
+    ARM_CP_STATE_AA32 = 0,
+    ARM_CP_STATE_AA64 = 1,
+    ARM_CP_STATE_BOTH = 2,
+};
 /* Return true if cptype is a valid type field. This is used to try to
  * catch errors where the sentinel has been accidentally left off the end
  * of a list of registers.
@@ -551,6 +755,8 @@
  * (ie anything visible in PL2 is visible in S-PL1, some things are only
  * visible in S-PL1) but "Secure PL1" is a bit of a mouthful, we bend the
  * terminology a little and call this PL3.
+ * In AArch64 things are somewhat simpler as the PLx bits line up exactly
+ * with the ELx exception levels.
  *
  * If access permissions for a register are more complex than can be
  * described with these bits, then use a laxer set of restrictions, and
@@ -708,7 +914,7 @@
  * @cpu: ARMCPU
  *
  * For each register listed in the ARMCPU cpreg_indexes list, write
- * its value from the cpreg_values list into the ARMCPUARMState structure.
+ * its value from the cpreg_values list into the ARMCPUState structure.
  * This updates TCG's working data structures from KVM data or
  * from incoming migration state.
  *
@@ -724,7 +930,7 @@
  * @cpu: ARMCPU
  *
  * For each register listed in the ARMCPU cpreg_indexes list, write
- * its value from the ARMCPUARMState structure into the cpreg_values list.
+ * its value from the ARMCPUState structure into the cpreg_values list.
  * This is used to copy info from TCG's working data structures into
  * KVM or for outbound migration.
  *
@@ -852,7 +1058,9 @@
 #define ARM_TBFLAG_BSWAP_CODE_SHIFT 16
 #define ARM_TBFLAG_BSWAP_CODE_MASK  (1 << ARM_TBFLAG_BSWAP_CODE_SHIFT)
 
-/* Bit usage when in AArch64 state: currently no bits defined */
+/* Bit usage when in AArch64 state */
+#define ARM_TBFLAG_AA64_EL_SHIFT    0
+#define ARM_TBFLAG_AA64_EL_MASK     (0x3 << ARM_TBFLAG_AA64_EL_SHIFT)
 
 /* some convenience accessor macros */
 #define ARM_TBFLAG_AARCH64_STATE(F) \
@@ -871,6 +1079,8 @@
     (((F) & ARM_TBFLAG_CONDEXEC_MASK) >> ARM_TBFLAG_CONDEXEC_SHIFT)
 #define ARM_TBFLAG_BSWAP_CODE(F) \
     (((F) & ARM_TBFLAG_BSWAP_CODE_MASK) >> ARM_TBFLAG_BSWAP_CODE_SHIFT)
+#define ARM_TBFLAG_AA64_EL(F) \
+    (((F) & ARM_TBFLAG_AA64_EL_MASK) >> ARM_TBFLAG_AA64_EL_SHIFT)
 
 static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
                                         target_ulong *cs_base, int *flags)
diff --git a/target-arm/kvm-consts.h b/target-arm/kvm-consts.h
index 2bba0bd..6009a33 100644
--- a/target-arm/kvm-consts.h
+++ b/target-arm/kvm-consts.h
@@ -29,12 +29,14 @@
 #define CP_REG_SIZE_U32        0x0020000000000000ULL
 #define CP_REG_SIZE_U64        0x0030000000000000ULL
 #define CP_REG_ARM             0x4000000000000000ULL
+#define CP_REG_ARCH_MASK       0xff00000000000000ULL
 
 MISMATCH_CHECK(CP_REG_SIZE_SHIFT, KVM_REG_SIZE_SHIFT)
 MISMATCH_CHECK(CP_REG_SIZE_MASK, KVM_REG_SIZE_MASK)
 MISMATCH_CHECK(CP_REG_SIZE_U32, KVM_REG_SIZE_U32)
 MISMATCH_CHECK(CP_REG_SIZE_U64, KVM_REG_SIZE_U64)
 MISMATCH_CHECK(CP_REG_ARM, KVM_REG_ARM)
+MISMATCH_CHECK(CP_REG_ARCH_MASK, KVM_REG_ARCH_MASK)
 
 #define PSCI_FN_BASE 0x95c1ba5e
 #define PSCI_FN(n) (PSCI_FN_BASE + (n))
@@ -48,15 +50,64 @@
 MISMATCH_CHECK(PSCI_FN_CPU_ON, KVM_PSCI_FN_CPU_ON)
 MISMATCH_CHECK(PSCI_FN_MIGRATE, KVM_PSCI_FN_MIGRATE)
 
+/* Note that KVM uses overlapping values for AArch32 and AArch64
+ * target CPU numbers. AArch32 targets:
+ */
 #define QEMU_KVM_ARM_TARGET_CORTEX_A15 0
+#define QEMU_KVM_ARM_TARGET_CORTEX_A7 1
+
+/* AArch64 targets: */
+#define QEMU_KVM_ARM_TARGET_AEM_V8 0
+#define QEMU_KVM_ARM_TARGET_FOUNDATION_V8 1
+#define QEMU_KVM_ARM_TARGET_CORTEX_A57 2
 
 /* There's no kernel define for this: sentinel value which
  * matches no KVM target value for either 64 or 32 bit
  */
 #define QEMU_KVM_ARM_TARGET_NONE UINT_MAX
 
-#ifndef TARGET_AARCH64
+#ifdef TARGET_AARCH64
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_AEM_V8, KVM_ARM_TARGET_AEM_V8)
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_FOUNDATION_V8, KVM_ARM_TARGET_FOUNDATION_V8)
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A57, KVM_ARM_TARGET_CORTEX_A57)
+#else
 MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A15, KVM_ARM_TARGET_CORTEX_A15)
+MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A7, KVM_ARM_TARGET_CORTEX_A7)
+#endif
+
+#define CP_REG_ARM64                   0x6000000000000000ULL
+#define CP_REG_ARM_COPROC_MASK         0x000000000FFF0000
+#define CP_REG_ARM_COPROC_SHIFT        16
+#define CP_REG_ARM64_SYSREG            (0x0013 << CP_REG_ARM_COPROC_SHIFT)
+#define CP_REG_ARM64_SYSREG_OP0_MASK   0x000000000000c000
+#define CP_REG_ARM64_SYSREG_OP0_SHIFT  14
+#define CP_REG_ARM64_SYSREG_OP1_MASK   0x0000000000003800
+#define CP_REG_ARM64_SYSREG_OP1_SHIFT  11
+#define CP_REG_ARM64_SYSREG_CRN_MASK   0x0000000000000780
+#define CP_REG_ARM64_SYSREG_CRN_SHIFT  7
+#define CP_REG_ARM64_SYSREG_CRM_MASK   0x0000000000000078
+#define CP_REG_ARM64_SYSREG_CRM_SHIFT  3
+#define CP_REG_ARM64_SYSREG_OP2_MASK   0x0000000000000007
+#define CP_REG_ARM64_SYSREG_OP2_SHIFT  0
+
+/* No kernel define but it's useful to QEMU */
+#define CP_REG_ARM64_SYSREG_CP (CP_REG_ARM64_SYSREG >> CP_REG_ARM_COPROC_SHIFT)
+
+#ifdef TARGET_AARCH64
+MISMATCH_CHECK(CP_REG_ARM64, KVM_REG_ARM64)
+MISMATCH_CHECK(CP_REG_ARM_COPROC_MASK, KVM_REG_ARM_COPROC_MASK)
+MISMATCH_CHECK(CP_REG_ARM_COPROC_SHIFT, KVM_REG_ARM_COPROC_SHIFT)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP0_MASK)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_OP0_SHIFT, KVM_REG_ARM64_SYSREG_OP0_SHIFT)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_OP1_SHIFT, KVM_REG_ARM64_SYSREG_OP1_SHIFT)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_CRN_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_CRN_SHIFT, KVM_REG_ARM64_SYSREG_CRN_SHIFT)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRM_MASK)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_CRM_SHIFT, KVM_REG_ARM64_SYSREG_CRM_SHIFT)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_OP2_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK)
+MISMATCH_CHECK(CP_REG_ARM64_SYSREG_OP2_SHIFT, KVM_REG_ARM64_SYSREG_OP2_SHIFT)
 #endif
 
 #undef MISMATCH_CHECK
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index e3c4644..5ca3805 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -53,10 +53,10 @@
     return val;
 }
 
-#undef env
 
 #if !defined(CONFIG_USER_ONLY)
 
+#include "exec/softmmu_exec.h"
 
 #define MMUSUFFIX _mmu
 
@@ -73,8 +73,9 @@
 #include "exec/softmmu_template.h"
 
 /* try to fill the TLB and return an exception if error. If retaddr is
-   NULL, it means that the function was called in C code (i.e. not
-   from generated code or from helper.c) */
+ * NULL, it means that the function was called in C code (i.e. not
+ * from generated code or from helper.c)
+ */
 void tlb_fill(CPUARMState *env, target_ulong addr, int is_write, int mmu_idx,
               uintptr_t retaddr)
 {
@@ -397,6 +398,12 @@
     return (int32_t)x >> shift;
 }
 
+/* ??? Flag setting arithmetic is awkward because we need to do comparisons.
+   The only way to do that in TCG is a conditional branch, which clobbers
+   all our temporaries.  For now implement these as helper functions.  */
+
+/* Similarly for variable shift instructions.  */
+
 uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i)
 {
     int shift = i & 0xff;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 2dc14b9..955eaf2 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -29,6 +29,7 @@
 #include "disas/disas.h"
 #include "tcg-op.h"
 #include "qemu/log.h"
+#include "qemu/bitops.h"
 
 #include "helper.h"
 #define GEN_HELPER 1
@@ -43,31 +44,11 @@
 #define ENABLE_ARCH_6K   arm_feature(env, ARM_FEATURE_V6K)
 #define ENABLE_ARCH_6T2   arm_feature(env, ARM_FEATURE_THUMB2)
 #define ENABLE_ARCH_7     arm_feature(env, ARM_FEATURE_V7)
+#define ENABLE_ARCH_8     arm_feature(env, ARM_FEATURE_V8)
 
 #define ARCH(x) do { if (!ENABLE_ARCH_##x) goto illegal_op; } while(0)
 
-/* internal defines */
-typedef struct DisasContext {
-    target_ulong pc;
-    int is_jmp;
-    /* Nonzero if this instruction has been conditionally skipped.  */
-    int condjmp;
-    /* The label that will be jumped to when the instruction is skipped.  */
-    int condlabel;
-    /* Thumb-2 condtional execution bits.  */
-    int condexec_mask;
-    int condexec_cond;
-    struct TranslationBlock *tb;
-    int singlestep_enabled;
-    int thumb;
-#if !defined(CONFIG_USER_ONLY)
-    int user;
-#endif
-    int vfp_enabled;
-    int vec_len;
-    int vec_stride;
-} DisasContext;
-
+#include "translate.h"
 static uint32_t gen_opc_condexec_bits[OPC_BUF_SIZE];
 
 #if defined(CONFIG_USER_ONLY)
@@ -76,13 +57,7 @@
 #define IS_USER(s) (s->user)
 #endif
 
-/* These instructions trap after executing, so defer them until after the
-   conditional executions state has been updated.  */
-#define DISAS_WFI 4
-#define DISAS_SWI 5
-#define DISAS_SMC 6
-
-static TCGv_ptr cpu_env;
+TCGv_ptr cpu_env;
 /* We reuse the same 64-bit temporaries for efficiency.  */
 static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
 static TCGv_i32 cpu_R[16];
@@ -95,7 +70,7 @@
 #endif
 
 /* FIXME:  These should be removed.  */
-static TCGv cpu_F0s, cpu_F1s;
+static TCGv_i32 cpu_F0s, cpu_F1s;
 static TCGv_i64 cpu_F0d, cpu_F1d;
 
 #include "exec/gen-icount.h"
@@ -130,16 +105,16 @@
 #endif
 }
 
-static inline TCGv load_cpu_offset(int offset)
+static inline TCGv_i32 load_cpu_offset(int offset)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_ld_i32(tmp, cpu_env, offset);
     return tmp;
 }
 
 #define load_cpu_field(name) load_cpu_offset(offsetof(CPUARMState, name))
 
-static inline void store_cpu_offset(TCGv var, int offset)
+static inline void store_cpu_offset(TCGv_i32 var, int offset)
 {
     tcg_gen_st_i32(var, cpu_env, offset);
     tcg_temp_free_i32(var);
@@ -149,11 +124,11 @@
     store_cpu_offset(var, offsetof(CPUARMState, name))
 
 /* Set a variable to the value of a CPU register.  */
-static void load_reg_var(DisasContext *s, TCGv var, int reg)
+static void load_reg_var(DisasContext *s, TCGv_i32 var, int reg)
 {
     if (reg == 15) {
         uint32_t addr;
-        /* normaly, since we updated PC, we need only to add one insn */
+        /* normally, since we updated PC, we need only to add one insn */
         if (s->thumb)
             addr = (long)s->pc + 2;
         else
@@ -165,16 +140,16 @@
 }
 
 /* Create a new temporary and set it to the value of a CPU register.  */
-static inline TCGv load_reg(DisasContext *s, int reg)
+static inline TCGv_i32 load_reg(DisasContext *s, int reg)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     load_reg_var(s, tmp, reg);
     return tmp;
 }
 
 /* Set a CPU register.  The source must be a temporary and will be
    marked as dead.  */
-static void store_reg(DisasContext *s, int reg, TCGv var)
+static void store_reg(DisasContext *s, int reg, TCGv_i32 var)
 {
     if (reg == 15) {
         tcg_gen_andi_i32(var, var, ~1);
@@ -194,9 +169,9 @@
 #define gen_uxtb16(var) gen_helper_uxtb16(var, var)
 
 
-static inline void gen_set_cpsr(TCGv var, uint32_t mask)
+static inline void gen_set_cpsr(TCGv_i32 var, uint32_t mask)
 {
-    TCGv tmp_mask = tcg_const_i32(mask);
+    TCGv_i32 tmp_mask = tcg_const_i32(mask);
     gen_helper_cpsr_write(cpu_env, var, tmp_mask);
     tcg_temp_free_i32(tmp_mask);
 }
@@ -205,16 +180,16 @@
 
 static void gen_exception(int excp)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_movi_i32(tmp, excp);
     gen_helper_exception(cpu_env, tmp);
     tcg_temp_free_i32(tmp);
 }
 
-static void gen_smul_dual(TCGv a, TCGv b)
+static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
 {
-    TCGv tmp1 = tcg_temp_new_i32();
-    TCGv tmp2 = tcg_temp_new_i32();
+    TCGv_i32 tmp1 = tcg_temp_new_i32();
+    TCGv_i32 tmp2 = tcg_temp_new_i32();
     tcg_gen_ext16s_i32(tmp1, a);
     tcg_gen_ext16s_i32(tmp2, b);
     tcg_gen_mul_i32(tmp1, tmp1, tmp2);
@@ -227,9 +202,9 @@
 }
 
 /* Byteswap each halfword.  */
-static void gen_rev16(TCGv var)
+static void gen_rev16(TCGv_i32 var)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_shri_i32(tmp, var, 8);
     tcg_gen_andi_i32(tmp, tmp, 0x00ff00ff);
     tcg_gen_shli_i32(var, var, 8);
@@ -239,7 +214,7 @@
 }
 
 /* Byteswap low halfword and sign extend.  */
-static void gen_revsh(TCGv var)
+static void gen_revsh(TCGv_i32 var)
 {
     tcg_gen_ext16u_i32(var, var);
     tcg_gen_bswap16_i32(var, var);
@@ -247,7 +222,7 @@
 }
 
 /* Unsigned bitfield extract.  */
-static void gen_ubfx(TCGv var, int shift, uint32_t mask)
+static void gen_ubfx(TCGv_i32 var, int shift, uint32_t mask)
 {
     if (shift)
         tcg_gen_shri_i32(var, var, shift);
@@ -255,7 +230,7 @@
 }
 
 /* Signed bitfield extract.  */
-static void gen_sbfx(TCGv var, int shift, int width)
+static void gen_sbfx(TCGv_i32 var, int shift, int width)
 {
     uint32_t signbit;
 
@@ -269,17 +244,9 @@
     }
 }
 
-/* Bitfield insertion.  Insert val into base.  Clobbers base and val.  */
-static void gen_bfi(TCGv dest, TCGv base, TCGv val, int shift, uint32_t mask)
-{
-    tcg_gen_andi_i32(val, val, mask);
-    tcg_gen_shli_i32(val, val, shift);
-    tcg_gen_andi_i32(base, base, ~(mask << shift));
-    tcg_gen_or_i32(dest, base, val);
-}
 
 /* Return (b << 32) + a. Mark inputs as dead */
-static TCGv_i64 gen_addq_msw(TCGv_i64 a, TCGv b)
+static TCGv_i64 gen_addq_msw(TCGv_i64 a, TCGv_i32 b)
 {
     TCGv_i64 tmp64 = tcg_temp_new_i64();
 
@@ -293,7 +260,7 @@
 }
 
 /* Return (b << 32) - a. Mark inputs as dead. */
-static TCGv_i64 gen_subq_msw(TCGv_i64 a, TCGv b)
+static TCGv_i64 gen_subq_msw(TCGv_i64 a, TCGv_i32 b)
 {
     TCGv_i64 tmp64 = tcg_temp_new_i64();
 
@@ -309,7 +276,7 @@
 /* FIXME: Most targets have native widening multiplication.
    It would be good to use that instead of a full wide multiply.  */
 /* 32x32->64 multiply.  Marks inputs as dead.  */
-static TCGv_i64 gen_mulu_i64_i32(TCGv a, TCGv b)
+static TCGv_i64 gen_mulu_i64_i32(TCGv_i32 a, TCGv_i32 b)
 {
     TCGv_i64 tmp1 = tcg_temp_new_i64();
     TCGv_i64 tmp2 = tcg_temp_new_i64();
@@ -323,7 +290,7 @@
     return tmp1;
 }
 
-static TCGv_i64 gen_muls_i64_i32(TCGv a, TCGv b)
+static TCGv_i64 gen_muls_i64_i32(TCGv_i32 a, TCGv_i32 b)
 {
     TCGv_i64 tmp1 = tcg_temp_new_i64();
     TCGv_i64 tmp2 = tcg_temp_new_i64();
@@ -338,9 +305,9 @@
 }
 
 /* Swap low and high halfwords.  */
-static void gen_swap_half(TCGv var)
+static void gen_swap_half(TCGv_i32 var)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_shri_i32(tmp, var, 16);
     tcg_gen_shli_i32(var, var, 16);
     tcg_gen_or_i32(var, var, tmp);
@@ -354,9 +321,9 @@
     t0 = (t0 + t1) ^ tmp;
  */
 
-static void gen_add16(TCGv t0, TCGv t1)
+static void gen_add16(TCGv_i32 t0, TCGv_i32 t1)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_xor_i32(tmp, t0, t1);
     tcg_gen_andi_i32(tmp, tmp, 0x8000);
     tcg_gen_andi_i32(t0, t0, ~0x8000);
@@ -370,25 +337,25 @@
 #define gen_set_CF(var) tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, CF))
 
 /* Set CF to the top bit of var.  */
-static void gen_set_CF_bit31(TCGv var)
+static void gen_set_CF_bit31(TCGv_i32 var)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_shri_i32(tmp, var, 31);
     gen_set_CF(tmp);
     tcg_temp_free_i32(tmp);
 }
 
 /* Set N and Z flags from var.  */
-static inline void gen_logic_CC(TCGv var)
+static inline void gen_logic_CC(TCGv_i32 var)
 {
     tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, NF));
     tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, ZF));
 }
 
 /* T0 += T1 + CF.  */
-static void gen_adc(TCGv t0, TCGv t1)
+static void gen_adc(TCGv_i32 t0, TCGv_i32 t1)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     tcg_gen_add_i32(t0, t0, t1);
     tmp = load_cpu_field(CF);
     tcg_gen_add_i32(t0, t0, tmp);
@@ -396,9 +363,9 @@
 }
 
 /* dest = T0 + T1 + CF. */
-static void gen_add_carry(TCGv dest, TCGv t0, TCGv t1)
+static void gen_add_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     tcg_gen_add_i32(dest, t0, t1);
     tmp = load_cpu_field(CF);
     tcg_gen_add_i32(dest, dest, tmp);
@@ -406,9 +373,9 @@
 }
 
 /* dest = T0 - T1 + CF - 1.  */
-static void gen_sub_carry(TCGv dest, TCGv t0, TCGv t1)
+static void gen_sub_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     tcg_gen_sub_i32(dest, t0, t1);
     tmp = load_cpu_field(CF);
     tcg_gen_add_i32(dest, dest, tmp);
@@ -419,9 +386,9 @@
 /* FIXME:  Implement this natively.  */
 #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1)
 
-static void shifter_out_im(TCGv var, int shift)
+static void shifter_out_im(TCGv_i32 var, int shift)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     if (shift == 0) {
         tcg_gen_andi_i32(tmp, var, 1);
     } else {
@@ -434,7 +401,8 @@
 }
 
 /* Shift by immediate.  Includes special handling for shift == 0.  */
-static inline void gen_arm_shift_im(TCGv var, int shiftop, int shift, int flags)
+static inline void gen_arm_shift_im(TCGv_i32 var, int shiftop,
+                                    int shift, int flags)
 {
     switch (shiftop) {
     case 0: /* LSL */
@@ -472,7 +440,7 @@
                 shifter_out_im(var, shift - 1);
             tcg_gen_rotri_i32(var, var, shift); break;
         } else {
-            TCGv tmp = load_cpu_field(CF);
+            TCGv_i32 tmp = load_cpu_field(CF);
             if (flags)
                 shifter_out_im(var, 0);
             tcg_gen_shri_i32(var, var, 1);
@@ -483,8 +451,8 @@
     }
 };
 
-static inline void gen_arm_shift_reg(TCGv var, int shiftop,
-                                     TCGv shift, int flags)
+static inline void gen_arm_shift_reg(TCGv_i32 var, int shiftop,
+                                     TCGv_i32 shift, int flags)
 {
     if (flags) {
         switch (shiftop) {
@@ -495,9 +463,15 @@
         }
     } else {
         switch (shiftop) {
-        case 0: gen_helper_shl(var, var, shift); break;
-        case 1: gen_helper_shr(var, var, shift); break;
-        case 2: gen_helper_sar(var, var, shift); break;
+        case 0:
+            gen_helper_shl(var, var, shift);
+            break;
+        case 1:
+            gen_helper_shr(var, var, shift);
+            break;
+        case 2:
+            gen_helper_sar(var, var, shift);
+            break;
         case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
                 tcg_gen_rotr_i32(var, var, shift); break;
         }
@@ -514,7 +488,7 @@
     case 4: gen_pas_helper(glue(pfx,add8)); break; \
     case 7: gen_pas_helper(glue(pfx,sub8)); break; \
     }
-static void gen_arm_parallel_addsub(int op1, int op2, TCGv a, TCGv b)
+static void gen_arm_parallel_addsub(int op1, int op2, TCGv_i32 a, TCGv_i32 b)
 {
     TCGv_ptr tmp;
 
@@ -561,7 +535,7 @@
     case 5: gen_pas_helper(glue(pfx,sub16)); break; \
     case 6: gen_pas_helper(glue(pfx,subaddx)); break; \
     }
-static void gen_thumb2_parallel_addsub(int op1, int op2, TCGv a, TCGv b)
+static void gen_thumb2_parallel_addsub(int op1, int op2, TCGv_i32 a, TCGv_i32 b)
 {
     TCGv_ptr tmp;
 
@@ -598,10 +572,14 @@
 }
 #undef PAS_OP
 
-static void gen_test_cc(int cc, int label)
+/*
+ * generate a conditional branch based on ARM condition code cc.
+ * This is common between ARM and Aarch64 targets.
+ */
+void arm_gen_test_cc(int cc, int label)
 {
-    TCGv tmp;
-    TCGv tmp2;
+    TCGv_i32 tmp;
+    TCGv_i32 tmp2;
     int inv;
 
     switch (cc) {
@@ -718,7 +696,7 @@
 /* Set PC and Thumb state from an immediate address.  */
 static inline void gen_bx_im(DisasContext *s, uint32_t addr)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
 
     s->is_jmp = DISAS_UPDATE;
     if (s->thumb != (addr & 1)) {
@@ -731,7 +709,7 @@
 }
 
 /* Set PC and Thumb state from var.  var is marked as dead.  */
-static inline void gen_bx(DisasContext *s, TCGv var)
+static inline void gen_bx(DisasContext *s, TCGv_i32 var)
 {
     s->is_jmp = DISAS_UPDATE;
     tcg_gen_andi_i32(cpu_R[15], var, ~1);
@@ -743,7 +721,7 @@
    to r15 in ARM architecture v7 and above. The source must be a temporary
    and will be marked as dead. */
 static inline void store_reg_bx(CPUARMState *env, DisasContext *s,
-                                int reg, TCGv var)
+                                int reg, TCGv_i32 var)
 {
     if (reg == 15 && ENABLE_ARCH_7) {
         gen_bx(s, var);
@@ -757,7 +735,7 @@
  * the results of a LDR/LDM/POP into r15, and corresponds to the cases
  * in the ARM ARM which use the LoadWritePC() pseudocode function. */
 static inline void store_reg_from_load(CPUARMState *env, DisasContext *s,
-                                int reg, TCGv var)
+                                       int reg, TCGv_i32 var)
 {
     if (reg == 15 && ENABLE_ARCH_5) {
         gen_bx(s, var);
@@ -772,64 +750,85 @@
     s->is_jmp = DISAS_SMC;
 }
 
-static inline TCGv gen_ld8s(TCGv addr, int index)
-{
-    TCGv tmp = tcg_temp_new_i32();
-    tcg_gen_qemu_ld8s(tmp, addr, index);
-    return tmp;
-}
-static inline TCGv gen_ld8u(TCGv addr, int index)
-{
-    TCGv tmp = tcg_temp_new_i32();
-    tcg_gen_qemu_ld8u(tmp, addr, index);
-    return tmp;
-}
-static inline TCGv gen_ld16s(TCGv addr, int index)
-{
-    TCGv tmp = tcg_temp_new_i32();
-    tcg_gen_qemu_ld16s(tmp, addr, index);
-    return tmp;
-}
-static inline TCGv gen_ld16u(TCGv addr, int index)
-{
-    TCGv tmp = tcg_temp_new_i32();
-    tcg_gen_qemu_ld16u(tmp, addr, index);
-    return tmp;
-}
-static inline TCGv gen_ld32(TCGv addr, int index)
-{
-    TCGv tmp = tcg_temp_new_i32();
-    tcg_gen_qemu_ld32u(tmp, addr, index);
-    return tmp;
-}
-static inline TCGv_i64 gen_ld64(TCGv addr, int index)
-{
-    TCGv_i64 tmp = tcg_temp_new_i64();
-    tcg_gen_qemu_ld64(tmp, addr, index);
-    return tmp;
-}
-static inline void gen_st8(TCGv val, TCGv addr, int index)
-{
-    tcg_gen_qemu_st8(val, addr, index);
-    tcg_temp_free_i32(val);
-}
-static inline void gen_st16(TCGv val, TCGv addr, int index)
-{
-    tcg_gen_qemu_st16(val, addr, index);
-    tcg_temp_free_i32(val);
-}
-static inline void gen_st32(TCGv val, TCGv addr, int index)
-{
-    tcg_gen_qemu_st32(val, addr, index);
-    tcg_temp_free_i32(val);
-}
-static inline void gen_st64(TCGv_i64 val, TCGv addr, int index)
-{
-    tcg_gen_qemu_st64(val, addr, index);
-    tcg_temp_free_i64(val);
+/* Abstractions of "generate code to do a guest load/store for
+ * AArch32", where a vaddr is always 32 bits (and is zero
+ * extended if we're a 64 bit core) and  data is also
+ * 32 bits unless specifically doing a 64 bit access.
+ * These functions work like tcg_gen_qemu_{ld,st}* except
+ * that the address argument is TCGv_i32 rather than TCGv.
+ */
+#if TARGET_LONG_BITS == 32
+
+#define DO_GEN_LD(SUFF, OPC)                                             \
+static inline void gen_aa32_ld##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
+{                                                                        \
+    tcg_gen_qemu_ld_i32(val, addr, index, OPC);                          \
 }
 
-static inline void gen_set_pc_im(uint32_t val)
+#define DO_GEN_ST(SUFF, OPC)                                             \
+static inline void gen_aa32_st##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
+{                                                                        \
+    tcg_gen_qemu_st_i32(val, addr, index, OPC);                          \
+}
+
+static inline void gen_aa32_ld64(TCGv_i64 val, TCGv_i32 addr, int index)
+{
+    tcg_gen_qemu_ld_i64(val, addr, index, MO_TEQ);
+}
+
+static inline void gen_aa32_st64(TCGv_i64 val, TCGv_i32 addr, int index)
+{
+    tcg_gen_qemu_st_i64(val, addr, index, MO_TEQ);
+}
+
+#else
+
+#define DO_GEN_LD(SUFF, OPC)                                             \
+static inline void gen_aa32_ld##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
+{                                                                        \
+    TCGv addr64 = tcg_temp_new();                                        \
+    tcg_gen_extu_i32_i64(addr64, addr);                                  \
+    tcg_gen_qemu_ld_i32(val, addr64, index, OPC);                        \
+    tcg_temp_free(addr64);                                               \
+}
+
+#define DO_GEN_ST(SUFF, OPC)                                             \
+static inline void gen_aa32_st##SUFF(TCGv_i32 val, TCGv_i32 addr, int index) \
+{                                                                        \
+    TCGv addr64 = tcg_temp_new();                                        \
+    tcg_gen_extu_i32_i64(addr64, addr);                                  \
+    tcg_gen_qemu_st_i32(val, addr64, index, OPC);                        \
+    tcg_temp_free(addr64);                                               \
+}
+
+static inline void gen_aa32_ld64(TCGv_i64 val, TCGv_i32 addr, int index)
+{
+    TCGv addr64 = tcg_temp_new();
+    tcg_gen_extu_i32_i64(addr64, addr);
+    tcg_gen_qemu_ld_i64(val, addr64, index, MO_TEQ);
+    tcg_temp_free(addr64);
+}
+
+static inline void gen_aa32_st64(TCGv_i64 val, TCGv_i32 addr, int index)
+{
+    TCGv addr64 = tcg_temp_new();
+    tcg_gen_extu_i32_i64(addr64, addr);
+    tcg_gen_qemu_st_i64(val, addr64, index, MO_TEQ);
+    tcg_temp_free(addr64);
+}
+
+#endif
+
+DO_GEN_LD(8s, MO_SB)
+DO_GEN_LD(8u, MO_UB)
+DO_GEN_LD(16s, MO_TESW)
+DO_GEN_LD(16u, MO_TEUW)
+DO_GEN_LD(32u, MO_TEUL)
+DO_GEN_ST(8, MO_UB)
+DO_GEN_ST(16, MO_TEUW)
+DO_GEN_ST(32, MO_TEUL)
+
+static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
 {
     tcg_gen_movi_i32(cpu_R[15], val);
 }
@@ -842,10 +841,10 @@
 }
 
 static inline void gen_add_data_offset(DisasContext *s, unsigned int insn,
-                                       TCGv var)
+                                       TCGv_i32 var)
 {
     int val, rm, shift, shiftop;
-    TCGv offset;
+    TCGv_i32 offset;
 
     if (!(insn & (1 << 25))) {
         /* immediate */
@@ -870,10 +869,10 @@
 }
 
 static inline void gen_add_datah_offset(DisasContext *s, unsigned int insn,
-                                        int extra, TCGv var)
+                                        int extra, TCGv_i32 var)
 {
     int val, rm;
-    TCGv offset;
+    TCGv_i32 offset;
 
     if (insn & (1 << 22)) {
         /* immediate */
@@ -1035,7 +1034,7 @@
 #define VFP_GEN_FIX(name) \
 static inline void gen_vfp_##name(int dp, int shift, int neon) \
 { \
-    TCGv tmp_shift = tcg_const_i32(shift); \
+    TCGv_i32 tmp_shift = tcg_const_i32(shift); \
     TCGv_ptr statusptr = get_fpstatus_ptr(neon); \
     if (dp) { \
         gen_helper_vfp_##name##d(cpu_F0d, cpu_F0d, tmp_shift, statusptr); \
@@ -1055,20 +1054,22 @@
 VFP_GEN_FIX(ulto)
 #undef VFP_GEN_FIX
 
-static inline void gen_vfp_ld(DisasContext *s, int dp, TCGv addr)
+static inline void gen_vfp_ld(DisasContext *s, int dp, TCGv_i32 addr)
 {
-    if (dp)
-        tcg_gen_qemu_ld64(cpu_F0d, addr, IS_USER(s));
-    else
-        tcg_gen_qemu_ld32u(cpu_F0s, addr, IS_USER(s));
+    if (dp) {
+        gen_aa32_ld64(cpu_F0d, addr, IS_USER(s));
+    } else {
+        gen_aa32_ld32u(cpu_F0s, addr, IS_USER(s));
+    }
 }
 
-static inline void gen_vfp_st(DisasContext *s, int dp, TCGv addr)
+static inline void gen_vfp_st(DisasContext *s, int dp, TCGv_i32 addr)
 {
-    if (dp)
-        tcg_gen_qemu_st64(cpu_F0d, addr, IS_USER(s));
-    else
-        tcg_gen_qemu_st32(cpu_F0s, addr, IS_USER(s));
+    if (dp) {
+        gen_aa32_st64(cpu_F0d, addr, IS_USER(s));
+    } else {
+        gen_aa32_st32(cpu_F0s, addr, IS_USER(s));
+    }
 }
 
 static inline long
@@ -1095,14 +1096,14 @@
     return vfp_reg_offset(0, sreg);
 }
 
-static TCGv neon_load_reg(int reg, int pass)
+static TCGv_i32 neon_load_reg(int reg, int pass)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_ld_i32(tmp, cpu_env, neon_reg_offset(reg, pass));
     return tmp;
 }
 
-static void neon_store_reg(int reg, int pass, TCGv var)
+static void neon_store_reg(int reg, int pass, TCGv_i32 var)
 {
     tcg_gen_st_i32(var, cpu_env, neon_reg_offset(reg, pass));
     tcg_temp_free_i32(var);
@@ -1159,14 +1160,14 @@
     tcg_gen_st_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
 }
 
-static inline TCGv iwmmxt_load_creg(int reg)
+static inline TCGv_i32 iwmmxt_load_creg(int reg)
 {
-    TCGv var = tcg_temp_new_i32();
+    TCGv_i32 var = tcg_temp_new_i32();
     tcg_gen_ld_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
     return var;
 }
 
-static inline void iwmmxt_store_creg(int reg, TCGv var)
+static inline void iwmmxt_store_creg(int reg, TCGv_i32 var)
 {
     tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
     tcg_temp_free_i32(var);
@@ -1284,7 +1285,7 @@
 
 static void gen_op_iwmmxt_set_mup(void)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
     tcg_gen_ori_i32(tmp, tmp, 2);
     store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
@@ -1292,7 +1293,7 @@
 
 static void gen_op_iwmmxt_set_cup(void)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
     tcg_gen_ori_i32(tmp, tmp, 1);
     store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
@@ -1300,7 +1301,7 @@
 
 static void gen_op_iwmmxt_setpsr_nz(void)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     gen_helper_iwmmxt_setpsr_nz(tmp, cpu_M0);
     store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCASF]);
 }
@@ -1312,11 +1313,12 @@
     tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1);
 }
 
-static inline int gen_iwmmxt_address(DisasContext *s, uint32_t insn, TCGv dest)
+static inline int gen_iwmmxt_address(DisasContext *s, uint32_t insn,
+                                     TCGv_i32 dest)
 {
     int rd;
     uint32_t offset;
-    TCGv tmp;
+    TCGv_i32 tmp;
 
     rd = (insn >> 16) & 0xf;
     tmp = load_reg(s, rd);
@@ -1346,10 +1348,10 @@
     return 0;
 }
 
-static inline int gen_iwmmxt_shift(uint32_t insn, uint32_t mask, TCGv dest)
+static inline int gen_iwmmxt_shift(uint32_t insn, uint32_t mask, TCGv_i32 dest)
 {
     int rd = (insn >> 0) & 0xf;
-    TCGv tmp;
+    TCGv_i32 tmp;
 
     if (insn & (1 << 8)) {
         if (rd < ARM_IWMMXT_wCGR0 || rd > ARM_IWMMXT_wCGR3) {
@@ -1374,8 +1376,8 @@
 {
     int rd, wrd;
     int rdhi, rdlo, rd0, rd1, i;
-    TCGv addr;
-    TCGv tmp, tmp2, tmp3;
+    TCGv_i32 addr;
+    TCGv_i32 tmp, tmp2, tmp3;
 
     if ((insn & 0x0e000e00) == 0x0c000000) {
         if ((insn & 0x0fe00ff0) == 0x0c400000) {
@@ -1404,22 +1406,24 @@
         if (insn & ARM_CP_RW_BIT) {
             if ((insn >> 28) == 0xf) {			/* WLDRW wCx */
                 tmp = tcg_temp_new_i32();
-                tcg_gen_qemu_ld32u(tmp, addr, IS_USER(s));
+                gen_aa32_ld32u(tmp, addr, IS_USER(s));
                 iwmmxt_store_creg(wrd, tmp);
             } else {
                 i = 1;
                 if (insn & (1 << 8)) {
                     if (insn & (1 << 22)) {		/* WLDRD */
-                        tcg_gen_qemu_ld64(cpu_M0, addr, IS_USER(s));
+                        gen_aa32_ld64(cpu_M0, addr, IS_USER(s));
                         i = 0;
                     } else {				/* WLDRW wRd */
-                        tmp = gen_ld32(addr, IS_USER(s));
+                        tmp = tcg_temp_new_i32();
+                        gen_aa32_ld32u(tmp, addr, IS_USER(s));
                     }
                 } else {
+                    tmp = tcg_temp_new_i32();
                     if (insn & (1 << 22)) {		/* WLDRH */
-                        tmp = gen_ld16u(addr, IS_USER(s));
+                        gen_aa32_ld16u(tmp, addr, IS_USER(s));
                     } else {				/* WLDRB */
-                        tmp = gen_ld8u(addr, IS_USER(s));
+                        gen_aa32_ld8u(tmp, addr, IS_USER(s));
                     }
                 }
                 if (i) {
@@ -1431,28 +1435,28 @@
         } else {
             if ((insn >> 28) == 0xf) {			/* WSTRW wCx */
                 tmp = iwmmxt_load_creg(wrd);
-                gen_st32(tmp, addr, IS_USER(s));
+                gen_aa32_st32(tmp, addr, IS_USER(s));
             } else {
                 gen_op_iwmmxt_movq_M0_wRn(wrd);
                 tmp = tcg_temp_new_i32();
                 if (insn & (1 << 8)) {
                     if (insn & (1 << 22)) {		/* WSTRD */
-                        tcg_temp_free_i32(tmp);
-                        tcg_gen_qemu_st64(cpu_M0, addr, IS_USER(s));
+                        gen_aa32_st64(cpu_M0, addr, IS_USER(s));
                     } else {				/* WSTRW wRd */
                         tcg_gen_trunc_i64_i32(tmp, cpu_M0);
-                        gen_st32(tmp, addr, IS_USER(s));
+                        gen_aa32_st32(tmp, addr, IS_USER(s));
                     }
                 } else {
                     if (insn & (1 << 22)) {		/* WSTRH */
                         tcg_gen_trunc_i64_i32(tmp, cpu_M0);
-                        gen_st16(tmp, addr, IS_USER(s));
+                        gen_aa32_st16(tmp, addr, IS_USER(s));
                     } else {				/* WSTRB */
                         tcg_gen_trunc_i64_i32(tmp, cpu_M0);
-                        gen_st8(tmp, addr, IS_USER(s));
+                        gen_aa32_st8(tmp, addr, IS_USER(s));
                     }
                 }
             }
+            tcg_temp_free_i32(tmp);
         }
         tcg_temp_free_i32(addr);
         return 0;
@@ -1727,12 +1731,12 @@
             tmp3 = tcg_const_i32((insn & 1) << 5);
             break;
         default:
-            TCGV_UNUSED(tmp2);
-            TCGV_UNUSED(tmp3);
+            TCGV_UNUSED_I32(tmp2);
+            TCGV_UNUSED_I32(tmp3);
         }
         gen_helper_iwmmxt_insr(cpu_M0, cpu_M0, tmp, tmp2, tmp3);
-        tcg_temp_free(tmp3);
-        tcg_temp_free(tmp2);
+        tcg_temp_free_i32(tmp3);
+        tcg_temp_free_i32(tmp2);
         tcg_temp_free_i32(tmp);
         gen_op_iwmmxt_movq_wRn_M0(wrd);
         gen_op_iwmmxt_set_mup();
@@ -2191,7 +2195,7 @@
         tmp = tcg_const_i32((insn >> 20) & 3);
         iwmmxt_load_reg(cpu_V1, rd1);
         gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1, tmp);
-        tcg_temp_free(tmp);
+        tcg_temp_free_i32(tmp);
         gen_op_iwmmxt_movq_wRn_M0(wrd);
         gen_op_iwmmxt_set_mup();
         break;
@@ -2247,7 +2251,7 @@
         gen_op_iwmmxt_movq_M0_wRn(rd0);
         tmp = tcg_const_i32(((insn >> 16) & 0xf0) | (insn & 0x0f));
         gen_helper_iwmmxt_shufh(cpu_M0, cpu_env, cpu_M0, tmp);
-        tcg_temp_free(tmp);
+        tcg_temp_free_i32(tmp);
         gen_op_iwmmxt_movq_wRn_M0(wrd);
         gen_op_iwmmxt_set_mup();
         gen_op_iwmmxt_set_cup();
@@ -2377,7 +2381,7 @@
 static int disas_dsp_insn(CPUARMState *env, DisasContext *s, uint32_t insn)
 {
     int acc, rd0, rd1, rdhi, rdlo;
-    TCGv tmp, tmp2;
+    TCGv_i32 tmp, tmp2;
 
     if ((insn & 0x0ff00f10) == 0x0e200010) {
         /* Multiply with Internal Accumulate Format */
@@ -2446,27 +2450,27 @@
    instruction is not defined.  */
 static int disas_cp_insn(CPUARMState *env, DisasContext *s, uint32_t insn)
 {
-    TCGv tmp, tmp2;
+    TCGv_i32 tmp, tmp2;
     uint32_t rd = (insn >> 12) & 0xf;
     uint32_t cp = (insn >> 8) & 0xf;
 
     if (insn & ARM_CP_RW_BIT) {
         if (!env->cp[cp].cp_read)
             return 1;
-        gen_set_pc_im(s->pc);
+        gen_set_pc_im(s, s->pc);
         tmp = tcg_temp_new_i32();
         tmp2 = tcg_const_i32(insn);
         gen_helper_get_cp(tmp, cpu_env, tmp2);
-        tcg_temp_free(tmp2);
+        tcg_temp_free_i32(tmp2);
         store_reg(s, rd, tmp);
     } else {
         if (!env->cp[cp].cp_write)
             return 1;
-        gen_set_pc_im(s->pc);
+        gen_set_pc_im(s, s->pc);
         tmp = load_reg(s, rd);
         tmp2 = tcg_const_i32(insn);
         gen_helper_set_cp(cpu_env, tmp2, tmp);
-        tcg_temp_free(tmp2);
+        tcg_temp_free_i32(tmp2);
         tcg_temp_free_i32(tmp);
     }
     return 0;
@@ -2504,7 +2508,7 @@
 
 static int cp15_tls_load_store(CPUARMState *env, DisasContext *s, uint32_t insn, uint32_t rd)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     int cpn = (insn >> 16) & 0xf;
     int cpm = insn & 0xf;
     int op = ((insn >> 5) & 7) | ((insn >> 18) & 0x38);
@@ -2556,7 +2560,7 @@
 static int disas_cp15_insn(CPUARMState *env, DisasContext *s, uint32_t insn)
 {
     uint32_t rd;
-    TCGv tmp, tmp2;
+    TCGv_i32 tmp, tmp2;
 
     /* M profile cores use memory mapped registers instead of cp15.  */
     if (arm_feature(env, ARM_FEATURE_M))
@@ -2589,7 +2593,7 @@
         }
         if (!arm_feature(env, ARM_FEATURE_V7)) {
             /* Wait for interrupt.  */
-            gen_set_pc_im(s->pc);
+            gen_set_pc_im(s, s->pc);
             s->is_jmp = DISAS_WFI;
         }
         return 0;
@@ -2599,7 +2603,7 @@
          */
         if (!IS_USER(s) && !arm_feature(env, ARM_FEATURE_V6)) {
             /* Wait for interrupt.  */
-            gen_set_pc_im(s->pc);
+            gen_set_pc_im(s, s->pc);
             s->is_jmp = DISAS_WFI;
             return 0;
         }
@@ -2680,22 +2684,22 @@
 #define VFP_DREG_M(reg, insn) VFP_DREG(reg, insn,  0,  5)
 
 /* Move between integer and VFP cores.  */
-static TCGv gen_vfp_mrs(void)
+static TCGv_i32 gen_vfp_mrs(void)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_mov_i32(tmp, cpu_F0s);
     return tmp;
 }
 
-static void gen_vfp_msr(TCGv tmp)
+static void gen_vfp_msr(TCGv_i32 tmp)
 {
     tcg_gen_mov_i32(cpu_F0s, tmp);
     tcg_temp_free_i32(tmp);
 }
 
-static void gen_neon_dup_u8(TCGv var, int shift)
+static void gen_neon_dup_u8(TCGv_i32 var, int shift)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     if (shift)
         tcg_gen_shri_i32(var, var, shift);
     tcg_gen_ext8u_i32(var, var);
@@ -2706,39 +2710,39 @@
     tcg_temp_free_i32(tmp);
 }
 
-static void gen_neon_dup_low16(TCGv var)
+static void gen_neon_dup_low16(TCGv_i32 var)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_ext16u_i32(var, var);
     tcg_gen_shli_i32(tmp, var, 16);
     tcg_gen_or_i32(var, var, tmp);
     tcg_temp_free_i32(tmp);
 }
 
-static void gen_neon_dup_high16(TCGv var)
+static void gen_neon_dup_high16(TCGv_i32 var)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_andi_i32(var, var, 0xffff0000);
     tcg_gen_shri_i32(tmp, var, 16);
     tcg_gen_or_i32(var, var, tmp);
     tcg_temp_free_i32(tmp);
 }
 
-static TCGv gen_load_and_replicate(DisasContext *s, TCGv addr, int size)
+static TCGv_i32 gen_load_and_replicate(DisasContext *s, TCGv_i32 addr, int size)
 {
     /* Load a single Neon element and replicate into a 32 bit TCG reg */
-    TCGv tmp;
+    TCGv_i32 tmp = tcg_temp_new_i32();
     switch (size) {
     case 0:
-        tmp = gen_ld8u(addr, IS_USER(s));
+        gen_aa32_ld8u(tmp, addr, IS_USER(s));
         gen_neon_dup_u8(tmp, 0);
         break;
     case 1:
-        tmp = gen_ld16u(addr, IS_USER(s));
+        gen_aa32_ld16u(tmp, addr, IS_USER(s));
         gen_neon_dup_low16(tmp);
         break;
     case 2:
-        tmp = gen_ld32(addr, IS_USER(s));
+        gen_aa32_ld32u(tmp, addr, IS_USER(s));
         break;
     default: /* Avoid compiler warnings.  */
         abort();
@@ -2752,9 +2756,9 @@
 {
     uint32_t rd, rn, rm, op, i, n, offset, delta_d, delta_m, bank_mask;
     int dp, veclen;
-    TCGv addr;
-    TCGv tmp;
-    TCGv tmp2;
+    TCGv_i32 addr;
+    TCGv_i32 tmp;
+    TCGv_i32 tmp2;
 
     if (!arm_feature(env, ARM_FEATURE_VFP))
         return 1;
@@ -2848,12 +2852,12 @@
                         switch (size) {
                         case 0:
                             tmp2 = neon_load_reg(rn, pass);
-                            gen_bfi(tmp, tmp2, tmp, offset, 0xff);
+                            tcg_gen_deposit_i32(tmp, tmp2, tmp, offset, 8);
                             tcg_temp_free_i32(tmp2);
                             break;
                         case 1:
                             tmp2 = neon_load_reg(rn, pass);
-                            gen_bfi(tmp, tmp2, tmp, offset, 0xffff);
+                            tcg_gen_deposit_i32(tmp, tmp2, tmp, offset, 16);
                             tcg_temp_free_i32(tmp2);
                             break;
                         case 2:
@@ -2927,7 +2931,6 @@
                     }
                 } else {
                     /* arm->vfp */
-                    tmp = load_reg(s, rd);
                     if (insn & (1 << 21)) {
                         rn >>= 1;
                         /* system register */
@@ -2938,6 +2941,7 @@
                             /* Writes are ignored.  */
                             break;
                         case ARM_VFP_FPSCR:
+                            tmp = load_reg(s, rd);
                             gen_helper_vfp_set_fpscr(cpu_env, tmp);
                             tcg_temp_free_i32(tmp);
                             gen_lookup_tb(s);
@@ -2947,18 +2951,21 @@
                                 return 1;
                             /* TODO: VFP subarchitecture support.
                              * For now, keep the EN bit only */
+                            tmp = load_reg(s, rd);
                             tcg_gen_andi_i32(tmp, tmp, 1 << 30);
                             store_cpu_field(tmp, vfp.xregs[rn]);
                             gen_lookup_tb(s);
                             break;
                         case ARM_VFP_FPINST:
                         case ARM_VFP_FPINST2:
+                            tmp = load_reg(s, rd);
                             store_cpu_field(tmp, vfp.xregs[rn]);
                             break;
                         default:
                             return 1;
                         }
                     } else {
+                        tmp = load_reg(s, rd);
                         gen_vfp_msr(tmp);
                         gen_mov_vreg_F0(0, rn);
                     }
@@ -3178,19 +3185,19 @@
                     case 3: /* sqrt */
                         gen_vfp_sqrt(dp);
                         break;
-                    case 4: /* vcvtb.f32.f16 */
+                    case 4: /* vcvtb.f32.f16, vcvtb.f64.f16 */
                         tmp = gen_vfp_mrs();
                         tcg_gen_ext16u_i32(tmp, tmp);
                         gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp, cpu_env);
                         tcg_temp_free_i32(tmp);
                         break;
-                    case 5: /* vcvtt.f32.f16 */
+                    case 5: /* vcvtt.f32.f16, vcvtt.f64.f16 */
                         tmp = gen_vfp_mrs();
                         tcg_gen_shri_i32(tmp, tmp, 16);
                         gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp, cpu_env);
                         tcg_temp_free_i32(tmp);
                         break;
-                    case 6: /* vcvtb.f16.f32 */
+                    case 6: /* vcvtb.f16.f32, vcvtb.f16.f64 */
                         tmp = tcg_temp_new_i32();
                         gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env);
                         gen_mov_F0_vreg(0, rd);
@@ -3200,7 +3207,7 @@
                         tcg_temp_free_i32(tmp2);
                         gen_vfp_msr(tmp);
                         break;
-                    case 7: /* vcvtt.f16.f32 */
+                    case 7: /* vcvtt.f16.f32, vcvtt.f16.f64 */
                         tmp = tcg_temp_new_i32();
                         gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env);
                         tcg_gen_shli_i32(tmp, tmp, 16);
@@ -3289,26 +3296,25 @@
                         gen_vfp_toul(dp, 32 - rm, 0);
                         break;
                     default: /* undefined */
-                        printf ("rn:%d\n", rn);
                         return 1;
                     }
                     break;
                 default: /* undefined */
-                    printf ("op:%d\n", op);
                     return 1;
                 }
 
                 /* Write back the result.  */
-                if (op == 15 && (rn >= 8 && rn <= 11))
-                    ; /* Comparison, do nothing.  */
-                else if (op == 15 && dp && ((rn & 0x1c) == 0x18))
+                if (op == 15 && (rn >= 8 && rn <= 11)) {
+                    /* Comparison, do nothing.  */
+                } else if (op == 15 && dp && ((rn & 0x1c) == 0x18)) {
                     /* VCVT double to int: always integer result. */
                     gen_mov_vreg_F0(0, rd);
-                else if (op == 15 && rn == 15)
+                } else if (op == 15 && rn == 15) {
                     /* conversion */
                     gen_mov_vreg_F0(!dp, rd);
-                else
+                } else {
                     gen_mov_vreg_F0(dp, rd);
+                }
 
                 /* break out of the loop if we have finished  */
                 if (veclen == 0)
@@ -3459,7 +3465,6 @@
                     offset = 8;
                 else
                     offset = 4;
-                tmp = tcg_const_i32(offset);
                 for (i = 0; i < n; i++) {
                     if (insn & ARM_CP_RW_BIT) {
                         /* load */
@@ -3470,9 +3475,8 @@
                         gen_mov_F0_vreg(dp, rd + i);
                         gen_vfp_st(s, dp, addr);
                     }
-                    tcg_gen_add_i32(addr, addr, tmp);
+                    tcg_gen_addi_i32(addr, addr, offset);
                 }
-                tcg_temp_free_i32(tmp);
                 if (w) {
                     /* writeback */
                     if (insn & (1 << 24))
@@ -3498,17 +3502,17 @@
     return 0;
 }
 
-static inline void gen_goto_tb(DisasContext *s, int n, uint32_t dest)
+static inline void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
 {
     TranslationBlock *tb;
 
     tb = s->tb;
     if ((tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK)) {
         tcg_gen_goto_tb(n);
-        gen_set_pc_im(dest);
-        tcg_gen_exit_tb((tcg_target_long)tb + n);
+        gen_set_pc_im(s, dest);
+        tcg_gen_exit_tb((uintptr_t)tb + n);
     } else {
-        gen_set_pc_im(dest);
+        gen_set_pc_im(s, dest);
         tcg_gen_exit_tb(0);
     }
 }
@@ -3526,7 +3530,7 @@
     }
 }
 
-static inline void gen_mulxy(TCGv t0, TCGv t1, int x, int y)
+static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y)
 {
     if (x)
         tcg_gen_sari_i32(t0, t0, 16);
@@ -3573,9 +3577,9 @@
 }
 
 /* Returns nonzero if access to the PSR is not permitted. Marks t0 as dead. */
-static int gen_set_psr(DisasContext *s, uint32_t mask, int spsr, TCGv t0)
+static int gen_set_psr(DisasContext *s, uint32_t mask, int spsr, TCGv_i32 t0)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     if (spsr) {
         /* ??? This is also undefined in system mode.  */
         if (IS_USER(s))
@@ -3597,16 +3601,16 @@
 /* Returns nonzero if access to the PSR is not permitted.  */
 static int gen_set_psr_im(DisasContext *s, uint32_t mask, int spsr, uint32_t val)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     tmp = tcg_temp_new_i32();
     tcg_gen_movi_i32(tmp, val);
     return gen_set_psr(s, mask, spsr, tmp);
 }
 
 /* Generate an old-style exception return. Marks pc as dead. */
-static void gen_exception_return(DisasContext *s, TCGv pc)
+static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     store_reg(s, 15, pc);
     tmp = load_cpu_field(spsr);
     gen_set_cpsr(tmp, 0xffffffff);
@@ -3615,7 +3619,7 @@
 }
 
 /* Generate a v6 exception return.  Marks both values as dead.  */
-static void gen_rfe(DisasContext *s, TCGv pc, TCGv cpsr)
+static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
 {
     gen_set_cpsr(cpsr, 0xffffffff);
     tcg_temp_free_i32(cpsr);
@@ -3628,7 +3632,7 @@
 {
     if (s->condexec_mask) {
         uint32_t val = (s->condexec_cond << 4) | (s->condexec_mask >> 1);
-        TCGv tmp = tcg_temp_new_i32();
+        TCGv_i32 tmp = tcg_temp_new_i32();
         tcg_gen_movi_i32(tmp, val);
         store_cpu_field(tmp, condexec_bits);
     }
@@ -3637,7 +3641,7 @@
 static void gen_exception_insn(DisasContext *s, int offset, int excp)
 {
     gen_set_condexec(s);
-    gen_set_pc_im(s->pc - offset);
+    gen_set_pc_im(s, s->pc - offset);
     gen_exception(excp);
     s->is_jmp = DISAS_JUMP;
 }
@@ -3646,12 +3650,16 @@
 {
     switch (val) {
     case 3: /* wfi */
-        gen_set_pc_im(s->pc);
+        gen_set_pc_im(s, s->pc);
         s->is_jmp = DISAS_WFI;
         break;
     case 2: /* wfe */
+        gen_set_pc_im(s, s->pc);
+        s->is_jmp = DISAS_WFE;
+        break;
     case 4: /* sev */
-        /* TODO: Implement SEV and WFE.  May help SMP performance.  */
+    case 5: /* sevl */
+        /* TODO: Implement SEV, SEVL and WFE.  May help SMP performance.  */
     default: /* nop */
         break;
     }
@@ -3659,7 +3667,7 @@
 
 #define CPU_V001 cpu_V0, cpu_V0, cpu_V1
 
-static inline void gen_neon_add(int size, TCGv t0, TCGv t1)
+static inline void gen_neon_add(int size, TCGv_i32 t0, TCGv_i32 t1)
 {
     switch (size) {
     case 0: gen_helper_neon_add_u8(t0, t0, t1); break;
@@ -3669,7 +3677,7 @@
     }
 }
 
-static inline void gen_neon_rsb(int size, TCGv t0, TCGv t1)
+static inline void gen_neon_rsb(int size, TCGv_i32 t0, TCGv_i32 t1)
 {
     switch (size) {
     case 0: gen_helper_neon_sub_u8(t0, t1, t0); break;
@@ -3731,22 +3739,22 @@
     default: return 1; \
     }} while (0)
 
-static TCGv neon_load_scratch(int scratch)
+static TCGv_i32 neon_load_scratch(int scratch)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     tcg_gen_ld_i32(tmp, cpu_env, offsetof(CPUARMState, vfp.scratch[scratch]));
     return tmp;
 }
 
-static void neon_store_scratch(int scratch, TCGv var)
+static void neon_store_scratch(int scratch, TCGv_i32 var)
 {
     tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, vfp.scratch[scratch]));
     tcg_temp_free_i32(var);
 }
 
-static inline TCGv neon_get_scalar(int size, int reg)
+static inline TCGv_i32 neon_get_scalar(int size, int reg)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     if (size == 1) {
         tmp = neon_load_reg(reg & 7, reg >> 4);
         if (reg & 8) {
@@ -3762,7 +3770,7 @@
 
 static int gen_neon_unzip(int rd, int rm, int size, int q)
 {
-    TCGv tmp, tmp2;
+    TCGv_i32 tmp, tmp2;
     if (!q && size == 2) {
         return 1;
     }
@@ -3801,7 +3809,7 @@
 
 static int gen_neon_zip(int rd, int rm, int size, int q)
 {
-    TCGv tmp, tmp2;
+    TCGv_i32 tmp, tmp2;
     if (!q && size == 2) {
         return 1;
     }
@@ -3838,9 +3846,9 @@
     return 0;
 }
 
-static void gen_neon_trn_u8(TCGv t0, TCGv t1)
+static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
 {
-    TCGv rd, tmp;
+    TCGv_i32 rd, tmp;
 
     rd = tcg_temp_new_i32();
     tmp = tcg_temp_new_i32();
@@ -3860,9 +3868,9 @@
     tcg_temp_free_i32(rd);
 }
 
-static void gen_neon_trn_u16(TCGv t0, TCGv t1)
+static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
 {
-    TCGv rd, tmp;
+    TCGv_i32 rd, tmp;
 
     rd = tcg_temp_new_i32();
     tmp = tcg_temp_new_i32();
@@ -3913,9 +3921,9 @@
     int pass;
     int load;
     int shift;
-    TCGv addr;
-    TCGv tmp;
-    TCGv tmp2;
+    TCGv_i32 addr;
+    TCGv_i32 tmp;
+    TCGv_i32 tmp2;
 
     if (!s->vfp_enabled)
       return 1;
@@ -4062,22 +4070,24 @@
             load_reg_var(s, addr, rn);
             for (reg = 0; reg < nregs; reg++) {
                 if (load) {
+                    tmp = tcg_temp_new_i32();
                     switch (size) {
                     case 0:
-                        tmp = gen_ld8u(addr, IS_USER(s));
+                        gen_aa32_ld8u(tmp, addr, IS_USER(s));
                         break;
                     case 1:
-                        tmp = gen_ld16u(addr, IS_USER(s));
+                        gen_aa32_ld16u(tmp, addr, IS_USER(s));
                         break;
                     case 2:
-                        tmp = gen_ld32(addr, IS_USER(s));
+                        gen_aa32_ld32u(tmp, addr, IS_USER(s));
                         break;
                     default: /* Avoid compiler warnings.  */
                         abort();
                     }
                     if (size != 2) {
                         tmp2 = neon_load_reg(rd, pass);
-                        gen_bfi(tmp, tmp2, tmp, shift, size ? 0xffff : 0xff);
+                        tcg_gen_deposit_i32(tmp, tmp2, tmp,
+                                            shift, size ? 16 : 8);
                         tcg_temp_free_i32(tmp2);
                     }
                     neon_store_reg(rd, pass, tmp);
@@ -4087,15 +4097,16 @@
                         tcg_gen_shri_i32(tmp, tmp, shift);
                     switch (size) {
                     case 0:
-                        gen_st8(tmp, addr, IS_USER(s));
+                        gen_aa32_st8(tmp, addr, IS_USER(s));
                         break;
                     case 1:
-                        gen_st16(tmp, addr, IS_USER(s));
+                        gen_aa32_st16(tmp, addr, IS_USER(s));
                         break;
                     case 2:
-                        gen_st32(tmp, addr, IS_USER(s));
+                        gen_aa32_st32(tmp, addr, IS_USER(s));
                         break;
                     }
+                    tcg_temp_free_i32(tmp);
                 }
                 rd += stride;
                 tcg_gen_addi_i32(addr, addr, 1 << size);
@@ -4105,13 +4116,13 @@
         }
     }
     if (rm != 15) {
-        TCGv base;
+        TCGv_i32 base;
 
         base = load_reg(s, rn);
         if (rm == 13) {
             tcg_gen_addi_i32(base, base, stride);
         } else {
-            TCGv index;
+            TCGv_i32 index;
             index = load_reg(s, rm);
             tcg_gen_add_i32(base, base, index);
             tcg_temp_free_i32(index);
@@ -4122,14 +4133,14 @@
 }
 
 /* Bitwise select.  dest = c ? t : f.  Clobbers T and F.  */
-static void gen_neon_bsl(TCGv dest, TCGv t, TCGv f, TCGv c)
+static void gen_neon_bsl(TCGv_i32 dest, TCGv_i32 t, TCGv_i32 f, TCGv_i32 c)
 {
     tcg_gen_and_i32(t, t, c);
     tcg_gen_andc_i32(f, f, c);
     tcg_gen_or_i32(dest, t, f);
 }
 
-static inline void gen_neon_narrow(int size, TCGv dest, TCGv_i64 src)
+static inline void gen_neon_narrow(int size, TCGv_i32 dest, TCGv_i64 src)
 {
     switch (size) {
     case 0: gen_helper_neon_narrow_u8(dest, src); break;
@@ -4139,7 +4150,7 @@
     }
 }
 
-static inline void gen_neon_narrow_sats(int size, TCGv dest, TCGv_i64 src)
+static inline void gen_neon_narrow_sats(int size, TCGv_i32 dest, TCGv_i64 src)
 {
     switch (size) {
     case 0: gen_helper_neon_narrow_sat_s8(dest, cpu_env, src); break;
@@ -4149,7 +4160,7 @@
     }
 }
 
-static inline void gen_neon_narrow_satu(int size, TCGv dest, TCGv_i64 src)
+static inline void gen_neon_narrow_satu(int size, TCGv_i32 dest, TCGv_i64 src)
 {
     switch (size) {
     case 0: gen_helper_neon_narrow_sat_u8(dest, cpu_env, src); break;
@@ -4159,7 +4170,7 @@
     }
 }
 
-static inline void gen_neon_unarrow_sats(int size, TCGv dest, TCGv_i64 src)
+static inline void gen_neon_unarrow_sats(int size, TCGv_i32 dest, TCGv_i64 src)
 {
     switch (size) {
     case 0: gen_helper_neon_unarrow_sat8(dest, cpu_env, src); break;
@@ -4169,7 +4180,7 @@
     }
 }
 
-static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift,
+static inline void gen_neon_shift_narrow(int size, TCGv_i32 var, TCGv_i32 shift,
                                          int q, int u)
 {
     if (q) {
@@ -4203,7 +4214,7 @@
     }
 }
 
-static inline void gen_neon_widen(TCGv_i64 dest, TCGv src, int size, int u)
+static inline void gen_neon_widen(TCGv_i64 dest, TCGv_i32 src, int size, int u)
 {
     if (u) {
         switch (size) {
@@ -4262,7 +4273,8 @@
     }
 }
 
-static inline void gen_neon_mull(TCGv_i64 dest, TCGv a, TCGv b, int size, int u)
+static inline void gen_neon_mull(TCGv_i64 dest, TCGv_i32 a, TCGv_i32 b,
+                                 int size, int u)
 {
     TCGv_i64 tmp;
 
@@ -4292,7 +4304,8 @@
     }
 }
 
-static void gen_neon_narrow_op(int op, int u, int size, TCGv dest, TCGv_i64 src)
+static void gen_neon_narrow_op(int op, int u, int size,
+                               TCGv_i32 dest, TCGv_i64 src)
 {
     if (op) {
         if (u) {
@@ -4337,6 +4350,7 @@
 #define NEON_3R_VPMIN 21
 #define NEON_3R_VQDMULH_VQRDMULH 22
 #define NEON_3R_VPADD 23
+#define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */
 #define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */
 #define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */
 #define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */
@@ -4369,6 +4383,7 @@
     [NEON_3R_VPMIN] = 0x7,
     [NEON_3R_VQDMULH_VQRDMULH] = 0x6,
     [NEON_3R_VPADD] = 0x7,
+    [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */
     [NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */
     [NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */
     [NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */
@@ -4386,6 +4401,8 @@
 #define NEON_2RM_VREV16 2
 #define NEON_2RM_VPADDL 4
 #define NEON_2RM_VPADDL_U 5
+#define NEON_2RM_AESE 6 /* Includes AESD */
+#define NEON_2RM_AESMC 7 /* Includes AESIMC */
 #define NEON_2RM_VCLS 8
 #define NEON_2RM_VCLZ 9
 #define NEON_2RM_VCNT 10
@@ -4415,8 +4432,22 @@
 #define NEON_2RM_VMOVN 36 /* Includes VQMOVN, VQMOVUN */
 #define NEON_2RM_VQMOVN 37 /* Includes VQMOVUN */
 #define NEON_2RM_VSHLL 38
+#define NEON_2RM_VRINTN 40
+#define NEON_2RM_VRINTX 41
+#define NEON_2RM_VRINTA 42
+#define NEON_2RM_VRINTZ 43
 #define NEON_2RM_VCVT_F16_F32 44
+#define NEON_2RM_VRINTM 45
 #define NEON_2RM_VCVT_F32_F16 46
+#define NEON_2RM_VRINTP 47
+#define NEON_2RM_VCVTAU 48
+#define NEON_2RM_VCVTAS 49
+#define NEON_2RM_VCVTNU 50
+#define NEON_2RM_VCVTNS 51
+#define NEON_2RM_VCVTPU 52
+#define NEON_2RM_VCVTPS 53
+#define NEON_2RM_VCVTMU 54
+#define NEON_2RM_VCVTMS 55
 #define NEON_2RM_VRECPE 56
 #define NEON_2RM_VRSQRTE 57
 #define NEON_2RM_VRECPE_F 58
@@ -4443,6 +4474,8 @@
     [NEON_2RM_VREV16] = 0x1,
     [NEON_2RM_VPADDL] = 0x7,
     [NEON_2RM_VPADDL_U] = 0x7,
+    [NEON_2RM_AESE] = 0x1,
+    [NEON_2RM_AESMC] = 0x1,
     [NEON_2RM_VCLS] = 0x7,
     [NEON_2RM_VCLZ] = 0x7,
     [NEON_2RM_VCNT] = 0x1,
@@ -4472,8 +4505,22 @@
     [NEON_2RM_VMOVN] = 0x7,
     [NEON_2RM_VQMOVN] = 0x7,
     [NEON_2RM_VSHLL] = 0x7,
+    [NEON_2RM_VRINTN] = 0x4,
+    [NEON_2RM_VRINTX] = 0x4,
+    [NEON_2RM_VRINTA] = 0x4,
+    [NEON_2RM_VRINTZ] = 0x4,
     [NEON_2RM_VCVT_F16_F32] = 0x2,
+    [NEON_2RM_VRINTM] = 0x4,
     [NEON_2RM_VCVT_F32_F16] = 0x2,
+    [NEON_2RM_VRINTP] = 0x4,
+    [NEON_2RM_VCVTAU] = 0x4,
+    [NEON_2RM_VCVTAS] = 0x4,
+    [NEON_2RM_VCVTNU] = 0x4,
+    [NEON_2RM_VCVTNS] = 0x4,
+    [NEON_2RM_VCVTPU] = 0x4,
+    [NEON_2RM_VCVTPS] = 0x4,
+    [NEON_2RM_VCVTMU] = 0x4,
+    [NEON_2RM_VCVTMS] = 0x4,
     [NEON_2RM_VRECPE] = 0x4,
     [NEON_2RM_VRSQRTE] = 0x4,
     [NEON_2RM_VRECPE_F] = 0x4,
@@ -4501,7 +4548,7 @@
     int pairwise;
     int u;
     uint32_t imm, mask;
-    TCGv tmp, tmp2, tmp3, tmp4, tmp5;
+    TCGv_i32 tmp, tmp2, tmp3, tmp4, tmp5;
     TCGv_i64 tmp64;
 
     if (!s->vfp_enabled)
@@ -4964,7 +5011,7 @@
                     size--;
             }
             shift = (insn >> 16) & ((1 << (3 + size)) - 1);
-            /* To avoid excessive dumplication of ops we implement shift
+            /* To avoid excessive duplication of ops we implement shift
                by immediate using the variable shift operations.  */
             if (op < 8) {
                 /* Shift by immediate:
@@ -5063,7 +5110,8 @@
                     } else { /* size < 3 */
                         /* Operands in T0 and T1.  */
                         tmp = neon_load_reg(rm, pass);
-                        tmp2 = tcg_const_i32(imm);
+                        tmp2 = tcg_temp_new_i32();
+                        tcg_gen_movi_i32(tmp2, imm);
                         switch (op) {
                         case 0:  /* VSHR */
                         case 1:  /* VSRA */
@@ -5438,11 +5486,11 @@
                     tmp = neon_load_reg(rn, 1);
                     neon_store_scratch(2, tmp);
                 }
-                TCGV_UNUSED(tmp3);
+                TCGV_UNUSED_I32(tmp3);
                 for (pass = 0; pass < 2; pass++) {
                     if (src1_wide) {
                         neon_load_reg64(cpu_V0, rn + pass);
-                        TCGV_UNUSED(tmp);
+                        TCGV_UNUSED_I32(tmp);
                     } else {
                         if (pass == 1 && rd == rn) {
                             tmp = neon_load_scratch(2);
@@ -5455,7 +5503,7 @@
                     }
                     if (src2_wide) {
                         neon_load_reg64(cpu_V1, rm + pass);
-                        TCGV_UNUSED(tmp2);
+                        TCGV_UNUSED_I32(tmp2);
                     } else {
                         if (pass == 1 && rd == rm) {
                             tmp2 = neon_load_scratch(2);
@@ -5869,7 +5917,7 @@
                     if (rm & 1) {
                         return 1;
                     }
-                    TCGV_UNUSED(tmp2);
+                    TCGV_UNUSED_I32(tmp2);
                     for (pass = 0; pass < 2; pass++) {
                         neon_load_reg64(cpu_V0, rm + pass);
                         tmp = tcg_temp_new_i32();
@@ -5951,7 +5999,7 @@
                         if (neon_2rm_is_float_op(op)) {
                             tcg_gen_ld_f32(cpu_F0s, cpu_env,
                                            neon_reg_offset(rm, pass));
-                            TCGV_UNUSED(tmp);
+                            TCGV_UNUSED_I32(tmp);
                         } else {
                             tmp = neon_load_reg(rm, pass);
                         }
@@ -6024,7 +6072,7 @@
                             case 2: gen_helper_neon_cgt_s32(tmp, tmp, tmp2); break;
                             default: abort();
                             }
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             if (op == NEON_2RM_VCLE0) {
                                 tcg_gen_not_i32(tmp, tmp);
                             }
@@ -6037,7 +6085,7 @@
                             case 2: gen_helper_neon_cge_s32(tmp, tmp, tmp2); break;
                             default: abort();
                             }
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             if (op == NEON_2RM_VCLT0) {
                                 tcg_gen_not_i32(tmp, tmp);
                             }
@@ -6050,7 +6098,7 @@
                             case 2: gen_helper_neon_ceq_u32(tmp, tmp, tmp2); break;
                             default: abort();
                             }
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             break;
                         case NEON_2RM_VABS:
                             switch(size) {
@@ -6063,14 +6111,14 @@
                         case NEON_2RM_VNEG:
                             tmp2 = tcg_const_i32(0);
                             gen_neon_rsb(size, tmp, tmp2);
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             break;
                         case NEON_2RM_VCGT0_F:
                         {
                             TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
                             gen_helper_neon_cgt_f32(tmp, tmp, tmp2, fpstatus);
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             break;
                         }
                         case NEON_2RM_VCGE0_F:
@@ -6078,7 +6126,7 @@
                             TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
                             gen_helper_neon_cge_f32(tmp, tmp, tmp2, fpstatus);
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             tcg_temp_free_ptr(fpstatus);
                             break;
                         }
@@ -6087,7 +6135,7 @@
                             TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
                             gen_helper_neon_ceq_f32(tmp, tmp, tmp2, fpstatus);
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             tcg_temp_free_ptr(fpstatus);
                             break;
                         }
@@ -6096,7 +6144,7 @@
                             TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
                             gen_helper_neon_cge_f32(tmp, tmp2, tmp, fpstatus);
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             tcg_temp_free_ptr(fpstatus);
                             break;
                         }
@@ -6105,7 +6153,7 @@
                             TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
                             gen_helper_neon_cgt_f32(tmp, tmp2, tmp, fpstatus);
-                            tcg_temp_free(tmp2);
+                            tcg_temp_free_i32(tmp2);
                             tcg_temp_free_ptr(fpstatus);
                             break;
                         }
@@ -6240,7 +6288,7 @@
     int op1 = (insn >> 21) & 7;
     int op2 = (insn >> 5) & 7;
     int rt = (insn >> 12) & 0xf;
-    TCGv tmp;
+    TCGv_i32 tmp;
 
     /* Minimal set of debug registers, since we don't support debug */
     if (op1 == 0 && crn == 0 && op2 == 0) {
@@ -6300,7 +6348,7 @@
     int op1 = (insn >> 21) & 7;
     int op2 = (insn >> 5) & 7;
     int rt = (insn >> 12) & 0xf;
-    TCGv tmp;
+    TCGv_i32 tmp;
 
     /* Minimal set of debug registers, since we don't support debug */
     if (op1 == 0 && crn == 0 && op2 == 0) {
@@ -6393,7 +6441,7 @@
 /* Store a 64-bit value to a register pair.  Clobbers val.  */
 static void gen_storeq_reg(DisasContext *s, int rlow, int rhigh, TCGv_i64 val)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     tmp = tcg_temp_new_i32();
     tcg_gen_trunc_i64_i32(tmp, val);
     store_reg(s, rlow, tmp);
@@ -6407,7 +6455,7 @@
 static void gen_addq_lo(DisasContext *s, TCGv_i64 val, int rlow)
 {
     TCGv_i64 tmp;
-    TCGv tmp2;
+    TCGv_i32 tmp2;
 
     /* Load value and extend to 64 bits.  */
     tmp = tcg_temp_new_i64();
@@ -6422,8 +6470,8 @@
 static void gen_addq(DisasContext *s, TCGv_i64 val, int rlow, int rhigh)
 {
     TCGv_i64 tmp;
-    TCGv tmpl;
-    TCGv tmph;
+    TCGv_i32 tmpl;
+    TCGv_i32 tmph;
 
     /* Load 64-bit value rd:rn.  */
     tmpl = load_reg(s, rlow);
@@ -6439,7 +6487,7 @@
 /* Set N and Z flags from a 64-bit value.  */
 static void gen_logicq_cc(TCGv_i64 val)
 {
-    TCGv tmp = tcg_temp_new_i32();
+    TCGv_i32 tmp = tcg_temp_new_i32();
     gen_helper_logicq_cc(tmp, val);
     gen_logic_CC(tmp);
     tcg_temp_free_i32(tmp);
@@ -6447,7 +6495,7 @@
 
 /* Load/Store exclusive instructions are implemented by remembering
    the value/address loaded, and seeing if these are the same
-   when the store is performed. This should be is sufficient to implement
+   when the store is performed. This should be sufficient to implement
    the architecturally mandated semantics, and avoids having to monitor
    regular stores.
 
@@ -6455,20 +6503,20 @@
    this sequence is effectively atomic.  In user emulation mode we
    throw an exception and handle the atomic operation elsewhere.  */
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
-                               TCGv addr, int size)
+                               TCGv_i32 addr, int size)
 {
-    TCGv tmp;
+    TCGv_i32 tmp = tcg_temp_new_i32();
 
     switch (size) {
     case 0:
-        tmp = gen_ld8u(addr, IS_USER(s));
+        gen_aa32_ld8u(tmp, addr, IS_USER(s));
         break;
     case 1:
-        tmp = gen_ld16u(addr, IS_USER(s));
+        gen_aa32_ld16u(tmp, addr, IS_USER(s));
         break;
     case 2:
     case 3:
-        tmp = gen_ld32(addr, IS_USER(s));
+        gen_aa32_ld32u(tmp, addr, IS_USER(s));
         break;
     default:
         abort();
@@ -6476,9 +6524,9 @@
     tcg_gen_mov_i32(cpu_exclusive_val, tmp);
     store_reg(s, rt, tmp);
     if (size == 3) {
-        TCGv tmp2 = tcg_temp_new_i32();
+        TCGv_i32 tmp2 = tcg_temp_new_i32();
         tcg_gen_addi_i32(tmp2, addr, 4);
-        tmp = gen_ld32(tmp2, IS_USER(s));
+        gen_aa32_ld32u(tmp, tmp2, IS_USER(s));
         tcg_temp_free_i32(tmp2);
         tcg_gen_mov_i32(cpu_exclusive_high, tmp);
         store_reg(s, rt2, tmp);
@@ -6493,7 +6541,7 @@
 
 #ifdef CONFIG_USER_ONLY
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv addr, int size)
+                                TCGv_i32 addr, int size)
 {
     tcg_gen_mov_i32(cpu_exclusive_test, addr);
     tcg_gen_movi_i32(cpu_exclusive_info,
@@ -6502,9 +6550,9 @@
 }
 #else
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv addr, int size)
+                                TCGv_i32 addr, int size)
 {
-    TCGv tmp;
+    TCGv_i32 tmp;
     int done_label;
     int fail_label;
 
@@ -6517,49 +6565,52 @@
     fail_label = gen_new_label();
     done_label = gen_new_label();
     tcg_gen_brcond_i32(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
+    tmp = tcg_temp_new_i32();
     switch (size) {
     case 0:
-        tmp = gen_ld8u(addr, IS_USER(s));
+        gen_aa32_ld8u(tmp, addr, IS_USER(s));
         break;
     case 1:
-        tmp = gen_ld16u(addr, IS_USER(s));
+        gen_aa32_ld16u(tmp, addr, IS_USER(s));
         break;
     case 2:
     case 3:
-        tmp = gen_ld32(addr, IS_USER(s));
+        gen_aa32_ld32u(tmp, addr, IS_USER(s));
         break;
     default:
         abort();
     }
     tcg_gen_brcond_i32(TCG_COND_NE, tmp, cpu_exclusive_val, fail_label);
-    tcg_temp_free_i32(tmp);
     if (size == 3) {
-        TCGv tmp2 = tcg_temp_new_i32();
+        TCGv_i32 tmp2 = tcg_temp_new_i32();
         tcg_gen_addi_i32(tmp2, addr, 4);
-        tmp = gen_ld32(tmp2, IS_USER(s));
+        gen_aa32_ld32u(tmp, tmp2, IS_USER(s));
         tcg_temp_free_i32(tmp2);
         tcg_gen_brcond_i32(TCG_COND_NE, tmp, cpu_exclusive_high, fail_label);
-        tcg_temp_free_i32(tmp);
     }
+    tcg_temp_free_i32(tmp);
+
     tmp = load_reg(s, rt);
     switch (size) {
     case 0:
-        gen_st8(tmp, addr, IS_USER(s));
+        gen_aa32_st8(tmp, addr, IS_USER(s));
         break;
     case 1:
-        gen_st16(tmp, addr, IS_USER(s));
+        gen_aa32_st16(tmp, addr, IS_USER(s));
         break;
     case 2:
     case 3:
-        gen_st32(tmp, addr, IS_USER(s));
+        gen_aa32_st32(tmp, addr, IS_USER(s));
         break;
     default:
         abort();
     }
+    tcg_temp_free_i32(tmp);
     if (size == 3) {
         tcg_gen_addi_i32(addr, addr, 4);
         tmp = load_reg(s, rt2);
-        gen_st32(tmp, addr, IS_USER(s));
+        gen_aa32_st32(tmp, addr, IS_USER(s));
+        tcg_temp_free_i32(tmp);
     }
     tcg_gen_movi_i32(cpu_R[rd], 0);
     tcg_gen_br(done_label);
@@ -6573,10 +6624,10 @@
 static void disas_arm_insn(CPUARMState * env, DisasContext *s)
 {
     unsigned int cond, insn, val, op1, i, shift, rm, rs, rn, rd, sh;
-    TCGv tmp;
-    TCGv tmp2;
-    TCGv tmp3;
-    TCGv addr;
+    TCGv_i32 tmp;
+    TCGv_i32 tmp2;
+    TCGv_i32 tmp3;
+    TCGv_i32 addr;
     TCGv_i64 tmp64;
 
     insn = cpu_ldl_code(env, s->pc);
@@ -6683,10 +6734,10 @@
             if (offset)
                 tcg_gen_addi_i32(addr, addr, offset);
             tmp = load_reg(s, 14);
-            gen_st32(tmp, addr, 0);
+            gen_aa32_st32(tmp, addr, 0);
             tmp = load_cpu_field(spsr);
             tcg_gen_addi_i32(addr, addr, 4);
-            gen_st32(tmp, addr, 0);
+            gen_aa32_st32(tmp, addr, 0);
             if (insn & (1 << 21)) {
                 /* Base writeback.  */
                 switch (i) {
@@ -6725,9 +6776,11 @@
             if (offset)
                 tcg_gen_addi_i32(addr, addr, offset);
             /* Load PC into tmp and CPSR into tmp2.  */
-            tmp = gen_ld32(addr, 0);
+            tmp = tcg_temp_new_i32();
+            gen_aa32_ld32u(tmp, addr, 0);
             tcg_gen_addi_i32(addr, addr, 4);
-            tmp2 = gen_ld32(addr, 0);
+            tmp2 = tcg_temp_new_i32();
+            gen_aa32_ld32u(tmp2, addr, 0);
             if (insn & (1 << 21)) {
                 /* Base writeback.  */
                 switch (i) {
@@ -6809,7 +6862,7 @@
         /* if not always execute, we generate a conditional jump to
            next instruction */
         s->condlabel = gen_new_label();
-        gen_test_cc(cond ^ 1, s->condlabel);
+        arm_gen_test_cc(cond ^ 1, s->condlabel);
         s->condjmp = 1;
     }
     if ((insn & 0x0f900000) == 0x03000000) {
@@ -7034,7 +7087,7 @@
             rn = (insn >> 16) & 0xf;
             tmp = load_reg(s, rn);
         } else {
-            TCGV_UNUSED(tmp);
+            TCGV_UNUSED_I32(tmp);
         }
         rd = (insn >> 12) & 0xf;
         switch(op1) {
@@ -7252,10 +7305,11 @@
                     if (insn & (1 << 23)) {
                         /* load/store exclusive */
                         op1 = (insn >> 21) & 0x3;
-                        if (op1)
+                        if (op1) {
                             ARCH(6K);
-                        else
+                        } else {
                             ARCH(6);
+                        }
                         addr = tcg_temp_local_new_i32();
                         load_reg_var(s, addr, rn);
                         if (insn & (1 << 20)) {
@@ -7294,7 +7348,7 @@
                                 abort();
                             }
                         }
-                        tcg_temp_free(addr);
+                        tcg_temp_free_i32(addr);
                     } else {
                         /* SWP instruction */
                         rm = (insn) & 0xf;
@@ -7304,13 +7358,15 @@
                            so it is good enough.  */
                         addr = load_reg(s, rn);
                         tmp = load_reg(s, rm);
+                        tmp2 = tcg_temp_new_i32();
                         if (insn & (1 << 22)) {
-                            tmp2 = gen_ld8u(addr, IS_USER(s));
-                            gen_st8(tmp, addr, IS_USER(s));
+                            gen_aa32_ld8u(tmp2, addr, IS_USER(s));
+                            gen_aa32_st8(tmp, addr, IS_USER(s));
                         } else {
-                            tmp2 = gen_ld32(addr, IS_USER(s));
-                            gen_st32(tmp, addr, IS_USER(s));
+                            gen_aa32_ld32u(tmp2, addr, IS_USER(s));
+                            gen_aa32_st32(tmp, addr, IS_USER(s));
                         }
+                        tcg_temp_free_i32(tmp);
                         tcg_temp_free_i32(addr);
                         store_reg(s, rd, tmp2);
                     }
@@ -7327,16 +7383,17 @@
                 address_offset = 0;
                 if (insn & (1 << 20)) {
                     /* load */
+                    tmp = tcg_temp_new_i32();
                     switch(sh) {
                     case 1:
-                        tmp = gen_ld16u(addr, IS_USER(s));
+                        gen_aa32_ld16u(tmp, addr, IS_USER(s));
                         break;
                     case 2:
-                        tmp = gen_ld8s(addr, IS_USER(s));
+                        gen_aa32_ld8s(tmp, addr, IS_USER(s));
                         break;
                     default:
                     case 3:
-                        tmp = gen_ld16s(addr, IS_USER(s));
+                        gen_aa32_ld16s(tmp, addr, IS_USER(s));
                         break;
                     }
                     load = 1;
@@ -7346,17 +7403,20 @@
                     if (sh & 1) {
                         /* store */
                         tmp = load_reg(s, rd);
-                        gen_st32(tmp, addr, IS_USER(s));
+                        gen_aa32_st32(tmp, addr, IS_USER(s));
+                        tcg_temp_free_i32(tmp);
                         tcg_gen_addi_i32(addr, addr, 4);
                         tmp = load_reg(s, rd + 1);
-                        gen_st32(tmp, addr, IS_USER(s));
+                        gen_aa32_st32(tmp, addr, IS_USER(s));
                         load = 0;
                     } else {
                         /* load */
-                        tmp = gen_ld32(addr, IS_USER(s));
+                        tmp = tcg_temp_new_i32();
+                        gen_aa32_ld32u(tmp, addr, IS_USER(s));
                         store_reg(s, rd, tmp);
                         tcg_gen_addi_i32(addr, addr, 4);
-                        tmp = gen_ld32(addr, IS_USER(s));
+                        tmp = tcg_temp_new_i32();
+                        gen_aa32_ld32u(tmp, addr, IS_USER(s));
                         rd++;
                         load = 1;
                     }
@@ -7364,7 +7424,7 @@
                 } else {
                     /* store */
                     tmp = load_reg(s, rd);
-                    gen_st16(tmp, addr, IS_USER(s));
+                    gen_aa32_st16(tmp, addr, IS_USER(s));
                     load = 0;
                 }
                 /* Perform base writeback before the loaded value to
@@ -7489,7 +7549,7 @@
                         case 4: gen_uxtb16(tmp);  break;
                         case 6: gen_uxtb(tmp);    break;
                         case 7: gen_uxth(tmp);    break;
-                        default: tcg_temp_free_i32(tmp); goto illegal_op;
+                        default: goto illegal_op;
                         }
                         if (rn != 15) {
                             tmp2 = load_reg(s, rn);
@@ -7611,7 +7671,7 @@
                         }
                         if (i != 32) {
                             tmp2 = load_reg(s, rd);
-                            gen_bfi(tmp, tmp2, tmp, shift, (1u << i) - 1);
+                            tcg_gen_deposit_i32(tmp, tmp2, tmp, shift, i);
                             tcg_temp_free_i32(tmp2);
                         }
                         store_reg(s, rd, tmp);
@@ -7659,18 +7719,21 @@
                 gen_add_data_offset(s, insn, tmp2);
             if (insn & (1 << 20)) {
                 /* load */
+                tmp = tcg_temp_new_i32();
                 if (insn & (1 << 22)) {
-                    tmp = gen_ld8u(tmp2, i);
+                    gen_aa32_ld8u(tmp, tmp2, i);
                 } else {
-                    tmp = gen_ld32(tmp2, i);
+                    gen_aa32_ld32u(tmp, tmp2, i);
                 }
             } else {
                 /* store */
                 tmp = load_reg(s, rd);
-                if (insn & (1 << 22))
-                    gen_st8(tmp, tmp2, i);
-                else
-                    gen_st32(tmp, tmp2, i);
+                if (insn & (1 << 22)) {
+                    gen_aa32_st8(tmp, tmp2, i);
+                } else {
+                    gen_aa32_st32(tmp, tmp2, i);
+                }
+                tcg_temp_free_i32(tmp);
             }
             if (!(insn & (1 << 24))) {
                 gen_add_data_offset(s, insn, tmp2);
@@ -7689,7 +7752,7 @@
         case 0x09:
             {
                 int j, n, user, loaded_base;
-                TCGv loaded_var;
+                TCGv_i32 loaded_var;
                 /* load/store multiple words */
                 /* XXX: store correct base if write back */
                 user = 0;
@@ -7702,11 +7765,10 @@
                 }
                 rn = (insn >> 16) & 0xf;
                 addr = load_reg(s, rn);
-                tmp3 = tcg_const_i32(4);
 
                 /* compute total size */
                 loaded_base = 0;
-                TCGV_UNUSED(loaded_var);
+                TCGV_UNUSED_I32(loaded_var);
                 n = 0;
                 for(i=0;i<16;i++) {
                     if (insn & (1 << i))
@@ -7716,7 +7778,7 @@
                 if (insn & (1 << 23)) {
                     if (insn & (1 << 24)) {
                         /* pre increment */
-                        tcg_gen_add_i32(addr, addr, tmp3);
+                        tcg_gen_addi_i32(addr, addr, 4);
                     } else {
                         /* post increment */
                     }
@@ -7735,7 +7797,8 @@
                     if (insn & (1 << i)) {
                         if (insn & (1 << 20)) {
                             /* load */
-                            tmp = gen_ld32(addr, IS_USER(s));
+                            tmp = tcg_temp_new_i32();
+                            gen_aa32_ld32u(tmp, addr, IS_USER(s));
                             if (user) {
                                 tmp2 = tcg_const_i32(i);
                                 gen_helper_set_user_reg(cpu_env, tmp2, tmp);
@@ -7762,12 +7825,13 @@
                             } else {
                                 tmp = load_reg(s, i);
                             }
-                            gen_st32(tmp, addr, IS_USER(s));
+                            gen_aa32_st32(tmp, addr, IS_USER(s));
+                            tcg_temp_free_i32(tmp);
                         }
                         j++;
                         /* no need to add after the last transfer */
                         if (j != n)
-                            tcg_gen_add_i32(addr, addr, tmp3);
+                            tcg_gen_addi_i32(addr, addr, 4);
                     }
                 }
                 if (insn & (1 << 21)) {
@@ -7777,7 +7841,7 @@
                             /* pre increment */
                         } else {
                             /* post increment */
-                            tcg_gen_add_i32(addr, addr, tmp3);
+                            tcg_gen_addi_i32(addr, addr, 4);
                         }
                     } else {
                         if (insn & (1 << 24)) {
@@ -7793,7 +7857,6 @@
                 } else {
                     tcg_temp_free_i32(addr);
                 }
-                tcg_temp_free_i32(tmp3);
                 if (loaded_base) {
                     store_reg(s, rn, loaded_var);
                 }
@@ -7827,12 +7890,14 @@
         case 0xd:
         case 0xe:
             /* Coprocessor.  */
-            if (disas_coproc_insn(env, s, insn))
+            if (disas_coproc_insn(env, s, insn)) {
+                /* Coprocessor.  */
                 goto illegal_op;
+            }
             break;
         case 0xf:
             /* swi */
-            gen_set_pc_im(s->pc);
+            gen_set_pc_im(s, s->pc);
             s->is_jmp = DISAS_SWI;
             break;
         default:
@@ -7857,7 +7922,8 @@
    Returns zero if the opcode is valid.  */
 
 static int
-gen_thumb2_data_op(DisasContext *s, int op, int conds, uint32_t shifter_out, TCGv t0, TCGv t1)
+gen_thumb2_data_op(DisasContext *s, int op, int conds, uint32_t shifter_out,
+                   TCGv_i32 t0, TCGv_i32 t1)
 {
     int logic_cc;
 
@@ -7896,14 +7962,16 @@
             gen_adc(t0, t1);
         break;
     case 11: /* sbc */
-        if (conds)
+        if (conds) {
             gen_helper_sbc_cc(t0, cpu_env, t0, t1);
-        else
+        } else {
             gen_sub_carry(t0, t0, t1);
+        }
         break;
     case 13: /* sub */
-        if (conds)
+        if (conds) {
             gen_helper_sub_cc(t0, cpu_env, t0, t1);
+        }
         else
             tcg_gen_sub_i32(t0, t0, t1);
         break;
@@ -7930,10 +7998,10 @@
 {
     uint32_t insn, imm, shift, offset;
     uint32_t rd, rn, rm, rs;
-    TCGv tmp;
-    TCGv tmp2;
-    TCGv tmp3;
-    TCGv addr;
+    TCGv_i32 tmp;
+    TCGv_i32 tmp2;
+    TCGv_i32 tmp3;
+    TCGv_i32 addr;
     TCGv_i64 tmp64;
     int op;
     int shiftop;
@@ -8018,18 +8086,20 @@
                 }
                 if (insn & (1 << 20)) {
                     /* ldrd */
-                    tmp = gen_ld32(addr, IS_USER(s));
+                    tmp = tcg_temp_new_i32();
+                    gen_aa32_ld32u(tmp, addr, IS_USER(s));
                     store_reg(s, rs, tmp);
                     tcg_gen_addi_i32(addr, addr, 4);
-                    tmp = gen_ld32(addr, IS_USER(s));
+                    tmp = tcg_temp_new_i32();
+                    gen_aa32_ld32u(tmp, addr, IS_USER(s));
                     store_reg(s, rd, tmp);
                 } else {
                     /* strd */
                     tmp = load_reg(s, rs);
-                    gen_st32(tmp, addr, IS_USER(s));
+                    gen_aa32_st32(tmp, addr, IS_USER(s));
                     tcg_gen_addi_i32(addr, addr, 4);
                     tmp = load_reg(s, rd);
-                    gen_st32(tmp, addr, IS_USER(s));
+                    gen_aa32_st32(tmp, addr, IS_USER(s));
                 }
                 if (insn & (1 << 21)) {
                     /* Base writeback.  */
@@ -8042,7 +8112,7 @@
                 }
             } else if ((insn & (1 << 23)) == 0) {
                 /* Load/store exclusive word.  */
-                addr = tcg_temp_local_new();
+                addr = tcg_temp_local_new_i32();
                 load_reg_var(s, addr, rn);
                 tcg_gen_addi_i32(addr, addr, (insn & 0xff) << 2);
                 if (insn & (1 << 20)) {
@@ -8050,7 +8120,7 @@
                 } else {
                     gen_store_exclusive(s, rd, rs, 15, addr, 2);
                 }
-                tcg_temp_free(addr);
+                tcg_temp_free_i32(addr);
             } else if ((insn & (1 << 6)) == 0) {
                 /* Table Branch.  */
                 if (rn == 15) {
@@ -8065,10 +8135,12 @@
                     /* tbh */
                     tcg_gen_add_i32(addr, addr, tmp);
                     tcg_temp_free_i32(tmp);
-                    tmp = gen_ld16u(addr, IS_USER(s));
+                    tmp = tcg_temp_new_i32();
+                    gen_aa32_ld16u(tmp, addr, IS_USER(s));
                 } else { /* tbb */
                     tcg_temp_free_i32(tmp);
-                    tmp = gen_ld8u(addr, IS_USER(s));
+                    tmp = tcg_temp_new_i32();
+                    gen_aa32_ld8u(tmp, addr, IS_USER(s));
                 }
                 tcg_temp_free_i32(addr);
                 tcg_gen_shli_i32(tmp, tmp, 1);
@@ -8081,30 +8153,33 @@
                 if (op == 2) {
                     goto illegal_op;
                 }
-                addr = tcg_temp_local_new();
+                addr = tcg_temp_local_new_i32();
                 load_reg_var(s, addr, rn);
                 if (insn & (1 << 20)) {
                     gen_load_exclusive(s, rs, rd, addr, op);
                 } else {
                     gen_store_exclusive(s, rm, rs, rd, addr, op);
                 }
-                tcg_temp_free(addr);
+                tcg_temp_free_i32(addr);
             }
         } else {
             /* Load/store multiple, RFE, SRS.  */
             if (((insn >> 23) & 1) == ((insn >> 24) & 1)) {
                 /* Not available in user mode.  */
-                if (IS_USER(s))
+                if (IS_USER(s)) {
                     goto illegal_op;
+                }
                 if (insn & (1 << 20)) {
                     /* rfe */
                     addr = load_reg(s, rn);
                     if ((insn & (1 << 24)) == 0)
                         tcg_gen_addi_i32(addr, addr, -8);
                     /* Load PC into tmp and CPSR into tmp2.  */
-                    tmp = gen_ld32(addr, 0);
+                    tmp = tcg_temp_new_i32();
+                    gen_aa32_ld32u(tmp, addr, 0);
                     tcg_gen_addi_i32(addr, addr, 4);
-                    tmp2 = gen_ld32(addr, 0);
+                    tmp2 = tcg_temp_new_i32();
+                    gen_aa32_ld32u(tmp2, addr, 0);
                     if (insn & (1 << 21)) {
                         /* Base writeback.  */
                         if (insn & (1 << 24)) {
@@ -8128,11 +8203,11 @@
                         tcg_gen_addi_i32(addr, addr, -8);
                     }
                     tmp = load_reg(s, 14);
-                    gen_st32(tmp, addr, 0);
+                    gen_aa32_st32(tmp, addr, 0);
                     tcg_gen_addi_i32(addr, addr, 4);
                     tmp = tcg_temp_new_i32();
                     gen_helper_cpsr_read(tmp, cpu_env);
-                    gen_st32(tmp, addr, 0);
+                    gen_aa32_st32(tmp, addr, 0);
                     if (insn & (1 << 21)) {
                         if ((insn & (1 << 24)) == 0) {
                             tcg_gen_addi_i32(addr, addr, -4);
@@ -8148,7 +8223,7 @@
                 }
             } else {
                 int i, loaded_base = 0;
-                TCGv loaded_var;
+                TCGv_i32 loaded_var;
                 /* Load/store multiple.  */
                 addr = load_reg(s, rn);
                 offset = 0;
@@ -8160,14 +8235,14 @@
                     tcg_gen_addi_i32(addr, addr, -offset);
                 }
 
-                TCGV_UNUSED(loaded_var);
-                tmp2 = tcg_const_i32(4);
+                TCGV_UNUSED_I32(loaded_var);
                 for (i = 0; i < 16; i++) {
                     if ((insn & (1 << i)) == 0)
                         continue;
                     if (insn & (1 << 20)) {
                         /* Load.  */
-                        tmp = gen_ld32(addr, IS_USER(s));
+                        tmp = tcg_temp_new_i32();
+                        gen_aa32_ld32u(tmp, addr, IS_USER(s));
                         if (i == 15) {
                             gen_bx(s, tmp);
                         } else if (i == rn) {
@@ -8179,14 +8254,14 @@
                     } else {
                         /* Store.  */
                         tmp = load_reg(s, i);
-                        gen_st32(tmp, addr, IS_USER(s));
+                        gen_aa32_st32(tmp, addr, IS_USER(s));
+                        tcg_temp_free_i32(tmp);
                     }
-                    tcg_gen_add_i32(addr, addr, tmp2);
+                    tcg_gen_addi_i32(addr, addr, 4);
                 }
                 if (loaded_base) {
                     store_reg(s, rn, loaded_var);
                 }
-                tcg_temp_free_i32(tmp2);
                 if (insn & (1 << 21)) {
                     /* Base register writeback.  */
                     if (insn & (1 << 24)) {
@@ -8455,8 +8530,9 @@
             tmp2 = load_reg(s, rm);
             if ((op & 0x50) == 0x10) {
                 /* sdiv, udiv */
-                if (!arm_feature(env, ARM_FEATURE_DIV))
+                if (!arm_feature(env, ARM_FEATURE_DIV)) {
                     goto illegal_op;
+                }
                 if (op & 0x20)
                     gen_helper_udiv(tmp, tmp, tmp2);
                 else
@@ -8674,7 +8750,7 @@
                 op = (insn >> 22) & 0xf;
                 /* Generate a conditional jump to next instruction.  */
                 s->condlabel = gen_new_label();
-                gen_test_cc(op ^ 1, s->condlabel);
+                arm_gen_test_cc(op ^ 1, s->condlabel);
                 s->condjmp = 1;
 
                 /* offset[11:1] = insn[10:0] */
@@ -8728,7 +8804,7 @@
                         imm = imm + 1 - shift;
                         if (imm != 32) {
                             tmp2 = load_reg(s, rd);
-                            gen_bfi(tmp, tmp2, tmp, shift, (1u << imm) - 1);
+                            tcg_gen_deposit_i32(tmp, tmp2, tmp, shift, imm);
                             tcg_temp_free_i32(tmp2);
                         }
                         break;
@@ -8878,14 +8954,16 @@
                     goto illegal_op;
                 }
                 if (rn == 15) {
-                    /* UNPREDICTABLE or unallocated hint */
+                    /* UNPREDICTABLE, unallocated hint or
+                     * PLD/PLDW/PLI (literal)
+                     */
                     return 0;
                 }
                 if (op1 & 1) {
-                    return 0; /* PLD* or unallocated hint */
+                    return 0; /* PLD/PLDW/PLI or unallocated hint */
                 }
                 if ((op2 == 0) || ((op2 & 0x3c) == 0x30)) {
-                    return 0; /* PLD* or unallocated hint */
+                    return 0; /* PLD/PLDW/PLI or unallocated hint */
                 }
                 /* UNDEF space, or an UNPREDICTABLE */
                 return 1;
@@ -8952,13 +9030,25 @@
         }
         if (insn & (1 << 20)) {
             /* Load.  */
+            tmp = tcg_temp_new_i32();
             switch (op) {
-            case 0: tmp = gen_ld8u(addr, user); break;
-            case 4: tmp = gen_ld8s(addr, user); break;
-            case 1: tmp = gen_ld16u(addr, user); break;
-            case 5: tmp = gen_ld16s(addr, user); break;
-            case 2: tmp = gen_ld32(addr, user); break;
+            case 0:
+                gen_aa32_ld8u(tmp, addr, user);
+                break;
+            case 4:
+                gen_aa32_ld8s(tmp, addr, user);
+                break;
+            case 1:
+                gen_aa32_ld16u(tmp, addr, user);
+                break;
+            case 5:
+                gen_aa32_ld16s(tmp, addr, user);
+                break;
+            case 2:
+                gen_aa32_ld32u(tmp, addr, user);
+                break;
             default:
+                tcg_temp_free_i32(tmp);
                 tcg_temp_free_i32(addr);
                 goto illegal_op;
             }
@@ -8971,13 +9061,21 @@
             /* Store.  */
             tmp = load_reg(s, rs);
             switch (op) {
-            case 0: gen_st8(tmp, addr, user); break;
-            case 1: gen_st16(tmp, addr, user); break;
-            case 2: gen_st32(tmp, addr, user); break;
+            case 0:
+                gen_aa32_st8(tmp, addr, user);
+                break;
+            case 1:
+                gen_aa32_st16(tmp, addr, user);
+                break;
+            case 2:
+                gen_aa32_st32(tmp, addr, user);
+                break;
             default:
+                tcg_temp_free_i32(tmp);
                 tcg_temp_free_i32(addr);
                 goto illegal_op;
             }
+            tcg_temp_free_i32(tmp);
         }
         if (postinc)
             tcg_gen_addi_i32(addr, addr, imm);
@@ -9001,15 +9099,15 @@
     uint32_t val, insn, op, rm, rn, rd, shift, cond;
     int32_t offset;
     int i;
-    TCGv tmp;
-    TCGv tmp2;
-    TCGv addr;
+    TCGv_i32 tmp;
+    TCGv_i32 tmp2;
+    TCGv_i32 addr;
 
     if (s->condexec_mask) {
         cond = s->condexec_cond;
         if (cond != 0x0e) {     /* Skip conditional when condition is AL. */
           s->condlabel = gen_new_label();
-          gen_test_cc(cond ^ 1, s->condlabel);
+          arm_gen_test_cc(cond ^ 1, s->condlabel);
           s->condjmp = 1;
         }
     }
@@ -9107,7 +9205,8 @@
             val &= ~(uint32_t)2;
             addr = tcg_temp_new_i32();
             tcg_gen_movi_i32(addr, val);
-            tmp = gen_ld32(addr, IS_USER(s));
+            tmp = tcg_temp_new_i32();
+            gen_aa32_ld32u(tmp, addr, IS_USER(s));
             tcg_temp_free_i32(addr);
             store_reg(s, rd, tmp);
             break;
@@ -9172,7 +9271,7 @@
         } else if (op != 0xf) { /* mvn doesn't read its first operand */
             tmp = load_reg(s, rd);
         } else {
-            TCGV_UNUSED(tmp);
+            TCGV_UNUSED_I32(tmp);
         }
 
         tmp2 = load_reg(s, rm);
@@ -9212,16 +9311,18 @@
             }
             break;
         case 0x5: /* adc */
-            if (s->condexec_mask)
+            if (s->condexec_mask) {
                 gen_adc(tmp, tmp2);
-            else
+            } else {
                 gen_helper_adc_cc(tmp, cpu_env, tmp, tmp2);
+            }
             break;
         case 0x6: /* sbc */
-            if (s->condexec_mask)
+            if (s->condexec_mask) {
                 gen_sub_carry(tmp, tmp, tmp2);
-            else
+            } else {
                 gen_helper_sbc_cc(tmp, cpu_env, tmp, tmp2);
+            }
             break;
         case 0x7: /* ror */
             if (s->condexec_mask) {
@@ -9300,37 +9401,42 @@
         tcg_gen_add_i32(addr, addr, tmp);
         tcg_temp_free_i32(tmp);
 
-        if (op < 3) /* store */
+        if (op < 3) { /* store */
             tmp = load_reg(s, rd);
-
+        } else {
+            tmp = tcg_temp_new_i32();
+        }
         switch (op) {
         case 0: /* str */
-            gen_st32(tmp, addr, IS_USER(s));
+            gen_aa32_st32(tmp, addr, IS_USER(s));
             break;
         case 1: /* strh */
-            gen_st16(tmp, addr, IS_USER(s));
+            gen_aa32_st16(tmp, addr, IS_USER(s));
             break;
         case 2: /* strb */
-            gen_st8(tmp, addr, IS_USER(s));
+            gen_aa32_st8(tmp, addr, IS_USER(s));
             break;
         case 3: /* ldrsb */
-            tmp = gen_ld8s(addr, IS_USER(s));
+            gen_aa32_ld8s(tmp, addr, IS_USER(s));
             break;
         case 4: /* ldr */
-            tmp = gen_ld32(addr, IS_USER(s));
+            gen_aa32_ld32u(tmp, addr, IS_USER(s));
             break;
         case 5: /* ldrh */
-            tmp = gen_ld16u(addr, IS_USER(s));
+            gen_aa32_ld16u(tmp, addr, IS_USER(s));
             break;
         case 6: /* ldrb */
-            tmp = gen_ld8u(addr, IS_USER(s));
+            gen_aa32_ld8u(tmp, addr, IS_USER(s));
             break;
         case 7: /* ldrsh */
-            tmp = gen_ld16s(addr, IS_USER(s));
+            gen_aa32_ld16s(tmp, addr, IS_USER(s));
             break;
         }
-        if (op >= 3) /* load */
+        if (op >= 3) { /* load */
             store_reg(s, rd, tmp);
+        } else {
+            tcg_temp_free_i32(tmp);
+        }
         tcg_temp_free_i32(addr);
         break;
 
@@ -9344,12 +9450,14 @@
 
         if (insn & (1 << 11)) {
             /* load */
-            tmp = gen_ld32(addr, IS_USER(s));
+            tmp = tcg_temp_new_i32();
+            gen_aa32_ld32u(tmp, addr, IS_USER(s));
             store_reg(s, rd, tmp);
         } else {
             /* store */
             tmp = load_reg(s, rd);
-            gen_st32(tmp, addr, IS_USER(s));
+            gen_aa32_st32(tmp, addr, IS_USER(s));
+            tcg_temp_free_i32(tmp);
         }
         tcg_temp_free_i32(addr);
         break;
@@ -9364,12 +9472,14 @@
 
         if (insn & (1 << 11)) {
             /* load */
-            tmp = gen_ld8u(addr, IS_USER(s));
+            tmp = tcg_temp_new_i32();
+            gen_aa32_ld8u(tmp, addr, IS_USER(s));
             store_reg(s, rd, tmp);
         } else {
             /* store */
             tmp = load_reg(s, rd);
-            gen_st8(tmp, addr, IS_USER(s));
+            gen_aa32_st8(tmp, addr, IS_USER(s));
+            tcg_temp_free_i32(tmp);
         }
         tcg_temp_free_i32(addr);
         break;
@@ -9384,12 +9494,14 @@
 
         if (insn & (1 << 11)) {
             /* load */
-            tmp = gen_ld16u(addr, IS_USER(s));
+            tmp = tcg_temp_new_i32();
+            gen_aa32_ld16u(tmp, addr, IS_USER(s));
             store_reg(s, rd, tmp);
         } else {
             /* store */
             tmp = load_reg(s, rd);
-            gen_st16(tmp, addr, IS_USER(s));
+            gen_aa32_st16(tmp, addr, IS_USER(s));
+            tcg_temp_free_i32(tmp);
         }
         tcg_temp_free_i32(addr);
         break;
@@ -9403,12 +9515,14 @@
 
         if (insn & (1 << 11)) {
             /* load */
-            tmp = gen_ld32(addr, IS_USER(s));
+            tmp = tcg_temp_new_i32();
+            gen_aa32_ld32u(tmp, addr, IS_USER(s));
             store_reg(s, rd, tmp);
         } else {
             /* store */
             tmp = load_reg(s, rd);
-            gen_st32(tmp, addr, IS_USER(s));
+            gen_aa32_st32(tmp, addr, IS_USER(s));
+            tcg_temp_free_i32(tmp);
         }
         tcg_temp_free_i32(addr);
         break;
@@ -9470,37 +9584,39 @@
             if ((insn & (1 << 11)) == 0) {
                 tcg_gen_addi_i32(addr, addr, -offset);
             }
-            tmp2 = tcg_const_i32(4);
             for (i = 0; i < 8; i++) {
                 if (insn & (1 << i)) {
                     if (insn & (1 << 11)) {
                         /* pop */
-                        tmp = gen_ld32(addr, IS_USER(s));
+                        tmp = tcg_temp_new_i32();
+                        gen_aa32_ld32u(tmp, addr, IS_USER(s));
                         store_reg(s, i, tmp);
                     } else {
                         /* push */
                         tmp = load_reg(s, i);
-                        gen_st32(tmp, addr, IS_USER(s));
+                        gen_aa32_st32(tmp, addr, IS_USER(s));
+                        tcg_temp_free_i32(tmp);
                     }
                     /* advance to the next address.  */
-                    tcg_gen_add_i32(addr, addr, tmp2);
+                    tcg_gen_addi_i32(addr, addr, 4);
                 }
             }
-            TCGV_UNUSED(tmp);
+            TCGV_UNUSED_I32(tmp);
             if (insn & (1 << 8)) {
                 if (insn & (1 << 11)) {
                     /* pop pc */
-                    tmp = gen_ld32(addr, IS_USER(s));
+                    tmp = tcg_temp_new_i32();
+                    gen_aa32_ld32u(tmp, addr, IS_USER(s));
                     /* don't set the pc until the rest of the instruction
                        has completed */
                 } else {
                     /* push lr */
                     tmp = load_reg(s, 14);
-                    gen_st32(tmp, addr, IS_USER(s));
+                    gen_aa32_st32(tmp, addr, IS_USER(s));
+                    tcg_temp_free_i32(tmp);
                 }
-                tcg_gen_add_i32(addr, addr, tmp2);
+                tcg_gen_addi_i32(addr, addr, 4);
             }
-            tcg_temp_free_i32(tmp2);
             if ((insn & (1 << 11)) == 0) {
                 tcg_gen_addi_i32(addr, addr, -offset);
             }
@@ -9579,10 +9695,11 @@
                 tcg_temp_free_i32(tmp);
                 gen_lookup_tb(s);
             } else {
-                if (insn & (1 << 4))
+                if (insn & (1 << 4)) {
                     shift = CPSR_A | CPSR_I | CPSR_F;
-                else
+                } else {
                     shift = 0;
+                }
                 gen_set_psr_im(s, ((insn & 7) << 6), 0, shift);
             }
             break;
@@ -9595,15 +9712,16 @@
     case 12:
     {
         /* load/store multiple */
-        TCGv loaded_var;
-        TCGV_UNUSED(loaded_var);
+        TCGv_i32 loaded_var;
+        TCGV_UNUSED_I32(loaded_var);
         rn = (insn >> 8) & 0x7;
         addr = load_reg(s, rn);
         for (i = 0; i < 8; i++) {
             if (insn & (1 << i)) {
                 if (insn & (1 << 11)) {
                     /* load */
-                    tmp = gen_ld32(addr, IS_USER(s));
+                    tmp = tcg_temp_new_i32();
+                    gen_aa32_ld32u(tmp, addr, IS_USER(s));
                     if (i == rn) {
                         loaded_var = tmp;
                     } else {
@@ -9612,7 +9730,8 @@
                 } else {
                     /* store */
                     tmp = load_reg(s, i);
-                    gen_st32(tmp, addr, IS_USER(s));
+                    gen_aa32_st32(tmp, addr, IS_USER(s));
+                    tcg_temp_free_i32(tmp);
                 }
                 /* advance to the next address */
                 tcg_gen_addi_i32(addr, addr, 4);
@@ -9638,13 +9757,13 @@
 
         if (cond == 0xf) {
             /* swi */
-            gen_set_pc_im(s->pc);
+            gen_set_pc_im(s, s->pc);
             s->is_jmp = DISAS_SWI;
             break;
         }
         /* generate a conditional jump to next instruction */
         s->condlabel = gen_new_label();
-        gen_test_cc(cond ^ 1, s->condlabel);
+        arm_gen_test_cc(cond ^ 1, s->condlabel);
         s->condjmp = 1;
 
         /* jump to the offset */
@@ -9771,7 +9890,7 @@
        complications trying to do it at the end of the block.  */
     if (dc->condexec_mask || dc->condexec_cond)
       {
-        TCGv tmp = tcg_temp_new_i32();
+        TCGv_i32 tmp = tcg_temp_new_i32();
         tcg_gen_movi_i32(tmp, 0);
         store_cpu_field(tmp, condexec_bits);
       }
@@ -9848,7 +9967,8 @@
         }
 
         if (tcg_check_temp_count()) {
-            fprintf(stderr, "TCG temporary leak before %08x\n", dc->pc);
+            fprintf(stderr, "TCG temporary leak before "TARGET_FMT_lx"\n",
+                    dc->pc);
         }
 
         /* Translation stops when a conditional branch is encountered.
@@ -9888,7 +10008,7 @@
             gen_set_label(dc->condlabel);
         }
         if (dc->condjmp || !dc->is_jmp) {
-            gen_set_pc_im(dc->pc);
+            gen_set_pc_im(dc, dc->pc);
             dc->condjmp = 0;
         }
         gen_set_condexec(dc);
@@ -9904,7 +10024,7 @@
     } else {
         /* While branches must always occur at the end of an IT block,
            there are a few other things that can cause us to terminate
-           the TB in the middel of an IT block:
+           the TB in the middle of an IT block:
             - Exception generating instructions (bkpt, swi, undefined).
             - Page boundaries.
             - Hardware watchpoints.
@@ -9985,19 +10105,6 @@
 {
     CPUARMState *env = cpu->env_ptr;
     int i;
-#if 0
-    union {
-        uint32_t i;
-        float s;
-    } s0, s1;
-    CPU_DoubleU d;
-    /* ??? This assumes float64 and double have the same layout.
-       Oh well, it's only debug dumps.  */
-    union {
-        float64 f64;
-        double d;
-    } d0;
-#endif
     uint32_t psr;
 
     for(i=0;i<16;i++) {
@@ -10018,18 +10125,23 @@
                 cpu_mode_names[psr & 0xf], (psr & 0x10) ? 32 : 26);
 
 #if 0
-    for (i = 0; i < 16; i++) {
-        d.d = env->vfp.regs[i];
-        s0.i = d.l.lower;
-        s1.i = d.l.upper;
-        d0.f64 = d.d;
-        cpu_fprintf(f, "s%02d=%08x(%8g) s%02d=%08x(%8g) d%02d=%08x%08x(%8g)\n",
-                    i * 2, (int)s0.i, s0.s,
-                    i * 2 + 1, (int)s1.i, s1.s,
-                    i, (int)(uint32_t)d.l.upper, (int)(uint32_t)d.l.lower,
-                    d0.d);
+    if (flags & CPU_DUMP_FPU) {
+        int numvfpregs = 0;
+        if (arm_feature(env, ARM_FEATURE_VFP)) {
+            numvfpregs += 16;
+        }
+        if (arm_feature(env, ARM_FEATURE_VFP3)) {
+            numvfpregs += 16;
+        }
+        for (i = 0; i < numvfpregs; i++) {
+            uint64_t v = float64_val(env->vfp.regs[i]);
+            cpu_fprintf(f, "s%02d=%08x s%02d=%08x d%02d=%016" PRIx64 "\n",
+                        i * 2, (uint32_t)v,
+                        i * 2 + 1, (uint32_t)(v >> 32),
+                        i, v);
+        }
+        cpu_fprintf(f, "FPSCR: %08x\n", (int)env->vfp.xregs[ARM_VFP_FPSCR]);
     }
-    cpu_fprintf(f, "FPSCR: %08x\n", (int)env->vfp.xregs[ARM_VFP_FPSCR]);
 #endif
 }
 
diff --git a/target-arm/translate.h b/target-arm/translate.h
new file mode 100644
index 0000000..4761e41
--- /dev/null
+++ b/target-arm/translate.h
@@ -0,0 +1,69 @@
+#ifndef TARGET_ARM_TRANSLATE_H
+#define TARGET_ARM_TRANSLATE_H
+/* internal defines */
+typedef struct DisasContext {
+    target_ulong pc;
+    int is_jmp;
+    /* Nonzero if this instruction has been conditionally skipped.  */
+    int condjmp;
+    /* The label that will be jumped to when the instruction is skipped.  */
+    int condlabel;
+    /* Thumb-2 conditional execution bits.  */
+    int condexec_mask;
+    int condexec_cond;
+    struct TranslationBlock *tb;
+    int singlestep_enabled;
+    int thumb;
+#if !defined(CONFIG_USER_ONLY)
+    int user;
+#endif
+    int vfp_enabled;
+    int vec_len;
+    int vec_stride;
+#ifdef CONFIG_ANDROID_MEMCHECK
+    int search_pc;
+#endif
+} DisasContext;
+
+extern TCGv_ptr cpu_env;
+
+/* target-specific extra values for is_jmp */
+/* These instructions trap after executing, so the A32/T32 decoder must
+ * defer them until after the conditional execution state has been updated.
+ * WFI also needs special handling when single-stepping.
+ */
+#define DISAS_WFI 4
+#define DISAS_SWI 5
+#define DISAS_SMC 6
+/* For instructions which unconditionally cause an exception we can skip
+ * emitting unreachable code at the end of the TB in the A64 decoder
+ */
+#define DISAS_EXC 6
+/* WFE */
+#define DISAS_WFE 7
+
+#ifdef TARGET_AARCH64
+void a64_translate_init(void);
+void gen_intermediate_code_internal_a64(ARMCPU *cpu,
+                                        TranslationBlock *tb,
+                                        bool search_pc);
+void gen_a64_set_pc_im(uint64_t val);
+#else
+static inline void a64_translate_init(void)
+{
+}
+
+static inline void gen_intermediate_code_internal_a64(ARMCPU *cpu,
+                                                      TranslationBlock *tb,
+                                                      bool search_pc)
+{
+}
+
+static inline void gen_a64_set_pc_im(uint64_t val)
+{
+}
+#endif
+
+void arm_gen_test_cc(int cc, int label);
+
+#endif /* TARGET_ARM_TRANSLATE_H */