Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20140609-1' into staging

----------------------------------------------------------------
target-arm queue:
 * support -bios option in vexpress boards
 * register the Cortex-A57 impdef system registers
 * fix handling of UXN bit in ARMv8 page tables
 * complete support of crypto insns in A32/T32
 * implement CRC and crypto insns in A64
 * fix bugs in generic timer control register

----------------------------------------------------------------

# gpg: Signature made Mon 09 Jun 2014 16:08:26 BST using RSA key ID 14360CDE
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>"

* remotes/pmaydell/tags/pull-target-arm-20140609-1:
  target-arm: Delete unused iwmmxt_msadb helper
  target-arm: Fix errors in writes to generic timer control registers
  target-arm: A64: Implement two-register SHA instructions
  target-arm: A64: Implement 3-register SHA instructions
  target-arm: A64: Implement AES instructions
  target-arm: A32/T32: Mask CRC value in calling code, not helper
  target-arm: A64: Implement CRC instructions
  target-arm: VFPv4 implies half-precision extension
  target-arm: Clean up handling of ARMv8 optional feature bits
  target-arm: Remove unnecessary setting of feature bits
  target-arm: arm_any_initfn() should never set ARM_FEATURE_AARCH64
  target-arm: A64: Use PMULL feature bit for PMULL
  target-arm: add support for v8 VMULL.P64 instruction
  target-arm: Allow 3reg_wide undefreq to encode more bad size options
  target-arm: add support for v8 SHA1 and SHA256 instructions
  target-arm: Correct handling of UXN bit in ARMv8 LPAE page tables
  target-arm: Prepare cpreg writefns/readfns for EL3/SecExt
  target-arm/cpu64.c: Actually register Cortex-A57 impdef registers
  vexpress: Add support for the -bios flag to provide firmware

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/hw/arm/vexpress.c b/hw/arm/vexpress.c
index 33ff422..f311595 100644
--- a/hw/arm/vexpress.c
+++ b/hw/arm/vexpress.c
@@ -28,6 +28,7 @@
 #include "net/net.h"
 #include "sysemu/sysemu.h"
 #include "hw/boards.h"
+#include "hw/loader.h"
 #include "exec/address-spaces.h"
 #include "sysemu/blockdev.h"
 #include "hw/block/flash.h"
@@ -528,6 +529,18 @@
     daughterboard->init(daughterboard, machine->ram_size, machine->cpu_model,
                         pic);
 
+    /*
+     * If a bios file was provided, attempt to map it into memory
+     */
+    if (bios_name) {
+        const char *fn = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
+        if (!fn || load_image_targphys(fn, map[VE_NORFLASH0],
+                                       VEXPRESS_FLASH_SIZE) < 0) {
+            error_report("Could not load ROM image '%s'", bios_name);
+            exit(1);
+        }
+    }
+
     /* Motherboard peripherals: the wiring is the same but the
      * addresses vary between the legacy and A-Series memory maps.
      */
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 995f999..68b9793 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -468,6 +468,9 @@
     uint32_t hwcaps = 0;
 
     GET_FEATURE(ARM_FEATURE_V8_AES, ARM_HWCAP2_ARM_AES);
+    GET_FEATURE(ARM_FEATURE_V8_PMULL, ARM_HWCAP2_ARM_PMULL);
+    GET_FEATURE(ARM_FEATURE_V8_SHA1, ARM_HWCAP2_ARM_SHA1);
+    GET_FEATURE(ARM_FEATURE_V8_SHA256, ARM_HWCAP2_ARM_SHA2);
     GET_FEATURE(ARM_FEATURE_CRC, ARM_HWCAP2_ARM_CRC32);
     return hwcaps;
 }
@@ -536,7 +539,11 @@
     /* probe for the extra features */
 #define GET_FEATURE(feat, hwcap) \
     do { if (arm_feature(&cpu->env, feat)) { hwcaps |= hwcap; } } while (0)
-    GET_FEATURE(ARM_FEATURE_V8_AES, ARM_HWCAP_A64_PMULL);
+    GET_FEATURE(ARM_FEATURE_V8_AES, ARM_HWCAP_A64_AES);
+    GET_FEATURE(ARM_FEATURE_V8_PMULL, ARM_HWCAP_A64_PMULL);
+    GET_FEATURE(ARM_FEATURE_V8_SHA1, ARM_HWCAP_A64_SHA1);
+    GET_FEATURE(ARM_FEATURE_V8_SHA256, ARM_HWCAP_A64_SHA2);
+    GET_FEATURE(ARM_FEATURE_CRC, ARM_HWCAP_A64_CRC32);
 #undef GET_FEATURE
 
     return hwcaps;
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 794dcb9..b877835 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -316,7 +316,6 @@
         set_feature(env, ARM_FEATURE_V7);
         set_feature(env, ARM_FEATURE_ARM_DIV);
         set_feature(env, ARM_FEATURE_LPAE);
-        set_feature(env, ARM_FEATURE_V8_AES);
     }
     if (arm_feature(env, ARM_FEATURE_V7)) {
         set_feature(env, ARM_FEATURE_VAPA);
@@ -349,6 +348,7 @@
     }
     if (arm_feature(env, ARM_FEATURE_VFP4)) {
         set_feature(env, ARM_FEATURE_VFP3);
+        set_feature(env, ARM_FEATURE_VFP_FP16);
     }
     if (arm_feature(env, ARM_FEATURE_VFP3)) {
         set_feature(env, ARM_FEATURE_VFP);
@@ -745,7 +745,6 @@
     cpu->dtb_compatible = "arm,cortex-a15";
     set_feature(&cpu->env, ARM_FEATURE_V7);
     set_feature(&cpu->env, ARM_FEATURE_VFP4);
-    set_feature(&cpu->env, ARM_FEATURE_VFP_FP16);
     set_feature(&cpu->env, ARM_FEATURE_NEON);
     set_feature(&cpu->env, ARM_FEATURE_THUMB2EE);
     set_feature(&cpu->env, ARM_FEATURE_ARM_DIV);
@@ -954,15 +953,13 @@
     ARMCPU *cpu = ARM_CPU(obj);
     set_feature(&cpu->env, ARM_FEATURE_V8);
     set_feature(&cpu->env, ARM_FEATURE_VFP4);
-    set_feature(&cpu->env, ARM_FEATURE_VFP_FP16);
     set_feature(&cpu->env, ARM_FEATURE_NEON);
     set_feature(&cpu->env, ARM_FEATURE_THUMB2EE);
-    set_feature(&cpu->env, ARM_FEATURE_ARM_DIV);
-    set_feature(&cpu->env, ARM_FEATURE_V7MP);
+    set_feature(&cpu->env, ARM_FEATURE_V8_AES);
+    set_feature(&cpu->env, ARM_FEATURE_V8_SHA1);
+    set_feature(&cpu->env, ARM_FEATURE_V8_SHA256);
+    set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
     set_feature(&cpu->env, ARM_FEATURE_CRC);
-#ifdef TARGET_AARCH64
-    set_feature(&cpu->env, ARM_FEATURE_AARCH64);
-#endif
     cpu->midr = 0xffffffff;
 }
 #endif
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 7d8332e..79e7f82 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -635,6 +635,9 @@
     ARM_FEATURE_CBAR_RO, /* has cp15 CBAR and it is read-only */
     ARM_FEATURE_EL2, /* has EL2 Virtualization support */
     ARM_FEATURE_EL3, /* has EL3 Secure monitor support */
+    ARM_FEATURE_V8_SHA1, /* implements SHA1 part of v8 Crypto Extensions */
+    ARM_FEATURE_V8_SHA256, /* implements SHA256 part of v8 Crypto Extensions */
+    ARM_FEATURE_V8_PMULL, /* implements PMULL part of v8 Crypto Extensions */
 };
 
 static inline int arm_feature(CPUARMState *env, int feature)
diff --git a/target-arm/cpu64.c b/target-arm/cpu64.c
index 8daa622..8b2081c 100644
--- a/target-arm/cpu64.c
+++ b/target-arm/cpu64.c
@@ -93,11 +93,15 @@
 
     set_feature(&cpu->env, ARM_FEATURE_V8);
     set_feature(&cpu->env, ARM_FEATURE_VFP4);
-    set_feature(&cpu->env, ARM_FEATURE_VFP_FP16);
     set_feature(&cpu->env, ARM_FEATURE_NEON);
     set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER);
     set_feature(&cpu->env, ARM_FEATURE_AARCH64);
     set_feature(&cpu->env, ARM_FEATURE_CBAR_RO);
+    set_feature(&cpu->env, ARM_FEATURE_V8_AES);
+    set_feature(&cpu->env, ARM_FEATURE_V8_SHA1);
+    set_feature(&cpu->env, ARM_FEATURE_V8_SHA256);
+    set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
+    set_feature(&cpu->env, ARM_FEATURE_CRC);
     cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A57;
     cpu->midr = 0x411fd070;
     cpu->reset_fpsid = 0x41034070;
@@ -128,6 +132,7 @@
     cpu->ccsidr[1] = 0x201fe012; /* 48KB L1 icache */
     cpu->ccsidr[2] = 0x70ffe07a; /* 2048KB L2 cache */
     cpu->dcz_blocksize = 4; /* 64 bytes */
+    define_arm_cp_regs(cpu, cortexa57_cp_reginfo);
 }
 
 #ifdef CONFIG_USER_ONLY
@@ -137,11 +142,13 @@
 
     set_feature(&cpu->env, ARM_FEATURE_V8);
     set_feature(&cpu->env, ARM_FEATURE_VFP4);
-    set_feature(&cpu->env, ARM_FEATURE_VFP_FP16);
     set_feature(&cpu->env, ARM_FEATURE_NEON);
-    set_feature(&cpu->env, ARM_FEATURE_ARM_DIV);
-    set_feature(&cpu->env, ARM_FEATURE_V7MP);
     set_feature(&cpu->env, ARM_FEATURE_AARCH64);
+    set_feature(&cpu->env, ARM_FEATURE_V8_AES);
+    set_feature(&cpu->env, ARM_FEATURE_V8_SHA1);
+    set_feature(&cpu->env, ARM_FEATURE_V8_SHA256);
+    set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
+    set_feature(&cpu->env, ARM_FEATURE_CRC);
     cpu->ctr = 0x80030003; /* 32 byte I and D cacheline size, VIPT icache */
     cpu->dcz_blocksize = 7; /*  512 bytes */
 }
diff --git a/target-arm/crypto_helper.c b/target-arm/crypto_helper.c
index d8898ed..3e4b5f7 100644
--- a/target-arm/crypto_helper.c
+++ b/target-arm/crypto_helper.c
@@ -1,7 +1,7 @@
 /*
  * crypto_helper.c - emulate v8 Crypto Extensions instructions
  *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -15,9 +15,9 @@
 #include "exec/exec-all.h"
 #include "exec/helper-proto.h"
 
-union AES_STATE {
+union CRYPTO_STATE {
     uint8_t    bytes[16];
-    uint32_t   cols[4];
+    uint32_t   words[4];
     uint64_t   l[2];
 };
 
@@ -99,11 +99,11 @@
         /* ShiftRows permutation vector for decryption */
         { 0, 13, 10,  7, 4, 1, 14, 11, 8,  5, 2, 15, 12, 9, 6,  3 },
     };
-    union AES_STATE rk = { .l = {
+    union CRYPTO_STATE rk = { .l = {
         float64_val(env->vfp.regs[rm]),
         float64_val(env->vfp.regs[rm + 1])
     } };
-    union AES_STATE st = { .l = {
+    union CRYPTO_STATE st = { .l = {
         float64_val(env->vfp.regs[rd]),
         float64_val(env->vfp.regs[rd + 1])
     } };
@@ -260,7 +260,7 @@
         0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
         0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d,
     } };
-    union AES_STATE st = { .l = {
+    union CRYPTO_STATE st = { .l = {
         float64_val(env->vfp.regs[rm]),
         float64_val(env->vfp.regs[rm + 1])
     } };
@@ -269,7 +269,7 @@
     assert(decrypt < 2);
 
     for (i = 0; i < 16; i += 4) {
-        st.cols[i >> 2] = cpu_to_le32(
+        st.words[i >> 2] = cpu_to_le32(
             mc[decrypt][st.bytes[i]] ^
             rol32(mc[decrypt][st.bytes[i + 1]], 8) ^
             rol32(mc[decrypt][st.bytes[i + 2]], 16) ^
@@ -279,3 +279,246 @@
     env->vfp.regs[rd] = make_float64(st.l[0]);
     env->vfp.regs[rd + 1] = make_float64(st.l[1]);
 }
+
+/*
+ * SHA-1 logical functions
+ */
+
+static uint32_t cho(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & (y ^ z)) ^ z;
+}
+
+static uint32_t par(uint32_t x, uint32_t y, uint32_t z)
+{
+    return x ^ y ^ z;
+}
+
+static uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | ((x | y) & z);
+}
+
+void HELPER(crypto_sha1_3reg)(CPUARMState *env, uint32_t rd, uint32_t rn,
+                              uint32_t rm, uint32_t op)
+{
+    union CRYPTO_STATE d = { .l = {
+        float64_val(env->vfp.regs[rd]),
+        float64_val(env->vfp.regs[rd + 1])
+    } };
+    union CRYPTO_STATE n = { .l = {
+        float64_val(env->vfp.regs[rn]),
+        float64_val(env->vfp.regs[rn + 1])
+    } };
+    union CRYPTO_STATE m = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+
+    if (op == 3) { /* sha1su0 */
+        d.l[0] ^= d.l[1] ^ m.l[0];
+        d.l[1] ^= n.l[0] ^ m.l[1];
+    } else {
+        int i;
+
+        for (i = 0; i < 4; i++) {
+            uint32_t t;
+
+            switch (op) {
+            case 0: /* sha1c */
+                t = cho(d.words[1], d.words[2], d.words[3]);
+                break;
+            case 1: /* sha1p */
+                t = par(d.words[1], d.words[2], d.words[3]);
+                break;
+            case 2: /* sha1m */
+                t = maj(d.words[1], d.words[2], d.words[3]);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+            t += rol32(d.words[0], 5) + n.words[0] + m.words[i];
+
+            n.words[0] = d.words[3];
+            d.words[3] = d.words[2];
+            d.words[2] = ror32(d.words[1], 2);
+            d.words[1] = d.words[0];
+            d.words[0] = t;
+        }
+    }
+    env->vfp.regs[rd] = make_float64(d.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(d.l[1]);
+}
+
+void HELPER(crypto_sha1h)(CPUARMState *env, uint32_t rd, uint32_t rm)
+{
+    union CRYPTO_STATE m = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+
+    m.words[0] = ror32(m.words[0], 2);
+    m.words[1] = m.words[2] = m.words[3] = 0;
+
+    env->vfp.regs[rd] = make_float64(m.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(m.l[1]);
+}
+
+void HELPER(crypto_sha1su1)(CPUARMState *env, uint32_t rd, uint32_t rm)
+{
+    union CRYPTO_STATE d = { .l = {
+        float64_val(env->vfp.regs[rd]),
+        float64_val(env->vfp.regs[rd + 1])
+    } };
+    union CRYPTO_STATE m = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+
+    d.words[0] = rol32(d.words[0] ^ m.words[1], 1);
+    d.words[1] = rol32(d.words[1] ^ m.words[2], 1);
+    d.words[2] = rol32(d.words[2] ^ m.words[3], 1);
+    d.words[3] = rol32(d.words[3] ^ d.words[0], 1);
+
+    env->vfp.regs[rd] = make_float64(d.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(d.l[1]);
+}
+
+/*
+ * The SHA-256 logical functions, according to
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ */
+
+static uint32_t S0(uint32_t x)
+{
+    return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22);
+}
+
+static uint32_t S1(uint32_t x)
+{
+    return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25);
+}
+
+static uint32_t s0(uint32_t x)
+{
+    return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3);
+}
+
+static uint32_t s1(uint32_t x)
+{
+    return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10);
+}
+
+void HELPER(crypto_sha256h)(CPUARMState *env, uint32_t rd, uint32_t rn,
+                            uint32_t rm)
+{
+    union CRYPTO_STATE d = { .l = {
+        float64_val(env->vfp.regs[rd]),
+        float64_val(env->vfp.regs[rd + 1])
+    } };
+    union CRYPTO_STATE n = { .l = {
+        float64_val(env->vfp.regs[rn]),
+        float64_val(env->vfp.regs[rn + 1])
+    } };
+    union CRYPTO_STATE m = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        uint32_t t = cho(n.words[0], n.words[1], n.words[2]) + n.words[3]
+                     + S1(n.words[0]) + m.words[i];
+
+        n.words[3] = n.words[2];
+        n.words[2] = n.words[1];
+        n.words[1] = n.words[0];
+        n.words[0] = d.words[3] + t;
+
+        t += maj(d.words[0], d.words[1], d.words[2]) + S0(d.words[0]);
+
+        d.words[3] = d.words[2];
+        d.words[2] = d.words[1];
+        d.words[1] = d.words[0];
+        d.words[0] = t;
+    }
+
+    env->vfp.regs[rd] = make_float64(d.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(d.l[1]);
+}
+
+void HELPER(crypto_sha256h2)(CPUARMState *env, uint32_t rd, uint32_t rn,
+                             uint32_t rm)
+{
+    union CRYPTO_STATE d = { .l = {
+        float64_val(env->vfp.regs[rd]),
+        float64_val(env->vfp.regs[rd + 1])
+    } };
+    union CRYPTO_STATE n = { .l = {
+        float64_val(env->vfp.regs[rn]),
+        float64_val(env->vfp.regs[rn + 1])
+    } };
+    union CRYPTO_STATE m = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        uint32_t t = cho(d.words[0], d.words[1], d.words[2]) + d.words[3]
+                     + S1(d.words[0]) + m.words[i];
+
+        d.words[3] = d.words[2];
+        d.words[2] = d.words[1];
+        d.words[1] = d.words[0];
+        d.words[0] = n.words[3 - i] + t;
+    }
+
+    env->vfp.regs[rd] = make_float64(d.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(d.l[1]);
+}
+
+void HELPER(crypto_sha256su0)(CPUARMState *env, uint32_t rd, uint32_t rm)
+{
+    union CRYPTO_STATE d = { .l = {
+        float64_val(env->vfp.regs[rd]),
+        float64_val(env->vfp.regs[rd + 1])
+    } };
+    union CRYPTO_STATE m = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+
+    d.words[0] += s0(d.words[1]);
+    d.words[1] += s0(d.words[2]);
+    d.words[2] += s0(d.words[3]);
+    d.words[3] += s0(m.words[0]);
+
+    env->vfp.regs[rd] = make_float64(d.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(d.l[1]);
+}
+
+void HELPER(crypto_sha256su1)(CPUARMState *env, uint32_t rd, uint32_t rn,
+                              uint32_t rm)
+{
+    union CRYPTO_STATE d = { .l = {
+        float64_val(env->vfp.regs[rd]),
+        float64_val(env->vfp.regs[rd + 1])
+    } };
+    union CRYPTO_STATE n = { .l = {
+        float64_val(env->vfp.regs[rn]),
+        float64_val(env->vfp.regs[rn + 1])
+    } };
+    union CRYPTO_STATE m = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+
+    d.words[0] += s1(m.words[2]) + n.words[1];
+    d.words[1] += s1(m.words[3]) + n.words[2];
+    d.words[2] += s1(d.words[0]) + n.words[3];
+    d.words[3] += s1(d.words[1]) + m.words[0];
+
+    env->vfp.regs[rd] = make_float64(d.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(d.l[1]);
+}
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index cccda74..2b4ce6a 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -24,6 +24,8 @@
 #include "sysemu/sysemu.h"
 #include "qemu/bitops.h"
 #include "internals.h"
+#include "qemu/crc32c.h"
+#include <zlib.h> /* For crc32 */
 
 /* C2.4.7 Multiply and divide */
 /* special cases for 0 and LLONG_MIN are mandated by the standard */
@@ -186,36 +188,6 @@
     return result;
 }
 
-/* Helper function for 64 bit polynomial multiply case:
- * perform PolynomialMult(op1, op2) and return either the top or
- * bottom half of the 128 bit result.
- */
-uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)
-{
-    int bitnum;
-    uint64_t res = 0;
-
-    for (bitnum = 0; bitnum < 64; bitnum++) {
-        if (op1 & (1ULL << bitnum)) {
-            res ^= op2 << bitnum;
-        }
-    }
-    return res;
-}
-uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)
-{
-    int bitnum;
-    uint64_t res = 0;
-
-    /* bit 0 of op1 can't influence the high 64 bits at all */
-    for (bitnum = 1; bitnum < 64; bitnum++) {
-        if (op1 & (1ULL << bitnum)) {
-            res ^= op2 >> (64 - bitnum);
-        }
-    }
-    return res;
-}
-
 /* 64bit/double versions of the neon float compare functions */
 uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
 {
@@ -438,6 +410,34 @@
     return r;
 }
 
+/* 64-bit versions of the CRC helpers. Note that although the operation
+ * (and the prototypes of crc32c() and crc32() mean that only the bottom
+ * 32 bits of the accumulator and result are used, we pass and return
+ * uint64_t for convenience of the generated code. Unlike the 32-bit
+ * instruction set versions, val may genuinely have 64 bits of data in it.
+ * The upper bytes of val (above the number specified by 'bytes') must have
+ * been zeroed out by the caller.
+ */
+uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
+{
+    uint8_t buf[8];
+
+    stq_le_p(buf, val);
+
+    /* zlib crc32 converts the accumulator and output to one's complement.  */
+    return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
+}
+
+uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
+{
+    uint8_t buf[8];
+
+    stq_le_p(buf, val);
+
+    /* Linux crc32c converts the output to one's complement.  */
+    return crc32c(acc, buf, bytes) ^ 0xffffffff;
+}
+
 /* Handle a CPU exception.  */
 void aarch64_cpu_do_interrupt(CPUState *cs)
 {
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 3f05bed..1d3d10f 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -28,8 +28,6 @@
 DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)
 DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)
 DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)
-DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
-DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
 DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
 DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
@@ -46,3 +44,5 @@
 DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
+DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
+DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 95af624..050c409 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -319,7 +319,7 @@
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
 
-    env->cp15.c3 = value;
+    raw_write(env, ri, value);
     tlb_flush(CPU(cpu), 1); /* Flush TLB as domain not tracked in TLB */
 }
 
@@ -327,12 +327,12 @@
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
 
-    if (env->cp15.c13_fcse != value) {
+    if (raw_read(env, ri) != value) {
         /* Unlike real hardware the qemu TLB uses virtual addresses,
          * not modified virtual addresses, so this causes a TLB flush.
          */
         tlb_flush(CPU(cpu), 1);
-        env->cp15.c13_fcse = value;
+        raw_write(env, ri, value);
     }
 }
 
@@ -341,7 +341,7 @@
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
 
-    if (env->cp15.contextidr_el1 != value && !arm_feature(env, ARM_FEATURE_MPU)
+    if (raw_read(env, ri) != value && !arm_feature(env, ARM_FEATURE_MPU)
         && !extended_addresses_enabled(env)) {
         /* For VMSA (when not using the LPAE long descriptor page table
          * format) this register includes the ASID, so do a TLB flush.
@@ -349,7 +349,7 @@
          */
         tlb_flush(CPU(cpu), 1);
     }
-    env->cp15.contextidr_el1 = value;
+    raw_write(env, ri, value);
 }
 
 static void tlbiall_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -693,7 +693,7 @@
 static void csselr_write(CPUARMState *env, const ARMCPRegInfo *ri,
                          uint64_t value)
 {
-    env->cp15.c0_cssel = value & 0xf;
+    raw_write(env, ri, value & 0xf);
 }
 
 static uint64_t isr_read(CPUARMState *env, const ARMCPRegInfo *ri)
@@ -1040,16 +1040,16 @@
     int timeridx = ri->crm & 1;
     uint32_t oldval = env->cp15.c14_timer[timeridx].ctl;
 
-    env->cp15.c14_timer[timeridx].ctl = value & 3;
+    env->cp15.c14_timer[timeridx].ctl = deposit64(oldval, 0, 2, value);
     if ((oldval ^ value) & 1) {
         /* Enable toggled */
         gt_recalc_timer(cpu, timeridx);
-    } else if ((oldval & value) & 2) {
+    } else if ((oldval ^ value) & 2) {
         /* IMASK toggled: don't need to recalculate,
          * just set the interrupt line based on ISTATUS
          */
         qemu_set_irq(cpu->gt_timer_outputs[timeridx],
-                     (oldval & 4) && (value & 2));
+                     (oldval & 4) && !(value & 2));
     }
 }
 
@@ -1216,11 +1216,11 @@
 static void par_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
 {
     if (arm_feature(env, ARM_FEATURE_LPAE)) {
-        env->cp15.par_el1 = value;
+        raw_write(env, ri, value);
     } else if (arm_feature(env, ARM_FEATURE_V7)) {
-        env->cp15.par_el1 = value & 0xfffff6ff;
+        raw_write(env, ri, value & 0xfffff6ff);
     } else {
-        env->cp15.par_el1 = value & 0xfffff1ff;
+        raw_write(env, ri, value & 0xfffff1ff);
     }
 }
 
@@ -1423,7 +1423,7 @@
      * for long-descriptor tables the TTBCR fields are used differently
      * and the c2_mask and c2_base_mask values are meaningless.
      */
-    env->cp15.c2_control = value;
+    raw_write(env, ri, value);
     env->cp15.c2_mask = ~(((uint32_t)0xffffffffu) >> maskshift);
     env->cp15.c2_base_mask = ~((uint32_t)0x3fffu >> maskshift);
 }
@@ -1445,7 +1445,7 @@
 static void vmsa_ttbcr_reset(CPUARMState *env, const ARMCPRegInfo *ri)
 {
     env->cp15.c2_base_mask = 0xffffc000u;
-    env->cp15.c2_control = 0;
+    raw_write(env, ri, 0);
     env->cp15.c2_mask = 0;
 }
 
@@ -1456,7 +1456,7 @@
 
     /* For AArch64 the A1 bit could result in a change of ASID, so TLB flush. */
     tlb_flush(CPU(cpu), 1);
-    env->cp15.c2_control = value;
+    raw_write(env, ri, value);
 }
 
 static void vmsa_ttbr_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -2151,14 +2151,14 @@
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
 
-    if (env->cp15.c1_sys == value) {
+    if (raw_read(env, ri) == value) {
         /* Skip the TLB flush if nothing actually changed; Linux likes
          * to do a lot of pointless SCTLR writes.
          */
         return;
     }
 
-    env->cp15.c1_sys = value;
+    raw_write(env, ri, value);
     /* ??? Lots of these bits are not implemented.  */
     /* This may enable/disable the MMU, so do a TLB flush.  */
     tlb_flush(CPU(cpu), 1);
@@ -3929,13 +3929,8 @@
         page_size = (1 << ((granule_sz * (4 - level)) + 3));
         descaddr |= (address & (page_size - 1));
         /* Extract attributes from the descriptor and merge with table attrs */
-        if (arm_feature(env, ARM_FEATURE_V8)) {
-            attrs = extract64(descriptor, 2, 10)
-                | (extract64(descriptor, 53, 11) << 10);
-        } else {
-            attrs = extract64(descriptor, 2, 10)
-                | (extract64(descriptor, 52, 12) << 10);
-        }
+        attrs = extract64(descriptor, 2, 10)
+            | (extract64(descriptor, 52, 12) << 10);
         attrs |= extract32(tableattrs, 0, 2) << 11; /* XN, PXN */
         attrs |= extract32(tableattrs, 3, 1) << 5; /* APTable[1] => AP[2] */
         /* The sense of AP[1] vs APTable[0] is reversed, as APTable[0] == 1
@@ -3961,8 +3956,12 @@
         goto do_fault;
     }
     *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
-    if (attrs & (1 << 12) || (!is_user && (attrs & (1 << 11)))) {
-        /* XN or PXN */
+    if ((arm_feature(env, ARM_FEATURE_V8) && is_user && (attrs & (1 << 12))) ||
+        (!arm_feature(env, ARM_FEATURE_V8) && (attrs & (1 << 12))) ||
+        (!is_user && (attrs & (1 << 11)))) {
+        /* XN/UXN or PXN. Since we only implement EL0/EL1 we unconditionally
+         * treat XN/UXN as UXN for v8.
+         */
         if (access_type == 2) {
             goto do_fault;
         }
@@ -5560,28 +5559,15 @@
     return rmode;
 }
 
-static void crc_init_buffer(uint8_t *buf, uint32_t val, uint32_t bytes)
-{
-    memset(buf, 0, 4);
-
-    if (bytes == 1) {
-        buf[0] = val & 0xff;
-    } else if (bytes == 2) {
-        buf[0] = val & 0xff;
-        buf[1] = (val >> 8) & 0xff;
-    } else {
-        buf[0] = val & 0xff;
-        buf[1] = (val >> 8) & 0xff;
-        buf[2] = (val >> 16) & 0xff;
-        buf[3] = (val >> 24) & 0xff;
-    }
-}
-
+/* CRC helpers.
+ * The upper bytes of val (above the number specified by 'bytes') must have
+ * been zeroed out by the caller.
+ */
 uint32_t HELPER(crc32)(uint32_t acc, uint32_t val, uint32_t bytes)
 {
     uint8_t buf[4];
 
-    crc_init_buffer(buf, val, bytes);
+    stl_le_p(buf, val);
 
     /* zlib crc32 converts the accumulator and output to one's complement.  */
     return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
@@ -5591,7 +5577,7 @@
 {
     uint8_t buf[4];
 
-    crc_init_buffer(buf, val, bytes);
+    stl_le_p(buf, val);
 
     /* Linux crc32c converts the output to one's complement.  */
     return crc32c(acc, buf, bytes) ^ 0xffffffff;
diff --git a/target-arm/helper.h b/target-arm/helper.h
index b63fd0f..facfcd2 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -456,8 +456,6 @@
 DEF_HELPER_3(iwmmxt_avgw0, i64, env, i64, i64)
 DEF_HELPER_3(iwmmxt_avgw1, i64, env, i64, i64)
 
-DEF_HELPER_2(iwmmxt_msadb, i64, i64, i64)
-
 DEF_HELPER_3(iwmmxt_align, i64, i64, i64, i32)
 DEF_HELPER_4(iwmmxt_insr, i64, i64, i32, i32, i32)
 
@@ -512,10 +510,22 @@
 DEF_HELPER_4(crypto_aese, void, env, i32, i32, i32)
 DEF_HELPER_4(crypto_aesmc, void, env, i32, i32, i32)
 
+DEF_HELPER_5(crypto_sha1_3reg, void, env, i32, i32, i32, i32)
+DEF_HELPER_3(crypto_sha1h, void, env, i32, i32)
+DEF_HELPER_3(crypto_sha1su1, void, env, i32, i32)
+
+DEF_HELPER_4(crypto_sha256h, void, env, i32, i32, i32)
+DEF_HELPER_4(crypto_sha256h2, void, env, i32, i32, i32)
+DEF_HELPER_3(crypto_sha256su0, void, env, i32, i32)
+DEF_HELPER_4(crypto_sha256su1, void, env, i32, i32, i32)
+
 DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)
 DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)
 DEF_HELPER_2(dc_zva, void, env, i64)
 
+DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #endif
diff --git a/target-arm/iwmmxt_helper.c b/target-arm/iwmmxt_helper.c
index 398cbcb..a506914 100644
--- a/target-arm/iwmmxt_helper.c
+++ b/target-arm/iwmmxt_helper.c
@@ -369,15 +369,6 @@
 #undef IWMMXT_OP_AVGW
 #undef AVGW
 
-uint64_t HELPER(iwmmxt_msadb)(uint64_t a, uint64_t b)
-{
-    a =  ((((a >> 0 ) & 0xffff) * ((b >> 0) & 0xffff) +
-           ((a >> 16) & 0xffff) * ((b >> 16) & 0xffff)) & 0xffffffff) |
-         ((((a >> 32) & 0xffff) * ((b >> 32) & 0xffff) +
-           ((a >> 48) & 0xffff) * ((b >> 48) & 0xffff)) << 32);
-    return a;
-}
-
 uint64_t HELPER(iwmmxt_align)(uint64_t a, uint64_t b, uint32_t n)
 {
     a >>= n << 3;
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 492e500..47d13e9 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -2211,3 +2211,33 @@
     env->vfp.regs[rm] = make_float64(m0);
     env->vfp.regs[rd] = make_float64(d0);
 }
+
+/* Helper function for 64 bit polynomial multiply case:
+ * perform PolynomialMult(op1, op2) and return either the top or
+ * bottom half of the 128 bit result.
+ */
+uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)
+{
+    int bitnum;
+    uint64_t res = 0;
+
+    for (bitnum = 0; bitnum < 64; bitnum++) {
+        if (op1 & (1ULL << bitnum)) {
+            res ^= op2 << bitnum;
+        }
+    }
+    return res;
+}
+uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)
+{
+    int bitnum;
+    uint64_t res = 0;
+
+    /* bit 0 of op1 can't influence the high 64 bits at all */
+    for (bitnum = 1; bitnum < 64; bitnum++) {
+        if (op1 & (1ULL << bitnum)) {
+            res ^= op2 >> (64 - bitnum);
+        }
+    }
+    return res;
+}
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index a9c4633..63ad787 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -85,6 +85,8 @@
 typedef void NeonGenTwoSingleOPFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
 typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
 typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64);
+typedef void CryptoTwoOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void CryptoThreeOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
 
 /* initialize TCG globals.  */
 void a64_translate_init(void)
@@ -3774,6 +3776,54 @@
     tcg_temp_free_i64(tcg_shift);
 }
 
+/* CRC32[BHWX], CRC32C[BHWX] */
+static void handle_crc32(DisasContext *s,
+                         unsigned int sf, unsigned int sz, bool crc32c,
+                         unsigned int rm, unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_acc, tcg_val;
+    TCGv_i32 tcg_bytes;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_CRC)
+        || (sf == 1 && sz != 3)
+        || (sf == 0 && sz == 3)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (sz == 3) {
+        tcg_val = cpu_reg(s, rm);
+    } else {
+        uint64_t mask;
+        switch (sz) {
+        case 0:
+            mask = 0xFF;
+            break;
+        case 1:
+            mask = 0xFFFF;
+            break;
+        case 2:
+            mask = 0xFFFFFFFF;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        tcg_val = new_tmp_a64(s);
+        tcg_gen_andi_i64(tcg_val, cpu_reg(s, rm), mask);
+    }
+
+    tcg_acc = cpu_reg(s, rn);
+    tcg_bytes = tcg_const_i32(1 << sz);
+
+    if (crc32c) {
+        gen_helper_crc32c_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
+    } else {
+        gen_helper_crc32_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
+    }
+
+    tcg_temp_free_i32(tcg_bytes);
+}
+
 /* C3.5.8 Data-processing (2 source)
  *   31   30  29 28             21 20  16 15    10 9    5 4    0
  * +----+---+---+-----------------+------+--------+------+------+
@@ -3821,8 +3871,12 @@
     case 21:
     case 22:
     case 23: /* CRC32 */
-        unsupported_encoding(s, insn);
+    {
+        int sz = extract32(opcode, 0, 2);
+        bool crc32c = extract32(opcode, 2, 1);
+        handle_crc32(s, sf, sz, crc32c, rm, rn, rd);
         break;
+    }
     default:
         unallocated_encoding(s);
         break;
@@ -8574,7 +8628,7 @@
             return;
         }
         if (size == 3) {
-            if (!arm_dc_feature(s, ARM_FEATURE_V8_AES)) {
+            if (!arm_dc_feature(s, ARM_FEATURE_V8_PMULL)) {
                 unallocated_encoding(s);
                 return;
             }
@@ -10497,7 +10551,55 @@
  */
 static void disas_crypto_aes(DisasContext *s, uint32_t insn)
 {
-    unsupported_encoding(s, insn);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int decrypt;
+    TCGv_i32 tcg_rd_regno, tcg_rn_regno, tcg_decrypt;
+    CryptoThreeOpEnvFn *genfn;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_AES)
+        || size != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x4: /* AESE */
+        decrypt = 0;
+        genfn = gen_helper_crypto_aese;
+        break;
+    case 0x6: /* AESMC */
+        decrypt = 0;
+        genfn = gen_helper_crypto_aesmc;
+        break;
+    case 0x5: /* AESD */
+        decrypt = 1;
+        genfn = gen_helper_crypto_aese;
+        break;
+    case 0x7: /* AESIMC */
+        decrypt = 1;
+        genfn = gen_helper_crypto_aesmc;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* Note that we convert the Vx register indexes into the
+     * index within the vfp.regs[] array, so we can share the
+     * helper with the AArch32 instructions.
+     */
+    tcg_rd_regno = tcg_const_i32(rd << 1);
+    tcg_rn_regno = tcg_const_i32(rn << 1);
+    tcg_decrypt = tcg_const_i32(decrypt);
+
+    genfn(cpu_env, tcg_rd_regno, tcg_rn_regno, tcg_decrypt);
+
+    tcg_temp_free_i32(tcg_rd_regno);
+    tcg_temp_free_i32(tcg_rn_regno);
+    tcg_temp_free_i32(tcg_decrypt);
 }
 
 /* C3.6.20 Crypto three-reg SHA
@@ -10508,7 +10610,64 @@
  */
 static void disas_crypto_three_reg_sha(DisasContext *s, uint32_t insn)
 {
-    unsupported_encoding(s, insn);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 3);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    CryptoThreeOpEnvFn *genfn;
+    TCGv_i32 tcg_rd_regno, tcg_rn_regno, tcg_rm_regno;
+    int feature = ARM_FEATURE_V8_SHA256;
+
+    if (size != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0: /* SHA1C */
+    case 1: /* SHA1P */
+    case 2: /* SHA1M */
+    case 3: /* SHA1SU0 */
+        genfn = NULL;
+        feature = ARM_FEATURE_V8_SHA1;
+        break;
+    case 4: /* SHA256H */
+        genfn = gen_helper_crypto_sha256h;
+        break;
+    case 5: /* SHA256H2 */
+        genfn = gen_helper_crypto_sha256h2;
+        break;
+    case 6: /* SHA256SU1 */
+        genfn = gen_helper_crypto_sha256su1;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!arm_dc_feature(s, feature)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rd_regno = tcg_const_i32(rd << 1);
+    tcg_rn_regno = tcg_const_i32(rn << 1);
+    tcg_rm_regno = tcg_const_i32(rm << 1);
+
+    if (genfn) {
+        genfn(cpu_env, tcg_rd_regno, tcg_rn_regno, tcg_rm_regno);
+    } else {
+        TCGv_i32 tcg_opcode = tcg_const_i32(opcode);
+
+        gen_helper_crypto_sha1_3reg(cpu_env, tcg_rd_regno,
+                                    tcg_rn_regno, tcg_rm_regno, tcg_opcode);
+        tcg_temp_free_i32(tcg_opcode);
+    }
+
+    tcg_temp_free_i32(tcg_rd_regno);
+    tcg_temp_free_i32(tcg_rn_regno);
+    tcg_temp_free_i32(tcg_rm_regno);
 }
 
 /* C3.6.21 Crypto two-reg SHA
@@ -10519,7 +10678,49 @@
  */
 static void disas_crypto_two_reg_sha(DisasContext *s, uint32_t insn)
 {
-    unsupported_encoding(s, insn);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    CryptoTwoOpEnvFn *genfn;
+    int feature;
+    TCGv_i32 tcg_rd_regno, tcg_rn_regno;
+
+    if (size != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0: /* SHA1H */
+        feature = ARM_FEATURE_V8_SHA1;
+        genfn = gen_helper_crypto_sha1h;
+        break;
+    case 1: /* SHA1SU1 */
+        feature = ARM_FEATURE_V8_SHA1;
+        genfn = gen_helper_crypto_sha1su1;
+        break;
+    case 2: /* SHA256SU0 */
+        feature = ARM_FEATURE_V8_SHA256;
+        genfn = gen_helper_crypto_sha256su0;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!arm_dc_feature(s, feature)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rd_regno = tcg_const_i32(rd << 1);
+    tcg_rn_regno = tcg_const_i32(rn << 1);
+
+    genfn(cpu_env, tcg_rd_regno, tcg_rn_regno);
+
+    tcg_temp_free_i32(tcg_rd_regno);
+    tcg_temp_free_i32(tcg_rn_regno);
 }
 
 /* C3.6 Data processing - SIMD, inc Crypto
diff --git a/target-arm/translate.c b/target-arm/translate.c
index d499caa..cf4e767 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -1382,8 +1382,6 @@
 IWMMXT_OP_ENV(avgw0)
 IWMMXT_OP_ENV(avgw1)
 
-IWMMXT_OP(msadb)
-
 IWMMXT_OP_ENV(packuw)
 IWMMXT_OP_ENV(packul)
 IWMMXT_OP_ENV(packuq)
@@ -4776,6 +4774,7 @@
 #define NEON_3R_VPMIN 21
 #define NEON_3R_VQDMULH_VQRDMULH 22
 #define NEON_3R_VPADD 23
+#define NEON_3R_SHA 24 /* SHA1C,SHA1P,SHA1M,SHA1SU0,SHA256H{2},SHA256SU1 */
 #define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */
 #define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */
 #define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */
@@ -4809,6 +4808,7 @@
     [NEON_3R_VPMIN] = 0x7,
     [NEON_3R_VQDMULH_VQRDMULH] = 0x6,
     [NEON_3R_VPADD] = 0x7,
+    [NEON_3R_SHA] = 0xf, /* size field encodes op type */
     [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */
     [NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */
     [NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */
@@ -4842,6 +4842,7 @@
 #define NEON_2RM_VCEQ0 18
 #define NEON_2RM_VCLE0 19
 #define NEON_2RM_VCLT0 20
+#define NEON_2RM_SHA1H 21
 #define NEON_2RM_VABS 22
 #define NEON_2RM_VNEG 23
 #define NEON_2RM_VCGT0_F 24
@@ -4858,6 +4859,7 @@
 #define NEON_2RM_VMOVN 36 /* Includes VQMOVN, VQMOVUN */
 #define NEON_2RM_VQMOVN 37 /* Includes VQMOVUN */
 #define NEON_2RM_VSHLL 38
+#define NEON_2RM_SHA1SU1 39 /* Includes SHA256SU0 */
 #define NEON_2RM_VRINTN 40
 #define NEON_2RM_VRINTX 41
 #define NEON_2RM_VRINTA 42
@@ -4918,6 +4920,7 @@
     [NEON_2RM_VCEQ0] = 0x7,
     [NEON_2RM_VCLE0] = 0x7,
     [NEON_2RM_VCLT0] = 0x7,
+    [NEON_2RM_SHA1H] = 0x4,
     [NEON_2RM_VABS] = 0x7,
     [NEON_2RM_VNEG] = 0x7,
     [NEON_2RM_VCGT0_F] = 0x4,
@@ -4934,6 +4937,7 @@
     [NEON_2RM_VMOVN] = 0x7,
     [NEON_2RM_VQMOVN] = 0x7,
     [NEON_2RM_VSHLL] = 0x7,
+    [NEON_2RM_SHA1SU1] = 0x4,
     [NEON_2RM_VRINTN] = 0x4,
     [NEON_2RM_VRINTX] = 0x4,
     [NEON_2RM_VRINTA] = 0x4,
@@ -5011,6 +5015,49 @@
         if (q && ((rd | rn | rm) & 1)) {
             return 1;
         }
+        /*
+         * The SHA-1/SHA-256 3-register instructions require special treatment
+         * here, as their size field is overloaded as an op type selector, and
+         * they all consume their input in a single pass.
+         */
+        if (op == NEON_3R_SHA) {
+            if (!q) {
+                return 1;
+            }
+            if (!u) { /* SHA-1 */
+                if (!arm_feature(env, ARM_FEATURE_V8_SHA1)) {
+                    return 1;
+                }
+                tmp = tcg_const_i32(rd);
+                tmp2 = tcg_const_i32(rn);
+                tmp3 = tcg_const_i32(rm);
+                tmp4 = tcg_const_i32(size);
+                gen_helper_crypto_sha1_3reg(cpu_env, tmp, tmp2, tmp3, tmp4);
+                tcg_temp_free_i32(tmp4);
+            } else { /* SHA-256 */
+                if (!arm_feature(env, ARM_FEATURE_V8_SHA256) || size == 3) {
+                    return 1;
+                }
+                tmp = tcg_const_i32(rd);
+                tmp2 = tcg_const_i32(rn);
+                tmp3 = tcg_const_i32(rm);
+                switch (size) {
+                case 0:
+                    gen_helper_crypto_sha256h(cpu_env, tmp, tmp2, tmp3);
+                    break;
+                case 1:
+                    gen_helper_crypto_sha256h2(cpu_env, tmp, tmp2, tmp3);
+                    break;
+                case 2:
+                    gen_helper_crypto_sha256su1(cpu_env, tmp, tmp2, tmp3);
+                    break;
+                }
+            }
+            tcg_temp_free_i32(tmp);
+            tcg_temp_free_i32(tmp2);
+            tcg_temp_free_i32(tmp3);
+            return 0;
+        }
         if (size == 3 && op != NEON_3R_LOGIC) {
             /* 64-bit element instructions. */
             for (pass = 0; pass < (q ? 2 : 1); pass++) {
@@ -5905,10 +5952,11 @@
                 int src1_wide;
                 int src2_wide;
                 int prewiden;
-                /* undefreq: bit 0 : UNDEF if size != 0
-                 *           bit 1 : UNDEF if size == 0
-                 *           bit 2 : UNDEF if U == 1
-                 * Note that [1:0] set implies 'always UNDEF'
+                /* undefreq: bit 0 : UNDEF if size == 0
+                 *           bit 1 : UNDEF if size == 1
+                 *           bit 2 : UNDEF if size == 2
+                 *           bit 3 : UNDEF if U == 1
+                 * Note that [2:0] set implies 'always UNDEF'
                  */
                 int undefreq;
                 /* prewiden, src1_wide, src2_wide, undefreq */
@@ -5922,13 +5970,13 @@
                     {0, 1, 1, 0}, /* VSUBHN */
                     {0, 0, 0, 0}, /* VABDL */
                     {0, 0, 0, 0}, /* VMLAL */
-                    {0, 0, 0, 6}, /* VQDMLAL */
+                    {0, 0, 0, 9}, /* VQDMLAL */
                     {0, 0, 0, 0}, /* VMLSL */
-                    {0, 0, 0, 6}, /* VQDMLSL */
+                    {0, 0, 0, 9}, /* VQDMLSL */
                     {0, 0, 0, 0}, /* Integer VMULL */
-                    {0, 0, 0, 2}, /* VQDMULL */
-                    {0, 0, 0, 5}, /* Polynomial VMULL */
-                    {0, 0, 0, 3}, /* Reserved: always UNDEF */
+                    {0, 0, 0, 1}, /* VQDMULL */
+                    {0, 0, 0, 0xa}, /* Polynomial VMULL */
+                    {0, 0, 0, 7}, /* Reserved: always UNDEF */
                 };
 
                 prewiden = neon_3reg_wide[op][0];
@@ -5936,9 +5984,8 @@
                 src2_wide = neon_3reg_wide[op][2];
                 undefreq = neon_3reg_wide[op][3];
 
-                if (((undefreq & 1) && (size != 0)) ||
-                    ((undefreq & 2) && (size == 0)) ||
-                    ((undefreq & 4) && u)) {
+                if ((undefreq & (1 << size)) ||
+                    ((undefreq & 8) && u)) {
                     return 1;
                 }
                 if ((src1_wide && (rn & 1)) ||
@@ -5947,6 +5994,30 @@
                     return 1;
                 }
 
+                /* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply)
+                 * outside the loop below as it only performs a single pass.
+                 */
+                if (op == 14 && size == 2) {
+                    TCGv_i64 tcg_rn, tcg_rm, tcg_rd;
+
+                    if (!arm_feature(env, ARM_FEATURE_V8_PMULL)) {
+                        return 1;
+                    }
+                    tcg_rn = tcg_temp_new_i64();
+                    tcg_rm = tcg_temp_new_i64();
+                    tcg_rd = tcg_temp_new_i64();
+                    neon_load_reg64(tcg_rn, rn);
+                    neon_load_reg64(tcg_rm, rm);
+                    gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm);
+                    neon_store_reg64(tcg_rd, rd);
+                    gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm);
+                    neon_store_reg64(tcg_rd, rd + 1);
+                    tcg_temp_free_i64(tcg_rn);
+                    tcg_temp_free_i64(tcg_rm);
+                    tcg_temp_free_i64(tcg_rd);
+                    return 0;
+                }
+
                 /* Avoid overlapping operands.  Wide source operands are
                    always aligned so will never overlap with wide
                    destinations in problematic ways.  */
@@ -6486,6 +6557,41 @@
                     tcg_temp_free_i32(tmp2);
                     tcg_temp_free_i32(tmp3);
                     break;
+                case NEON_2RM_SHA1H:
+                    if (!arm_feature(env, ARM_FEATURE_V8_SHA1)
+                        || ((rm | rd) & 1)) {
+                        return 1;
+                    }
+                    tmp = tcg_const_i32(rd);
+                    tmp2 = tcg_const_i32(rm);
+
+                    gen_helper_crypto_sha1h(cpu_env, tmp, tmp2);
+
+                    tcg_temp_free_i32(tmp);
+                    tcg_temp_free_i32(tmp2);
+                    break;
+                case NEON_2RM_SHA1SU1:
+                    if ((rm | rd) & 1) {
+                            return 1;
+                    }
+                    /* bit 6 (q): set -> SHA256SU0, cleared -> SHA1SU1 */
+                    if (q) {
+                        if (!arm_feature(env, ARM_FEATURE_V8_SHA256)) {
+                            return 1;
+                        }
+                    } else if (!arm_feature(env, ARM_FEATURE_V8_SHA1)) {
+                        return 1;
+                    }
+                    tmp = tcg_const_i32(rd);
+                    tmp2 = tcg_const_i32(rm);
+                    if (q) {
+                        gen_helper_crypto_sha256su0(cpu_env, tmp, tmp2);
+                    } else {
+                        gen_helper_crypto_sha1su1(cpu_env, tmp, tmp2);
+                    }
+                    tcg_temp_free_i32(tmp);
+                    tcg_temp_free_i32(tmp2);
+                    break;
                 default:
                 elementwise:
                     for (pass = 0; pass < (q ? 4 : 2); pass++) {
@@ -7698,6 +7804,11 @@
 
             tmp = load_reg(s, rn);
             tmp2 = load_reg(s, rm);
+            if (op1 == 0) {
+                tcg_gen_andi_i32(tmp2, tmp2, 0xff);
+            } else if (op1 == 1) {
+                tcg_gen_andi_i32(tmp2, tmp2, 0xffff);
+            }
             tmp3 = tcg_const_i32(1 << op1);
             if (c & 0x2) {
                 gen_helper_crc32c(tmp, tmp, tmp2, tmp3);
@@ -9330,6 +9441,11 @@
                     }
 
                     tmp2 = load_reg(s, rm);
+                    if (sz == 0) {
+                        tcg_gen_andi_i32(tmp2, tmp2, 0xff);
+                    } else if (sz == 1) {
+                        tcg_gen_andi_i32(tmp2, tmp2, 0xffff);
+                    }
                     tmp3 = tcg_const_i32(1 << sz);
                     if (c) {
                         gen_helper_crc32c(tmp, tmp, tmp2, tmp3);