target-mips & softmmu: Misaligned Memory Accesses for R6/MSA

This patch is the last one in a series of 3 patches that optimize the
Misaligned Memory Access overhead for MIPS R6 + MSA where these accesses
should be handled in hardware.

The patch integrates 3 commits from Qemu ToT. They are required to
enable support for unaligned accesses for R6 and MSA. Also, one of
those commits adds probe_write() function in order to probe for
whether the specified guest write access is permitted or not.
If it is not permitted then an exception will be taken in the same
way as if this were a real write access (and we will not return).
Otherwise the function will return, and there will be a valid
entry in the TLB for this access.

List of squashed commits from Qemu ToT:

adc370a target-mips: Misaligned memory accesses for MSA
3b4afc9 softmmu: Add probe_write()
be3a8c5 target-mips: Misaligned memory accesses for R6

Change-Id: I00d745a0a6ba45f7e56666b25f732f6faf0e5080
Signed-off-by: Goran Ferenc <goran.ferenc@imgtec.com>
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 0844885..9954e2b 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -104,6 +104,8 @@
                   hwaddr paddr, int prot,
                   int mmu_idx, target_ulong size);
 void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr);
+void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+                 uintptr_t retaddr);
 #else
 static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
 {
diff --git a/softmmu_template.h b/softmmu_template.h
index 8ec7a0e..df3e171 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -504,6 +504,28 @@
     helper_te_st_name(env, addr, val, oi, GETRA());
 }
 
+#if DATA_SIZE == 1
+/* Probe for whether the specified guest write access is permitted.
+ * If it is not permitted then an exception will be taken in the same
+ * way as if this were a real write access (and we will not return).
+ * Otherwise the function will return, and there will be a valid
+ * entry in the TLB for this access.
+ */
+void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
+                 uintptr_t retaddr)
+{
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        /* TLB entry is for a different page */
+        if (!VICTIM_TLB_HIT(addr_write)) {
+            tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
+        }
+    }
+}
+#endif
 #endif /* !defined(SOFTMMU_CODE_ACCESS) */
 
 #undef READ_ACCESS_TYPE
diff --git a/target-mips/helper.h b/target-mips/helper.h
index 3687df4..b8716de 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -931,5 +931,11 @@
 DEF_HELPER_4(msa_ffint_s_df, void, env, i32, i32, i32)
 DEF_HELPER_4(msa_ffint_u_df, void, env, i32, i32, i32)
 
-DEF_HELPER_5(msa_ld_df, void, env, i32, i32, i32, s32)
-DEF_HELPER_5(msa_st_df, void, env, i32, i32, i32, s32)
+#define MSALDST_PROTO(type)                         \
+DEF_HELPER_3(msa_ld_ ## type, void, env, i32, tl)   \
+DEF_HELPER_3(msa_st_ ## type, void, env, i32, tl)
+MSALDST_PROTO(b)
+MSALDST_PROTO(h)
+MSALDST_PROTO(w)
+MSALDST_PROTO(d)
+#undef MSALDST_PROTO
diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
index e7e7d1b..26cabac 100644
--- a/target-mips/op_helper.c
+++ b/target-mips/op_helper.c
@@ -90,10 +90,10 @@
     }                                                                   \
 }
 #endif
-HELPER_LD(lbu, ldub, uint8_t)
-HELPER_LD(lhu, lduw, uint16_t)
 HELPER_LD(lw, ldl, int32_t)
+#if defined(TARGET_MIPS64)
 HELPER_LD(ld, ldq, int64_t)
+#endif
 #undef HELPER_LD
 
 #if defined(CONFIG_USER_ONLY)
@@ -118,9 +118,10 @@
 }
 #endif
 HELPER_ST(sb, stb, uint8_t)
-HELPER_ST(sh, stw, uint16_t)
 HELPER_ST(sw, stl, uint32_t)
+#if defined(TARGET_MIPS64)
 HELPER_ST(sd, stq, uint64_t)
+#endif
 #undef HELPER_ST
 
 target_ulong helper_clo (target_ulong arg1)
@@ -3684,72 +3685,82 @@
 /* Element-by-element access macros */
 #define DF_ELEMENTS(df) (MSA_WRLEN / DF_BITS(df))
 
-void helper_msa_ld_df(CPUMIPSState *env, uint32_t df, uint32_t wd, uint32_t rs,
-                     int32_t s10)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    target_ulong addr = env->active_tc.gpr[rs] + (s10 << df);
-    int i;
+#if !defined(CONFIG_USER_ONLY)
+#define MEMOP_IDX(DF)                                           \
+        TCGMemOpIdx oi = make_memop_idx(MO_TE | DF | MO_UNALN,  \
+                                        cpu_mmu_index(env));
+#else
+#define MEMOP_IDX(DF)
+#endif
 
-    switch (df) {
-    case DF_BYTE:
-        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {
-            pwd->b[i] = do_lbu(env, addr + (i << DF_BYTE),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_HALF:
-        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {
-            pwd->h[i] = do_lhu(env, addr + (i << DF_HALF),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_WORD:
-        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {
-            pwd->w[i] = do_lw(env, addr + (i << DF_WORD),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_DOUBLE:
-        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {
-            pwd->d[i] = do_ld(env, addr + (i << DF_DOUBLE),
-                                env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    }
+#define MSA_LD_DF(DF, TYPE, LD_INSN, ...)                               \
+void helper_msa_ld_ ## TYPE(CPUMIPSState *env, uint32_t wd,             \
+                            target_ulong addr)                          \
+{                                                                       \
+    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
+    wr_t wx;                                                            \
+    int i;                                                              \
+    MEMOP_IDX(DF)                                                       \
+    for (i = 0; i < DF_ELEMENTS(DF); i++) {                             \
+        wx.TYPE[i] = LD_INSN(env, addr + (i << DF), ##__VA_ARGS__);     \
+    }                                                                   \
+    memcpy(pwd, &wx, sizeof(wr_t));                                     \
 }
 
-void helper_msa_st_df(CPUMIPSState *env, uint32_t df, uint32_t wd, uint32_t rs,
-                     int32_t s10)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    target_ulong addr = env->active_tc.gpr[rs] + (s10 << df);
-    int i;
+#if !defined(CONFIG_USER_ONLY)
+MSA_LD_DF(DF_BYTE,   b, helper_ret_ldub_mmu, oi, GETRA())
+MSA_LD_DF(DF_HALF,   h, helper_ret_lduw_mmu, oi, GETRA())
+MSA_LD_DF(DF_WORD,   w, helper_ret_ldul_mmu, oi, GETRA())
+MSA_LD_DF(DF_DOUBLE, d, helper_ret_ldq_mmu,  oi, GETRA())
+#else
+MSA_LD_DF(DF_BYTE,   b, cpu_ldub_data)
+MSA_LD_DF(DF_HALF,   h, cpu_lduw_data)
+MSA_LD_DF(DF_WORD,   w, cpu_ldl_data)
+MSA_LD_DF(DF_DOUBLE, d, cpu_ldq_data)
+#endif
 
-    switch (df) {
-    case DF_BYTE:
-        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {
-            do_sb(env, addr + (i << DF_BYTE), pwd->b[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_HALF:
-        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {
-            do_sh(env, addr + (i << DF_HALF), pwd->h[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_WORD:
-        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {
-            do_sw(env, addr + (i << DF_WORD), pwd->w[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
-    case DF_DOUBLE:
-        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {
-            do_sd(env, addr + (i << DF_DOUBLE), pwd->d[i],
-                    env->hflags & MIPS_HFLAG_KSU);
-        }
-        break;
+#define MSA_PAGESPAN(x) \
+        ((((x) & ~TARGET_PAGE_MASK) + MSA_WRLEN/8 - 1) >= TARGET_PAGE_SIZE)
+
+static inline void ensure_writable_pages(CPUMIPSState *env,
+                                         target_ulong addr,
+                                         int mmu_idx,
+                                         uintptr_t retaddr)
+{
+#if !defined(CONFIG_USER_ONLY)
+    target_ulong page_addr;
+    if (unlikely(MSA_PAGESPAN(addr))) {
+        /* first page */
+        probe_write(env, addr, mmu_idx, retaddr);
+        /* second page */
+        page_addr = (addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
+        probe_write(env, page_addr, mmu_idx, retaddr);
     }
+#endif
 }
+
+#define MSA_ST_DF(DF, TYPE, ST_INSN, ...)                               \
+void helper_msa_st_ ## TYPE(CPUMIPSState *env, uint32_t wd,             \
+                            target_ulong addr)                          \
+{                                                                       \
+    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
+    int mmu_idx = cpu_mmu_index(env);                                   \
+    int i;                                                              \
+    MEMOP_IDX(DF)                                                       \
+    ensure_writable_pages(env, addr, mmu_idx, GETRA());                 \
+    for (i = 0; i < DF_ELEMENTS(DF); i++) {                             \
+        ST_INSN(env, addr + (i << DF), pwd->TYPE[i], ##__VA_ARGS__);    \
+    }                                                                   \
+}
+
+#if !defined(CONFIG_USER_ONLY)
+MSA_ST_DF(DF_BYTE,   b, helper_ret_stb_mmu, oi, GETRA())
+MSA_ST_DF(DF_HALF,   h, helper_ret_stw_mmu, oi, GETRA())
+MSA_ST_DF(DF_WORD,   w, helper_ret_stl_mmu, oi, GETRA())
+MSA_ST_DF(DF_DOUBLE, d, helper_ret_stq_mmu, oi, GETRA())
+#else
+MSA_ST_DF(DF_BYTE,   b, cpu_stb_data)
+MSA_ST_DF(DF_HALF,   h, cpu_stw_data)
+MSA_ST_DF(DF_WORD,   w, cpu_stl_data)
+MSA_ST_DF(DF_DOUBLE, d, cpu_stq_data)
+#endif
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 1b93fc0..44b5fcc 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -2171,6 +2171,7 @@
     int32_t CP0_Config1;
     /* Routine used to access memory */
     int mem_idx;
+    TCGMemOp default_tcg_memop_mask;
     uint32_t hflags, saved_hflags;
     int bstate;
     target_ulong btarget;
@@ -2848,12 +2849,14 @@
     switch (opc) {
 #if defined(TARGET_MIPS64)
     case OPC_LWU:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUL);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUL |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lwu";
         break;
     case OPC_LD:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEQ);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEQ |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "ld";
         break;
@@ -2924,17 +2927,20 @@
         opn = "lwpc";
         break;
     case OPC_LW:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESL);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESL |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lw";
         break;
     case OPC_LH:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESW);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TESW |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lh";
         break;
     case OPC_LHU:
-        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUW);
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_TEUW |
+                           ctx->default_tcg_memop_mask);
         gen_store_gpr(t0, rt);
         opn = "lhu";
         break;
@@ -3018,7 +3024,8 @@
     switch (opc) {
 #if defined(TARGET_MIPS64)
     case OPC_SD:
-        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEQ);
+        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEQ |
+                           ctx->default_tcg_memop_mask);
         opn = "sd";
         break;
     case OPC_SDL:
@@ -3033,11 +3040,13 @@
         break;
 #endif
     case OPC_SW:
-        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUL);
+        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUL |
+                           ctx->default_tcg_memop_mask);
         opn = "sw";
         break;
     case OPC_SH:
-        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUW);
+        tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_TEUW |
+                           ctx->default_tcg_memop_mask);
         opn = "sh";
         break;
     case OPC_SB:
@@ -3114,7 +3123,8 @@
     case OPC_LWC1:
         {
             TCGv_i32 fp0 = tcg_temp_new_i32();
-            tcg_gen_qemu_ld_i32(fp0, t0, ctx->mem_idx, MO_TESL);
+            tcg_gen_qemu_ld_i32(fp0, t0, ctx->mem_idx, MO_TESL |
+                                ctx->default_tcg_memop_mask);
             gen_store_fpr32(ctx, fp0, ft);
             tcg_temp_free_i32(fp0);
         }
@@ -3124,7 +3134,8 @@
         {
             TCGv_i32 fp0 = tcg_temp_new_i32();
             gen_load_fpr32(ctx, fp0, ft);
-            tcg_gen_qemu_st_i32(fp0, t0, ctx->mem_idx, MO_TEUL);
+            tcg_gen_qemu_st_i32(fp0, t0, ctx->mem_idx, MO_TEUL |
+                                ctx->default_tcg_memop_mask);
             tcg_temp_free_i32(fp0);
         }
         opn = "swc1";
@@ -3132,7 +3143,8 @@
     case OPC_LDC1:
         {
             TCGv_i64 fp0 = tcg_temp_new_i64();
-            tcg_gen_qemu_ld_i64(fp0, t0, ctx->mem_idx, MO_TEQ);
+            tcg_gen_qemu_ld_i64(fp0, t0, ctx->mem_idx, MO_TEQ |
+                                ctx->default_tcg_memop_mask);
             gen_store_fpr64(ctx, fp0, ft);
             tcg_temp_free_i64(fp0);
         }
@@ -3142,7 +3154,8 @@
         {
             TCGv_i64 fp0 = tcg_temp_new_i64();
             gen_load_fpr64(ctx, fp0, ft);
-            tcg_gen_qemu_st_i64(fp0, t0, ctx->mem_idx, MO_TEQ);
+            tcg_gen_qemu_st_i64(fp0, t0, ctx->mem_idx, MO_TEQ |
+                                ctx->default_tcg_memop_mask);
             tcg_temp_free_i64(fp0);
         }
         opn = "sdc1";
@@ -19086,32 +19099,39 @@
             uint8_t wd = (ctx->opcode >> 6) & 0x1f;
             uint8_t df = (ctx->opcode >> 0) & 0x3;
 
-            TCGv_i32 tdf = tcg_const_i32(df);
             TCGv_i32 twd = tcg_const_i32(wd);
-            TCGv_i32 trs = tcg_const_i32(rs);
-            TCGv_i32 ts10 = tcg_const_i32(s10);
+            TCGv taddr = tcg_temp_new();
+            gen_base_offset_addr(ctx, taddr, rs, s10 << df);
 
             switch (MASK_MSA_MINOR(opcode)) {
             case OPC_LD_B:
+                gen_helper_msa_ld_b(cpu_env, twd, taddr);
+                break;
             case OPC_LD_H:
+                gen_helper_msa_ld_h(cpu_env, twd, taddr);
+                break;
             case OPC_LD_W:
+                gen_helper_msa_ld_w(cpu_env, twd, taddr);
+                break;
             case OPC_LD_D:
-                save_cpu_state(ctx, 1);
-                gen_helper_msa_ld_df(cpu_env, tdf, twd, trs, ts10);
+                gen_helper_msa_ld_d(cpu_env, twd, taddr);
                 break;
             case OPC_ST_B:
+                gen_helper_msa_st_b(cpu_env, twd, taddr);
+                break;
             case OPC_ST_H:
+                gen_helper_msa_st_h(cpu_env, twd, taddr);
+                break;
             case OPC_ST_W:
+                gen_helper_msa_st_w(cpu_env, twd, taddr);
+                break;
             case OPC_ST_D:
-                save_cpu_state(ctx, 1);
-                gen_helper_msa_st_df(cpu_env, tdf, twd, trs, ts10);
+                gen_helper_msa_st_d(cpu_env, twd, taddr);
                 break;
             }
 
             tcg_temp_free_i32(twd);
-            tcg_temp_free_i32(tdf);
-            tcg_temp_free_i32(trs);
-            tcg_temp_free_i32(ts10);
+            tcg_temp_free(taddr);
         }
         break;
     default:
@@ -19798,6 +19818,8 @@
 #else
         ctx.mem_idx = ctx.hflags & MIPS_HFLAG_KSU;
 #endif
+    ctx.default_tcg_memop_mask = (ctx.insn_flags & ISA_MIPS32R6) ?
+                                 MO_UNALN : MO_ALIGN;
     num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
     if (max_insns == 0)
diff --git a/target-mips/translate_init.c b/target-mips/translate_init.c
index 860b2c3..eae4f0e 100644
--- a/target-mips/translate_init.c
+++ b/target-mips/translate_init.c
@@ -558,7 +558,7 @@
     },
     {
         /* A generic CPU supporting MIPS64 Release 6 ISA.
-           FIXME: Support IEEE 754-2008 FP and misaligned memory accesses.
+           FIXME: Support IEEE 754-2008 FP.
                   Eventually this should be replaced by a real CPU model. */
         .name = "MIPS64R6-generic",
         .CP0_PRid = 0x00010000,