[MIPS64] target-mips : implement unaligned loads using TCG

original commit from QEMU ToT :

commit fc40787abcf8452b8f50d92b7a13243a12972c7a
Author: Aurelien Jarno <aurelien@aurel32.net>
Date:   Tue Oct 9 21:53:21 2012 +0200

target-mips: implement unaligned loads using TCG

Load/store from helpers should be avoided as they are quite
inefficient. Rewrite unaligned loads instructions using TCG and
aligned loads. The number of actual loads operations to implement
an unaligned load instruction is reduced from up to 8 to 1.

Note: As we can't rely on shift by 32 or 64 undefined behaviour,
the code loads already shift by one constants.

Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>

Change-Id: I5d034506490ccd00b350f2eb9ecf86284ed7a15e
diff --git a/target-mips/helper.h b/target-mips/helper.h
index 3e6557a..31a77ac 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -5,13 +5,9 @@
 DEF_HELPER_1(interrupt_restart, void, env)
 
 #ifdef TARGET_MIPS64
-DEF_HELPER_4(ldl, tl, env, tl, tl, int)
-DEF_HELPER_4(ldr, tl, env, tl, tl, int)
 DEF_HELPER_4(sdl, void, env, tl, tl, int)
 DEF_HELPER_4(sdr, void, env, tl, tl, int)
 #endif
-DEF_HELPER_4(lwl, tl, env, tl, tl, int)
-DEF_HELPER_4(lwr, tl, env, tl, tl, int)
 DEF_HELPER_4(swl, void, env, tl, tl, int)
 DEF_HELPER_4(swr, void, env, tl, tl, int)
 
@@ -29,8 +25,8 @@
 #ifdef TARGET_MIPS64
 DEF_HELPER_FLAGS_1(dclo, TCG_CALL_NO_RWG_SE, tl, tl)
 DEF_HELPER_FLAGS_1(dclz, TCG_CALL_NO_RWG_SE, tl, tl)
-DEF_HELPER_2(dmult, void, tl, tl)
-DEF_HELPER_2(dmultu, void, tl, tl)
+DEF_HELPER_3(dmult, void, env, tl, tl)
+DEF_HELPER_3(dmultu, void, env, tl, tl)
 #endif
 
 DEF_HELPER_3(muls, tl, env, tl, tl)
diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
index 9f8adfd..66c2cab 100644
--- a/target-mips/op_helper.c
+++ b/target-mips/op_helper.c
@@ -415,56 +415,6 @@
 #define GET_OFFSET(addr, offset) (addr - (offset))
 #endif
 
-target_ulong helper_lwl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
-                        int mem_idx)
-{
-    target_ulong tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
-
-    if (GET_LMASK(arg2) <= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
-    }
-
-    if (GET_LMASK(arg2) <= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
-    }
-
-    if (GET_LMASK(arg2) == 0) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
-        arg1 = (arg1 & 0xFFFFFF00) | tmp;
-    }
-    return (int32_t)arg1;
-}
-
-target_ulong helper_lwr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
-                        int mem_idx)
-{
-    target_ulong tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0xFFFFFF00) | tmp;
-
-    if (GET_LMASK(arg2) >= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FF) | (tmp << 8);
-    }
-
-    if (GET_LMASK(arg2) >= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFF) | (tmp << 16);
-    }
-
-    if (GET_LMASK(arg2) == 3) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
-        arg1 = (arg1 & 0x00FFFFFF) | (tmp << 24);
-    }
-    return (int32_t)arg1;
-}
-
 void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                 int mem_idx)
 {
@@ -505,98 +455,6 @@
 #define GET_LMASK64(v) (((v) & 7) ^ 7)
 #endif
 
-target_ulong helper_ldl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
-                        int mem_idx)
-{
-    uint64_t tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
-
-    if (GET_LMASK64(arg2) <= 6) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 1), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
-    }
-
-    if (GET_LMASK64(arg2) <= 5) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 2), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
-    }
-
-    if (GET_LMASK64(arg2) <= 4) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 3), mem_idx);
-        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
-    }
-
-    if (GET_LMASK64(arg2) <= 3) {
-        tmp = do_lbu(GET_OFFSET(arg2, 4), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
-    }
-
-    if (GET_LMASK64(arg2) <= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 5), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
-    }
-
-    if (GET_LMASK64(arg2) <= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 6), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp << 8);
-    }
-
-    if (GET_LMASK64(arg2) == 0) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, 7), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
-    }
-
-    return arg1;
-}
-
-target_ulong helper_ldr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
-                        int mem_idx)
-{
-    uint64_t tmp;
-
-    tmp = do_lbu(env, arg2, mem_idx);
-    arg1 = (arg1 & 0xFFFFFFFFFFFFFF00ULL) | tmp;
-
-    if (GET_LMASK64(arg2) >= 1) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -1), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFFFF00FFULL) | (tmp  << 8);
-    }
-
-    if (GET_LMASK64(arg2) >= 2) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -2), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFFFF00FFFFULL) | (tmp << 16);
-    }
-
-    if (GET_LMASK64(arg2) >= 3) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -3), mem_idx);
-        arg1 = (arg1 & 0xFFFFFFFF00FFFFFFULL) | (tmp << 24);
-    }
-
-    if (GET_LMASK64(arg2) >= 4) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -4), mem_idx);
-        arg1 = (arg1 & 0xFFFFFF00FFFFFFFFULL) | (tmp << 32);
-    }
-
-    if (GET_LMASK64(arg2) >= 5) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -5), mem_idx);
-        arg1 = (arg1 & 0xFFFF00FFFFFFFFFFULL) | (tmp << 40);
-    }
-
-    if (GET_LMASK64(arg2) >= 6) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -6), mem_idx);
-        arg1 = (arg1 & 0xFF00FFFFFFFFFFFFULL) | (tmp << 48);
-    }
-
-    if (GET_LMASK64(arg2) == 7) {
-        tmp = do_lbu(env, GET_OFFSET(arg2, -7), mem_idx);
-        arg1 = (arg1 & 0x00FFFFFFFFFFFFFFULL) | (tmp << 56);
-    }
-
-    return arg1;
-}
-
 void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
                  int mem_idx)
 {
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 683c936..798517b 100755
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -988,8 +988,10 @@
                       int base, int16_t offset)
 {
     const char * __attribute__((unused)) opn = "ldst";
-    TCGv t0 = tcg_temp_new();
-    TCGv t1 = tcg_temp_new();
+    TCGv t0, t1, t2;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
 
     if (base == 0) {
         tcg_gen_movi_tl(t0, offset);
@@ -1028,10 +1030,22 @@
         opn = "sd";
         break;
     case OPC_LDL:
-        save_cpu_state(ctx, 1);
+        tcg_gen_andi_tl(t1, t0, 7);
+#ifndef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 7);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~7);
+        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
+        tcg_gen_shl_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 63);
+        t2 = tcg_const_tl(0x7fffffffffffffffull);
+        tcg_gen_shr_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_4i(ldl, t1, cpu_env, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
+        gen_store_gpr(t0, rt);
         opn = "ldl";
         break;
     case OPC_SDL:
@@ -1041,10 +1055,22 @@
         opn = "sdl";
         break;
     case OPC_LDR:
-        save_cpu_state(ctx, 1);
+        tcg_gen_andi_tl(t1, t0, 7);
+#ifdef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 7);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~7);
+        tcg_gen_qemu_ld64(t0, t0, ctx->mem_idx);
+        tcg_gen_shr_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 63);
+        t2 = tcg_const_tl(0xfffffffffffffffeull);
+        tcg_gen_shl_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_4i(ldr, t1, cpu_env, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
+        gen_store_gpr(t0, rt);
         opn = "ldr";
         break;
     case OPC_SDR:
@@ -1103,10 +1129,23 @@
         opn = "lbu";
         break;
     case OPC_LWL:
-        save_cpu_state(ctx, 1);
+        tcg_gen_andi_tl(t1, t0, 3);
+#ifndef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 3);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~3);
+        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
+        tcg_gen_shl_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 31);
+        t2 = tcg_const_tl(0x7fffffffull);
+        tcg_gen_shr_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_4i(lwl, t1, cpu_env, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
+        tcg_gen_ext32s_tl(t0, t0);
+        gen_store_gpr(t0, rt);
         opn = "lwl";
         break;
     case OPC_SWL:
@@ -1116,10 +1155,22 @@
         opn = "swl";
         break;
     case OPC_LWR:
-        save_cpu_state(ctx, 1);
+        tcg_gen_andi_tl(t1, t0, 3);
+#ifdef TARGET_WORDS_BIGENDIAN
+        tcg_gen_xori_tl(t1, t1, 3);
+#endif
+        tcg_gen_shli_tl(t1, t1, 3);
+        tcg_gen_andi_tl(t0, t0, ~3);
+        tcg_gen_qemu_ld32u(t0, t0, ctx->mem_idx);
+        tcg_gen_shr_tl(t0, t0, t1);
+        tcg_gen_xori_tl(t1, t1, 31);
+        t2 = tcg_const_tl(0xfffffffeull);
+        tcg_gen_shl_tl(t2, t2, t1);
         gen_load_gpr(t1, rt);
-        gen_helper_4i(lwr, t1, cpu_env, t1, t0, ctx->mem_idx);
-        gen_store_gpr(t1, rt);
+        tcg_gen_and_tl(t1, t1, t2);
+        tcg_temp_free(t2);
+        tcg_gen_or_tl(t0, t0, t1);
+        gen_store_gpr(t0, rt);
         opn = "lwr";
         break;
     case OPC_SWR:
@@ -2138,11 +2189,11 @@
         opn = "ddivu";
         break;
     case OPC_DMULT:
-        gen_helper_dmult(t0, t1);
+        gen_helper_dmult(cpu_env, t0, t1);
         opn = "dmult";
         break;
     case OPC_DMULTU:
-        gen_helper_dmultu(t0, t1);
+        gen_helper_dmultu(cpu_env, t0, t1);
         opn = "dmultu";
         break;
 #endif
@@ -4091,17 +4142,17 @@
             break;
         case 1:
             check_insn(env, ctx, ASE_MT);
-            gen_helper_mfc0_mvpcontrol(arg);
+            gen_helper_mfc0_mvpcontrol(arg, cpu_env);
             rn = "MVPControl";
             break;
         case 2:
             check_insn(env, ctx, ASE_MT);
-            gen_helper_mfc0_mvpconf0(arg);
+            gen_helper_mfc0_mvpconf0(arg, cpu_env);
             rn = "MVPConf0";
             break;
         case 3:
             check_insn(env, ctx, ASE_MT);
-            gen_helper_mfc0_mvpconf1(arg);
+            gen_helper_mfc0_mvpconf1(arg, cpu_env);
             rn = "MVPConf1";
             break;
         default:
@@ -4111,7 +4162,7 @@
     case 1:
         switch (sel) {
         case 0:
-            gen_helper_mfc0_random(arg);
+            gen_helper_mfc0_random(arg, cpu_env);
             rn = "Random";
             break;
         case 1:
@@ -4161,12 +4212,12 @@
             break;
         case 1:
             check_insn(env, ctx, ASE_MT);
-            gen_helper_mfc0_tcstatus(arg);
+            gen_helper_mfc0_tcstatus(arg, cpu_env);
             rn = "TCStatus";
             break;
         case 2:
             check_insn(env, ctx, ASE_MT);
-            gen_helper_mfc0_tcbind(arg);
+            gen_helper_mfc0_tcbind(arg, cpu_env);
             rn = "TCBind";
             break;
         case 3:
@@ -4299,7 +4350,7 @@
             /* Mark as an IO operation because we read the time.  */
             if (use_icount)
                 gen_io_start();
-            gen_helper_mfc0_count(arg);
+            gen_helper_mfc0_count(arg, cpu_env);
             if (use_icount) {
                 gen_io_end();
                 ctx->bstate = BS_STOP;
@@ -4436,7 +4487,7 @@
     case 18:
         switch (sel) {
         case 0 ... 7:
-            gen_helper_1i(dmfc0_watchlo, arg, sel);
+            gen_helper_2i(dmfc0_watchlo, arg, cpu_env, sel);
             rn = "WatchLo";
             break;
         default:
@@ -4446,7 +4497,7 @@
     case 19:
         switch (sel) {
         case 0 ... 7:
-            gen_helper_1i(mfc0_watchhi, arg, sel);
+            gen_helper_2i(mfc0_watchhi, arg, cpu_env, sel);
             rn = "WatchHi";
             break;
         default: