Merge remote-tracking branch 'remotes/kvm/uq/master' into staging

* remotes/kvm/uq/master:
  pc: port 92 reset requires a low->high transition
  cpu: make CPU_INTERRUPT_RESET available on all targets
  apic: do not accept SIPI on the bootstrap processor
  target-i386: preserve FPU and MSR state on INIT
  target-i386: fix set of registers zeroed on reset
  kvm: forward INIT signals coming from the chipset
  kvm: reset state from the CPU's reset method
  target-i386: the x86 CPL is stored in CS.selector - auto update hflags accordingly.
  target-i386: set eflags prior to calling cpu_x86_load_seg_cache() in seg_helper.c
  target-i386: set eflags and cr0 prior to calling cpu_x86_load_seg_cache() in smm_helper.c
  target-i386: set eflags prior to calling svm_load_seg_cache() in svm_helper.c
  pci-assign: limit # of msix vectors
  pci-assign: Fix a bug when map MSI-X table memory failed
  kvm: make one_reg helpers available for everyone
  target-i386: Remove unused data from local array

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/Makefile b/Makefile
index a120aab..d830483 100644
--- a/Makefile
+++ b/Makefile
@@ -148,10 +148,6 @@
 
 all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all modules
 
-vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS)
-
-vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS)
-
 config-host.h: config-host.h-timestamp
 config-host.h-timestamp: config-host.mak
 qemu-options.def: $(SRC_PATH)/qemu-options.hx
@@ -195,8 +191,6 @@
 
 recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR_RULES)
 
-bt-host.o: QEMU_CFLAGS += $(BLUEZ_CFLAGS)
-
 $(BUILD_DIR)/version.o: $(SRC_PATH)/version.rc $(BUILD_DIR)/config-host.h | $(BUILD_DIR)/version.lo
 	$(call quiet-command,$(WINDRES) -I$(BUILD_DIR) -o $@ $<,"  RC    version.o")
 $(BUILD_DIR)/version.lo: $(SRC_PATH)/version.rc $(BUILD_DIR)/config-host.h
@@ -384,17 +378,25 @@
 install-datadir install-localstatedir
 	$(INSTALL_DIR) "$(DESTDIR)$(bindir)"
 ifneq ($(TOOLS),)
-	$(INSTALL_PROG) $(STRIP_OPT) $(TOOLS) "$(DESTDIR)$(bindir)"
+	$(INSTALL_PROG) $(TOOLS) "$(DESTDIR)$(bindir)"
+ifneq ($(STRIP),)
+	$(STRIP) $(TOOLS:%="$(DESTDIR)$(bindir)/%")
+endif
 endif
 ifneq ($(CONFIG_MODULES),)
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_moddir)"
-	for s in $(patsubst %.mo,%$(DSOSUF),$(modules-m)); do \
-		$(INSTALL_PROG) $(STRIP_OPT) $$s "$(DESTDIR)$(qemu_moddir)/$$(echo $$s | tr / -)"; \
+	for s in $(modules-m:.mo=$(DSOSUF)); do \
+		t="$(DESTDIR)$(qemu_moddir)/$$(echo $$s | tr / -)"; \
+		$(INSTALL_LIB) $$s "$$t"; \
+		test -z "$(STRIP)" || $(STRIP) "$$t"; \
 	done
 endif
 ifneq ($(HELPERS-y),)
 	$(INSTALL_DIR) "$(DESTDIR)$(libexecdir)"
-	$(INSTALL_PROG) $(STRIP_OPT) $(HELPERS-y) "$(DESTDIR)$(libexecdir)"
+	$(INSTALL_PROG) $(HELPERS-y) "$(DESTDIR)$(libexecdir)"
+ifneq ($(STRIP),)
+	$(STRIP) $(HELPERS-y:%="$(DESTDIR)$(libexecdir)/%")
+endif
 endif
 ifneq ($(BLOBS),)
 	set -e; for x in $(BLOBS); do \
diff --git a/Makefile.objs b/Makefile.objs
index a6e0e2a..b897e1d 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -31,6 +31,8 @@
 libcacard-y += libcacard/vcard_emul_type.o
 libcacard-y += libcacard/card_7816.o
 libcacard-y += libcacard/vcardt.o
+libcacard/vcard_emul_nss.o-cflags := $(NSS_CFLAGS)
+libcacard/vcard_emul_nss.o-libs := $(NSS_LIBS)
 
 ######################################################################
 # Target independent part of system emulation. The long term path is to
@@ -64,9 +66,11 @@
 
 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o
+bt-host.o-cflags := $(BLUEZ_CFLAGS)
 
 common-obj-y += dma-helpers.o
 common-obj-y += vl.o
+vl.o-cflags := $(GPROF_CFLAGS) $(SDL_CFLAGS)
 common-obj-y += tpm.o
 
 common-obj-$(CONFIG_SLIRP) += slirp/
diff --git a/Makefile.target b/Makefile.target
index 6d8fde8..9986047 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -16,19 +16,22 @@
 ifdef CONFIG_USER_ONLY
 # user emulator name
 QEMU_PROG=qemu-$(TARGET_NAME)
+QEMU_PROG_BUILD = $(QEMU_PROG)
 else
 # system emulator name
+QEMU_PROG=qemu-system-$(TARGET_NAME)$(EXESUF)
 ifneq (,$(findstring -mwindows,$(libs_softmmu)))
 # Terminate program name with a 'w' because the linker builds a windows executable.
 QEMU_PROGW=qemu-system-$(TARGET_NAME)w$(EXESUF)
-endif # windows executable
-QEMU_PROG=qemu-system-$(TARGET_NAME)$(EXESUF)
+$(QEMU_PROG): $(QEMU_PROGW)
+	$(call quiet-command,$(OBJCOPY) --subsystem console $(QEMU_PROGW) $(QEMU_PROG),"  GEN   $(TARGET_DIR)$(QEMU_PROG)")
+QEMU_PROG_BUILD = $(QEMU_PROGW)
+else
+QEMU_PROG_BUILD = $(QEMU_PROG)
+endif
 endif
 
-PROGS=$(QEMU_PROG)
-ifdef QEMU_PROGW
-PROGS+=$(QEMU_PROGW)
-endif
+PROGS=$(QEMU_PROG) $(QEMU_PROGW)
 STPFILES=
 
 config-target.h: config-target.h-timestamp
@@ -140,10 +143,7 @@
 %/translate.o: QEMU_CFLAGS += $(TRANSLATE_OPT_CFLAGS)
 
 dummy := $(call unnest-vars,,obj-y)
-
-# we are making another call to unnest-vars with different vars, protect obj-y,
-# it can be overriden in subdir Makefile.objs
-obj-y-save := $(obj-y)
+all-obj-y := $(obj-y)
 
 block-obj-y :=
 common-obj-y :=
@@ -153,27 +153,16 @@
                block-obj-m \
                common-obj-y \
                common-obj-m)
-
-# Now restore obj-y
-obj-y := $(obj-y-save)
-
-all-obj-y = $(obj-y) $(common-obj-y)
+all-obj-y += $(common-obj-y)
 all-obj-$(CONFIG_SOFTMMU) += $(block-obj-y)
 
 ifndef CONFIG_HAIKU
 LIBS+=-lm
 endif
 
-ifdef QEMU_PROGW
-# The linker builds a windows executable. Make also a console executable.
-$(QEMU_PROGW): $(all-obj-y) ../libqemuutil.a ../libqemustub.a
+# build either PROG or PROGW
+$(QEMU_PROG_BUILD): $(all-obj-y) ../libqemuutil.a ../libqemustub.a
 	$(call LINK,$^)
-$(QEMU_PROG): $(QEMU_PROGW)
-	$(call quiet-command,$(OBJCOPY) --subsystem console $(QEMU_PROGW) $(QEMU_PROG),"  GEN   $(TARGET_DIR)$(QEMU_PROG)")
-else
-$(QEMU_PROG): $(all-obj-y) ../libqemuutil.a ../libqemustub.a
-	$(call LINK,$^)
-endif
 
 gdbstub-xml.c: $(TARGET_XML_FILES) $(SRC_PATH)/scripts/feature_to_c.sh
 	$(call quiet-command,rm -f $@ && $(SHELL) $(SRC_PATH)/scripts/feature_to_c.sh $@ $(TARGET_XML_FILES),"  GEN   $(TARGET_DIR)$@")
@@ -194,9 +183,9 @@
 
 install: all
 ifneq ($(PROGS),)
-	$(INSTALL) -m 755 $(PROGS) "$(DESTDIR)$(bindir)"
+	$(INSTALL_PROG) $(PROGS) "$(DESTDIR)$(bindir)"
 ifneq ($(STRIP),)
-	$(STRIP) $(patsubst %,"$(DESTDIR)$(bindir)/%",$(PROGS))
+	$(STRIP) $(PROGS:%="$(DESTDIR)$(bindir)/%")
 endif
 endif
 ifdef CONFIG_TRACE_SYSTEMTAP
diff --git a/audio/Makefile.objs b/audio/Makefile.objs
index d71a877..26a0ac9 100644
--- a/audio/Makefile.objs
+++ b/audio/Makefile.objs
@@ -14,4 +14,4 @@
 common-obj-y += wavcapture.o
 
 $(obj)/audio.o $(obj)/fmodaudio.o: QEMU_CFLAGS += $(FMOD_CFLAGS)
-$(obj)/sdlaudio.o: QEMU_CFLAGS += $(SDL_CFLAGS)
+sdlaudio.o-cflags := $(SDL_CFLAGS)
diff --git a/backends/Makefile.objs b/backends/Makefile.objs
index 42557d5..591ddcf 100644
--- a/backends/Makefile.objs
+++ b/backends/Makefile.objs
@@ -3,6 +3,6 @@
 
 common-obj-y += msmouse.o
 common-obj-$(CONFIG_BRLAPI) += baum.o
-$(obj)/baum.o: QEMU_CFLAGS += $(SDL_CFLAGS) 
+baum.o-cflags := $(SDL_CFLAGS)
 
 common-obj-$(CONFIG_TPM) += tpm.o
diff --git a/bsd-user/main.c b/bsd-user/main.c
index 9f895b4..4ba61da 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -43,7 +43,7 @@
 #endif
 
 static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX;
-const char *qemu_uname_release = CONFIG_UNAME_RELEASE;
+const char *qemu_uname_release;
 extern char **environ;
 enum BSDType bsd_type;
 
diff --git a/configure b/configure
index 3c11dd0..e565e59 100755
--- a/configure
+++ b/configure
@@ -3470,10 +3470,10 @@
 #include <pk11pub.h>
 int main(void) { PK11_FreeSlot(0); return 0; }
 EOF
-    smartcard_includes="-I\$(SRC_PATH)/libcacard"
-    libcacard_libs="$($pkg_config --libs nss 2>/dev/null) $glib_libs"
-    libcacard_cflags="$($pkg_config --cflags nss 2>/dev/null) $glib_cflags"
-    test_cflags="$libcacard_cflags"
+    # FIXME: do not include $glib_* in here
+    nss_libs="$($pkg_config --libs nss 2>/dev/null) $glib_libs"
+    nss_cflags="$($pkg_config --cflags nss 2>/dev/null) $glib_cflags"
+    test_cflags="$nss_cflags"
     # The header files in nss < 3.13.3 have a bug which causes them to
     # emit a warning. If we're going to compile QEMU with -Werror, then
     # test that the headers don't have this bug. Otherwise we would pass
@@ -3483,11 +3483,8 @@
     fi
     if test -n "$libtool" &&
        $pkg_config --atleast-version=3.12.8 nss && \
-      compile_prog "$test_cflags" "$libcacard_libs"; then
+      compile_prog "$test_cflags" "$nss_libs"; then
         smartcard_nss="yes"
-        QEMU_CFLAGS="$QEMU_CFLAGS $libcacard_cflags"
-        QEMU_INCLUDES="$QEMU_INCLUDES $smartcard_includes"
-        libs_softmmu="$libcacard_libs $libs_softmmu"
     else
         if test "$smartcard_nss" = "yes"; then
             feature_not_found "nss"
@@ -4501,8 +4498,8 @@
 
 if test "$smartcard_nss" = "yes" ; then
   echo "CONFIG_SMARTCARD_NSS=y" >> $config_host_mak
-  echo "libcacard_libs=$libcacard_libs" >> $config_host_mak
-  echo "libcacard_cflags=$libcacard_cflags" >> $config_host_mak
+  echo "NSS_LIBS=$nss_libs" >> $config_host_mak
+  echo "NSS_CFLAGS=$nss_cflags" >> $config_host_mak
 fi
 
 if test "$libusb" = "yes" ; then
@@ -4771,12 +4768,6 @@
   echo "GCOV=$gcov_tool" >> $config_host_mak
 fi
 
-iotests_common_env="tests/qemu-iotests/common.env"
-
-echo "# Automatically generated by configure - do not modify" > $iotests_common_env
-echo >> $iotests_common_env
-echo "PYTHON='$python'" >> $iotests_common_env
-
 # use included Linux headers
 if test "$linux" = "yes" ; then
   mkdir -p linux-headers
diff --git a/disas/Makefile.objs b/disas/Makefile.objs
index 41c2374..8dae4da 100644
--- a/disas/Makefile.objs
+++ b/disas/Makefile.objs
@@ -4,7 +4,7 @@
 common-obj-$(CONFIG_ARM_A64_DIS) += arm-a64.o
 common-obj-$(CONFIG_ARM_A64_DIS) += libvixl/
 libvixldir = $(SRC_PATH)/disas/libvixl
-$(obj)/arm-a64.o: QEMU_CFLAGS += -I$(libvixldir)
+arm-a64.o-cflags := -I$(libvixldir)
 common-obj-$(CONFIG_CRIS_DIS) += cris.o
 common-obj-$(CONFIG_HPPA_DIS) += hppa.o
 common-obj-$(CONFIG_I386_DIS) += i386.o
diff --git a/hw/usb/Makefile.objs b/hw/usb/Makefile.objs
index 17d460c..3fe4dff 100644
--- a/hw/usb/Makefile.objs
+++ b/hw/usb/Makefile.objs
@@ -24,6 +24,7 @@
 common-obj-y                          += dev-smartcard-reader.o
 common-obj-y                          += ccid-card-passthru.o
 common-obj-$(CONFIG_SMARTCARD_NSS)    += ccid-card-emulated.o
+ccid-card-emulated.o-cflags := -I$(SRC_PATH)/libcacard
 endif
 
 ifeq ($(CONFIG_POSIX),y)
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index f9ac332..444b4d9 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -145,7 +145,7 @@
 #define CF_COUNT_MASK  0x7fff
 #define CF_LAST_IO     0x8000 /* Last insn may be an IO access.  */
 
-    uint8_t *tc_ptr;    /* pointer to the translated code */
+    void *tc_ptr;    /* pointer to the translated code */
     /* next matching tb for physical address. */
     struct TranslationBlock *phys_hash_next;
     /* first and second physical page containing code. The lower bit
@@ -229,7 +229,7 @@
 static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
 {
     /* patch the branch destination */
-    *(uint32_t *)jmp_addr = addr - (jmp_addr + 4);
+    stl_le_p((void*)jmp_addr, addr - (jmp_addr + 4));
     /* no need to flush icache explicitly */
 }
 #elif defined(__aarch64__)
diff --git a/libcacard/Makefile b/libcacard/Makefile
index 6b06448..881b222 100644
--- a/libcacard/Makefile
+++ b/libcacard/Makefile
@@ -25,7 +25,6 @@
 
 libcacard.la: LDFLAGS += -rpath $(libdir) -no-undefined \
 	-export-syms $(SRC_PATH)/libcacard/libcacard.syms
-libcacard.la: LIBS = $(libcacard_libs)
 libcacard.la: $(libcacard-lobj-y)
 	$(call LINK,$^)
 
diff --git a/rules.mak b/rules.mak
index 5c454d8..b12d312 100644
--- a/rules.mak
+++ b/rules.mak
@@ -45,7 +45,7 @@
 else
 LIBTOOL += $(if $(V),,--quiet)
 %.lo: %.c
-	$(call quiet-command,$(LIBTOOL) --mode=compile --tag=CC $(CC) $(QEMU_INCLUDES) $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) -c -o $@ $<,"  lt CC $@")
+	$(call quiet-command,$(LIBTOOL) --mode=compile --tag=CC $(CC) $(QEMU_INCLUDES) $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) $($*.o-cflags) -c -o $@ $<,"  lt CC $@")
 %.lo: %.rc
 	$(call quiet-command,$(LIBTOOL) --mode=compile --tag=RC $(WINDRES) -I. -o $@ $<,"lt RC   $(TARGET_DIR)$@")
 %.lo: %.dtrace
@@ -57,7 +57,7 @@
        $(call expand-objs,$1) \
        $(if $(filter %.lo %.la,$1),$(version-lobj-y),$(version-obj-y)) \
        $(if $(filter %.lo %.la,$1),$(LIBTOOLFLAGS)) \
-       $(call extract-libs,$1) $(LIBS),$(if $(filter %.lo %.la,$1),"lt LINK ", "  LINK  ")"$(TARGET_DIR)$@")
+       $(call extract-libs,$(1:.lo=.o)) $(LIBS),$(if $(filter %.lo %.la,$1),"lt LINK ", "  LINK  ")"$(TARGET_DIR)$@")
 endif
 
 %.asm: %.S
@@ -67,13 +67,13 @@
 	$(call quiet-command,$(AS) $(ASFLAGS) -o $@ $<,"  AS    $(TARGET_DIR)$@")
 
 %.o: %.cc
-	$(call quiet-command,$(CXX) $(QEMU_INCLUDES) $(QEMU_CXXFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) -c -o $@ $<,"  CXX   $(TARGET_DIR)$@")
+	$(call quiet-command,$(CXX) $(QEMU_INCLUDES) $(QEMU_CXXFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) $($@-cflags) -c -o $@ $<,"  CXX   $(TARGET_DIR)$@")
 
 %.o: %.cpp
-	$(call quiet-command,$(CXX) $(QEMU_INCLUDES) $(QEMU_CXXFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) -c -o $@ $<,"  CXX   $(TARGET_DIR)$@")
+	$(call quiet-command,$(CXX) $(QEMU_INCLUDES) $(QEMU_CXXFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) $($@-cflags) -c -o $@ $<,"  CXX   $(TARGET_DIR)$@")
 
 %.o: %.m
-	$(call quiet-command,$(OBJCC) $(QEMU_INCLUDES) $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) -c -o $@ $<,"  OBJC  $(TARGET_DIR)$@")
+	$(call quiet-command,$(OBJCC) $(QEMU_INCLUDES) $(QEMU_CFLAGS) $(QEMU_DGFLAGS) $(CFLAGS) $($@-cflags) -c -o $@ $<,"  OBJC  $(TARGET_DIR)$@")
 
 %.o: %.dtrace
 	$(call quiet-command,dtrace -o $@ -G -s $<, "  GEN   $(TARGET_DIR)$@")
@@ -175,16 +175,16 @@
 endef
 
 define fix-obj-vars
-$(foreach v,$($1), \
+$(if $2, $(foreach v,$($1), \
 	$(if $($v-cflags), \
-		$(eval $2$v-cflags := $($v-cflags)) \
+		$(eval $2/$v-cflags := $($v-cflags)) \
 		$(eval $v-cflags := )) \
 	$(if $($v-libs), \
-		$(eval $2$v-libs := $($v-libs)) \
+		$(eval $2/$v-libs := $($v-libs)) \
 		$(eval $v-libs := )) \
 	$(if $($v-objs), \
-		$(eval $2$v-objs := $(addprefix $2,$($v-objs))) \
-		$(eval $v-objs := )))
+		$(eval $2/$v-objs := $(addprefix $2/,$($v-objs))) \
+		$(eval $v-objs := ))))
 endef
 
 define unnest-dir
@@ -192,7 +192,7 @@
 $(eval obj-parent-$1 := $(obj))
 $(eval obj := $(if $(obj),$(obj)/$1,$1))
 $(eval include $(SRC_PATH)/$1/Makefile.objs)
-$(foreach v,$(nested-vars),$(call fix-obj-vars,$v,$(if $(obj),$(obj)/)))
+$(foreach v,$(nested-vars),$(call fix-obj-vars,$v,$(obj)))
 $(eval obj := $(obj-parent-$1))
 $(eval obj-parent-$1 := )
 $(foreach var,$(nested-vars),$(call pop-var,$(var),$1/))
@@ -228,6 +228,7 @@
 define unnest-vars
 $(eval obj := $1)
 $(eval nested-vars := $2)
+$(foreach v,$(nested-vars),$(call fix-obj-vars,$v,$(obj)))
 $(eval old-nested-dirs := )
 $(call unnest-vars-1)
 $(if $1,$(foreach v,$(nested-vars),$(eval \
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 02625e3..032b0fd 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -6708,41 +6708,63 @@
         }
     bt_op:
         tcg_gen_andi_tl(cpu_T[1], cpu_T[1], (1 << (3 + ot)) - 1);
+        tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
         switch(op) {
         case 0:
-            tcg_gen_shr_tl(cpu_cc_src, cpu_T[0], cpu_T[1]);
-            tcg_gen_movi_tl(cpu_cc_dst, 0);
             break;
         case 1:
-            tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
             tcg_gen_movi_tl(cpu_tmp0, 1);
             tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T[1]);
             tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
             break;
         case 2:
-            tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
             tcg_gen_movi_tl(cpu_tmp0, 1);
             tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T[1]);
-            tcg_gen_not_tl(cpu_tmp0, cpu_tmp0);
-            tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            tcg_gen_andc_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
             break;
         default:
         case 3:
-            tcg_gen_shr_tl(cpu_tmp4, cpu_T[0], cpu_T[1]);
             tcg_gen_movi_tl(cpu_tmp0, 1);
             tcg_gen_shl_tl(cpu_tmp0, cpu_tmp0, cpu_T[1]);
             tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
             break;
         }
-        set_cc_op(s, CC_OP_SARB + ot);
         if (op != 0) {
             if (mod != 3) {
                 gen_op_st_v(s, ot, cpu_T[0], cpu_A0);
             } else {
                 gen_op_mov_reg_v(ot, rm, cpu_T[0]);
             }
+        }
+
+        /* Delay all CC updates until after the store above.  Note that
+           C is the result of the test, Z is unchanged, and the others
+           are all undefined.  */
+        switch (s->cc_op) {
+        case CC_OP_MULB ... CC_OP_MULQ:
+        case CC_OP_ADDB ... CC_OP_ADDQ:
+        case CC_OP_ADCB ... CC_OP_ADCQ:
+        case CC_OP_SUBB ... CC_OP_SUBQ:
+        case CC_OP_SBBB ... CC_OP_SBBQ:
+        case CC_OP_LOGICB ... CC_OP_LOGICQ:
+        case CC_OP_INCB ... CC_OP_INCQ:
+        case CC_OP_DECB ... CC_OP_DECQ:
+        case CC_OP_SHLB ... CC_OP_SHLQ:
+        case CC_OP_SARB ... CC_OP_SARQ:
+        case CC_OP_BMILGB ... CC_OP_BMILGQ:
+            /* Z was going to be computed from the non-zero status of CC_DST.
+               We can get that same Z value (and the new C value) by leaving
+               CC_DST alone, setting CC_SRC, and using a CC_OP_SAR of the
+               same width.  */
             tcg_gen_mov_tl(cpu_cc_src, cpu_tmp4);
-            tcg_gen_movi_tl(cpu_cc_dst, 0);
+            set_cc_op(s, ((s->cc_op - CC_OP_MULB) & 3) + CC_OP_SARB);
+            break;
+        default:
+            /* Otherwise, generate EFLAGS and replace the C bit.  */
+            gen_compute_eflags(s);
+            tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, cpu_tmp4,
+                               ctz32(CC_C), 1);
+            break;
         }
         break;
     case 0x1bc: /* bsf / tzcnt */
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 0a580b6..77bb6d9 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -63,40 +63,34 @@
 # endif
 #endif
 
-static inline void reloc_pc26(void *code_ptr, intptr_t target)
+static inline void reloc_pc26(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
 {
-    intptr_t offset = (target - (intptr_t)code_ptr) / 4;
+    ptrdiff_t offset = target - code_ptr;
+    assert(offset == sextract64(offset, 0, 26));
     /* read instruction, mask away previous PC_REL26 parameter contents,
        set the proper offset, then write back the instruction. */
-    uint32_t insn = *(uint32_t *)code_ptr;
-    insn = deposit32(insn, 0, 26, offset);
-    *(uint32_t *)code_ptr = insn;
+    *code_ptr = deposit32(*code_ptr, 0, 26, offset);
 }
 
-static inline void reloc_pc19(void *code_ptr, intptr_t target)
+static inline void reloc_pc19(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
 {
-    intptr_t offset = (target - (intptr_t)code_ptr) / 4;
-    /* read instruction, mask away previous PC_REL19 parameter contents,
-       set the proper offset, then write back the instruction. */
-    uint32_t insn = *(uint32_t *)code_ptr;
-    insn = deposit32(insn, 5, 19, offset);
-    *(uint32_t *)code_ptr = insn;
+    ptrdiff_t offset = target - code_ptr;
+    assert(offset == sextract64(offset, 0, 19));
+    *code_ptr = deposit32(*code_ptr, 5, 19, offset);
 }
 
-static inline void patch_reloc(uint8_t *code_ptr, int type,
+static inline void patch_reloc(tcg_insn_unit *code_ptr, int type,
                                intptr_t value, intptr_t addend)
 {
-    value += addend;
-
+    assert(addend == 0);
     switch (type) {
     case R_AARCH64_JUMP26:
     case R_AARCH64_CALL26:
-        reloc_pc26(code_ptr, value);
+        reloc_pc26(code_ptr, (tcg_insn_unit *)value);
         break;
     case R_AARCH64_CONDBR19:
-        reloc_pc19(code_ptr, value);
+        reloc_pc19(code_ptr, (tcg_insn_unit *)value);
         break;
-
     default:
         tcg_abort();
     }
@@ -794,15 +788,10 @@
     }
 }
 
-static inline void tcg_out_goto(TCGContext *s, intptr_t target)
+static inline void tcg_out_goto(TCGContext *s, tcg_insn_unit *target)
 {
-    intptr_t offset = (target - (intptr_t)s->code_ptr) / 4;
-
-    if (offset < -0x02000000 || offset >= 0x02000000) {
-        /* out of 26bit range */
-        tcg_abort();
-    }
-
+    ptrdiff_t offset = target - s->code_ptr;
+    assert(offset == sextract64(offset, 0, 26));
     tcg_out_insn(s, 3206, B, offset);
 }
 
@@ -828,29 +817,23 @@
     tcg_out_insn(s, 3207, BLR, reg);
 }
 
-static inline void tcg_out_call(TCGContext *s, intptr_t target)
+static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
 {
-    intptr_t offset = (target - (intptr_t)s->code_ptr) / 4;
-
-    if (offset < -0x02000000 || offset >= 0x02000000) { /* out of 26bit rng */
-        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, target);
-        tcg_out_callr(s, TCG_REG_TMP);
-    } else {
+    ptrdiff_t offset = target - s->code_ptr;
+    if (offset == sextract64(offset, 0, 26)) {
         tcg_out_insn(s, 3206, BL, offset);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
+        tcg_out_callr(s, TCG_REG_TMP);
     }
 }
 
 void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
 {
-    intptr_t target = addr;
-    intptr_t offset = (target - (intptr_t)jmp_addr) / 4;
+    tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
+    tcg_insn_unit *target = (tcg_insn_unit *)addr;
 
-    if (offset < -0x02000000 || offset >= 0x02000000) {
-        /* out of 26bit range */
-        tcg_abort();
-    }
-
-    patch_reloc((uint8_t *)jmp_addr, R_AARCH64_JUMP26, target, 0);
+    reloc_pc26(code_ptr, target);
     flush_icache_range(jmp_addr, jmp_addr + 4);
 }
 
@@ -862,7 +845,7 @@
         tcg_out_reloc(s, s->code_ptr, R_AARCH64_JUMP26, label_index, 0);
         tcg_out_goto_noaddr(s);
     } else {
-        tcg_out_goto(s, l->u.value);
+        tcg_out_goto(s, l->u.value_ptr);
     }
 }
 
@@ -884,9 +867,8 @@
         tcg_out_reloc(s, s->code_ptr, R_AARCH64_CONDBR19, label, 0);
         offset = tcg_in32(s) >> 5;
     } else {
-        offset = l->u.value - (uintptr_t)s->code_ptr;
-        offset >>= 2;
-        assert(offset >= -0x40000 && offset < 0x40000);
+        offset = l->u.value_ptr - s->code_ptr;
+        assert(offset == sextract64(offset, 0, 19));
     }
 
     if (need_cmp) {
@@ -982,7 +964,7 @@
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
     [MO_LEUW] = helper_le_lduw_mmu,
     [MO_LEUL] = helper_le_ldul_mmu,
@@ -995,7 +977,7 @@
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[16] = {
     [MO_UB]   = helper_ret_stb_mmu,
     [MO_LEUW] = helper_le_stw_mmu,
     [MO_LEUL] = helper_le_stl_mmu,
@@ -1005,11 +987,11 @@
     [MO_BEQ]  = helper_be_stq_mmu,
 };
 
-static inline void tcg_out_adr(TCGContext *s, TCGReg rd, uintptr_t addr)
+static inline void tcg_out_adr(TCGContext *s, TCGReg rd, void *target)
 {
-    addr -= (uintptr_t)s->code_ptr;
-    assert(addr == sextract64(addr, 0, 21));
-    tcg_out_insn(s, 3406, ADR, rd, addr);
+    ptrdiff_t offset = tcg_pcrel_diff(s, target);
+    assert(offset == sextract64(offset, 0, 21));
+    tcg_out_insn(s, 3406, ADR, rd, offset);
 }
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -1017,20 +999,20 @@
     TCGMemOp opc = lb->opc;
     TCGMemOp size = opc & MO_SIZE;
 
-    reloc_pc19(lb->label_ptr[0], (intptr_t)s->code_ptr);
+    reloc_pc19(lb->label_ptr[0], s->code_ptr);
 
     tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_X0, TCG_AREG0);
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
-    tcg_out_adr(s, TCG_REG_X3, (intptr_t)lb->raddr);
-    tcg_out_call(s, (intptr_t)qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_adr(s, TCG_REG_X3, lb->raddr);
+    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
     if (opc & MO_SIGN) {
         tcg_out_sxt(s, TCG_TYPE_I64, size, lb->datalo_reg, TCG_REG_X0);
     } else {
         tcg_out_mov(s, size == MO_64, lb->datalo_reg, TCG_REG_X0);
     }
 
-    tcg_out_goto(s, (intptr_t)lb->raddr);
+    tcg_out_goto(s, lb->raddr);
 }
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -1038,21 +1020,21 @@
     TCGMemOp opc = lb->opc;
     TCGMemOp size = opc & MO_SIZE;
 
-    reloc_pc19(lb->label_ptr[0], (intptr_t)s->code_ptr);
+    reloc_pc19(lb->label_ptr[0], s->code_ptr);
 
     tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_X0, TCG_AREG0);
     tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
     tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
-    tcg_out_adr(s, TCG_REG_X4, (intptr_t)lb->raddr);
-    tcg_out_call(s, (intptr_t)qemu_st_helpers[opc]);
-    tcg_out_goto(s, (intptr_t)lb->raddr);
+    tcg_out_adr(s, TCG_REG_X4, lb->raddr);
+    tcg_out_call(s, qemu_st_helpers[opc]);
+    tcg_out_goto(s, lb->raddr);
 }
 
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
                                 TCGReg data_reg, TCGReg addr_reg,
-                                int mem_index,
-                                uint8_t *raddr, uint8_t *label_ptr)
+                                int mem_index, tcg_insn_unit *raddr,
+                                tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
@@ -1070,7 +1052,8 @@
    the slow path. Generated code returns the host addend in X1,
    clobbers X0,X2,X3,TMP. */
 static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp s_bits,
-                             uint8_t **label_ptr, int mem_index, bool is_read)
+                             tcg_insn_unit **label_ptr, int mem_index,
+                             bool is_read)
 {
     TCGReg base = TCG_AREG0;
     int tlb_offset = is_read ?
@@ -1218,7 +1201,7 @@
 {
 #ifdef CONFIG_SOFTMMU
     TCGMemOp s_bits = memop & MO_SIZE;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 
     tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 1);
     tcg_out_qemu_ld_direct(s, memop, data_reg, addr_reg, TCG_REG_X1);
@@ -1235,7 +1218,7 @@
 {
 #ifdef CONFIG_SOFTMMU
     TCGMemOp s_bits = memop & MO_SIZE;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 
     tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr, mem_index, 0);
     tcg_out_qemu_st_direct(s, memop, data_reg, addr_reg, TCG_REG_X1);
@@ -1247,7 +1230,7 @@
 #endif /* CONFIG_SOFTMMU */
 }
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
                        const TCGArg args[TCG_MAX_OP_ARGS],
@@ -1270,7 +1253,7 @@
     switch (opc) {
     case INDEX_op_exit_tb:
         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
-        tcg_out_goto(s, (intptr_t)tb_ret_addr);
+        tcg_out_goto(s, tb_ret_addr);
         break;
 
     case INDEX_op_goto_tb:
@@ -1278,19 +1261,11 @@
 #error "USE_DIRECT_JUMP required for aarch64"
 #endif
         assert(s->tb_jmp_offset != NULL); /* consistency for USE_DIRECT_JUMP */
-        s->tb_jmp_offset[a0] = s->code_ptr - s->code_buf;
+        s->tb_jmp_offset[a0] = tcg_current_code_size(s);
         /* actual branch destination will be patched by
            aarch64_tb_set_jmp_target later, beware retranslation. */
         tcg_out_goto_noaddr(s);
-        s->tb_next_offset[a0] = s->code_ptr - s->code_buf;
-        break;
-
-    case INDEX_op_call:
-        if (const_args[0]) {
-            tcg_out_call(s, a0);
-        } else {
-            tcg_out_callr(s, a0);
-        }
+        s->tb_next_offset[a0] = tcg_current_code_size(s);
         break;
 
     case INDEX_op_br:
@@ -1613,13 +1588,12 @@
         tcg_out_insn(s, 3508, SMULH, TCG_TYPE_I64, a0, a1, a2);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_mov_i32:
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_movi_i64:
-    case INDEX_op_movi_i32:
-        /* Always implemented with tcg_out_mov/i, never with tcg_out_op.  */
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
-        /* Opcode not implemented.  */
         tcg_abort();
     }
 
@@ -1629,15 +1603,8 @@
 static const TCGTargetOpDef aarch64_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_br, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_mov_i64, { "r", "r" } },
-
-    { INDEX_op_movi_i32, { "r" } },
-    { INDEX_op_movi_i64, { "r" } },
-
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index a1d4322..a32aea6 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -13,6 +13,7 @@
 #ifndef TCG_TARGET_AARCH64
 #define TCG_TARGET_AARCH64 1
 
+#define TCG_TARGET_INSN_UNIT_SIZE  4
 #undef TCG_TARGET_STACK_GROWSUP
 
 typedef enum {
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 7535175..538ca2a 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -115,36 +115,18 @@
 
 #define TCG_REG_TMP  TCG_REG_R12
 
-static inline void reloc_abs32(void *code_ptr, intptr_t target)
+static inline void reloc_pc24(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
 {
-    *(uint32_t *) code_ptr = target;
+    ptrdiff_t offset = (tcg_ptr_byte_diff(target, code_ptr) - 8) >> 2;
+    *code_ptr = (*code_ptr & ~0xffffff) | (offset & 0xffffff);
 }
 
-static inline void reloc_pc24(void *code_ptr, intptr_t target)
-{
-    uint32_t offset = ((target - ((intptr_t)code_ptr + 8)) >> 2);
-
-    *(uint32_t *) code_ptr = ((*(uint32_t *) code_ptr) & ~0xffffff)
-                             | (offset & 0xffffff);
-}
-
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
-    switch (type) {
-    case R_ARM_ABS32:
-        reloc_abs32(code_ptr, value);
-        break;
-
-    case R_ARM_CALL:
-    case R_ARM_JUMP24:
-    default:
-        tcg_abort();
-
-    case R_ARM_PC24:
-        reloc_pc24(code_ptr, value);
-        break;
-    }
+    assert(type == R_ARM_PC24);
+    assert(addend == 0);
+    reloc_pc24(code_ptr, (tcg_insn_unit *)value);
 }
 
 #define TCG_CT_CONST_ARM  0x100
@@ -379,20 +361,18 @@
 
 static inline void tcg_out_b_noaddr(TCGContext *s, int cond)
 {
-    /* We pay attention here to not modify the branch target by skipping
-       the corresponding bytes. This ensure that caches and memory are
+    /* We pay attention here to not modify the branch target by masking
+       the corresponding bytes.  This ensure that caches and memory are
        kept coherent during retranslation. */
-    s->code_ptr += 3;
-    tcg_out8(s, (cond << 4) | 0x0a);
+    tcg_out32(s, deposit32(*s->code_ptr, 24, 8, (cond << 4) | 0x0a));
 }
 
 static inline void tcg_out_bl_noaddr(TCGContext *s, int cond)
 {
-    /* We pay attention here to not modify the branch target by skipping
-       the corresponding bytes. This ensure that caches and memory are
+    /* We pay attention here to not modify the branch target by masking
+       the corresponding bytes.  This ensure that caches and memory are
        kept coherent during retranslation. */
-    s->code_ptr += 3;
-    tcg_out8(s, (cond << 4) | 0x0b);
+    tcg_out32(s, deposit32(*s->code_ptr, 24, 8, (cond << 4) | 0x0b));
 }
 
 static inline void tcg_out_bl(TCGContext *s, int cond, int32_t offset)
@@ -1010,20 +990,21 @@
  * with the code buffer limited to 16MB we wouldn't need the long case.
  * But we also use it for the tail-call to the qemu_ld/st helpers, which does.
  */
-static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr)
+static inline void tcg_out_goto(TCGContext *s, int cond, tcg_insn_unit *addr)
 {
-    int32_t disp = addr - (tcg_target_long) s->code_ptr;
+    intptr_t addri = (intptr_t)addr;
+    ptrdiff_t disp = tcg_pcrel_diff(s, addr);
 
-    if ((addr & 1) == 0 && disp - 8 < 0x01fffffd && disp - 8 > -0x01fffffd) {
+    if ((addri & 1) == 0 && disp - 8 < 0x01fffffd && disp - 8 > -0x01fffffd) {
         tcg_out_b(s, cond, disp);
         return;
     }
 
-    tcg_out_movi32(s, cond, TCG_REG_TMP, addr);
+    tcg_out_movi32(s, cond, TCG_REG_TMP, addri);
     if (use_armv5t_instructions) {
         tcg_out_bx(s, cond, TCG_REG_TMP);
     } else {
-        if (addr & 1) {
+        if (addri & 1) {
             tcg_abort();
         }
         tcg_out_mov_reg(s, cond, TCG_REG_PC, TCG_REG_TMP);
@@ -1032,39 +1013,28 @@
 
 /* The call case is mostly used for helpers - so it's not unreasonable
  * for them to be beyond branch range */
-static inline void tcg_out_call(TCGContext *s, uint32_t addr)
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *addr)
 {
-    int32_t val;
+    intptr_t addri = (intptr_t)addr;
+    ptrdiff_t disp = tcg_pcrel_diff(s, addr);
 
-    val = addr - (tcg_target_long) s->code_ptr;
-    if (val - 8 < 0x02000000 && val - 8 >= -0x02000000) {
-        if (addr & 1) {
+    if (disp - 8 < 0x02000000 && disp - 8 >= -0x02000000) {
+        if (addri & 1) {
             /* Use BLX if the target is in Thumb mode */
             if (!use_armv5t_instructions) {
                 tcg_abort();
             }
-            tcg_out_blx_imm(s, val);
+            tcg_out_blx_imm(s, disp);
         } else {
-            tcg_out_bl(s, COND_AL, val);
+            tcg_out_bl(s, COND_AL, disp);
         }
     } else if (use_armv7_instructions) {
-        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, addr);
+        tcg_out_movi32(s, COND_AL, TCG_REG_TMP, addri);
         tcg_out_blx(s, COND_AL, TCG_REG_TMP);
     } else {
         tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R14, TCG_REG_PC, 4);
         tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
-        tcg_out32(s, addr);
-    }
-}
-
-static inline void tcg_out_callr(TCGContext *s, int cond, int arg)
-{
-    if (use_armv5t_instructions) {
-        tcg_out_blx(s, cond, arg);
-    } else {
-        tcg_out_dat_reg(s, cond, ARITH_MOV, TCG_REG_R14, 0,
-                        TCG_REG_PC, SHIFT_IMM_LSL(0));
-        tcg_out_bx(s, cond, arg);
+        tcg_out32(s, addri);
     }
 }
 
@@ -1073,9 +1043,9 @@
     TCGLabel *l = &s->labels[label_index];
 
     if (l->has_value) {
-        tcg_out_goto(s, cond, l->u.value);
+        tcg_out_goto(s, cond, l->u.value_ptr);
     } else {
-        tcg_out_reloc(s, s->code_ptr, R_ARM_PC24, label_index, 31337);
+        tcg_out_reloc(s, s->code_ptr, R_ARM_PC24, label_index, 0);
         tcg_out_b_noaddr(s, cond);
     }
 }
@@ -1084,7 +1054,7 @@
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
     [MO_SB]   = helper_ret_ldsb_mmu,
 
@@ -1104,7 +1074,7 @@
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[16] = {
     [MO_UB]   = helper_ret_stb_mmu,
     [MO_LEUW] = helper_le_stw_mmu,
     [MO_LEUL] = helper_le_stl_mmu,
@@ -1256,7 +1226,7 @@
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
                                 TCGReg datalo, TCGReg datahi, TCGReg addrlo,
                                 TCGReg addrhi, int mem_index,
-                                uint8_t *raddr, uint8_t *label_ptr)
+                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
@@ -1275,9 +1245,9 @@
 {
     TCGReg argreg, datalo, datahi;
     TCGMemOp opc = lb->opc;
-    uintptr_t func;
+    void *func;
 
-    reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+    reloc_pc24(lb->label_ptr[0], s->code_ptr);
 
     argreg = tcg_out_arg_reg32(s, TCG_REG_R0, TCG_AREG0);
     if (TARGET_LONG_BITS == 64) {
@@ -1292,9 +1262,9 @@
        icache usage.  For pre-armv6, use the signed helpers since we do
        not have a single insn sign-extend.  */
     if (use_armv6_instructions) {
-        func = (uintptr_t)qemu_ld_helpers[opc & ~MO_SIGN];
+        func = qemu_ld_helpers[opc & ~MO_SIGN];
     } else {
-        func = (uintptr_t)qemu_ld_helpers[opc];
+        func = qemu_ld_helpers[opc];
         if (opc & MO_SIGN) {
             opc = MO_UL;
         }
@@ -1328,7 +1298,7 @@
         break;
     }
 
-    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
+    tcg_out_goto(s, COND_AL, lb->raddr);
 }
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -1336,7 +1306,7 @@
     TCGReg argreg, datalo, datahi;
     TCGMemOp opc = lb->opc;
 
-    reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+    reloc_pc24(lb->label_ptr[0], s->code_ptr);
 
     argreg = TCG_REG_R0;
     argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
@@ -1368,7 +1338,7 @@
     argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
     /* Tail-call to the helper, which will return to the fast path.  */
-    tcg_out_goto(s, COND_AL, (uintptr_t)qemu_st_helpers[opc]);
+    tcg_out_goto(s, COND_AL, qemu_st_helpers[opc]);
 }
 #endif /* SOFTMMU */
 
@@ -1499,7 +1469,7 @@
 #ifdef CONFIG_SOFTMMU
     int mem_index;
     TCGReg addend;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 #endif
 
     datalo = *args++;
@@ -1628,7 +1598,7 @@
 #ifdef CONFIG_SOFTMMU
     int mem_index;
     TCGReg addend;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 #endif
 
     datalo = *args++;
@@ -1660,7 +1630,7 @@
 #endif
 }
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                 const TCGArg *args, const int *const_args)
@@ -1670,51 +1640,21 @@
 
     switch (opc) {
     case INDEX_op_exit_tb:
-        if (use_armv7_instructions || check_fit_imm(args[0])) {
-            tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]);
-            tcg_out_goto(s, COND_AL, (tcg_target_ulong) tb_ret_addr);
-        } else {
-            uint8_t *ld_ptr = s->code_ptr;
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_PC, 0);
-            tcg_out_goto(s, COND_AL, (tcg_target_ulong) tb_ret_addr);
-            *ld_ptr = (uint8_t) (s->code_ptr - ld_ptr) - 8;
-            tcg_out32(s, args[0]);
-        }
+        tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]);
+        tcg_out_goto(s, COND_AL, tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* Direct jump method */
-#if defined(USE_DIRECT_JUMP)
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
+            s->tb_jmp_offset[args[0]] = tcg_current_code_size(s);
             tcg_out_b_noaddr(s, COND_AL);
-#else
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-            tcg_out32(s, 0);
-#endif
         } else {
             /* Indirect jump method */
-#if 1
-            c = (int) (s->tb_next + args[0]) - ((int) s->code_ptr + 8);
-            if (c > 0xfff || c < -0xfff) {
-                tcg_out_movi32(s, COND_AL, TCG_REG_R0,
-                                (tcg_target_long) (s->tb_next + args[0]));
-                tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0);
-            } else
-                tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, c);
-#else
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_PC, 0);
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0);
-            tcg_out32(s, (tcg_target_long) (s->tb_next + args[0]));
-#endif
+            intptr_t ptr = (intptr_t)(s->tb_next + args[0]);
+            tcg_out_movi32(s, COND_AL, TCG_REG_R0, ptr & ~0xfff);
+            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, ptr & 0xfff);
         }
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
-        break;
-    case INDEX_op_call:
-        if (const_args[0])
-            tcg_out_call(s, args[0]);
-        else
-            tcg_out_callr(s, COND_AL, args[0]);
+        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
         tcg_out_goto_label(s, COND_AL, args[0]);
@@ -1745,13 +1685,6 @@
         tcg_out_st32(s, COND_AL, args[0], args[1], args[2]);
         break;
 
-    case INDEX_op_mov_i32:
-        tcg_out_dat_reg(s, COND_AL, ARITH_MOV,
-                        args[0], 0, args[1], SHIFT_IMM_LSL(0));
-        break;
-    case INDEX_op_movi_i32:
-        tcg_out_movi32(s, COND_AL, args[0], args[1]);
-        break;
     case INDEX_op_movcond_i32:
         /* Constraints mean that v2 is always in the same register as dest,
          * so we only need to do "if condition passed, move v1 to dest".
@@ -1967,6 +1900,9 @@
         tcg_out_udiv(s, COND_AL, args[0], args[1], args[2]);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
     }
@@ -1975,12 +1911,8 @@
 static const TCGTargetOpDef arm_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_br, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
-
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 1bc5dac..73f10c4 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -26,6 +26,7 @@
 #define TCG_TARGET_ARM 1
 
 #undef TCG_TARGET_STACK_GROWSUP
+#define TCG_TARGET_INSN_UNIT_SIZE 4
 
 typedef enum {
     TCG_REG_R0 = 0,
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 34ece1f..a373073 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -139,9 +139,9 @@
 # define have_bmi2 0
 #endif
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
     value += addend;
@@ -151,14 +151,14 @@
         if (value != (int32_t)value) {
             tcg_abort();
         }
-        *(uint32_t *)code_ptr = value;
+        tcg_patch32(code_ptr, value);
         break;
     case R_386_PC8:
         value -= (uintptr_t)code_ptr;
         if (value != (int8_t)value) {
             tcg_abort();
         }
-        *(uint8_t *)code_ptr = value;
+        tcg_patch8(code_ptr, value);
         break;
     default:
         tcg_abort();
@@ -859,7 +859,7 @@
     TCGLabel *l = &s->labels[label_index];
 
     if (l->has_value) {
-        val = l->u.value - (intptr_t)s->code_ptr;
+        val = tcg_pcrel_diff(s, l->u.value_ptr);
         val1 = val - 2;
         if ((int8_t)val1 == val1) {
             if (opc == -1) {
@@ -1099,26 +1099,26 @@
 }
 #endif
 
-static void tcg_out_branch(TCGContext *s, int call, uintptr_t dest)
+static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
 {
-    intptr_t disp = dest - (intptr_t)s->code_ptr - 5;
+    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
 
     if (disp == (int32_t)disp) {
         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
         tcg_out32(s, disp);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, dest);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, (uintptr_t)dest);
         tcg_out_modrm(s, OPC_GRP5,
                       call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev, TCG_REG_R10);
     }
 }
 
-static inline void tcg_out_calli(TCGContext *s, uintptr_t dest)
+static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
 {
     tcg_out_branch(s, 1, dest);
 }
 
-static void tcg_out_jmp(TCGContext *s, uintptr_t dest)
+static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
 {
     tcg_out_branch(s, 0, dest);
 }
@@ -1127,7 +1127,7 @@
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
     [MO_LEUW] = helper_le_lduw_mmu,
     [MO_LEUL] = helper_le_ldul_mmu,
@@ -1140,7 +1140,7 @@
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[16] = {
     [MO_UB]   = helper_ret_stb_mmu,
     [MO_LEUW] = helper_le_stw_mmu,
     [MO_LEUL] = helper_le_stl_mmu,
@@ -1173,7 +1173,7 @@
 
 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
                                     int mem_index, TCGMemOp s_bits,
-                                    uint8_t **label_ptr, int which)
+                                    tcg_insn_unit **label_ptr, int which)
 {
     const TCGReg r0 = TCG_REG_L0;
     const TCGReg r1 = TCG_REG_L1;
@@ -1247,8 +1247,8 @@
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
                                 TCGReg datalo, TCGReg datahi,
                                 TCGReg addrlo, TCGReg addrhi,
-                                int mem_index, uint8_t *raddr,
-                                uint8_t **label_ptr)
+                                int mem_index, tcg_insn_unit *raddr,
+                                tcg_insn_unit **label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
@@ -1273,12 +1273,12 @@
 {
     TCGMemOp opc = l->opc;
     TCGReg data_reg;
-    uint8_t **label_ptr = &l->label_ptr[0];
+    tcg_insn_unit **label_ptr = &l->label_ptr[0];
 
     /* resolve label address */
-    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
     }
 
     if (TCG_TARGET_REG_BITS == 32) {
@@ -1308,7 +1308,7 @@
                      (uintptr_t)l->raddr);
     }
 
-    tcg_out_calli(s, (uintptr_t)qemu_ld_helpers[opc & ~MO_SIGN]);
+    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
 
     data_reg = l->datalo_reg;
     switch (opc & MO_SSIZE) {
@@ -1346,7 +1346,7 @@
     }
 
     /* Jump to the code corresponding to next IR of qemu_st */
-    tcg_out_jmp(s, (uintptr_t)l->raddr);
+    tcg_out_jmp(s, l->raddr);
 }
 
 /*
@@ -1356,13 +1356,13 @@
 {
     TCGMemOp opc = l->opc;
     TCGMemOp s_bits = opc & MO_SIZE;
-    uint8_t **label_ptr = &l->label_ptr[0];
+    tcg_insn_unit **label_ptr = &l->label_ptr[0];
     TCGReg retaddr;
 
     /* resolve label address */
-    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
     }
 
     if (TCG_TARGET_REG_BITS == 32) {
@@ -1413,7 +1413,7 @@
 
     /* "Tail call" to the helper, with the return address back inline.  */
     tcg_out_push(s, retaddr);
-    tcg_out_jmp(s, (uintptr_t)qemu_st_helpers[opc]);
+    tcg_out_jmp(s, qemu_st_helpers[opc]);
 }
 #elif defined(__x86_64__) && defined(__linux__)
 # include <asm/prctl.h>
@@ -1534,7 +1534,7 @@
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
     TCGMemOp s_bits;
-    uint8_t *label_ptr[2];
+    tcg_insn_unit *label_ptr[2];
 #endif
 
     datalo = *args++;
@@ -1665,7 +1665,7 @@
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
     TCGMemOp s_bits;
-    uint8_t *label_ptr[2];
+    tcg_insn_unit *label_ptr[2];
 #endif
 
     datalo = *args++;
@@ -1731,35 +1731,24 @@
     switch(opc) {
     case INDEX_op_exit_tb:
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, args[0]);
-        tcg_out_jmp(s, (uintptr_t)tb_ret_addr);
+        tcg_out_jmp(s, tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* direct jump method */
             tcg_out8(s, OPC_JMP_long); /* jmp im */
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
+            s->tb_jmp_offset[args[0]] = tcg_current_code_size(s);
             tcg_out32(s, 0);
         } else {
             /* indirect jump method */
             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
                                  (intptr_t)(s->tb_next + args[0]));
         }
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
-        break;
-    case INDEX_op_call:
-        if (const_args[0]) {
-            tcg_out_calli(s, args[0]);
-        } else {
-            /* call *reg */
-            tcg_out_modrm(s, OPC_GRP5, EXT5_CALLN_Ev, args[0]);
-        }
+        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
         tcg_out_jxx(s, JCC_JMP, args[0], 0);
         break;
-    case INDEX_op_movi_i32:
-        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
-        break;
     OP_32_64(ld8u):
         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
         tcg_out_modrm_offset(s, OPC_MOVZBL, args[0], args[1], args[2]);
@@ -2009,9 +1998,6 @@
         tcg_out_setcond2(s, args, const_args);
         break;
 #else /* TCG_TARGET_REG_BITS == 64 */
-    case INDEX_op_movi_i64:
-        tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
-        break;
     case INDEX_op_ld32s_i64:
         tcg_out_modrm_offset(s, OPC_MOVSLQ, args[0], args[1], args[2]);
         break;
@@ -2068,6 +2054,11 @@
         }
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_movi_i64:
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
     }
@@ -2078,10 +2069,7 @@
 static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_br, { } },
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
@@ -2135,8 +2123,6 @@
     { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
     { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
 #else
-    { INDEX_op_mov_i64, { "r", "r" } },
-    { INDEX_op_movi_i64, { "r" } },
     { INDEX_op_ld8u_i64, { "r", "r" } },
     { INDEX_op_ld8s_i64, { "r", "r" } },
     { INDEX_op_ld16u_i64, { "r", "r" } },
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index dbeb16d..6c94e5c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -24,6 +24,8 @@
 #ifndef TCG_TARGET_I386 
 #define TCG_TARGET_I386 1
 
+#define TCG_TARGET_INSN_UNIT_SIZE  1
+
 #ifdef __x86_64__
 # define TCG_TARGET_REG_BITS  64
 # define TCG_TARGET_NB_REGS   16
diff --git a/tcg/ia64/tcg-target.c b/tcg/ia64/tcg-target.c
index 1f523d6..6bc9924 100644
--- a/tcg/ia64/tcg-target.c
+++ b/tcg/ia64/tcg-target.c
@@ -692,112 +692,32 @@
 
 
 /*
- * Relocations
+ * Relocations - Note that we never encode branches elsewhere than slot 2.
  */
 
-static inline void reloc_pcrel21b(void *pc, intptr_t target)
+static void reloc_pcrel21b_slot2(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    uint64_t imm;
-    int64_t disp;
-    int slot;
+    uint64_t imm = target - pc;
 
-    slot = (intptr_t)pc & 3;
-    pc = (void *)((intptr_t)pc & ~3);
-
-    disp = target - (intptr_t)pc;
-    imm = (uint64_t) disp >> 4;
-
-    switch(slot) {
-    case 0:
-        *(uint64_t *)(pc + 0) = (*(uint64_t *)(pc + 8) & 0xfffffdc00003ffffull)
-                                | ((imm & 0x100000) << 21)  /* s */
-                                | ((imm & 0x0fffff) << 18); /* imm20b */
-        break;
-    case 1:
-        *(uint64_t *)(pc + 8) = (*(uint64_t *)(pc + 8) & 0xfffffffffffb8000ull)
-                                | ((imm & 0x100000) >> 2)   /* s */
-                                | ((imm & 0x0fffe0) >> 5);  /* imm20b */
-        *(uint64_t *)(pc + 0) = (*(uint64_t *)(pc + 0) & 0x07ffffffffffffffull)
-                                | ((imm & 0x00001f) << 59); /* imm20b */
-        break;
-    case 2:
-        *(uint64_t *)(pc + 8) = (*(uint64_t *)(pc + 8) & 0xf700000fffffffffull)
-                                | ((imm & 0x100000) << 39)  /* s */
-                                | ((imm & 0x0fffff) << 36); /* imm20b */
-        break;
-    }
+    pc->hi = (pc->hi & 0xf700000fffffffffull)
+             | ((imm & 0x100000) << 39)  /* s */
+             | ((imm & 0x0fffff) << 36); /* imm20b */
 }
 
-static inline uint64_t get_reloc_pcrel21b (void *pc)
+static uint64_t get_reloc_pcrel21b_slot2(tcg_insn_unit *pc)
 {
-    int64_t low, high;
-    int slot;
+    int64_t high = pc->hi;
 
-    slot = (tcg_target_long) pc & 3;
-    pc = (void *)((tcg_target_long) pc & ~3);
-
-    low  = (*(uint64_t *)(pc + 0));
-    high = (*(uint64_t *)(pc + 8));
-
-    switch(slot) {
-    case 0:
-        return ((low >> 21) & 0x100000) + /* s */
-               ((low >> 18) & 0x0fffff);  /* imm20b */
-    case 1:
-        return ((high << 2) & 0x100000) + /* s */
-               ((high << 5) & 0x0fffe0) + /* imm20b */
-               ((low >> 59) & 0x00001f);  /* imm20b */
-    case 2:
-        return ((high >> 39) & 0x100000) + /* s */
-               ((high >> 36) & 0x0fffff);  /* imm20b */
-    default:
-        tcg_abort();
-    }
+    return ((high >> 39) & 0x100000) + /* s */
+           ((high >> 36) & 0x0fffff);  /* imm20b */
 }
 
-static inline void reloc_pcrel60b(void *pc, intptr_t target)
-{
-    int64_t disp;
-    uint64_t imm;
-
-    disp = target - (intptr_t)pc;
-    imm = (uint64_t) disp >> 4;
-
-    *(uint64_t *)(pc + 8) = (*(uint64_t *)(pc + 8) & 0xf700000fff800000ull)
-                             |  (imm & 0x0800000000000000ull)         /* s */
-                             | ((imm & 0x07fffff000000000ull) >> 36)  /* imm39 */
-                             | ((imm & 0x00000000000fffffull) << 36); /* imm20b */
-    *(uint64_t *)(pc + 0) = (*(uint64_t *)(pc + 0) & 0x00003fffffffffffull)
-                             | ((imm & 0x0000000ffff00000ull) << 28); /* imm39 */
-}
-
-static inline uint64_t get_reloc_pcrel60b (void *pc)
-{
-    int64_t low, high;
-
-    low  = (*(uint64_t *)(pc + 0));
-    high = (*(uint64_t *)(pc + 8));
-
-    return ((high)       & 0x0800000000000000ull) + /* s */
-           ((high >> 36) & 0x00000000000fffffull) + /* imm20b */
-           ((high << 36) & 0x07fffff000000000ull) + /* imm39 */
-           ((low >> 28)  & 0x0000000ffff00000ull);  /* imm39 */
-}
-
-
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
-    value += addend;
-    switch (type) {
-    case R_IA64_PCREL21B:
-        reloc_pcrel21b(code_ptr, value);
-        break;
-    case R_IA64_PCREL60B:
-        reloc_pcrel60b(code_ptr, value);
-    default:
-        tcg_abort();
-    }
+    assert(addend == 0);
+    assert(type == R_IA64_PCREL21B);
+    reloc_pcrel21b_slot2(code_ptr, (tcg_insn_unit *)value);
 }
 
 /*
@@ -861,7 +781,7 @@
  * Code generation
  */
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
 static inline void tcg_out_bundle(TCGContext *s, int template,
                                   uint64_t slot0, uint64_t slot1,
@@ -872,9 +792,10 @@
     slot1 &= 0x1ffffffffffull; /* 41 bits */
     slot2 &= 0x1ffffffffffull; /* 41 bits */
 
-    *(uint64_t *)(s->code_ptr + 0) = (slot1 << 46) | (slot0 << 5) | template;
-    *(uint64_t *)(s->code_ptr + 8) = (slot2 << 23) | (slot1 >> 18);
-    s->code_ptr += 16;
+    *s->code_ptr++ = (tcg_insn_unit){
+        (slot1 << 46) | (slot0 << 5) | template,
+        (slot2 << 23) | (slot1 >> 18)
+    };
 }
 
 static inline uint64_t tcg_opc_mov_a(int qp, TCGReg dst, TCGReg src)
@@ -909,33 +830,34 @@
 static void tcg_out_br(TCGContext *s, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
+    uint64_t imm;
 
     /* We pay attention here to not modify the branch target by reading
        the existing value and using it again. This ensure that caches and
        memory are kept coherent during retranslation. */
+    if (l->has_value) {
+        imm = l->u.value_ptr -  s->code_ptr;
+    } else {
+        imm = get_reloc_pcrel21b_slot2(s->code_ptr);
+        tcg_out_reloc(s, s->code_ptr, R_IA64_PCREL21B, label_index, 0);
+    }
+
     tcg_out_bundle(s, mmB,
                    INSN_NOP_M,
                    INSN_NOP_M,
-                   tcg_opc_b1 (TCG_REG_P0, OPC_BR_SPTK_MANY_B1,
-                               get_reloc_pcrel21b(s->code_ptr + 2)));
-
-    if (l->has_value) {
-        reloc_pcrel21b((s->code_ptr - 16) + 2, l->u.value);
-    } else {
-        tcg_out_reloc(s, (s->code_ptr - 16) + 2,
-                      R_IA64_PCREL21B, label_index, 0);
-    }
+                   tcg_opc_b1(TCG_REG_P0, OPC_BR_SPTK_MANY_B1, imm));
 }
 
-static inline void tcg_out_calli(TCGContext *s, uintptr_t addr)
+static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *desc)
 {
+    uintptr_t func = desc->lo, gp = desc->hi, disp;
+
     /* Look through the function descriptor.  */
-    uintptr_t disp, *desc = (uintptr_t *)addr;
     tcg_out_bundle(s, mlx,
                    INSN_NOP_M,
-                   tcg_opc_l2 (desc[1]),
-                   tcg_opc_x2 (TCG_REG_P0, OPC_MOVL_X2, TCG_REG_R1, desc[1]));
-    disp = (desc[0] - (uintptr_t)s->code_ptr) >> 4;
+                   tcg_opc_l2 (gp),
+                   tcg_opc_x2 (TCG_REG_P0, OPC_MOVL_X2, TCG_REG_R1, gp));
+    disp = (tcg_insn_unit *)func - s->code_ptr;
     tcg_out_bundle(s, mLX,
                    INSN_NOP_M,
                    tcg_opc_l4 (disp),
@@ -943,23 +865,8 @@
                                TCG_REG_B0, disp));
 }
 
-static inline void tcg_out_callr(TCGContext *s, TCGReg addr)
-{
-    tcg_out_bundle(s, MmI,
-                   tcg_opc_m1 (TCG_REG_P0, OPC_LD8_M1, TCG_REG_R2, addr),
-                   tcg_opc_a4 (TCG_REG_P0, OPC_ADDS_A4, TCG_REG_R3, 8, addr),
-                   tcg_opc_i21(TCG_REG_P0, OPC_MOV_I21,
-                               TCG_REG_B6, TCG_REG_R2, 0));
-    tcg_out_bundle(s, mmB,
-                   tcg_opc_m1 (TCG_REG_P0, OPC_LD8_M1, TCG_REG_R1, TCG_REG_R3),
-                   INSN_NOP_M,
-                   tcg_opc_b5 (TCG_REG_P0, OPC_BR_CALL_SPTK_MANY_B5,
-                               TCG_REG_B0, TCG_REG_B6));
-}
-
 static void tcg_out_exit_tb(TCGContext *s, tcg_target_long arg)
 {
-    int64_t disp;
     uint64_t imm, opc1;
 
     /* At least arg == 0 is a common operation.  */
@@ -970,8 +877,7 @@
         opc1 = INSN_NOP_M;
     }
 
-    disp = tb_ret_addr - s->code_ptr;
-    imm = (uint64_t)disp >> 4;
+    imm = tb_ret_addr - s->code_ptr;
 
     tcg_out_bundle(s, mLX,
                    opc1,
@@ -1000,7 +906,7 @@
                        tcg_opc_b4 (TCG_REG_P0, OPC_BR_SPTK_MANY_B4,
                                    TCG_REG_B6));
     }
-    s->tb_next_offset[arg] = s->code_ptr - s->code_buf;
+    s->tb_next_offset[arg] = tcg_current_code_size(s);
 }
 
 static inline void tcg_out_jmp(TCGContext *s, TCGArg addr)
@@ -1521,19 +1427,22 @@
                                   TCGReg arg2, int label_index, int cmp4)
 {
     TCGLabel *l = &s->labels[label_index];
+    uint64_t imm;
+
+    /* We pay attention here to not modify the branch target by reading
+       the existing value and using it again. This ensure that caches and
+       memory are kept coherent during retranslation. */
+    if (l->has_value) {
+        imm = l->u.value_ptr - s->code_ptr;
+    } else {
+        imm = get_reloc_pcrel21b_slot2(s->code_ptr);
+        tcg_out_reloc(s, s->code_ptr, R_IA64_PCREL21B, label_index, 0);
+    }
 
     tcg_out_bundle(s, miB,
                    INSN_NOP_M,
                    tcg_opc_cmp_a(TCG_REG_P0, cond, arg1, arg2, cmp4),
-                   tcg_opc_b1(TCG_REG_P6, OPC_BR_DPTK_FEW_B1,
-                              get_reloc_pcrel21b(s->code_ptr + 2)));
-
-    if (l->has_value) {
-        reloc_pcrel21b((s->code_ptr - 16) + 2, l->u.value);
-    } else {
-        tcg_out_reloc(s, (s->code_ptr - 16) + 2,
-                      R_IA64_PCREL21B, label_index, 0);
-    }
+                   tcg_opc_b1(TCG_REG_P6, OPC_BR_DPTK_FEW_B1, imm));
 }
 
 static inline void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGArg ret,
@@ -1646,7 +1555,7 @@
 typedef struct TCGLabelQemuLdst {
     bool is_ld;
     TCGMemOp size;
-    uint8_t *label_ptr;     /* label pointers to be updated */
+    tcg_insn_unit *label_ptr;     /* label pointers to be updated */
 } TCGLabelQemuLdst;
 
 typedef struct TCGBackendData {
@@ -1660,7 +1569,7 @@
 }
 
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
-                                uint8_t *label_ptr)
+                                tcg_insn_unit *label_ptr)
 {
     TCGBackendData *be = s->be;
     TCGLabelQemuLdst *l = &be->ldst_labels[be->nb_ldst_labels++];
@@ -1683,43 +1592,44 @@
         helper_le_ldul_mmu,
         helper_le_ldq_mmu,
     };
-    uintptr_t thunks[8] = { };
+    tcg_insn_unit *thunks[8] = { };
     TCGBackendData *be = s->be;
     size_t i, n = be->nb_ldst_labels;
 
     for (i = 0; i < n; i++) {
         TCGLabelQemuLdst *l = &be->ldst_labels[i];
         long x = l->is_ld * 4 + l->size;
-        uintptr_t dest = thunks[x];
+        tcg_insn_unit *dest = thunks[x];
 
         /* The out-of-line thunks are all the same; load the return address
            from B0, load the GP, and branch to the code.  Note that we are
            always post-call, so the register window has rolled, so we're
            using incomming parameter register numbers, not outgoing.  */
-        if (dest == 0) {
-            uintptr_t disp, *desc = (uintptr_t *)helpers[x];
+        if (dest == NULL) {
+            uintptr_t *desc = (uintptr_t *)helpers[x];
+            uintptr_t func = desc[0], gp = desc[1], disp;
 
-            thunks[x] = dest = (uintptr_t)s->code_ptr;
+            thunks[x] = dest = s->code_ptr;
 
             tcg_out_bundle(s, mlx,
                            INSN_NOP_M,
-                           tcg_opc_l2 (desc[1]),
+                           tcg_opc_l2 (gp),
                            tcg_opc_x2 (TCG_REG_P0, OPC_MOVL_X2,
-                                       TCG_REG_R1, desc[1]));
+                                       TCG_REG_R1, gp));
             tcg_out_bundle(s, mii,
                            INSN_NOP_M,
                            INSN_NOP_I,
                            tcg_opc_i22(TCG_REG_P0, OPC_MOV_I22,
                                        l->is_ld ? TCG_REG_R35 : TCG_REG_R36,
                                        TCG_REG_B0));
-            disp = (desc[0] - (uintptr_t)s->code_ptr) >> 4;
+            disp = (tcg_insn_unit *)func - s->code_ptr;
             tcg_out_bundle(s, mLX,
                            INSN_NOP_M,
                            tcg_opc_l3 (disp),
                            tcg_opc_x3 (TCG_REG_P0, OPC_BRL_SPTK_MANY_X3, disp));
         }
 
-        reloc_pcrel21b(l->label_ptr, dest);
+        reloc_pcrel21b_slot2(l->label_ptr, dest);
     }
 }
 
@@ -1731,7 +1641,7 @@
     int addr_reg, data_reg, mem_index;
     TCGMemOp opc, s_bits;
     uint64_t fin1, fin2;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 
     data_reg = args[0];
     addr_reg = args[1];
@@ -1765,13 +1675,13 @@
                    tcg_opc_a1 (TCG_REG_P6, OPC_ADD_A1, TCG_REG_R2,
                                TCG_REG_R2, TCG_REG_R57),
                    tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R58, mem_index));
-    label_ptr = s->code_ptr + 2;
+    label_ptr = s->code_ptr;
     tcg_out_bundle(s, miB,
                    tcg_opc_m1 (TCG_REG_P6, opc_ld_m1[s_bits],
                                TCG_REG_R8, TCG_REG_R2),
                    INSN_NOP_I,
                    tcg_opc_b3 (TCG_REG_P7, OPC_BR_CALL_SPNT_FEW_B3, TCG_REG_B0,
-                               get_reloc_pcrel21b(label_ptr)));
+                               get_reloc_pcrel21b_slot2(label_ptr)));
 
     add_qemu_ldst_label(s, 1, opc, label_ptr);
 
@@ -1792,7 +1702,7 @@
     int mem_index;
     uint64_t pre1, pre2;
     TCGMemOp opc, s_bits;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 
     data_reg = args[0];
     addr_reg = args[1];
@@ -1827,13 +1737,13 @@
                    tcg_opc_a1 (TCG_REG_P6, OPC_ADD_A1, TCG_REG_R2,
                                TCG_REG_R2, TCG_REG_R57),
                    tcg_opc_movi_a(TCG_REG_P7, TCG_REG_R59, mem_index));
-    label_ptr = s->code_ptr + 2;
+    label_ptr = s->code_ptr;
     tcg_out_bundle(s, miB,
                    tcg_opc_m4 (TCG_REG_P6, opc_st_m4[s_bits],
                                TCG_REG_R58, TCG_REG_R2),
                    INSN_NOP_I,
                    tcg_opc_b3 (TCG_REG_P7, OPC_BR_CALL_SPNT_FEW_B3, TCG_REG_B0,
-                               get_reloc_pcrel21b(label_ptr)));
+                               get_reloc_pcrel21b_slot2(label_ptr)));
 
     add_qemu_ldst_label(s, 0, opc, label_ptr);
 }
@@ -2085,24 +1995,10 @@
     case INDEX_op_br:
         tcg_out_br(s, args[0]);
         break;
-    case INDEX_op_call:
-        if (likely(const_args[0])) {
-            tcg_out_calli(s, args[0]);
-        } else {
-            tcg_out_callr(s, args[0]);
-        }
-        break;
     case INDEX_op_goto_tb:
         tcg_out_goto_tb(s, args[0]);
         break;
 
-    case INDEX_op_movi_i32:
-        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
-        break;
-    case INDEX_op_movi_i64:
-        tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
-        break;
-
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
         tcg_out_ld_rel(s, OPC_LD1_M1, args[0], args[1], args[2]);
@@ -2312,6 +2208,11 @@
         tcg_out_qemu_st(s, args);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_movi_i64:
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
     }
@@ -2319,13 +2220,9 @@
 
 static const TCGTargetOpDef ia64_op_defs[] = {
     { INDEX_op_br, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
-
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
@@ -2367,9 +2264,6 @@
     { INDEX_op_setcond_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_movcond_i32, { "r", "rZ", "rZ", "rI", "rI" } },
 
-    { INDEX_op_mov_i64, { "r", "r" } },
-    { INDEX_op_movi_i64, { "r" } },
-
     { INDEX_op_ld8u_i64, { "r", "r" } },
     { INDEX_op_ld8s_i64, { "r", "r" } },
     { INDEX_op_ld16u_i64, { "r", "r" } },
@@ -2442,8 +2336,11 @@
                   CPU_TEMP_BUF_NLONGS * sizeof(long));
 
     /* First emit adhoc function descriptor */
-    *(uint64_t *)(s->code_ptr) = (uint64_t)s->code_ptr + 16; /* entry point */
-    s->code_ptr += 16; /* skip GP */
+    *s->code_ptr = (tcg_insn_unit){
+        (uint64_t)(s->code_ptr + 1), /* entry point */
+        0                            /* skip gp */
+    };
+    s->code_ptr++;
 
     /* prologue */
     tcg_out_bundle(s, miI,
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index d834beb..3a59b50 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -25,6 +25,12 @@
 #ifndef TCG_TARGET_IA64 
 #define TCG_TARGET_IA64 1
 
+#define TCG_TARGET_INSN_UNIT_SIZE 16
+typedef struct {
+    uint64_t lo __attribute__((aligned(16)));
+    uint64_t hi;
+} tcg_insn_unit;
+
 /* We only map the first 64 registers */
 #define TCG_TARGET_NB_REGS 64
 typedef enum {
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index 37241b2..0ae495c 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -108,83 +108,38 @@
     TCG_REG_V1
 };
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
-static inline uint32_t reloc_lo16_val(void *pc, intptr_t target)
+static inline uint32_t reloc_pc16_val(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    return target & 0xffff;
+    /* Let the compiler perform the right-shift as part of the arithmetic.  */
+    ptrdiff_t disp = target - (pc + 1);
+    assert(disp == (int16_t)disp);
+    return disp & 0xffff;
 }
 
-static inline void reloc_lo16(void *pc, intptr_t target)
+static inline void reloc_pc16(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0xffff)
-                       | reloc_lo16_val(pc, target);
+    *pc = deposit32(*pc, 0, 16, reloc_pc16_val(pc, target));
 }
 
-static inline uint32_t reloc_hi16_val(void *pc, intptr_t target)
+static inline uint32_t reloc_26_val(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    return (target >> 16) & 0xffff;
+    assert((((uintptr_t)pc ^ (uintptr_t)target) & 0xf0000000) == 0);
+    return ((uintptr_t)target >> 2) & 0x3ffffff;
 }
 
-static inline void reloc_hi16(void *pc, intptr_t target)
+static inline void reloc_26(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0xffff)
-                       | reloc_hi16_val(pc, target);
+    *pc = deposit32(*pc, 0, 26, reloc_26_val(pc, target));
 }
 
-static inline uint32_t reloc_pc16_val(void *pc, intptr_t target)
-{
-    int32_t disp;
-
-    disp = target - (intptr_t)pc - 4;
-    if (disp != (disp << 14) >> 14) {
-        tcg_abort ();
-    }
-
-    return (disp >> 2) & 0xffff;
-}
-
-static inline void reloc_pc16 (void *pc, tcg_target_long target)
-{
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0xffff)
-                       | reloc_pc16_val(pc, target);
-}
-
-static inline uint32_t reloc_26_val (void *pc, tcg_target_long target)
-{
-    if ((((tcg_target_long)pc + 4) & 0xf0000000) != (target & 0xf0000000)) {
-        tcg_abort ();
-    }
-
-    return (target >> 2) & 0x3ffffff;
-}
-
-static inline void reloc_pc26(void *pc, intptr_t target)
-{
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0x3ffffff)
-                       | reloc_26_val(pc, target);
-}
-
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
-    value += addend;
-    switch(type) {
-    case R_MIPS_LO16:
-        reloc_lo16(code_ptr, value);
-        break;
-    case R_MIPS_HI16:
-        reloc_hi16(code_ptr, value);
-        break;
-    case R_MIPS_PC16:
-        reloc_pc16(code_ptr, value);
-        break;
-    case R_MIPS_26:
-        reloc_pc26(code_ptr, value);
-        break;
-    default:
-        tcg_abort();
-    }
+    assert(type == R_MIPS_PC16);
+    assert(addend == 0);
+    reloc_pc16(code_ptr, (tcg_insn_unit *)value);
 }
 
 /* parse target specific constraints */
@@ -198,11 +153,6 @@
         ct->ct |= TCG_CT_REG;
         tcg_regset_set(ct->u.regs, 0xffffffff);
         break;
-    case 'C':
-        ct->ct |= TCG_CT_REG;
-        tcg_regset_clear(ct->u.regs);
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_T9);
-        break;
     case 'L': /* qemu_ld output arg constraint */
         ct->ct |= TCG_CT_REG;
         tcg_regset_set(ct->u.regs, 0xffffffff);
@@ -374,7 +324,7 @@
     /* We pay attention here to not modify the branch target by reading
        the existing value and using it again. This ensure that caches and
        memory are kept coherent during retranslation. */
-    uint16_t offset = (uint16_t)(*(uint32_t *) s->code_ptr);
+    uint16_t offset = (uint16_t)*s->code_ptr;
 
     tcg_out_opc_imm(s, opc, rt, rs, offset);
 }
@@ -663,9 +613,9 @@
         break;
     }
     if (l->has_value) {
-        reloc_pc16(s->code_ptr - 4, l->u.value);
+        reloc_pc16(s->code_ptr - 1, l->u.value_ptr);
     } else {
-        tcg_out_reloc(s, s->code_ptr - 4, R_MIPS_PC16, label_index, 0);
+        tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, label_index, 0);
     }
     tcg_out_nop(s);
 }
@@ -676,7 +626,7 @@
                             TCGArg arg2, TCGArg arg3, TCGArg arg4,
                             int label_index)
 {
-    void *label_ptr;
+    tcg_insn_unit *label_ptr;
 
     switch(cond) {
     case TCG_COND_NE:
@@ -733,7 +683,7 @@
         tcg_abort();
     }
 
-    reloc_pc16(label_ptr, (tcg_target_long) s->code_ptr);
+    reloc_pc16(label_ptr, s->code_ptr);
 }
 
 static void tcg_out_movcond(TCGContext *s, TCGCond cond, TCGReg ret,
@@ -945,12 +895,12 @@
 {
     TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
-    void *label1_ptr, *label2_ptr;
+    tcg_insn_unit *label1_ptr, *label2_ptr;
     int arg_num;
     int mem_index, s_bits;
     int addr_meml;
 # if TARGET_LONG_BITS == 64
-    uint8_t *label3_ptr;
+    tcg_insn_unit *label3_ptr;
     TCGReg addr_regh;
     int addr_memh;
 # endif
@@ -1011,7 +961,7 @@
     tcg_out_opc_br(s, OPC_BEQ, addr_regh, TCG_REG_AT);
     tcg_out_nop(s);
 
-    reloc_pc16(label3_ptr, (tcg_target_long) s->code_ptr);
+    reloc_pc16(label3_ptr, s->code_ptr);
 # else
     label1_ptr = s->code_ptr;
     tcg_out_opc_br(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT);
@@ -1060,7 +1010,7 @@
     tcg_out_nop(s);
 
     /* label1: fast path */
-    reloc_pc16(label1_ptr, (tcg_target_long) s->code_ptr);
+    reloc_pc16(label1_ptr, s->code_ptr);
 
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0,
                     offsetof(CPUArchState, tlb_table[mem_index][0].addend));
@@ -1121,7 +1071,7 @@
     }
 
 #if defined(CONFIG_SOFTMMU)
-    reloc_pc16(label2_ptr, (tcg_target_long) s->code_ptr);
+    reloc_pc16(label2_ptr, s->code_ptr);
 #endif
 }
 
@@ -1130,14 +1080,14 @@
 {
     TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
-    uint8_t *label1_ptr, *label2_ptr;
+    tcg_insn_unit *label1_ptr, *label2_ptr;
     int arg_num;
     int mem_index, s_bits;
     int addr_meml;
 #endif
 #if TARGET_LONG_BITS == 64
 # if defined(CONFIG_SOFTMMU)
-    uint8_t *label3_ptr;
+    tcg_insn_unit *label3_ptr;
     TCGReg addr_regh;
     int addr_memh;
 # endif
@@ -1200,7 +1150,7 @@
     tcg_out_opc_br(s, OPC_BEQ, addr_regh, TCG_REG_AT);
     tcg_out_nop(s);
 
-    reloc_pc16(label3_ptr, (tcg_target_long) s->code_ptr);
+    reloc_pc16(label3_ptr, s->code_ptr);
 # else
     label1_ptr = s->code_ptr;
     tcg_out_opc_br(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT);
@@ -1241,7 +1191,7 @@
     tcg_out_nop(s);
 
     /* label1: fast path */
-    reloc_pc16(label1_ptr, (tcg_target_long) s->code_ptr);
+    reloc_pc16(label1_ptr, s->code_ptr);
 
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0,
                     offsetof(CPUArchState, tlb_table[mem_index][0].addend));
@@ -1293,17 +1243,24 @@
     }
 
 #if defined(CONFIG_SOFTMMU)
-    reloc_pc16(label2_ptr, (tcg_target_long) s->code_ptr);
+    reloc_pc16(label2_ptr, s->code_ptr);
 #endif
 }
 
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
+{
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T9, (intptr_t)target);
+    tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, TCG_REG_T9, 0);
+    tcg_out_nop(s);
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
     switch(opc) {
     case INDEX_op_exit_tb:
         tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_V0, args[0]);
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_AT, (tcg_target_long)tb_ret_addr);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_AT, (uintptr_t)tb_ret_addr);
         tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_AT, 0);
         tcg_out_nop(s);
         break;
@@ -1313,28 +1270,18 @@
             tcg_abort();
         } else {
             /* indirect jump method */
-            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_AT, (tcg_target_long)(s->tb_next + args[0]));
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_AT,
+                         (uintptr_t)(s->tb_next + args[0]));
             tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_AT, TCG_REG_AT, 0);
             tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_AT, 0);
         }
         tcg_out_nop(s);
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
-        break;
-    case INDEX_op_call:
-        tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, args[0], 0);
-        tcg_out_nop(s);
+        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
         tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO, args[0]);
         break;
 
-    case INDEX_op_mov_i32:
-        tcg_out_mov(s, TCG_TYPE_I32, args[0], args[1]);
-        break;
-    case INDEX_op_movi_i32:
-        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
-        break;
-
     case INDEX_op_ld8u_i32:
         tcg_out_ldst(s, OPC_LBU, args[0], args[1], args[2]);
         break;
@@ -1582,6 +1529,9 @@
         tcg_out_qemu_st(s, args, 3);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
     }
@@ -1590,11 +1540,8 @@
 static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "C" } },
     { INDEX_op_br, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 9576db5..c6d2267 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -26,6 +26,7 @@
 #ifndef TCG_TARGET_MIPS 
 #define TCG_TARGET_MIPS 1
 
+#define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
 
 typedef enum {
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 0302f4f..3a504a1 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -513,12 +513,8 @@
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
 {
-    int i, nb_ops, op_index, nb_temps, nb_globals, nb_call_args;
-    tcg_target_ulong mask, affected;
-    TCGOpcode op;
-    const TCGOpDef *def;
+    int nb_ops, op_index, nb_temps, nb_globals;
     TCGArg *gen_args;
-    TCGArg tmp;
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -532,22 +528,27 @@
     nb_ops = tcg_opc_ptr - s->gen_opc_buf;
     gen_args = args;
     for (op_index = 0; op_index < nb_ops; op_index++) {
-        op = s->gen_opc_buf[op_index];
-        def = &tcg_op_defs[op];
-        /* Do copy propagation */
+        TCGOpcode op = s->gen_opc_buf[op_index];
+        const TCGOpDef *def = &tcg_op_defs[op];
+        tcg_target_ulong mask, affected;
+        int nb_oargs, nb_iargs, nb_args, i;
+        TCGArg tmp;
+
         if (op == INDEX_op_call) {
-            int nb_oargs = args[0] >> 16;
-            int nb_iargs = args[0] & 0xffff;
-            for (i = nb_oargs + 1; i < nb_oargs + nb_iargs + 1; i++) {
-                if (temps[args[i]].state == TCG_TEMP_COPY) {
-                    args[i] = find_better_copy(s, args[i]);
-                }
-            }
+            *gen_args++ = tmp = *args++;
+            nb_oargs = tmp >> 16;
+            nb_iargs = tmp & 0xffff;
+            nb_args = nb_oargs + nb_iargs + def->nb_cargs;
         } else {
-            for (i = def->nb_oargs; i < def->nb_oargs + def->nb_iargs; i++) {
-                if (temps[args[i]].state == TCG_TEMP_COPY) {
-                    args[i] = find_better_copy(s, args[i]);
-                }
+            nb_oargs = def->nb_oargs;
+            nb_iargs = def->nb_iargs;
+            nb_args = def->nb_args;
+        }
+
+        /* Do copy propagation */
+        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+            if (temps[args[i]].state == TCG_TEMP_COPY) {
+                args[i] = find_better_copy(s, args[i]);
             }
         }
 
@@ -882,7 +883,7 @@
 
         CASE_OP_32_64(qemu_ld):
             {
-                TCGMemOp mop = args[def->nb_oargs + def->nb_iargs];
+                TCGMemOp mop = args[nb_oargs + nb_iargs];
                 if (!(mop & MO_SIGN)) {
                     mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
@@ -900,15 +901,15 @@
         }
 
         if (mask == 0) {
-            assert(def->nb_oargs == 1);
+            assert(nb_oargs == 1);
             s->gen_opc_buf[op_index] = op_to_movi(op);
             tcg_opt_gen_movi(gen_args, args[0], 0);
-            args += def->nb_oargs + def->nb_iargs + def->nb_cargs;
+            args += nb_args;
             gen_args += 2;
             continue;
         }
         if (affected == 0) {
-            assert(def->nb_oargs == 1);
+            assert(nb_oargs == 1);
             if (temps_are_copies(args[0], args[1])) {
                 s->gen_opc_buf[op_index] = INDEX_op_nop;
             } else if (temps[args[1]].state != TCG_TEMP_CONST) {
@@ -920,7 +921,7 @@
                 tcg_opt_gen_movi(gen_args, args[0], temps[args[1]].val);
                 gen_args += 2;
             }
-            args += def->nb_iargs + 1;
+            args += nb_args;
             continue;
         }
 
@@ -1246,24 +1247,13 @@
             break;
 
         case INDEX_op_call:
-            nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
-            if (!(args[nb_call_args + 1] & (TCG_CALL_NO_READ_GLOBALS |
-                                            TCG_CALL_NO_WRITE_GLOBALS))) {
+            if (!(args[nb_oargs + nb_iargs + 1]
+                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
                     reset_temp(i);
                 }
             }
-            for (i = 0; i < (args[0] >> 16); i++) {
-                reset_temp(args[i + 1]);
-            }
-            i = nb_call_args + 3;
-            while (i) {
-                *gen_args = *args;
-                args++;
-                gen_args++;
-                i--;
-            }
-            break;
+            goto do_reset_output;
 
         default:
         do_default:
@@ -1275,7 +1265,8 @@
             if (def->flags & TCG_OPF_BB_END) {
                 reset_all_temps(nb_temps);
             } else {
-                for (i = 0; i < def->nb_oargs; i++) {
+        do_reset_output:
+                for (i = 0; i < nb_oargs; i++) {
                     reset_temp(args[i]);
                     /* Save the corresponding known-zero bits mask for the
                        first output argument (only one supported so far). */
@@ -1284,11 +1275,11 @@
                     }
                 }
             }
-            for (i = 0; i < def->nb_args; i++) {
+            for (i = 0; i < nb_args; i++) {
                 gen_args[i] = args[i];
             }
-            args += def->nb_args;
-            gen_args += def->nb_args;
+            args += nb_args;
+            gen_args += nb_args;
             break;
         }
     }
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 83d9340..436b65b 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -24,7 +24,7 @@
 
 #include "tcg-be-ldst.h"
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
 #if defined _CALL_DARWIN || defined __APPLE__
 #define TCG_TARGET_CALL_DARWIN
@@ -171,50 +171,47 @@
     TCG_REG_R31
 };
 
-static uint32_t reloc_pc24_val (void *pc, tcg_target_long target)
+static inline bool in_range_b(tcg_target_long target)
 {
-    tcg_target_long disp;
+    return target == sextract32(target, 0, 26);
+}
 
-    disp = target - (tcg_target_long) pc;
-    if ((disp << 6) >> 6 != disp)
-        tcg_abort ();
-
+static uint32_t reloc_pc24_val(tcg_insn_unit *pc, tcg_insn_unit *target)
+{
+    ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
+    assert(in_range_b(disp));
     return disp & 0x3fffffc;
 }
 
-static void reloc_pc24 (void *pc, tcg_target_long target)
+static void reloc_pc24(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0x3fffffc)
-        | reloc_pc24_val (pc, target);
+    *pc = (*pc & ~0x3fffffc) | reloc_pc24_val(pc, target);
 }
 
-static uint16_t reloc_pc14_val (void *pc, tcg_target_long target)
+static uint16_t reloc_pc14_val(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    tcg_target_long disp;
-
-    disp = target - (tcg_target_long) pc;
-    if (disp != (int16_t) disp)
-        tcg_abort ();
-
+    ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
+    assert(disp == (int16_t) disp);
     return disp & 0xfffc;
 }
 
-static void reloc_pc14 (void *pc, tcg_target_long target)
+static void reloc_pc14(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    *(uint32_t *) pc = (*(uint32_t *) pc & ~0xfffc)
-        | reloc_pc14_val (pc, target);
+    *pc = (*pc & ~0xfffc) | reloc_pc14_val(pc, target);
 }
 
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
-    value += addend;
+    tcg_insn_unit *target = (tcg_insn_unit *)value;
+
+    assert(addend == 0);
     switch (type) {
     case R_PPC_REL14:
-        reloc_pc14 (code_ptr, value);
+        reloc_pc14(code_ptr, target);
         break;
     case R_PPC_REL24:
-        reloc_pc24 (code_ptr, value);
+        reloc_pc24(code_ptr, target);
         break;
     default:
         tcg_abort();
@@ -480,47 +477,36 @@
     }
 }
 
-static void tcg_out_b (TCGContext *s, int mask, tcg_target_long target)
+static void tcg_out_b(TCGContext *s, int mask, tcg_insn_unit *target)
 {
-    tcg_target_long disp;
-
-    disp = target - (tcg_target_long) s->code_ptr;
-    if ((disp << 6) >> 6 == disp)
-        tcg_out32 (s, B | (disp & 0x3fffffc) | mask);
-    else {
-        tcg_out_movi (s, TCG_TYPE_I32, 0, (tcg_target_long) target);
-        tcg_out32 (s, MTSPR | RS (0) | CTR);
-        tcg_out32 (s, BCCTR | BO_ALWAYS | mask);
+    ptrdiff_t disp = tcg_pcrel_diff(s, target);
+    if (in_range_b(disp)) {
+        tcg_out32(s, B | (disp & 0x3fffffc) | mask);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, (uintptr_t)target);
+        tcg_out32(s, MTSPR | RS(TCG_REG_R0) | CTR);
+        tcg_out32(s, BCCTR | BO_ALWAYS | mask);
     }
 }
 
-static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg,
-                          int lk)
+static void tcg_out_call1(TCGContext *s, tcg_insn_unit *target, int lk)
 {
 #ifdef _CALL_AIX
-    int reg;
-
-    if (const_arg) {
-        reg = 2;
-        tcg_out_movi (s, TCG_TYPE_I32, reg, arg);
-    }
-    else reg = arg;
-
-    tcg_out32 (s, LWZ | RT (0) | RA (reg));
-    tcg_out32 (s, MTSPR | RA (0) | CTR);
-    tcg_out32 (s, LWZ | RT (2) | RA (reg) | 4);
-    tcg_out32 (s, BCCTR | BO_ALWAYS | lk);
+    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, (uintptr_t)target);
+    tcg_out32(s, LWZ | RT(TCG_REG_R0) | RA(reg));
+    tcg_out32(s, MTSPR | RA(TCG_REG_R0) | CTR);
+    tcg_out32(s, LWZ | RT(TCG_REG_R2) | RA(reg) | 4);
+    tcg_out32(s, BCCTR | BO_ALWAYS | lk);
 #else
-    if (const_arg) {
-        tcg_out_b (s, lk, arg);
-    }
-    else {
-        tcg_out32 (s, MTSPR | RS (arg) | LR);
-        tcg_out32 (s, BCLR | BO_ALWAYS | lk);
-    }
+    tcg_out_b(s, lk, target);
 #endif
 }
 
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
+{
+    tcg_out_call1(s, target, LK);
+}
+
 #if defined(CONFIG_SOFTMMU)
 
 static void add_qemu_ldst_label (TCGContext *s,
@@ -531,8 +517,8 @@
                                  int addrlo_reg,
                                  int addrhi_reg,
                                  int mem_index,
-                                 uint8_t *raddr,
-                                 uint8_t *label_ptr)
+                                 tcg_insn_unit *raddr,
+                                 tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
@@ -550,7 +536,7 @@
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
     [MO_LEUW] = helper_le_lduw_mmu,
     [MO_LEUL] = helper_le_ldul_mmu,
@@ -563,7 +549,7 @@
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[16] = {
     [MO_UB]   = helper_ret_stb_mmu,
     [MO_LEUW] = helper_le_stw_mmu,
     [MO_LEUL] = helper_le_stl_mmu,
@@ -573,8 +559,8 @@
     [MO_BEQ]  = helper_be_stq_mmu,
 };
 
-static void *ld_trampolines[16];
-static void *st_trampolines[16];
+static tcg_insn_unit *ld_trampolines[16];
+static tcg_insn_unit *st_trampolines[16];
 
 /* Perform the TLB load and compare.  Branches to the slow path, placing the
    address of the branch in *LABEL_PTR.  Loads the addend of the TLB into R0.
@@ -582,14 +568,15 @@
 
 static void tcg_out_tlb_check(TCGContext *s, TCGReg r0, TCGReg r1, TCGReg r2,
                               TCGReg addrlo, TCGReg addrhi, TCGMemOp s_bits,
-                              int mem_index, int is_load, uint8_t **label_ptr)
+                              int mem_index, int is_load,
+                              tcg_insn_unit **label_ptr)
 {
     int cmp_off =
         (is_load
          ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
          : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
     int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
-    uint16_t retranst;
+    tcg_insn_unit retranst;
     TCGReg base = TCG_AREG0;
 
     /* Extract the page index, shifted into place for tlb index.  */
@@ -648,7 +635,7 @@
        This address cannot be used for a tail call, but it's shorter
        than forming an address from scratch.  */
     *label_ptr = s->code_ptr;
-    retranst = ((uint16_t *) s->code_ptr)[1] & ~3;
+    retranst = *s->code_ptr & 0xfffc;
     tcg_out32(s, BC | BI(7, CR_EQ) | retranst | BO_COND_FALSE | LK);
 }
 #endif
@@ -659,7 +646,7 @@
     TCGMemOp opc, bswap;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 #endif
 
     datalo = *args++;
@@ -731,7 +718,7 @@
     TCGMemOp opc, bswap, s_bits;
 #ifdef CONFIG_SOFTMMU
     int mem_index;
-    uint8_t *label_ptr;
+    tcg_insn_unit *label_ptr;
 #endif
 
     datalo = *args++;
@@ -790,7 +777,7 @@
     TCGReg ir, datalo, datahi;
     TCGMemOp opc = l->opc;
 
-    reloc_pc14 (l->label_ptr[0], (uintptr_t)s->code_ptr);
+    reloc_pc14(l->label_ptr[0], s->code_ptr);
 
     ir = TCG_REG_R4;
     if (TARGET_LONG_BITS == 32) {
@@ -804,7 +791,7 @@
     }
     tcg_out_movi(s, TCG_TYPE_I32, ir++, l->mem_index);
     tcg_out32(s, MFSPR | RT(ir++) | LR);
-    tcg_out_b(s, LK, (uintptr_t)ld_trampolines[opc & ~MO_SIGN]);
+    tcg_out_b(s, LK, ld_trampolines[opc & ~MO_SIGN]);
 
     datalo = l->datalo_reg;
     switch (opc & MO_SSIZE) {
@@ -832,7 +819,7 @@
         }
         break;
     }
-    tcg_out_b (s, 0, (uintptr_t)l->raddr);
+    tcg_out_b(s, 0, l->raddr);
 }
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
@@ -840,7 +827,7 @@
     TCGReg ir, datalo;
     TCGMemOp opc = l->opc;
 
-    reloc_pc14 (l->label_ptr[0], (tcg_target_long) s->code_ptr);
+    reloc_pc14(l->label_ptr[0], s->code_ptr);
 
     ir = TCG_REG_R4;
     if (TARGET_LONG_BITS == 32) {
@@ -878,16 +865,16 @@
 
     tcg_out_movi(s, TCG_TYPE_I32, ir++, l->mem_index);
     tcg_out32(s, MFSPR | RT(ir++) | LR);
-    tcg_out_b(s, LK, (uintptr_t)st_trampolines[opc]);
-    tcg_out_b(s, 0, (uintptr_t)l->raddr);
+    tcg_out_b(s, LK, st_trampolines[opc]);
+    tcg_out_b(s, 0, l->raddr);
 }
 #endif
 
 #ifdef CONFIG_SOFTMMU
-static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
+static void emit_ldst_trampoline(TCGContext *s, tcg_insn_unit *ptr)
 {
-    tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
-    tcg_out_call (s, (tcg_target_long) ptr, 1, 0);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_AREG0);
+    tcg_out_call1(s, ptr, 0);
 }
 #endif
 
@@ -909,12 +896,13 @@
 
 #ifdef _CALL_AIX
     {
-        uint32_t addr;
+        uintptr_t addr;
 
         /* First emit adhoc function descriptor */
-        addr = (uint32_t) s->code_ptr + 12;
-        tcg_out32 (s, addr);        /* entry point */
-        s->code_ptr += 8;           /* skip TOC and environment pointer */
+        addr = (uintptr_t)s->code_ptr + 12;
+        tcg_out32(s, addr);        /* entry point */
+        tcg_out32(s, 0);           /* toc */
+        tcg_out32(s, 0);           /* environment pointer */
     }
 #endif
     tcg_out32 (s, MFSPR | RT (0) | LR);
@@ -1065,18 +1053,17 @@
 
 }
 
-static void tcg_out_bc (TCGContext *s, int bc, int label_index)
+static void tcg_out_bc(TCGContext *s, int bc, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
 
-    if (l->has_value)
-        tcg_out32 (s, bc | reloc_pc14_val (s->code_ptr, l->u.value));
-    else {
-        uint16_t val = *(uint16_t *) &s->code_ptr[2];
-
+    if (l->has_value) {
+        tcg_out32(s, bc | reloc_pc14_val(s->code_ptr, l->u.value_ptr));
+    } else {
         /* Thanks to Andrzej Zaborowski */
-        tcg_out32 (s, bc | (val & 0xfffc));
-        tcg_out_reloc (s, s->code_ptr - 4, R_PPC_REL14, label_index, 0);
+        tcg_insn_unit retrans = *s->code_ptr & 0xfffc;
+        tcg_out_reloc(s, s->code_ptr, R_PPC_REL14, label_index, 0);
+        tcg_out32(s, bc | retrans);
     }
 }
 
@@ -1367,43 +1354,33 @@
 {
     switch (opc) {
     case INDEX_op_exit_tb:
-        tcg_out_movi (s, TCG_TYPE_I32, TCG_REG_R3, args[0]);
-        tcg_out_b (s, 0, (tcg_target_long) tb_ret_addr);
+        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R3, args[0]);
+        tcg_out_b(s, 0, tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* direct jump method */
-
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-            s->code_ptr += 16;
-        }
-        else {
+            s->tb_jmp_offset[args[0]] = tcg_current_code_size(s);
+            s->code_ptr += 4;
+        } else {
             tcg_abort ();
         }
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
+        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
         {
             TCGLabel *l = &s->labels[args[0]];
 
             if (l->has_value) {
-                tcg_out_b (s, 0, l->u.value);
-            }
-            else {
-                uint32_t val = *(uint32_t *) s->code_ptr;
-
+                tcg_out_b(s, 0, l->u.value_ptr);
+            } else {
                 /* Thanks to Andrzej Zaborowski */
-                tcg_out32 (s, B | (val & 0x3fffffc));
-                tcg_out_reloc (s, s->code_ptr - 4, R_PPC_REL24, args[0], 0);
+                tcg_insn_unit retrans = *s->code_ptr & 0x3fffffc;
+                tcg_out_reloc(s, s->code_ptr, R_PPC_REL24, args[0], 0);
+                tcg_out32(s, B | retrans);
             }
         }
         break;
-    case INDEX_op_call:
-        tcg_out_call (s, args[0], const_args[0], LK);
-        break;
-    case INDEX_op_movi_i32:
-        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
-        break;
     case INDEX_op_ld8u_i32:
         tcg_out_ldst (s, args[0], args[1], args[2], LBZ, LBZX);
         break;
@@ -1839,20 +1816,19 @@
                          const_args[2]);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
-        tcg_dump_ops (s);
-        tcg_abort ();
+        tcg_abort();
     }
 }
 
 static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_br, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 0d4f595..dd7e557 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -25,6 +25,7 @@
 #define TCG_TARGET_PPC 1
 
 #define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_INSN_UNIT_SIZE 4
 
 typedef enum {
     TCG_REG_R0 = 0,
diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
index 45b1c06..c90ddcd 100644
--- a/tcg/ppc64/tcg-target.c
+++ b/tcg/ppc64/tcg-target.c
@@ -31,7 +31,7 @@
 #define TCG_CT_CONST_ZERO 0x1000
 #define TCG_CT_CONST_MONE 0x2000
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
 #if TARGET_LONG_BITS == 32
 #define LD_ADDR LWZ
@@ -168,61 +168,54 @@
     return target == sextract64(target, 0, 26);
 }
 
-static uint32_t reloc_pc24_val(void *pc, tcg_target_long target)
+static uint32_t reloc_pc24_val(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    tcg_target_long disp;
-
-    disp = target - (tcg_target_long)pc;
+    ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
     assert(in_range_b(disp));
-
     return disp & 0x3fffffc;
 }
 
-static void reloc_pc24(void *pc, tcg_target_long target)
+static void reloc_pc24(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    *(uint32_t *)pc = (*(uint32_t *)pc & ~0x3fffffc)
-        | reloc_pc24_val(pc, target);
+    *pc = (*pc & ~0x3fffffc) | reloc_pc24_val(pc, target);
 }
 
-static uint16_t reloc_pc14_val(void *pc, tcg_target_long target)
+static uint16_t reloc_pc14_val(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    tcg_target_long disp;
-
-    disp = target - (tcg_target_long)pc;
-    if (disp != (int16_t) disp) {
-        tcg_abort();
-    }
-
+    ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
+    assert(disp == (int16_t) disp);
     return disp & 0xfffc;
 }
 
-static void reloc_pc14(void *pc, tcg_target_long target)
+static void reloc_pc14(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
-    *(uint32_t *)pc = (*(uint32_t *)pc & ~0xfffc) | reloc_pc14_val(pc, target);
+    *pc = (*pc & ~0xfffc) | reloc_pc14_val(pc, target);
 }
 
 static inline void tcg_out_b_noaddr(TCGContext *s, int insn)
 {
-    unsigned retrans = *(uint32_t *)s->code_ptr & 0x3fffffc;
+    unsigned retrans = *s->code_ptr & 0x3fffffc;
     tcg_out32(s, insn | retrans);
 }
 
 static inline void tcg_out_bc_noaddr(TCGContext *s, int insn)
 {
-    unsigned retrans = *(uint32_t *)s->code_ptr & 0xfffc;
+    unsigned retrans = *s->code_ptr & 0xfffc;
     tcg_out32(s, insn | retrans);
 }
 
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
-    value += addend;
+    tcg_insn_unit *target = (tcg_insn_unit *)value;
+
+    assert(addend == 0);
     switch (type) {
     case R_PPC_REL14:
-        reloc_pc14(code_ptr, value);
+        reloc_pc14(code_ptr, target);
         break;
     case R_PPC_REL24:
-        reloc_pc24(code_ptr, value);
+        reloc_pc24(code_ptr, target);
         break;
     default:
         tcg_abort();
@@ -702,61 +695,48 @@
     tcg_out_zori32(s, dst, src, c, XORI, XORIS);
 }
 
-static void tcg_out_b(TCGContext *s, int mask, tcg_target_long target)
+static void tcg_out_b(TCGContext *s, int mask, tcg_insn_unit *target)
 {
-    tcg_target_long disp;
-
-    disp = target - (tcg_target_long)s->code_ptr;
+    ptrdiff_t disp = tcg_pcrel_diff(s, target);
     if (in_range_b(disp)) {
         tcg_out32(s, B | (disp & 0x3fffffc) | mask);
     } else {
-        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R0, (tcg_target_long)target);
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R0, (uintptr_t)target);
         tcg_out32(s, MTSPR | RS(TCG_REG_R0) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS | mask);
     }
 }
 
-static void tcg_out_call(TCGContext *s, tcg_target_long arg, int const_arg)
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
 {
 #ifdef __APPLE__
-    if (const_arg) {
-        tcg_out_b(s, LK, arg);
-    } else {
-        tcg_out32(s, MTSPR | RS(arg) | LR);
-        tcg_out32(s, BCLR | BO_ALWAYS | LK);
-    }
+    tcg_out_b(s, LK, target);
 #else
-    TCGReg reg = arg;
-    int ofs = 0;
+    /* Look through the descriptor.  If the branch is in range, and we
+       don't have to spend too much effort on building the toc.  */
+    void *tgt = ((void **)target)[0];
+    uintptr_t toc = ((uintptr_t *)target)[1];
+    intptr_t diff = tcg_pcrel_diff(s, tgt);
 
-    if (const_arg) {
-        /* Look through the descriptor.  If the branch is in range, and we
-           don't have to spend too much effort on building the toc.  */
-        intptr_t tgt = ((intptr_t *)arg)[0];
-        intptr_t toc = ((intptr_t *)arg)[1];
-        intptr_t diff = tgt - (intptr_t)s->code_ptr;
-
-        if (in_range_b(diff) && toc == (uint32_t)toc) {
-            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R2, toc);
-            tcg_out_b(s, LK, tgt);
-            return;
-        }
-
+    if (in_range_b(diff) && toc == (uint32_t)toc) {
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R2, toc);
+        tcg_out_b(s, LK, tgt);
+    } else {
         /* Fold the low bits of the constant into the addresses below.  */
-        ofs = (int16_t)arg;
+        intptr_t arg = (intptr_t)target;
+        int ofs = (int16_t)arg;
+
         if (ofs + 8 < 0x8000) {
             arg -= ofs;
         } else {
             ofs = 0;
         }
-        reg = TCG_REG_R2;
-        tcg_out_movi(s, TCG_TYPE_I64, reg, arg);
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R2, arg);
+        tcg_out32(s, LD | TAI(TCG_REG_R0, TCG_REG_R2, ofs));
+        tcg_out32(s, MTSPR | RA(TCG_REG_R0) | CTR);
+        tcg_out32(s, LD | TAI(TCG_REG_R2, TCG_REG_R2, ofs + 8));
+        tcg_out32(s, BCCTR | BO_ALWAYS | LK);
     }
-
-    tcg_out32(s, LD | TAI(TCG_REG_R0, reg, ofs));
-    tcg_out32(s, MTSPR | RA(TCG_REG_R0) | CTR);
-    tcg_out32(s, LD | TAI(TCG_REG_R2, reg, ofs + 8));
-    tcg_out32(s, BCCTR | BO_ALWAYS | LK);
 #endif
 }
 
@@ -844,7 +824,7 @@
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
  *                                 int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[16] = {
     [MO_UB]   = helper_ret_ldub_mmu,
     [MO_LEUW] = helper_le_lduw_mmu,
     [MO_LEUL] = helper_le_ldul_mmu,
@@ -857,7 +837,7 @@
 /* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
  *                                 uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static const void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[16] = {
     [MO_UB]   = helper_ret_stb_mmu,
     [MO_LEUW] = helper_le_stw_mmu,
     [MO_LEUL] = helper_le_stl_mmu,
@@ -946,7 +926,7 @@
    helper code.  */
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOp opc,
                                 int data_reg, int addr_reg, int mem_index,
-                                uint8_t *raddr, uint8_t *label_ptr)
+                                tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
     TCGLabelQemuLdst *label = new_ldst_label(s);
 
@@ -963,7 +943,7 @@
 {
     TCGMemOp opc = lb->opc;
 
-    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
+    reloc_pc14(lb->label_ptr[0], s->code_ptr);
 
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_AREG0);
 
@@ -974,7 +954,7 @@
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, lb->mem_index);
     tcg_out32(s, MFSPR | RT(TCG_REG_R6) | LR);
 
-    tcg_out_call(s, (tcg_target_long)qemu_ld_helpers[opc & ~MO_SIGN], 1);
+    tcg_out_call(s, qemu_ld_helpers[opc & ~MO_SIGN]);
 
     if (opc & MO_SIGN) {
         uint32_t insn = qemu_exts_opc[opc & MO_SIZE];
@@ -983,7 +963,7 @@
         tcg_out_mov(s, TCG_TYPE_I64, lb->datalo_reg, TCG_REG_R3);
     }
 
-    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
+    tcg_out_b(s, 0, lb->raddr);
 }
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -991,7 +971,7 @@
     TCGMemOp opc = lb->opc;
     TCGMemOp s_bits = opc & MO_SIZE;
 
-    reloc_pc14(lb->label_ptr[0], (uintptr_t)s->code_ptr);
+    reloc_pc14(lb->label_ptr[0], s->code_ptr);
 
     tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R3, TCG_AREG0);
 
@@ -1004,9 +984,9 @@
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R6, lb->mem_index);
     tcg_out32(s, MFSPR | RT(TCG_REG_R7) | LR);
 
-    tcg_out_call(s, (tcg_target_long)qemu_st_helpers[opc], 1);
+    tcg_out_call(s, qemu_st_helpers[opc]);
 
-    tcg_out_b(s, 0, (uintptr_t)lb->raddr);
+    tcg_out_b(s, 0, lb->raddr);
 }
 #endif /* SOFTMMU */
 
@@ -1017,7 +997,7 @@
     uint32_t insn;
     TCGMemOp s_bits = opc & MO_SIZE;
 #ifdef CONFIG_SOFTMMU
-    void *label_ptr;
+    tcg_insn_unit *label_ptr;
 #endif
 
 #ifdef CONFIG_SOFTMMU
@@ -1063,7 +1043,7 @@
     TCGReg rbase;
     uint32_t insn;
 #ifdef CONFIG_SOFTMMU
-    void *label_ptr;
+    tcg_insn_unit *label_ptr;
 #endif
 
 #ifdef CONFIG_SOFTMMU
@@ -1123,7 +1103,8 @@
 #ifndef __APPLE__
     /* First emit adhoc function descriptor */
     tcg_out64(s, (uint64_t)s->code_ptr + 24); /* entry point */
-    s->code_ptr += 16;          /* skip TOC and environment pointer */
+    tcg_out64(s, 0);                          /* toc */
+    tcg_out64(s, 0);                          /* environment pointer */
 #endif
 
     /* Prologue */
@@ -1415,7 +1396,7 @@
     TCGLabel *l = &s->labels[label_index];
 
     if (l->has_value) {
-        tcg_out32(s, bc | reloc_pc14_val(s->code_ptr, l->u.value));
+        tcg_out32(s, bc | reloc_pc14_val(s->code_ptr, l->u.value_ptr));
     } else {
         tcg_out_reloc(s, s->code_ptr, R_PPC_REL14, label_index, 0);
         tcg_out_bc_noaddr(s, bc);
@@ -1478,15 +1459,13 @@
     }
 }
 
-void ppc_tb_set_jmp_target(unsigned long jmp_addr, unsigned long addr)
+void ppc_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
 {
     TCGContext s;
-    unsigned long patch_size;
 
-    s.code_ptr = (uint8_t *) jmp_addr;
-    tcg_out_b(&s, 0, addr);
-    patch_size = s.code_ptr - (uint8_t *) jmp_addr;
-    flush_icache_range(jmp_addr, jmp_addr + patch_size);
+    s.code_buf = s.code_ptr = (tcg_insn_unit *)jmp_addr;
+    tcg_out_b(&s, 0, (tcg_insn_unit *)addr);
+    flush_icache_range(jmp_addr, jmp_addr + tcg_current_code_size(&s));
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
@@ -1498,40 +1477,31 @@
     switch (opc) {
     case INDEX_op_exit_tb:
         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_R3, args[0]);
-        tcg_out_b(s, 0, (tcg_target_long)tb_ret_addr);
+        tcg_out_b(s, 0, tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* Direct jump method.  */
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
-            s->code_ptr += 28;
+            s->tb_jmp_offset[args[0]] = tcg_current_code_size(s);
+            s->code_ptr += 7;
         } else {
             /* Indirect jump method.  */
             tcg_abort();
         }
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
+        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
         {
             TCGLabel *l = &s->labels[args[0]];
 
             if (l->has_value) {
-                tcg_out_b(s, 0, l->u.value);
+                tcg_out_b(s, 0, l->u.value_ptr);
             } else {
                 tcg_out_reloc(s, s->code_ptr, R_PPC_REL24, args[0], 0);
                 tcg_out_b_noaddr(s, B);
             }
         }
         break;
-    case INDEX_op_call:
-        tcg_out_call(s, args[0], const_args[0]);
-        break;
-    case INDEX_op_movi_i32:
-        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
-        break;
-    case INDEX_op_movi_i64:
-        tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
-        break;
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
         tcg_out_mem_long(s, LBZ, LBZX, args[0], args[1], args[2]);
@@ -2012,8 +1982,12 @@
         tcg_out32(s, MULHD | TAB(args[0], args[1], args[2]));
         break;
 
+    case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_movi_i32:  /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_movi_i64:
+    case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
     default:
-        tcg_dump_ops(s);
         tcg_abort();
     }
 }
@@ -2021,14 +1995,8 @@
 static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_br, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_mov_i64, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
-    { INDEX_op_movi_i64, { "r" } },
-
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index 3815b84..29f479a 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -25,6 +25,7 @@
 #define TCG_TARGET_PPC64 1
 
 #define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_INSN_UNIT_SIZE 4
 
 typedef enum {
     TCG_REG_R0 = 0,
diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c
index 1d912a7..ebdd074 100644
--- a/tcg/s390/tcg-target.c
+++ b/tcg/s390/tcg-target.c
@@ -320,7 +320,7 @@
 #ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
    int mmu_idx) */
-static const void * const qemu_ld_helpers[4] = {
+static void * const qemu_ld_helpers[4] = {
     helper_ldb_mmu,
     helper_ldw_mmu,
     helper_ldl_mmu,
@@ -329,7 +329,7 @@
 
 /* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
    uintxx_t val, int mmu_idx) */
-static const void * const qemu_st_helpers[4] = {
+static void * const qemu_st_helpers[4] = {
     helper_stb_mmu,
     helper_stw_mmu,
     helper_stl_mmu,
@@ -337,7 +337,7 @@
 };
 #endif
 
-static uint8_t *tb_ret_addr;
+static tcg_insn_unit *tb_ret_addr;
 
 /* A list of relevant facilities used by this translator.  Some of these
    are required for proper operation, and these are checked at startup.  */
@@ -350,23 +350,20 @@
 
 static uint64_t facilities;
 
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
-    intptr_t code_ptr_tl = (intptr_t)code_ptr;
-    intptr_t pcrel2;
-
-    /* ??? Not the usual definition of "addend".  */
-    pcrel2 = (value - (code_ptr_tl + addend)) >> 1;
+    intptr_t pcrel2 = (tcg_insn_unit *)value - (code_ptr - 1);
+    assert(addend == -2);
 
     switch (type) {
     case R_390_PC16DBL:
         assert(pcrel2 == (int16_t)pcrel2);
-        *(int16_t *)code_ptr = pcrel2;
+        tcg_patch16(code_ptr, pcrel2);
         break;
     case R_390_PC32DBL:
         assert(pcrel2 == (int32_t)pcrel2);
-        *(int32_t *)code_ptr = pcrel2;
+        tcg_patch32(code_ptr, pcrel2);
         break;
     default:
         tcg_abort();
@@ -672,7 +669,7 @@
 
     /* Try for PC-relative address load.  */
     if ((sval & 1) == 0) {
-        intptr_t off = (sval - (intptr_t)s->code_ptr) >> 1;
+        ptrdiff_t off = tcg_pcrel_diff(s, (void *)sval) >> 1;
         if (off == (int32_t)off) {
             tcg_out_insn(s, RIL, LARL, ret, off);
             return;
@@ -789,10 +786,10 @@
 /* load data from an absolute host address */
 static void tcg_out_ld_abs(TCGContext *s, TCGType type, TCGReg dest, void *abs)
 {
-    tcg_target_long addr = (tcg_target_long)abs;
+    intptr_t addr = (intptr_t)abs;
 
-    if (facilities & FACILITY_GEN_INST_EXT) {
-        tcg_target_long disp = (addr - (tcg_target_long)s->code_ptr) >> 1;
+    if ((facilities & FACILITY_GEN_INST_EXT) && !(addr & 1)) {
+        ptrdiff_t disp = tcg_pcrel_diff(s, abs) >> 1;
         if (disp == (int32_t)disp) {
             if (type == TCG_TYPE_I32) {
                 tcg_out_insn(s, RIL, LRL, dest, disp);
@@ -1154,15 +1151,15 @@
     tcg_out_risbg(s, dest, src, msb, lsb, ofs, 0);
 }
 
-static void tgen_gotoi(TCGContext *s, int cc, tcg_target_long dest)
+static void tgen_gotoi(TCGContext *s, int cc, tcg_insn_unit *dest)
 {
-    tcg_target_long off = (dest - (tcg_target_long)s->code_ptr) >> 1;
-    if (off > -0x8000 && off < 0x7fff) {
+    ptrdiff_t off = dest - s->code_ptr;
+    if (off == (int16_t)off) {
         tcg_out_insn(s, RI, BRC, cc, off);
     } else if (off == (int32_t)off) {
         tcg_out_insn(s, RIL, BRCL, cc, off);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, dest);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)dest);
         tcg_out_insn(s, RR, BCR, cc, TCG_TMP0);
     }
 }
@@ -1171,15 +1168,15 @@
 {
     TCGLabel* l = &s->labels[labelno];
     if (l->has_value) {
-        tgen_gotoi(s, cc, l->u.value);
+        tgen_gotoi(s, cc, l->u.value_ptr);
     } else if (USE_LONG_BRANCHES) {
         tcg_out16(s, RIL_BRCL | (cc << 4));
         tcg_out_reloc(s, s->code_ptr, R_390_PC32DBL, labelno, -2);
-        s->code_ptr += 4;
+        s->code_ptr += 2;
     } else {
         tcg_out16(s, RI_BRC | (cc << 4));
         tcg_out_reloc(s, s->code_ptr, R_390_PC16DBL, labelno, -2);
-        s->code_ptr += 2;
+        s->code_ptr += 1;
     }
 }
 
@@ -1187,14 +1184,14 @@
                                 TCGReg r1, TCGReg r2, int labelno)
 {
     TCGLabel* l = &s->labels[labelno];
-    tcg_target_long off;
+    intptr_t off;
 
     if (l->has_value) {
-        off = (l->u.value - (tcg_target_long)s->code_ptr) >> 1;
+        off = l->u.value_ptr - s->code_ptr;
     } else {
         /* We need to keep the offset unchanged for retranslation.  */
-        off = ((int16_t *)s->code_ptr)[1];
-        tcg_out_reloc(s, s->code_ptr + 2, R_390_PC16DBL, labelno, -2);
+        off = s->code_ptr[1];
+        tcg_out_reloc(s, s->code_ptr + 1, R_390_PC16DBL, labelno, -2);
     }
 
     tcg_out16(s, (opc & 0xff00) | (r1 << 4) | r2);
@@ -1209,11 +1206,11 @@
     tcg_target_long off;
 
     if (l->has_value) {
-        off = (l->u.value - (tcg_target_long)s->code_ptr) >> 1;
+        off = l->u.value_ptr - s->code_ptr;
     } else {
         /* We need to keep the offset unchanged for retranslation.  */
-        off = ((int16_t *)s->code_ptr)[1];
-        tcg_out_reloc(s, s->code_ptr + 2, R_390_PC16DBL, labelno, -2);
+        off = s->code_ptr[1];
+        tcg_out_reloc(s, s->code_ptr + 1, R_390_PC16DBL, labelno, -2);
     }
 
     tcg_out16(s, (opc & 0xff00) | (r1 << 4) | cc);
@@ -1272,13 +1269,13 @@
     tgen_branch(s, cc, labelno);
 }
 
-static void tgen_calli(TCGContext *s, tcg_target_long dest)
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
 {
-    tcg_target_long off = (dest - (tcg_target_long)s->code_ptr) >> 1;
+    ptrdiff_t off = dest - s->code_ptr;
     if (off == (int32_t)off) {
         tcg_out_insn(s, RIL, BRASL, TCG_REG_R14, off);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, dest);
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_TMP0, (uintptr_t)dest);
         tcg_out_insn(s, RR, BASR, TCG_REG_R14, TCG_TMP0);
     }
 }
@@ -1395,14 +1392,14 @@
 #if defined(CONFIG_SOFTMMU)
 static TCGReg tcg_prepare_qemu_ldst(TCGContext* s, TCGReg data_reg,
                                     TCGReg addr_reg, int mem_index, int opc,
-                                    uint16_t **label2_ptr_p, int is_store)
+                                    tcg_insn_unit **label2_ptr_p, int is_store)
 {
     const TCGReg arg0 = tcg_target_call_iarg_regs[0];
     const TCGReg arg1 = tcg_target_call_iarg_regs[1];
     const TCGReg arg2 = tcg_target_call_iarg_regs[2];
     const TCGReg arg3 = tcg_target_call_iarg_regs[3];
     int s_bits = opc & 3;
-    uint16_t *label1_ptr;
+    tcg_insn_unit *label1_ptr;
     tcg_target_long ofs;
 
     if (TARGET_LONG_BITS == 32) {
@@ -1436,7 +1433,7 @@
         tcg_out_mov(s, TCG_TYPE_I64, arg1, addr_reg);
     }
 
-    label1_ptr = (uint16_t*)s->code_ptr;
+    label1_ptr = s->code_ptr;
 
     /* je label1 (offset will be patched in later) */
     tcg_out_insn(s, RI, BRC, S390_CC_EQ, 0);
@@ -1463,11 +1460,11 @@
         }
         tcg_out_movi(s, TCG_TYPE_I32, arg3, mem_index);
         tcg_out_mov(s, TCG_TYPE_I64, arg0, TCG_AREG0);
-        tgen_calli(s, (tcg_target_ulong)qemu_st_helpers[s_bits]);
+        tcg_out_call(s, qemu_st_helpers[s_bits]);
     } else {
         tcg_out_movi(s, TCG_TYPE_I32, arg2, mem_index);
         tcg_out_mov(s, TCG_TYPE_I64, arg0, TCG_AREG0);
-        tgen_calli(s, (tcg_target_ulong)qemu_ld_helpers[s_bits]);
+        tcg_out_call(s, qemu_ld_helpers[s_bits]);
 
         /* sign extension */
         switch (opc) {
@@ -1488,13 +1485,12 @@
     }
 
     /* jump to label2 (end) */
-    *label2_ptr_p = (uint16_t*)s->code_ptr;
+    *label2_ptr_p = s->code_ptr;
 
     tcg_out_insn(s, RI, BRC, S390_CC_ALWAYS, 0);
 
     /* this is label1, patch branch */
-    *(label1_ptr + 1) = ((unsigned long)s->code_ptr -
-                         (unsigned long)label1_ptr) >> 1;
+    label1_ptr[1] = s->code_ptr - label1_ptr;
 
     ofs = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
     assert(ofs < 0x80000);
@@ -1504,11 +1500,10 @@
     return arg1;
 }
 
-static void tcg_finish_qemu_ldst(TCGContext* s, uint16_t *label2_ptr)
+static void tcg_finish_qemu_ldst(TCGContext* s, tcg_insn_unit *label2_ptr)
 {
     /* patch branch */
-    *(label2_ptr + 1) = ((unsigned long)s->code_ptr -
-                         (unsigned long)label2_ptr) >> 1;
+    label2_ptr[1] = s->code_ptr - label2_ptr;
 }
 #else
 static void tcg_prepare_user_ldst(TCGContext *s, TCGReg *addr_reg,
@@ -1535,7 +1530,7 @@
     TCGReg addr_reg, data_reg;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    uint16_t *label2_ptr;
+    tcg_insn_unit *label2_ptr;
 #else
     TCGReg index_reg;
     tcg_target_long disp;
@@ -1564,7 +1559,7 @@
     TCGReg addr_reg, data_reg;
 #if defined(CONFIG_SOFTMMU)
     int mem_index;
-    uint16_t *label2_ptr;
+    tcg_insn_unit *label2_ptr;
 #else
     TCGReg index_reg;
     tcg_target_long disp;
@@ -1602,7 +1597,7 @@
     case INDEX_op_exit_tb:
         /* return value */
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, args[0]);
-        tgen_gotoi(s, S390_CC_ALWAYS, (unsigned long)tb_ret_addr);
+        tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
         break;
 
     case INDEX_op_goto_tb:
@@ -1614,22 +1609,7 @@
             /* and go there */
             tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, TCG_TMP0);
         }
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
-        break;
-
-    case INDEX_op_call:
-        if (const_args[0]) {
-            tgen_calli(s, args[0]);
-        } else {
-            tcg_out_insn(s, RR, BASR, TCG_REG_R14, args[0]);
-        }
-        break;
-
-    case INDEX_op_mov_i32:
-        tcg_out_mov(s, TCG_TYPE_I32, args[0], args[1]);
-        break;
-    case INDEX_op_movi_i32:
-        tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
+        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
 
     OP_32_64(ld8u):
@@ -1864,13 +1844,6 @@
         tcg_out_qemu_st(s, args, LD_UINT64);
         break;
 
-    case INDEX_op_mov_i64:
-        tcg_out_mov(s, TCG_TYPE_I64, args[0], args[1]);
-        break;
-    case INDEX_op_movi_i64:
-        tcg_out_movi(s, TCG_TYPE_I64, args[0], args[1]);
-        break;
-
     case INDEX_op_ld16s_i64:
         tcg_out_mem(s, 0, RXY_LGH, args[0], args[1], TCG_REG_NONE, args[2]);
         break;
@@ -2075,8 +2048,12 @@
         tgen_deposit(s, args[0], args[2], args[3], args[4]);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_movi_i64:
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
-        fprintf(stderr,"unimplemented opc 0x%x\n",opc);
         tcg_abort();
     }
 }
@@ -2084,12 +2061,8 @@
 static const TCGTargetOpDef s390_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_br, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
-
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
@@ -2147,9 +2120,6 @@
     { INDEX_op_qemu_st32, { "L", "L" } },
     { INDEX_op_qemu_st64, { "L", "L" } },
 
-    { INDEX_op_mov_i64, { "r", "r" } },
-    { INDEX_op_movi_i64, { "r" } },
-
     { INDEX_op_ld8u_i64, { "r", "r" } },
     { INDEX_op_ld8s_i64, { "r", "r" } },
     { INDEX_op_ld16u_i64, { "r", "r" } },
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index 755c002..5bf733e 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -24,6 +24,8 @@
 #ifndef TCG_TARGET_S390 
 #define TCG_TARGET_S390 1
 
+#define TCG_TARGET_INSN_UNIT_SIZE 2
+
 typedef enum TCGReg {
     TCG_REG_R0 = 0,
     TCG_REG_R1,
diff --git a/tcg/sparc/tcg-target.c b/tcg/sparc/tcg-target.c
index 5468ff5..17ff577 100644
--- a/tcg/sparc/tcg-target.c
+++ b/tcg/sparc/tcg-target.c
@@ -279,37 +279,32 @@
 # define check_fit_ptr  check_fit_i32
 #endif
 
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
     uint32_t insn;
-    value += addend;
+
+    assert(addend == 0);
+    value = tcg_ptr_byte_diff((tcg_insn_unit *)value, code_ptr);
+
     switch (type) {
-    case R_SPARC_32:
-        if (value != (uint32_t)value) {
-            tcg_abort();
-        }
-        *(uint32_t *)code_ptr = value;
-        break;
     case R_SPARC_WDISP16:
-        value -= (intptr_t)code_ptr;
         if (!check_fit_ptr(value >> 2, 16)) {
             tcg_abort();
         }
-        insn = *(uint32_t *)code_ptr;
+        insn = *code_ptr;
         insn &= ~INSN_OFF16(-1);
         insn |= INSN_OFF16(value);
-        *(uint32_t *)code_ptr = insn;
+        *code_ptr = insn;
         break;
     case R_SPARC_WDISP19:
-        value -= (intptr_t)code_ptr;
         if (!check_fit_ptr(value >> 2, 19)) {
             tcg_abort();
         }
-        insn = *(uint32_t *)code_ptr;
+        insn = *code_ptr;
         insn &= ~INSN_OFF19(-1);
         insn |= INSN_OFF19(value);
-        *(uint32_t *)code_ptr = insn;
+        *code_ptr = insn;
         break;
     default:
         tcg_abort();
@@ -573,10 +568,10 @@
     int off19;
 
     if (l->has_value) {
-        off19 = INSN_OFF19(l->u.value - (unsigned long)s->code_ptr);
+        off19 = INSN_OFF19(tcg_pcrel_diff(s, l->u.value_ptr));
     } else {
         /* Make sure to preserve destinations during retranslation.  */
-        off19 = *(uint32_t *)s->code_ptr & INSN_OFF19(-1);
+        off19 = *s->code_ptr & INSN_OFF19(-1);
         tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP19, label, 0);
     }
     tcg_out_bpcc0(s, scond, flags, off19);
@@ -620,10 +615,10 @@
         int off16;
 
         if (l->has_value) {
-            off16 = INSN_OFF16(l->u.value - (unsigned long)s->code_ptr);
+            off16 = INSN_OFF16(tcg_pcrel_diff(s, l->u.value_ptr));
         } else {
             /* Make sure to preserve destinations during retranslation.  */
-            off16 = *(uint32_t *)s->code_ptr & INSN_OFF16(-1);
+            off16 = *s->code_ptr & INSN_OFF16(-1);
             tcg_out_reloc(s, s->code_ptr, R_SPARC_WDISP16, label, 0);
         }
         tcg_out32(s, INSN_OP(0) | INSN_OP2(3) | BPR_PT | INSN_RS1(arg1)
@@ -740,62 +735,66 @@
     tcg_out_mov(s, TCG_TYPE_I32, rl, tmp);
 }
 
-static void tcg_out_calli(TCGContext *s, uintptr_t dest)
+static void tcg_out_call_nodelay(TCGContext *s, tcg_insn_unit *dest)
 {
-    intptr_t disp = dest - (uintptr_t)s->code_ptr;
+    ptrdiff_t disp = tcg_pcrel_diff(s, dest);
 
     if (disp == (int32_t)disp) {
         tcg_out32(s, CALL | (uint32_t)disp >> 2);
     } else {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, dest & ~0xfff);
-        tcg_out_arithi(s, TCG_REG_O7, TCG_REG_T1, dest & 0xfff, JMPL);
+        uintptr_t desti = (uintptr_t)dest;
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_T1, desti & ~0xfff);
+        tcg_out_arithi(s, TCG_REG_O7, TCG_REG_T1, desti & 0xfff, JMPL);
     }
 }
 
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
+{
+    tcg_out_call_nodelay(s, dest);
+    tcg_out_nop(s);
+}
+
 #ifdef CONFIG_SOFTMMU
-static uintptr_t qemu_ld_trampoline[16];
-static uintptr_t qemu_st_trampoline[16];
+static tcg_insn_unit *qemu_ld_trampoline[16];
+static tcg_insn_unit *qemu_st_trampoline[16];
 
 static void build_trampolines(TCGContext *s)
 {
-    static uintptr_t const qemu_ld_helpers[16] = {
-        [MO_UB]   = (uintptr_t)helper_ret_ldub_mmu,
-        [MO_SB]   = (uintptr_t)helper_ret_ldsb_mmu,
-        [MO_LEUW] = (uintptr_t)helper_le_lduw_mmu,
-        [MO_LESW] = (uintptr_t)helper_le_ldsw_mmu,
-        [MO_LEUL] = (uintptr_t)helper_le_ldul_mmu,
-        [MO_LEQ]  = (uintptr_t)helper_le_ldq_mmu,
-        [MO_BEUW] = (uintptr_t)helper_be_lduw_mmu,
-        [MO_BESW] = (uintptr_t)helper_be_ldsw_mmu,
-        [MO_BEUL] = (uintptr_t)helper_be_ldul_mmu,
-        [MO_BEQ]  = (uintptr_t)helper_be_ldq_mmu,
+    static void * const qemu_ld_helpers[16] = {
+        [MO_UB]   = helper_ret_ldub_mmu,
+        [MO_SB]   = helper_ret_ldsb_mmu,
+        [MO_LEUW] = helper_le_lduw_mmu,
+        [MO_LESW] = helper_le_ldsw_mmu,
+        [MO_LEUL] = helper_le_ldul_mmu,
+        [MO_LEQ]  = helper_le_ldq_mmu,
+        [MO_BEUW] = helper_be_lduw_mmu,
+        [MO_BESW] = helper_be_ldsw_mmu,
+        [MO_BEUL] = helper_be_ldul_mmu,
+        [MO_BEQ]  = helper_be_ldq_mmu,
     };
-    static uintptr_t const qemu_st_helpers[16] = {
-        [MO_UB]   = (uintptr_t)helper_ret_stb_mmu,
-        [MO_LEUW] = (uintptr_t)helper_le_stw_mmu,
-        [MO_LEUL] = (uintptr_t)helper_le_stl_mmu,
-        [MO_LEQ]  = (uintptr_t)helper_le_stq_mmu,
-        [MO_BEUW] = (uintptr_t)helper_be_stw_mmu,
-        [MO_BEUL] = (uintptr_t)helper_be_stl_mmu,
-        [MO_BEQ]  = (uintptr_t)helper_be_stq_mmu,
+    static void * const qemu_st_helpers[16] = {
+        [MO_UB]   = helper_ret_stb_mmu,
+        [MO_LEUW] = helper_le_stw_mmu,
+        [MO_LEUL] = helper_le_stl_mmu,
+        [MO_LEQ]  = helper_le_stq_mmu,
+        [MO_BEUW] = helper_be_stw_mmu,
+        [MO_BEUL] = helper_be_stl_mmu,
+        [MO_BEQ]  = helper_be_stq_mmu,
     };
 
     int i;
     TCGReg ra;
-    uintptr_t tramp;
 
     for (i = 0; i < 16; ++i) {
-        if (qemu_ld_helpers[i] == 0) {
+        if (qemu_ld_helpers[i] == NULL) {
             continue;
         }
 
         /* May as well align the trampoline.  */
-        tramp = (uintptr_t)s->code_ptr;
-        while (tramp & 15) {
+        while ((uintptr_t)s->code_ptr & 15) {
             tcg_out_nop(s);
-            tramp += 4;
         }
-        qemu_ld_trampoline[i] = tramp;
+        qemu_ld_trampoline[i] = s->code_ptr;
 
         if (SPARC64 || TARGET_LONG_BITS == 32) {
             ra = TCG_REG_O3;
@@ -810,22 +809,20 @@
         /* Set the env operand.  */
         tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O0, TCG_AREG0);
         /* Tail call.  */
-        tcg_out_calli(s, qemu_ld_helpers[i]);
+        tcg_out_call_nodelay(s, qemu_ld_helpers[i]);
         tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O7, ra);
     }
 
     for (i = 0; i < 16; ++i) {
-        if (qemu_st_helpers[i] == 0) {
+        if (qemu_st_helpers[i] == NULL) {
             continue;
         }
 
         /* May as well align the trampoline.  */
-        tramp = (uintptr_t)s->code_ptr;
-        while (tramp & 15) {
+        while ((uintptr_t)s->code_ptr & 15) {
             tcg_out_nop(s);
-            tramp += 4;
         }
-        qemu_st_trampoline[i] = tramp;
+        qemu_st_trampoline[i] = s->code_ptr;
 
         if (SPARC64) {
             ra = TCG_REG_O4;
@@ -859,7 +856,7 @@
         /* Set the env operand.  */
         tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O0, TCG_AREG0);
         /* Tail call.  */
-        tcg_out_calli(s, qemu_st_helpers[i]);
+        tcg_out_call_nodelay(s, qemu_st_helpers[i]);
         tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O7, ra);
     }
 }
@@ -1005,8 +1002,8 @@
 #ifdef CONFIG_SOFTMMU
     TCGMemOp s_bits = memop & MO_SIZE;
     TCGReg addrz, param;
-    uintptr_t func;
-    uint32_t *label_ptr;
+    tcg_insn_unit *func;
+    tcg_insn_unit *label_ptr;
 
     addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
                              offsetof(CPUTLBEntry, addr_read));
@@ -1016,7 +1013,7 @@
        over the TLB Miss case.  */
 
     /* beq,a,pt %[xi]cc, label0 */
-    label_ptr = (uint32_t *)s->code_ptr;
+    label_ptr = s->code_ptr;
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
@@ -1038,8 +1035,8 @@
     } else {
         func = qemu_ld_trampoline[memop];
     }
-    assert(func != 0);
-    tcg_out_calli(s, func);
+    assert(func != NULL);
+    tcg_out_call_nodelay(s, func);
     /* delay slot */
     tcg_out_movi(s, TCG_TYPE_I32, param, memi);
 
@@ -1067,7 +1064,7 @@
         }
     }
 
-    *label_ptr |= INSN_OFF19((uintptr_t)s->code_ptr - (uintptr_t)label_ptr);
+    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #else
     if (SPARC64 && TARGET_LONG_BITS == 32) {
         tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
@@ -1085,8 +1082,8 @@
 #ifdef CONFIG_SOFTMMU
     TCGMemOp s_bits = memop & MO_SIZE;
     TCGReg addrz, param;
-    uintptr_t func;
-    uint32_t *label_ptr;
+    tcg_insn_unit *func;
+    tcg_insn_unit *label_ptr;
 
     addrz = tcg_out_tlb_load(s, addr, memi, s_bits,
                              offsetof(CPUTLBEntry, addr_write));
@@ -1094,7 +1091,7 @@
     /* The fast path is exactly one insn.  Thus we can perform the entire
        TLB Hit in the (annulled) delay slot of the branch over TLB Miss.  */
     /* beq,a,pt %[xi]cc, label0 */
-    label_ptr = (uint32_t *)s->code_ptr;
+    label_ptr = s->code_ptr;
     tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT
                   | (TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC), 0);
     /* delay slot */
@@ -1115,12 +1112,12 @@
     tcg_out_mov(s, TCG_TYPE_REG, param++, data);
 
     func = qemu_st_trampoline[memop];
-    assert(func != 0);
-    tcg_out_calli(s, func);
+    assert(func != NULL);
+    tcg_out_call_nodelay(s, func);
     /* delay slot */
     tcg_out_movi(s, TCG_TYPE_REG, param, memi);
 
-    *label_ptr |= INSN_OFF19((uintptr_t)s->code_ptr - (uintptr_t)label_ptr);
+    *label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
 #else
     if (SPARC64 && TARGET_LONG_BITS == 32) {
         tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
@@ -1159,26 +1156,16 @@
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* direct jump method */
-            uint32_t old_insn = *(uint32_t *)s->code_ptr;
-            s->tb_jmp_offset[a0] = s->code_ptr - s->code_buf;
+            s->tb_jmp_offset[a0] = tcg_current_code_size(s);
             /* Make sure to preserve links during retranslation.  */
-            tcg_out32(s, CALL | (old_insn & ~INSN_OP(-1)));
+            tcg_out32(s, CALL | (*s->code_ptr & ~INSN_OP(-1)));
         } else {
             /* indirect jump method */
             tcg_out_ld_ptr(s, TCG_REG_T1, (uintptr_t)(s->tb_next + a0));
             tcg_out_arithi(s, TCG_REG_G0, TCG_REG_T1, 0, JMPL);
         }
         tcg_out_nop(s);
-        s->tb_next_offset[a0] = s->code_ptr - s->code_buf;
-        break;
-    case INDEX_op_call:
-        if (const_args[0]) {
-            tcg_out_calli(s, a0);
-        } else {
-            tcg_out_arithi(s, TCG_REG_O7, a0, 0, JMPL);
-        }
-        /* delay slot */
-        tcg_out_nop(s);
+        s->tb_next_offset[a0] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
         tcg_out_bpcc(s, COND_A, BPCC_PT, a0);
@@ -1373,13 +1360,12 @@
 	tcg_out_arithc(s, a0, TCG_REG_G0, a1, const_args[1], c);
 	break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_mov_i32:
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_movi_i64:
-    case INDEX_op_movi_i32:
-        /* Always implemented with tcg_out_mov/i, never with tcg_out_op.  */
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
-        /* Opcode not implemented.  */
         tcg_abort();
     }
 }
@@ -1387,11 +1373,8 @@
 static const TCGTargetOpDef sparc_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },
-    { INDEX_op_call, { "ri" } },
     { INDEX_op_br, { } },
 
-    { INDEX_op_mov_i32, { "r", "r" } },
-    { INDEX_op_movi_i32, { "r" } },
     { INDEX_op_ld8u_i32, { "r", "r" } },
     { INDEX_op_ld8s_i32, { "r", "r" } },
     { INDEX_op_ld16u_i32, { "r", "r" } },
@@ -1428,8 +1411,6 @@
     { INDEX_op_mulu2_i32, { "r", "r", "rZ", "rJ" } },
     { INDEX_op_muls2_i32, { "r", "r", "rZ", "rJ" } },
 
-    { INDEX_op_mov_i64, { "R", "R" } },
-    { INDEX_op_movi_i64, { "R" } },
     { INDEX_op_ld8u_i64, { "R", "r" } },
     { INDEX_op_ld8s_i64, { "R", "r" } },
     { INDEX_op_ld16u_i64, { "R", "r" } },
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 3a903db..473bfc7 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -26,6 +26,7 @@
 
 #define TCG_TARGET_REG_BITS 64
 
+#define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
 
 typedef enum {
diff --git a/tcg/tcg-be-ldst.h b/tcg/tcg-be-ldst.h
index ad94c0c..49b3de6 100644
--- a/tcg/tcg-be-ldst.h
+++ b/tcg/tcg-be-ldst.h
@@ -31,8 +31,8 @@
     TCGReg datalo_reg;      /* reg index for low word to be loaded or stored */
     TCGReg datahi_reg;      /* reg index for high word to be loaded or stored */
     int mem_index;          /* soft MMU memory index */
-    uint8_t *raddr;         /* gen code addr of the next IR of qemu_ld/st IR */
-    uint8_t *label_ptr[2];  /* label pointers to be updated */
+    tcg_insn_unit *raddr;   /* gen code addr of the next IR of qemu_ld/st IR */
+    tcg_insn_unit *label_ptr[2]; /* label pointers to be updated */
 } TCGLabelQemuLdst;
 
 typedef struct TCGBackendData {
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 6dbbb38..bdd0139 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -390,11 +390,7 @@
 static inline void tcg_gen_helperN(void *func, int flags, int sizemask,
                                    TCGArg ret, int nargs, TCGArg *args)
 {
-    TCGv_ptr fn;
-    fn = tcg_const_ptr(func);
-    tcg_gen_callN(&tcg_ctx, fn, flags, sizemask, ret,
-                  nargs, args);
-    tcg_temp_free_ptr(fn);
+    tcg_gen_callN(&tcg_ctx, func, flags, sizemask, ret, nargs, args);
 }
 
 /* Note: Both tcg_gen_helper32() and tcg_gen_helper64() are currently
@@ -405,29 +401,23 @@
 static inline void tcg_gen_helper32(void *func, int sizemask, TCGv_i32 ret,
                                     TCGv_i32 a, TCGv_i32 b)
 {
-    TCGv_ptr fn;
     TCGArg args[2];
-    fn = tcg_const_ptr(func);
     args[0] = GET_TCGV_I32(a);
     args[1] = GET_TCGV_I32(b);
-    tcg_gen_callN(&tcg_ctx, fn,
+    tcg_gen_callN(&tcg_ctx, func,
                   TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_SIDE_EFFECTS,
                   sizemask, GET_TCGV_I32(ret), 2, args);
-    tcg_temp_free_ptr(fn);
 }
 
 static inline void tcg_gen_helper64(void *func, int sizemask, TCGv_i64 ret,
                                     TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_ptr fn;
     TCGArg args[2];
-    fn = tcg_const_ptr(func);
     args[0] = GET_TCGV_I64(a);
     args[1] = GET_TCGV_I64(b);
-    tcg_gen_callN(&tcg_ctx, fn,
+    tcg_gen_callN(&tcg_ctx, func,
                   TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_SIDE_EFFECTS,
                   sizemask, GET_TCGV_I64(ret), 2, args);
-    tcg_temp_free_ptr(fn);
 }
 
 /* 32 bit ops */
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index adb6ce1..71ba64a 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -40,7 +40,7 @@
 DEF(set_label, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_NOT_PRESENT)
 
 /* variable number of parameters */
-DEF(call, 0, 1, 2, TCG_OPF_CALL_CLOBBER)
+DEF(call, 0, 0, 3, TCG_OPF_CALL_CLOBBER | TCG_OPF_NOT_PRESENT)
 
 DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 
@@ -51,8 +51,8 @@
 # define IMPL64  TCG_OPF_64BIT
 #endif
 
-DEF(mov_i32, 1, 1, 0, 0)
-DEF(movi_i32, 1, 0, 1, 0)
+DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
+DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
 DEF(setcond_i32, 1, 2, 1, 0)
 DEF(movcond_i32, 1, 4, 1, IMPL(TCG_TARGET_HAS_movcond_i32))
 /* load/store */
@@ -110,8 +110,8 @@
 DEF(nand_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_nand_i32))
 DEF(nor_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_nor_i32))
 
-DEF(mov_i64, 1, 1, 0, IMPL64)
-DEF(movi_i64, 1, 0, 1, IMPL64)
+DEF(mov_i64, 1, 1, 0, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
+DEF(movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
 DEF(setcond_i64, 1, 2, 1, IMPL64)
 DEF(movcond_i64, 1, 4, 1, IMPL64 | IMPL(TCG_TARGET_HAS_movcond_i64))
 /* load/store */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index e71f7a0..0670aff 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -65,7 +65,7 @@
 /* Forward declarations for functions declared in tcg-target.c and used here. */
 static void tcg_target_init(TCGContext *s);
 static void tcg_target_qemu_prologue(TCGContext *s);
-static void patch_reloc(uint8_t *code_ptr, int type, 
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend);
 
 /* The CIE and FDE header definitions will be common to all hosts.  */
@@ -101,6 +101,7 @@
                        const int *const_args);
 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                        intptr_t arg2);
+static void tcg_out_call(TCGContext *s, tcg_insn_unit *target);
 static int tcg_target_const_match(tcg_target_long val, TCGType type,
                                   const TCGArgConstraint *arg_ct);
 static void tcg_out_tb_init(TCGContext *s);
@@ -117,35 +118,87 @@
 static TCGRegSet tcg_target_available_regs[2];
 static TCGRegSet tcg_target_call_clobber_regs;
 
+#if TCG_TARGET_INSN_UNIT_SIZE == 1
 static inline void tcg_out8(TCGContext *s, uint8_t v)
 {
     *s->code_ptr++ = v;
 }
 
+static inline void tcg_patch8(tcg_insn_unit *p, uint8_t v)
+{
+    *p = v;
+}
+#endif
+
+#if TCG_TARGET_INSN_UNIT_SIZE <= 2
 static inline void tcg_out16(TCGContext *s, uint16_t v)
 {
-    uint8_t *p = s->code_ptr;
-    *(uint16_t *)p = v;
-    s->code_ptr = p + 2;
+    if (TCG_TARGET_INSN_UNIT_SIZE == 2) {
+        *s->code_ptr++ = v;
+    } else {
+        tcg_insn_unit *p = s->code_ptr;
+        memcpy(p, &v, sizeof(v));
+        s->code_ptr = p + (2 / TCG_TARGET_INSN_UNIT_SIZE);
+    }
 }
 
+static inline void tcg_patch16(tcg_insn_unit *p, uint16_t v)
+{
+    if (TCG_TARGET_INSN_UNIT_SIZE == 2) {
+        *p = v;
+    } else {
+        memcpy(p, &v, sizeof(v));
+    }
+}
+#endif
+
+#if TCG_TARGET_INSN_UNIT_SIZE <= 4
 static inline void tcg_out32(TCGContext *s, uint32_t v)
 {
-    uint8_t *p = s->code_ptr;
-    *(uint32_t *)p = v;
-    s->code_ptr = p + 4;
+    if (TCG_TARGET_INSN_UNIT_SIZE == 4) {
+        *s->code_ptr++ = v;
+    } else {
+        tcg_insn_unit *p = s->code_ptr;
+        memcpy(p, &v, sizeof(v));
+        s->code_ptr = p + (4 / TCG_TARGET_INSN_UNIT_SIZE);
+    }
 }
 
+static inline void tcg_patch32(tcg_insn_unit *p, uint32_t v)
+{
+    if (TCG_TARGET_INSN_UNIT_SIZE == 4) {
+        *p = v;
+    } else {
+        memcpy(p, &v, sizeof(v));
+    }
+}
+#endif
+
+#if TCG_TARGET_INSN_UNIT_SIZE <= 8
 static inline void tcg_out64(TCGContext *s, uint64_t v)
 {
-    uint8_t *p = s->code_ptr;
-    *(uint64_t *)p = v;
-    s->code_ptr = p + 8;
+    if (TCG_TARGET_INSN_UNIT_SIZE == 8) {
+        *s->code_ptr++ = v;
+    } else {
+        tcg_insn_unit *p = s->code_ptr;
+        memcpy(p, &v, sizeof(v));
+        s->code_ptr = p + (8 / TCG_TARGET_INSN_UNIT_SIZE);
+    }
 }
 
+static inline void tcg_patch64(tcg_insn_unit *p, uint64_t v)
+{
+    if (TCG_TARGET_INSN_UNIT_SIZE == 8) {
+        *p = v;
+    } else {
+        memcpy(p, &v, sizeof(v));
+    }
+}
+#endif
+
 /* label relocation processing */
 
-static void tcg_out_reloc(TCGContext *s, uint8_t *code_ptr, int type,
+static void tcg_out_reloc(TCGContext *s, tcg_insn_unit *code_ptr, int type,
                           int label_index, intptr_t addend)
 {
     TCGLabel *l;
@@ -168,23 +221,20 @@
     }
 }
 
-static void tcg_out_label(TCGContext *s, int label_index, void *ptr)
+static void tcg_out_label(TCGContext *s, int label_index, tcg_insn_unit *ptr)
 {
-    TCGLabel *l;
-    TCGRelocation *r;
+    TCGLabel *l = &s->labels[label_index];
     intptr_t value = (intptr_t)ptr;
+    TCGRelocation *r;
 
-    l = &s->labels[label_index];
-    if (l->has_value) {
-        tcg_abort();
-    }
-    r = l->u.first_reloc;
-    while (r != NULL) {
+    assert(!l->has_value);
+
+    for (r = l->u.first_reloc; r != NULL; r = r->next) {
         patch_reloc(r->ptr, r->type, value, r->addend);
-        r = r->next;
     }
+
     l->has_value = 1;
-    l->u.value = value;
+    l->u.value_ptr = ptr;
 }
 
 int gen_new_label(void)
@@ -339,7 +389,7 @@
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
-        size_t size = s->code_ptr - s->code_buf;
+        size_t size = tcg_current_code_size(s);
         qemu_log("PROLOGUE: [size=%zu]\n", size);
         log_disas(s->code_buf, size);
         qemu_log("\n");
@@ -656,7 +706,7 @@
 /* Note: we convert the 64 bit args to 32 bit and do some alignment
    and endian swap. Maybe it would be better to do the alignment
    and endian swap in tcg_reg_alloc_call(). */
-void tcg_gen_callN(TCGContext *s, TCGv_ptr func, unsigned int flags,
+void tcg_gen_callN(TCGContext *s, void *func, unsigned int flags,
                    int sizemask, TCGArg ret, int nargs, TCGArg *args)
 {
     int i;
@@ -783,11 +833,10 @@
         *s->gen_opparam_ptr++ = args[i];
         real_args++;
     }
-    *s->gen_opparam_ptr++ = GET_TCGV_PTR(func);
-
+    *s->gen_opparam_ptr++ = (uintptr_t)func;
     *s->gen_opparam_ptr++ = flags;
 
-    *nparam = (nb_rets << 16) | (real_args + 1);
+    *nparam = (nb_rets << 16) | real_args;
 
     /* total parameters, needed to go backward in the instruction stream */
     *s->gen_opparam_ptr++ = 1 + nb_rets + real_args + 3;
@@ -1194,49 +1243,21 @@
             nb_iargs = arg & 0xffff;
             nb_cargs = def->nb_cargs;
 
-            qemu_log(" %s ", def->name);
-
-            /* function name */
-            qemu_log("%s",
-                     tcg_get_arg_str_idx(s, buf, sizeof(buf),
-                                         args[nb_oargs + nb_iargs - 1]));
-            /* flags */
-            qemu_log(",$0x%" TCG_PRIlx, args[nb_oargs + nb_iargs]);
-            /* nb out args */
-            qemu_log(",$%d", nb_oargs);
-            for(i = 0; i < nb_oargs; i++) {
-                qemu_log(",");
-                qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
+            /* function name, flags, out args */
+            qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name,
+                     tcg_find_helper(s, args[nb_oargs + nb_iargs]),
+                     args[nb_oargs + nb_iargs + 1], nb_oargs);
+            for (i = 0; i < nb_oargs; i++) {
+                qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
                                                    args[i]));
             }
-            for(i = 0; i < (nb_iargs - 1); i++) {
-                qemu_log(",");
-                if (args[nb_oargs + i] == TCG_CALL_DUMMY_ARG) {
-                    qemu_log("<dummy>");
-                } else {
-                    qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
-                                                       args[nb_oargs + i]));
+            for (i = 0; i < nb_iargs; i++) {
+                TCGArg arg = args[nb_oargs + i];
+                const char *t = "<dummy>";
+                if (arg != TCG_CALL_DUMMY_ARG) {
+                    t = tcg_get_arg_str_idx(s, buf, sizeof(buf), arg);
                 }
-            }
-        } else if (c == INDEX_op_movi_i32 || c == INDEX_op_movi_i64) {
-            tcg_target_ulong val;
-            const char *name;
-
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
-            nb_cargs = def->nb_cargs;
-            qemu_log(" %s %s,$", def->name,
-                     tcg_get_arg_str_idx(s, buf, sizeof(buf), args[0]));
-            val = args[1];
-            name = tcg_find_helper(s, val);
-            if (name) {
-                qemu_log("%s", name);
-            } else {
-                if (c == INDEX_op_movi_i32) {
-                    qemu_log("0x%x", (uint32_t)val);
-                } else {
-                    qemu_log("0x%" PRIx64 , (uint64_t)val);
-                }
+                qemu_log(",%s", t);
             }
         } else {
             qemu_log(" %s ", def->name);
@@ -1499,9 +1520,9 @@
    temporaries are removed. */
 static void tcg_liveness_analysis(TCGContext *s)
 {
-    int i, op_index, nb_args, nb_iargs, nb_oargs, arg, nb_ops;
+    int i, op_index, nb_args, nb_iargs, nb_oargs, nb_ops;
     TCGOpcode op, op_new, op_new2;
-    TCGArg *args;
+    TCGArg *args, arg;
     const TCGOpDef *def;
     uint8_t *dead_temps, *mem_temps;
     uint16_t dead_args;
@@ -1531,15 +1552,15 @@
 
                 nb_args = args[-1];
                 args -= nb_args;
-                nb_iargs = args[0] & 0xffff;
-                nb_oargs = args[0] >> 16;
-                args++;
-                call_flags = args[nb_oargs + nb_iargs];
+                arg = *args++;
+                nb_iargs = arg & 0xffff;
+                nb_oargs = arg >> 16;
+                call_flags = args[nb_oargs + nb_iargs + 1];
 
                 /* pure functions can be removed if their result is not
                    used */
                 if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
-                    for(i = 0; i < nb_oargs; i++) {
+                    for (i = 0; i < nb_oargs; i++) {
                         arg = args[i];
                         if (!dead_temps[arg] || mem_temps[arg]) {
                             goto do_not_remove_call;
@@ -1553,7 +1574,7 @@
                     /* output args are dead */
                     dead_args = 0;
                     sync_args = 0;
-                    for(i = 0; i < nb_oargs; i++) {
+                    for (i = 0; i < nb_oargs; i++) {
                         arg = args[i];
                         if (dead_temps[arg]) {
                             dead_args |= (1 << i);
@@ -1576,7 +1597,7 @@
                     }
 
                     /* input args are live */
-                    for(i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+                    for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
                         arg = args[i];
                         if (arg != TCG_CALL_DUMMY_ARG) {
                             if (dead_temps[arg]) {
@@ -2075,13 +2096,12 @@
 {
     TCGRegSet allocated_regs;
     TCGTemp *ts, *ots;
-    const TCGArgConstraint *arg_ct, *oarg_ct;
+    TCGType type;
 
     tcg_regset_set(allocated_regs, s->reserved_regs);
     ots = &s->temps[args[0]];
     ts = &s->temps[args[1]];
-    oarg_ct = &def->args_ct[0];
-    arg_ct = &def->args_ct[1];
+    type = ots->type;
 
     /* If the source value is not in a register, and we're going to be
        forced to have it in a register in order to perform the copy,
@@ -2089,12 +2109,13 @@
        we don't have to reload SOURCE the next time it is used. */
     if (((NEED_SYNC_ARG(0) || ots->fixed_reg) && ts->val_type != TEMP_VAL_REG)
         || ts->val_type == TEMP_VAL_MEM) {
-        ts->reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
+        ts->reg = tcg_reg_alloc(s, tcg_target_available_regs[type],
+                                allocated_regs);
         if (ts->val_type == TEMP_VAL_MEM) {
-            tcg_out_ld(s, ts->type, ts->reg, ts->mem_reg, ts->mem_offset);
+            tcg_out_ld(s, type, ts->reg, ts->mem_reg, ts->mem_offset);
             ts->mem_coherent = 1;
         } else if (ts->val_type == TEMP_VAL_CONST) {
-            tcg_out_movi(s, ts->type, ts->reg, ts->val);
+            tcg_out_movi(s, type, ts->reg, ts->val);
         }
         s->reg_to_temp[ts->reg] = args[1];
         ts->val_type = TEMP_VAL_REG;
@@ -2109,7 +2130,7 @@
         if (!ots->mem_allocated) {
             temp_allocate_frame(s, args[0]);
         }
-        tcg_out_st(s, ots->type, ts->reg, ots->mem_reg, ots->mem_offset);
+        tcg_out_st(s, type, ts->reg, ots->mem_reg, ots->mem_offset);
         if (IS_DEAD_ARG(1)) {
             temp_dead(s, args[1]);
         }
@@ -2137,9 +2158,10 @@
                 /* When allocating a new register, make sure to not spill the
                    input one. */
                 tcg_regset_set_reg(allocated_regs, ts->reg);
-                ots->reg = tcg_reg_alloc(s, oarg_ct->u.regs, allocated_regs);
+                ots->reg = tcg_reg_alloc(s, tcg_target_available_regs[type],
+                                         allocated_regs);
             }
-            tcg_out_mov(s, ots->type, ots->reg, ts->reg);
+            tcg_out_mov(s, type, ots->reg, ts->reg);
         }
         ots->val_type = TEMP_VAL_REG;
         ots->mem_coherent = 0;
@@ -2323,26 +2345,27 @@
                               uint16_t dead_args, uint8_t sync_args)
 {
     int nb_iargs, nb_oargs, flags, nb_regs, i, reg, nb_params;
-    TCGArg arg, func_arg;
+    TCGArg arg;
     TCGTemp *ts;
     intptr_t stack_offset;
     size_t call_stack_size;
-    uintptr_t func_addr;
-    int const_func_arg, allocate_args;
+    tcg_insn_unit *func_addr;
+    int allocate_args;
     TCGRegSet allocated_regs;
-    const TCGArgConstraint *arg_ct;
 
     arg = *args++;
 
     nb_oargs = arg >> 16;
     nb_iargs = arg & 0xffff;
-    nb_params = nb_iargs - 1;
+    nb_params = nb_iargs;
 
-    flags = args[nb_oargs + nb_iargs];
+    func_addr = (tcg_insn_unit *)(intptr_t)args[nb_oargs + nb_iargs];
+    flags = args[nb_oargs + nb_iargs + 1];
 
     nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
-    if (nb_regs > nb_params)
+    if (nb_regs > nb_params) {
         nb_regs = nb_params;
+    }
 
     /* assign stack slots first */
     call_stack_size = (nb_params - nb_regs) * sizeof(tcg_target_long);
@@ -2410,40 +2433,6 @@
         }
     }
     
-    /* assign function address */
-    func_arg = args[nb_oargs + nb_iargs - 1];
-    arg_ct = &def->args_ct[0];
-    ts = &s->temps[func_arg];
-    func_addr = ts->val;
-    const_func_arg = 0;
-    if (ts->val_type == TEMP_VAL_MEM) {
-        reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
-        tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
-        func_arg = reg;
-        tcg_regset_set_reg(allocated_regs, reg);
-    } else if (ts->val_type == TEMP_VAL_REG) {
-        reg = ts->reg;
-        if (!tcg_regset_test_reg(arg_ct->u.regs, reg)) {
-            reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
-            tcg_out_mov(s, ts->type, reg, ts->reg);
-        }
-        func_arg = reg;
-        tcg_regset_set_reg(allocated_regs, reg);
-    } else if (ts->val_type == TEMP_VAL_CONST) {
-        if (tcg_target_const_match(func_addr, ts->type, arg_ct)) {
-            const_func_arg = 1;
-            func_arg = func_addr;
-        } else {
-            reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
-            tcg_out_movi(s, ts->type, reg, func_addr);
-            func_arg = reg;
-            tcg_regset_set_reg(allocated_regs, reg);
-        }
-    } else {
-        tcg_abort();
-    }
-        
-    
     /* mark dead temporaries and free the associated registers */
     for(i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
         if (IS_DEAD_ARG(i)) {
@@ -2468,7 +2457,7 @@
         save_globals(s, allocated_regs);
     }
 
-    tcg_out_op(s, opc, &func_arg, &const_func_arg);
+    tcg_out_call(s, func_addr);
 
     /* assign output registers and emit moves if needed */
     for(i = 0; i < nb_oargs; i++) {
@@ -2518,7 +2507,8 @@
 #endif
 
 
-static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
+static inline int tcg_gen_code_common(TCGContext *s,
+                                      tcg_insn_unit *gen_code_buf,
                                       long search_pc)
 {
     TCGOpcode opc;
@@ -2633,7 +2623,7 @@
         }
         args += def->nb_args;
     next:
-        if (search_pc >= 0 && search_pc < s->code_ptr - gen_code_buf) {
+        if (search_pc >= 0 && search_pc < tcg_current_code_size(s)) {
             return op_index;
         }
         op_index++;
@@ -2647,7 +2637,7 @@
     return -1;
 }
 
-int tcg_gen_code(TCGContext *s, uint8_t *gen_code_buf)
+int tcg_gen_code(TCGContext *s, tcg_insn_unit *gen_code_buf)
 {
 #ifdef CONFIG_PROFILER
     {
@@ -2666,16 +2656,17 @@
     tcg_gen_code_common(s, gen_code_buf, -1);
 
     /* flush instruction cache */
-    flush_icache_range((uintptr_t)gen_code_buf, (uintptr_t)s->code_ptr);
+    flush_icache_range((uintptr_t)s->code_buf, (uintptr_t)s->code_ptr);
 
-    return s->code_ptr -  gen_code_buf;
+    return tcg_current_code_size(s);
 }
 
 /* Return the index of the micro operation such as the pc after is <
    offset bytes from the start of the TB.  The contents of gen_code_buf must
    not be changed, though writing the same values is ok.
    Return -1 if not found. */
-int tcg_gen_code_search_pc(TCGContext *s, uint8_t *gen_code_buf, long offset)
+int tcg_gen_code_search_pc(TCGContext *s, tcg_insn_unit *gen_code_buf,
+                           long offset)
 {
     return tcg_gen_code_common(s, gen_code_buf, offset);
 }
diff --git a/tcg/tcg.h b/tcg/tcg.h
index a6a2d06..fbc9310 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -146,10 +146,25 @@
 #define tcg_regset_andnot(d, a, b) (d) = (a) & ~(b)
 #define tcg_regset_not(d, a) (d) = ~(a)
 
+#ifndef TCG_TARGET_INSN_UNIT_SIZE
+# error "Missing TCG_TARGET_INSN_UNIT_SIZE"
+#elif TCG_TARGET_INSN_UNIT_SIZE == 1
+typedef uint8_t tcg_insn_unit;
+#elif TCG_TARGET_INSN_UNIT_SIZE == 2
+typedef uint16_t tcg_insn_unit;
+#elif TCG_TARGET_INSN_UNIT_SIZE == 4
+typedef uint32_t tcg_insn_unit;
+#elif TCG_TARGET_INSN_UNIT_SIZE == 8
+typedef uint64_t tcg_insn_unit;
+#else
+/* The port better have done this.  */
+#endif
+
+
 typedef struct TCGRelocation {
     struct TCGRelocation *next;
     int type;
-    uint8_t *ptr;
+    tcg_insn_unit *ptr;
     intptr_t addend;
 } TCGRelocation; 
 
@@ -157,6 +172,7 @@
     int has_value;
     union {
         uintptr_t value;
+        tcg_insn_unit *value_ptr;
         TCGRelocation *first_reloc;
     } u;
 } TCGLabel;
@@ -464,7 +480,7 @@
     int nb_temps;
 
     /* goto_tb support */
-    uint8_t *code_buf;
+    tcg_insn_unit *code_buf;
     uintptr_t *tb_next;
     uint16_t *tb_next_offset;
     uint16_t *tb_jmp_offset; /* != NULL if USE_DIRECT_JUMP */
@@ -485,7 +501,7 @@
     intptr_t frame_end;
     int frame_reg;
 
-    uint8_t *code_ptr;
+    tcg_insn_unit *code_ptr;
     TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
     TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
 
@@ -524,14 +540,17 @@
     uint16_t gen_opc_icount[OPC_BUF_SIZE];
     uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
 
-    /* Code generation */
+    /* Code generation.  Note that we specifically do not use tcg_insn_unit
+       here, because there's too much arithmetic throughout that relies
+       on addition and subtraction working on bytes.  Rely on the GCC
+       extension that allows arithmetic on void*.  */
     int code_gen_max_blocks;
-    uint8_t *code_gen_prologue;
-    uint8_t *code_gen_buffer;
+    void *code_gen_prologue;
+    void *code_gen_buffer;
     size_t code_gen_buffer_size;
     /* threshold to flush the translated code buffer */
     size_t code_gen_buffer_max_size;
-    uint8_t *code_gen_ptr;
+    void *code_gen_ptr;
 
     TBContext tb_ctx;
 
@@ -566,8 +585,9 @@
 void tcg_prologue_init(TCGContext *s);
 void tcg_func_start(TCGContext *s);
 
-int tcg_gen_code(TCGContext *s, uint8_t *gen_code_buf);
-int tcg_gen_code_search_pc(TCGContext *s, uint8_t *gen_code_buf, long offset);
+int tcg_gen_code(TCGContext *s, tcg_insn_unit *gen_code_buf);
+int tcg_gen_code_search_pc(TCGContext *s, tcg_insn_unit *gen_code_buf,
+                           long offset);
 
 void tcg_set_frame(TCGContext *s, int reg, intptr_t start, intptr_t size);
 
@@ -705,7 +725,7 @@
 #define tcg_temp_free_ptr(T) tcg_temp_free_i64(TCGV_PTR_TO_NAT(T))
 #endif
 
-void tcg_gen_callN(TCGContext *s, TCGv_ptr func, unsigned int flags,
+void tcg_gen_callN(TCGContext *s, void *func, unsigned int flags,
                    int sizemask, TCGArg ret, int nargs, TCGArg *args);
 
 void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
@@ -724,6 +744,51 @@
 TCGv_i64 tcg_const_local_i64(int64_t val);
 
 /**
+ * tcg_ptr_byte_diff
+ * @a, @b: addresses to be differenced
+ *
+ * There are many places within the TCG backends where we need a byte
+ * difference between two pointers.  While this can be accomplished
+ * with local casting, it's easy to get wrong -- especially if one is
+ * concerned with the signedness of the result.
+ *
+ * This version relies on GCC's void pointer arithmetic to get the
+ * correct result.
+ */
+
+static inline ptrdiff_t tcg_ptr_byte_diff(void *a, void *b)
+{
+    return a - b;
+}
+
+/**
+ * tcg_pcrel_diff
+ * @s: the tcg context
+ * @target: address of the target
+ *
+ * Produce a pc-relative difference, from the current code_ptr
+ * to the destination address.
+ */
+
+static inline ptrdiff_t tcg_pcrel_diff(TCGContext *s, void *target)
+{
+    return tcg_ptr_byte_diff(target, s->code_ptr);
+}
+
+/**
+ * tcg_current_code_size
+ * @s: the tcg context
+ *
+ * Compute the current code size within the translation block.
+ * This is used to fill in qemu's data structures for goto_tb.
+ */
+
+static inline size_t tcg_current_code_size(TCGContext *s)
+{
+    return tcg_ptr_byte_diff(s->code_ptr, s->code_buf);
+}
+
+/**
  * tcg_qemu_tb_exec:
  * @env: CPUArchState * for the CPU
  * @tb_ptr: address of generated code for the TB to execute
diff --git a/tcg/tci/tcg-target.c b/tcg/tci/tcg-target.c
index 47c0b85..9b39231 100644
--- a/tcg/tci/tcg-target.c
+++ b/tcg/tci/tcg-target.c
@@ -59,12 +59,8 @@
 static const TCGTargetOpDef tcg_target_op_defs[] = {
     { INDEX_op_exit_tb, { NULL } },
     { INDEX_op_goto_tb, { NULL } },
-    { INDEX_op_call, { RI } },
     { INDEX_op_br, { NULL } },
 
-    { INDEX_op_mov_i32, { R, R } },
-    { INDEX_op_movi_i32, { R } },
-
     { INDEX_op_ld8u_i32, { R, R } },
     { INDEX_op_ld8s_i32, { R, R } },
     { INDEX_op_ld16u_i32, { R, R } },
@@ -141,9 +137,6 @@
 #endif
 
 #if TCG_TARGET_REG_BITS == 64
-    { INDEX_op_mov_i64, { R, R } },
-    { INDEX_op_movi_i64, { R } },
-
     { INDEX_op_ld8u_i64, { R, R } },
     { INDEX_op_ld8s_i64, { R, R } },
     { INDEX_op_ld16u_i64, { R, R } },
@@ -371,14 +364,18 @@
 };
 #endif
 
-static void patch_reloc(uint8_t *code_ptr, int type,
+static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
     /* tcg_out_reloc always uses the same type, addend. */
     assert(type == sizeof(tcg_target_long));
     assert(addend == 0);
     assert(value != 0);
-    *(tcg_target_long *)code_ptr = value;
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_patch32(code_ptr, value);
+    } else {
+        tcg_patch64(code_ptr, value);
+    }
 }
 
 /* Parse target specific constraints. */
@@ -413,8 +410,11 @@
 /* Write value (native size). */
 static void tcg_out_i(TCGContext *s, tcg_target_ulong v)
 {
-    *(tcg_target_ulong *)s->code_ptr = v;
-    s->code_ptr += sizeof(tcg_target_ulong);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_out32(s, v);
+    } else {
+        tcg_out64(s, v);
+    }
 }
 
 /* Write opcode. */
@@ -542,6 +542,11 @@
     old_code_ptr[1] = s->code_ptr - old_code_ptr;
 }
 
+static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *arg)
+{
+    tcg_out_ri(s, 1, (uintptr_t)arg);
+}
+
 static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
                        const int *const_args)
 {
@@ -557,21 +562,18 @@
         if (s->tb_jmp_offset) {
             /* Direct jump method. */
             assert(args[0] < ARRAY_SIZE(s->tb_jmp_offset));
-            s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
+            s->tb_jmp_offset[args[0]] = tcg_current_code_size(s);
             tcg_out32(s, 0);
         } else {
             /* Indirect jump method. */
             TODO();
         }
         assert(args[0] < ARRAY_SIZE(s->tb_next_offset));
-        s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
+        s->tb_next_offset[args[0]] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
         tci_out_label(s, args[0]);
         break;
-    case INDEX_op_call:
-        tcg_out_ri(s, const_args[0], args[0]);
-        break;
     case INDEX_op_setcond_i32:
         tcg_out_r(s, args[0]);
         tcg_out_r(s, args[1]);
@@ -596,9 +598,6 @@
         tcg_out8(s, args[3]);   /* condition */
         break;
 #endif
-    case INDEX_op_movi_i32:
-        TODO(); /* Handled by tcg_out_movi? */
-        break;
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
     case INDEX_op_ld16u_i32:
@@ -654,10 +653,6 @@
         break;
 
 #if TCG_TARGET_REG_BITS == 64
-    case INDEX_op_mov_i64:
-    case INDEX_op_movi_i64:
-        TODO();
-        break;
     case INDEX_op_add_i64:
     case INDEX_op_sub_i64:
     case INDEX_op_mul_i64:
@@ -825,11 +820,12 @@
         tcg_out_i(s, *args);
 #endif
         break;
-    case INDEX_op_end:
-        TODO();
-        break;
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_movi_i64:
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
-        fprintf(stderr, "Missing: %s\n", tcg_op_defs[opc].name);
         tcg_abort();
     }
     old_code_ptr[1] = s->code_ptr - old_code_ptr;
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 37719e8..0be5acd 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -43,6 +43,7 @@
 #include "config-host.h"
 
 #define TCG_TARGET_INTERPRETER 1
+#define TCG_TARGET_INSN_UNIT_SIZE 1
 
 #if UINTPTR_MAX == UINT32_MAX
 # define TCG_TARGET_REG_BITS 32
diff --git a/tests/qemu-iotests/031 b/tests/qemu-iotests/031
index 5aefb88..1d920ea 100755
--- a/tests/qemu-iotests/031
+++ b/tests/qemu-iotests/031
@@ -35,7 +35,6 @@
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
-. ./common.env
 . ./common.rc
 . ./common.filter
 . ./common.pattern
@@ -57,22 +56,22 @@
     echo === Create image with unknown header extension ===
     echo
     _make_test_img 64M
-    $PYTHON qcow2.py "$TEST_IMG" add-header-ext 0x12345678 "This is a test header extension"
-    $PYTHON qcow2.py "$TEST_IMG" dump-header
+    ./qcow2.py "$TEST_IMG" add-header-ext 0x12345678 "This is a test header extension"
+    ./qcow2.py "$TEST_IMG" dump-header
     _check_test_img
 
     echo
     echo === Rewrite header with no backing file ===
     echo
     $QEMU_IMG rebase -u -b "" "$TEST_IMG"
-    $PYTHON qcow2.py "$TEST_IMG" dump-header
+    ./qcow2.py "$TEST_IMG" dump-header
     _check_test_img
 
     echo
     echo === Add a backing file and format ===
     echo
     $QEMU_IMG rebase -u -b "/some/backing/file/path" -F host_device "$TEST_IMG"
-    $PYTHON qcow2.py "$TEST_IMG" dump-header
+    ./qcow2.py "$TEST_IMG" dump-header
 done
 
 # success, all done
diff --git a/tests/qemu-iotests/036 b/tests/qemu-iotests/036
index 29c35d1..03b6aa9 100755
--- a/tests/qemu-iotests/036
+++ b/tests/qemu-iotests/036
@@ -38,7 +38,6 @@
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
-. ./common.env
 . ./common.rc
 . ./common.filter
 . ./common.pattern
@@ -54,15 +53,15 @@
 echo === Create image with unknown autoclear feature bit ===
 echo
 _make_test_img 64M
-$PYTHON qcow2.py "$TEST_IMG" set-feature-bit autoclear 63
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" set-feature-bit autoclear 63
+./qcow2.py "$TEST_IMG" dump-header
 
 echo
 echo === Repair image ===
 echo
 _check_test_img -r all
 
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 
 # success, all done
 echo "*** done"
diff --git a/tests/qemu-iotests/039 b/tests/qemu-iotests/039
index b7b7030..b9cbe99 100755
--- a/tests/qemu-iotests/039
+++ b/tests/qemu-iotests/039
@@ -38,7 +38,6 @@
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
-. ./common.env
 . ./common.rc
 . ./common.filter
 
@@ -59,7 +58,7 @@
 $QEMU_IO -c "write -P 0x5a 0 512" "$TEST_IMG" | _filter_qemu_io
 
 # The dirty bit must not be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 _check_test_img
 
 echo
@@ -74,7 +73,7 @@
 ulimit -c "$old_ulimit"
 
 # The dirty bit must be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 _check_test_img
 
 echo
@@ -83,7 +82,7 @@
 $QEMU_IO -r -c "read -P 0x5a 0 512" "$TEST_IMG" | _filter_qemu_io
 
 # The dirty bit must be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 echo
 echo "== Repairing the image file must succeed =="
@@ -91,7 +90,7 @@
 _check_test_img -r all
 
 # The dirty bit must not be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 echo
 echo "== Data should still be accessible after repair =="
@@ -110,12 +109,12 @@
 ulimit -c "$old_ulimit"
 
 # The dirty bit must be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 $QEMU_IO -c "write 0 512" "$TEST_IMG" | _filter_qemu_io
 
 # The dirty bit must not be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 echo
 echo "== Creating an image file with lazy_refcounts=off =="
@@ -129,7 +128,7 @@
 ulimit -c "$old_ulimit"
 
 # The dirty bit must not be set since lazy_refcounts=off
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 _check_test_img
 
 echo
@@ -145,8 +144,8 @@
 $QEMU_IMG commit "$TEST_IMG"
 
 # The dirty bit must not be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
-$PYTHON qcow2.py "$TEST_IMG".base dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG".base dump-header | grep incompatible_features
 
 _check_test_img
 TEST_IMG="$TEST_IMG".base _check_test_img
diff --git a/tests/qemu-iotests/054 b/tests/qemu-iotests/054
index a5ebf99..c8b7082 100755
--- a/tests/qemu-iotests/054
+++ b/tests/qemu-iotests/054
@@ -35,7 +35,6 @@
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
-. ./common.env
 . ./common.rc
 . ./common.filter
 
@@ -50,7 +49,7 @@
 echo
 echo "creating too large image (1 EB) using qcow2.py"
 _make_test_img 4G
-$PYTHON qcow2.py "$TEST_IMG" set-header size $((1024 ** 6))
+./qcow2.py "$TEST_IMG" set-header size $((1024 ** 6))
 _check_test_img
 
 # success, all done
diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060
index 5447b27..f0116aa 100755
--- a/tests/qemu-iotests/060
+++ b/tests/qemu-iotests/060
@@ -35,7 +35,6 @@
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
-. ./common.env
 . ./common.rc
 . ./common.filter
 
@@ -69,13 +68,13 @@
 _check_test_img
 
 # The corrupt bit should not be set anyway
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 # Try to write something, thereby forcing the corrupt bit to be set
 $QEMU_IO -c "$OPEN_RW" -c "write -P 0x2a 0 512" | _filter_qemu_io
 
 # The corrupt bit must now be set
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 # Try to open the image R/W (which should fail)
 $QEMU_IO -c "$OPEN_RW" -c "read 0 512" 2>&1 | _filter_qemu_io \
@@ -100,19 +99,19 @@
 # Redirect new data cluster onto refcount block
 poke_file "$TEST_IMG" "$l2_offset" "\x80\x00\x00\x00\x00\x02\x00\x00"
 _check_test_img
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 $QEMU_IO -c "$OPEN_RW" -c "write -P 0x2a 0 512" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 # Try to fix it
 _check_test_img -r all
 
 # The corrupt bit should be cleared
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 # Look if it's really really fixed
 $QEMU_IO -c "$OPEN_RW" -c "write -P 0x2a 0 512" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 echo
 echo "=== Testing cluster data reference into inactive L2 table ==="
@@ -125,13 +124,13 @@
 poke_file "$TEST_IMG" "$l2_offset_after_snapshot" \
                       "\x80\x00\x00\x00\x00\x04\x00\x00"
 _check_test_img
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 $QEMU_IO -c "$OPEN_RW" -c "write -P 3 0 512" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 _check_test_img -r all
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 $QEMU_IO -c "$OPEN_RW" -c "write -P 4 0 512" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
 
 # Check data
 $QEMU_IO -c "$OPEN_RO" -c "read -P 4 0 512" | _filter_qemu_io
diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061
index 0de7897..d3a6b38 100755
--- a/tests/qemu-iotests/061
+++ b/tests/qemu-iotests/061
@@ -35,7 +35,6 @@
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
-. ./common.env
 . ./common.rc
 . ./common.filter
 
@@ -49,9 +48,9 @@
 echo
 IMGOPTS="compat=1.1,lazy_refcounts=on" _make_test_img 64M
 $QEMU_IO -c "write -z 0 128k" "$TEST_IMG" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IMG amend -o "compat=0.10" "$TEST_IMG"
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IO -c "read -P 0 0 128k" "$TEST_IMG" | _filter_qemu_io
 _check_test_img
 
@@ -60,9 +59,9 @@
 echo
 IMGOPTS="compat=1.1,lazy_refcounts=on" _make_test_img 64M
 $QEMU_IO -c "write -P 0x2a 0 128k" -c flush -c abort "$TEST_IMG" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IMG amend -o "compat=0.10" "$TEST_IMG"
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IO -c "read -P 0x2a 0 128k" "$TEST_IMG" | _filter_qemu_io
 _check_test_img
 
@@ -70,11 +69,11 @@
 echo "=== Testing version downgrade with unknown compat/autoclear flags ==="
 echo
 IMGOPTS="compat=1.1" _make_test_img 64M
-$PYTHON qcow2.py "$TEST_IMG" set-feature-bit compatible 42
-$PYTHON qcow2.py "$TEST_IMG" set-feature-bit autoclear 42
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" set-feature-bit compatible 42
+./qcow2.py "$TEST_IMG" set-feature-bit autoclear 42
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IMG amend -o "compat=0.10" "$TEST_IMG"
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 _check_test_img
 
 echo
@@ -82,9 +81,9 @@
 echo
 IMGOPTS="compat=0.10" _make_test_img 64M
 $QEMU_IO -c "write -P 0x2a 42M 64k" "$TEST_IMG" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IMG amend -o "compat=1.1,lazy_refcounts=on,size=128M" "$TEST_IMG"
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IO -c "read -P 0x2a 42M 64k" "$TEST_IMG" | _filter_qemu_io
 _check_test_img
 
@@ -93,9 +92,9 @@
 echo
 IMGOPTS="compat=1.1,lazy_refcounts=on" _make_test_img 64M
 $QEMU_IO -c "write -P 0x2a 0 128k" -c flush -c abort "$TEST_IMG" | _filter_qemu_io
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IMG amend -o "lazy_refcounts=off" "$TEST_IMG"
-$PYTHON qcow2.py "$TEST_IMG" dump-header
+./qcow2.py "$TEST_IMG" dump-header
 $QEMU_IO -c "read -P 0x2a 0 128k" "$TEST_IMG" | _filter_qemu_io
 _check_test_img
 
diff --git a/tests/qemu-iotests/065 b/tests/qemu-iotests/065
index e89b61d..ab5445f 100755
--- a/tests/qemu-iotests/065
+++ b/tests/qemu-iotests/065
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2
 #
 # Test for additional information emitted by qemu-img info on qcow2
 # images
diff --git a/tests/qemu-iotests/083 b/tests/qemu-iotests/083
index 6a52c96..f764534 100755
--- a/tests/qemu-iotests/083
+++ b/tests/qemu-iotests/083
@@ -29,7 +29,6 @@
 status=1	# failure is the default!
 
 # get standard environment, filters and checks
-. ./common.env
 . ./common.rc
 . ./common.filter
 
@@ -82,7 +81,7 @@
 		nbd_url="nbd:127.0.0.1:$port:exportname=foo"
 	fi
 
-	$PYTHON nbd-fault-injector.py $extra_args "127.0.0.1:$port" "$TEST_DIR/nbd-fault-injector.conf" 2>&1 >/dev/null &
+	./nbd-fault-injector.py $extra_args "127.0.0.1:$port" "$TEST_DIR/nbd-fault-injector.conf" 2>&1 >/dev/null &
 	wait_for_tcp_port "127.0.0.1:$port"
 	$QEMU_IO -c "read 0 512" "$nbd_url" 2>&1 | _filter_qemu_io | filter_nbd
 
diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
index ca2ee43..e2ed5a9 100755
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@@ -34,13 +34,6 @@
 # generic initialization
 iam=check
 
-# we need common.env
-if ! . ./common.env
-then
-    echo "$iam: failed to source common.env"
-    exit 1
-fi
-
 # we need common.config
 if ! . ./common.config
 then
@@ -222,16 +215,9 @@
 
         start=`_wallclock`
         $timestamp && echo -n "        ["`date "+%T"`"]"
-
-        if [ "$(head -n 1 $seq)" == "#!/usr/bin/env python" ]; then
-            run_command="$PYTHON $seq"
-        else
-            [ ! -x $seq ] && chmod u+x $seq # ensure we can run it
-            run_command="./$seq"
-        fi
-
+        [ ! -x $seq ] && chmod u+x $seq # ensure we can run it
         MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(($RANDOM % 255 + 1))} \
-                $run_command >$tmp.out 2>&1
+                ./$seq >$tmp.out 2>&1
         sts=$?
         $timestamp && _timestamp
         stop=`_wallclock`
diff --git a/translate-all.c b/translate-all.c
index 5759974..5549a85 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -143,7 +143,7 @@
 int cpu_gen_code(CPUArchState *env, TranslationBlock *tb, int *gen_code_size_ptr)
 {
     TCGContext *s = &tcg_ctx;
-    uint8_t *gen_code_buf;
+    tcg_insn_unit *gen_code_buf;
     int gen_code_size;
 #ifdef CONFIG_PROFILER
     int64_t ti;
@@ -186,8 +186,8 @@
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
-        qemu_log("OUT: [size=%d]\n", *gen_code_size_ptr);
-        log_disas(tb->tc_ptr, *gen_code_size_ptr);
+        qemu_log("OUT: [size=%d]\n", gen_code_size);
+        log_disas(tb->tc_ptr, gen_code_size);
         qemu_log("\n");
         qemu_log_flush();
     }
@@ -235,7 +235,8 @@
     s->tb_jmp_offset = NULL;
     s->tb_next = tb->tb_next;
 #endif
-    j = tcg_gen_code_search_pc(s, (uint8_t *)tc_ptr, searched_pc - tc_ptr);
+    j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
+                               searched_pc - tc_ptr);
     if (j < 0)
         return -1;
     /* now find start of instruction before */
@@ -944,7 +945,6 @@
 {
     CPUArchState *env = cpu->env_ptr;
     TranslationBlock *tb;
-    uint8_t *tc_ptr;
     tb_page_addr_t phys_pc, phys_page2;
     target_ulong virt_page2;
     int code_gen_size;
@@ -959,8 +959,7 @@
         /* Don't forget to invalidate previous TB info.  */
         tcg_ctx.tb_ctx.tb_invalidated_flag = 1;
     }
-    tc_ptr = tcg_ctx.code_gen_ptr;
-    tb->tc_ptr = tc_ptr;
+    tb->tc_ptr = tcg_ctx.code_gen_ptr;
     tb->cs_base = cs_base;
     tb->flags = flags;
     tb->cflags = cflags;
diff --git a/ui/Makefile.objs b/ui/Makefile.objs
index 6f2294e..4af420b 100644
--- a/ui/Makefile.objs
+++ b/ui/Makefile.objs
@@ -17,4 +17,4 @@
 
 $(obj)/sdl.o $(obj)/sdl_zoom.o $(obj)/sdl2.o: QEMU_CFLAGS += $(SDL_CFLAGS)
 
-$(obj)/gtk.o: QEMU_CFLAGS += $(GTK_CFLAGS) $(VTE_CFLAGS)
+gtk.o-cflags := $(GTK_CFLAGS) $(VTE_CFLAGS)