optimize: optimize using nonzero bits

This adds two optimizations using the non-zero bit mask.  In some cases
involving shifts or ANDs the value can become zero, and can thus be
optimized to a move of zero.  Second, useless zero-extension or an
AND with constant can be detected that would only zero bits that are
already zero.

The main advantage of this optimization is that it turns zero-extensions
into moves, thus enabling much better copy propagation (around 1% code
reduction).  Here is for example a "test $0xff0000,%ecx + je" before
optimization:

 mov_i64 tmp0,rcx
 movi_i64 tmp1,$0xff0000
 discard cc_src
 and_i64 cc_dst,tmp0,tmp1
 movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0

and after (without patch on the left, with on the right):

 movi_i64 tmp1,$0xff0000                 movi_i64 tmp1,$0xff0000
 discard cc_src                          discard cc_src
 and_i64 cc_dst,rcx,tmp1                 and_i64 cc_dst,rcx,tmp1
 movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0

Other similar cases: "test %eax, %eax + jne" where eax is already 32-bit
(after optimization, without patch on the left, with on the right):

 discard cc_src                          discard cc_src
 mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
 movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,ne,$0x0           brcond_i64 rax,tmp12,ne,$0x0

"test $0x1, %dl + je":

 movi_i64 tmp1,$0x1                      movi_i64 tmp1,$0x1
 discard cc_src                          discard cc_src
 and_i64 cc_dst,rdx,tmp1                 and_i64 cc_dst,rdx,tmp1
 movi_i32 cc_op,$0x1a                    movi_i32 cc_op,$0x1a
 ext8u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0

In some cases TCG even outsmarts GCC. :)  Here the input code has
"and $0x2,%eax + movslq %eax,%rbx + test %rbx, %rbx" and the optimizer,
thanks to copy propagation, does the following:

 movi_i64 tmp12,$0x2                     movi_i64 tmp12,$0x2
 and_i64 rax,rax,tmp12                   and_i64 rax,rax,tmp12
 mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
 ext32s_i64 tmp0,rax                  -> nop
 mov_i64 rbx,tmp0                     -> mov_i64 rbx,cc_dst
 and_i64 cc_dst,rbx,rbx               -> nop

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
1 file changed
tree: 7dc66485bcdff64982cfd5a116592bf89b03647e
  1. audio/
  2. backends/
  3. block/
  4. bsd-user/
  5. default-configs/
  6. disas/
  7. docs/
  8. fpu/
  9. fsdev/
  10. gdb-xml/
  11. hw/
  12. include/
  13. ldscripts/
  14. libcacard/
  15. linux-headers/
  16. linux-user/
  17. net/
  18. pc-bios/
  19. qapi/
  20. qga/
  21. QMP/
  22. qobject/
  23. qom/
  24. roms/
  25. scripts/
  26. slirp/
  27. stubs/
  28. sysconfigs/
  29. target-alpha/
  30. target-arm/
  31. target-cris/
  32. target-i386/
  33. target-lm32/
  34. target-m68k/
  35. target-microblaze/
  36. target-mips/
  37. target-openrisc/
  38. target-ppc/
  39. target-s390x/
  40. target-sh4/
  41. target-sparc/
  42. target-unicore32/
  43. target-xtensa/
  44. tcg/
  45. tests/
  46. trace/
  47. ui/
  48. util/
  49. .exrc
  50. .gitignore
  51. .gitmodules
  52. .mailmap
  53. aio-posix.c
  54. aio-win32.c
  55. arch_init.c
  56. async.c
  57. balloon.c
  58. block-migration.c
  59. block.c
  60. blockdev-nbd.c
  61. blockdev.c
  62. blockjob.c
  63. bt-host.c
  64. bt-vhci.c
  65. Changelog
  66. cmd.c
  67. cmd.h
  68. CODING_STYLE
  69. configure
  70. COPYING
  71. COPYING.LIB
  72. coroutine-gthread.c
  73. coroutine-sigaltstack.c
  74. coroutine-ucontext.c
  75. coroutine-win32.c
  76. cpu-exec.c
  77. cpus.c
  78. cputlb.c
  79. device_tree.c
  80. disas.c
  81. dma-helpers.c
  82. dump-stub.c
  83. dump.c
  84. exec.c
  85. gdbstub.c
  86. HACKING
  87. hmp-commands.hx
  88. hmp.c
  89. hmp.h
  90. iohandler.c
  91. ioport.c
  92. kvm-all.c
  93. kvm-stub.c
  94. LICENSE
  95. main-loop.c
  96. MAINTAINERS
  97. Makefile
  98. Makefile.objs
  99. Makefile.target
  100. memory.c
  101. memory_mapping-stub.c
  102. memory_mapping.c
  103. migration-exec.c
  104. migration-fd.c
  105. migration-tcp.c
  106. migration-unix.c
  107. migration.c
  108. monitor.c
  109. nbd.c
  110. os-posix.c
  111. os-win32.c
  112. page_cache.c
  113. qapi-schema-test.json
  114. qapi-schema.json
  115. qdict-test-data.txt
  116. qemu-bridge-helper.c
  117. qemu-char.c
  118. qemu-coroutine-io.c
  119. qemu-coroutine-lock.c
  120. qemu-coroutine-sleep.c
  121. qemu-coroutine.c
  122. qemu-doc.texi
  123. qemu-img-cmds.hx
  124. qemu-img.c
  125. qemu-img.texi
  126. qemu-io.c
  127. qemu-log.c
  128. qemu-nbd.c
  129. qemu-nbd.texi
  130. qemu-options-wrapper.h
  131. qemu-options.h
  132. qemu-options.hx
  133. qemu-seccomp.c
  134. qemu-tech.texi
  135. qemu-timer.c
  136. qemu.sasl
  137. qmp-commands.hx
  138. qmp.c
  139. qtest.c
  140. readline.c
  141. README
  142. rules.mak
  143. savevm.c
  144. spice-qemu-char.c
  145. tcg-runtime.c
  146. tci.c
  147. thread-pool.c
  148. thunk.c
  149. TODO
  150. trace-events
  151. translate-all.c
  152. translate-all.h
  153. user-exec.c
  154. VERSION
  155. version.rc
  156. vl.c
  157. xen-all.c
  158. xen-mapcache.c
  159. xen-stub.c