Merge remote-tracking branch 'quintela/migration.next' into staging

# By Peter Lieven (9) and others
# Via Juan Quintela
* quintela/migration.next: (22 commits)
  Use qemu_put_buffer_async for guest memory pages
  Add qemu_put_buffer_async
  Use writev ops if available
  Store the data to send also in iovec
  Update bytes_xfer in qemu_put_byte
  Add socket_writev_buffer function
  Add QemuFileWritevBuffer QemuFileOps
  migration: use XBZRLE only after bulk stage
  migration: do not search dirty pages in bulk stage
  migration: do not sent zero pages in bulk stage
  migration: add an indicator for bulk state of ram migration
  migration: search for zero instead of dup pages
  bitops: unroll while loop in find_next_bit()
  buffer_is_zero: use vector optimizations if possible
  cutils: add a function to find non-zero content in a buffer
  move vector definitions to qemu-common.h
  savevm: Fix bugs in the VMSTATE_VBUFFER_MULTIPLY definition
  savevm: Add VMSTATE_STRUCT_VARRAY_POINTER_UINT32
  savevm: Add VMSTATE_FLOAT64 helpers
  savevm: Add VMSTATE_UINTTL_EQUAL helper
  ...
diff --git a/arch_init.c b/arch_init.c
index e8ade9e..4ef5a15 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -116,26 +116,6 @@
 #define RAM_SAVE_FLAG_CONTINUE 0x20
 #define RAM_SAVE_FLAG_XBZRLE   0x40
 
-#ifdef __ALTIVEC__
-#include <altivec.h>
-#define VECTYPE        vector unsigned char
-#define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
-#define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
-/* altivec.h may redefine the bool macro as vector type.
- * Reset it to POSIX semantics. */
-#undef bool
-#define bool _Bool
-#elif defined __SSE2__
-#include <emmintrin.h>
-#define VECTYPE        __m128i
-#define SPLAT(p)       _mm_set1_epi8(*(p))
-#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
-#else
-#define VECTYPE        unsigned long
-#define SPLAT(p)       (*(p) * (~0UL / 255))
-#define ALL_EQ(v1, v2) ((v1) == (v2))
-#endif
-
 
 static struct defconfig_file {
     const char *filename;
@@ -166,19 +146,10 @@
     return 0;
 }
 
-static int is_dup_page(uint8_t *page)
+static inline bool is_zero_page(uint8_t *p)
 {
-    VECTYPE *p = (VECTYPE *)page;
-    VECTYPE val = SPLAT(page);
-    int i;
-
-    for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
-        if (!ALL_EQ(val, p[i])) {
-            return 0;
-        }
-    }
-
-    return 1;
+    return buffer_find_nonzero_offset(p, TARGET_PAGE_SIZE) ==
+        TARGET_PAGE_SIZE;
 }
 
 /* struct contains XBZRLE cache and a static page
@@ -212,6 +183,7 @@
 /* accounting for migration statistics */
 typedef struct AccountingInfo {
     uint64_t dup_pages;
+    uint64_t skipped_pages;
     uint64_t norm_pages;
     uint64_t iterations;
     uint64_t xbzrle_bytes;
@@ -237,6 +209,16 @@
     return acct_info.dup_pages;
 }
 
+uint64_t skipped_mig_bytes_transferred(void)
+{
+    return acct_info.skipped_pages * TARGET_PAGE_SIZE;
+}
+
+uint64_t skipped_mig_pages_transferred(void)
+{
+    return acct_info.skipped_pages;
+}
+
 uint64_t norm_mig_bytes_transferred(void)
 {
     return acct_info.norm_pages * TARGET_PAGE_SIZE;
@@ -348,6 +330,7 @@
 static unsigned long *migration_bitmap;
 static uint64_t migration_dirty_pages;
 static uint32_t last_version;
+static bool ram_bulk_stage;
 
 static inline
 ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
@@ -357,7 +340,13 @@
     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
     unsigned long size = base + (int128_get64(mr->size) >> TARGET_PAGE_BITS);
 
-    unsigned long next = find_next_bit(migration_bitmap, size, nr);
+    unsigned long next;
+
+    if (ram_bulk_stage && nr > base) {
+        next = nr + 1;
+    } else {
+        next = find_next_bit(migration_bitmap, size, nr);
+    }
 
     if (next < size) {
         clear_bit(next, migration_bitmap);
@@ -455,6 +444,7 @@
             if (!block) {
                 block = QTAILQ_FIRST(&ram_list.blocks);
                 complete_round = true;
+                ram_bulk_stage = false;
             }
         } else {
             uint8_t *p;
@@ -465,13 +455,18 @@
 
             /* In doubt sent page as normal */
             bytes_sent = -1;
-            if (is_dup_page(p)) {
+            if (is_zero_page(p)) {
                 acct_info.dup_pages++;
-                bytes_sent = save_block_hdr(f, block, offset, cont,
-                                            RAM_SAVE_FLAG_COMPRESS);
-                qemu_put_byte(f, *p);
-                bytes_sent += 1;
-            } else if (migrate_use_xbzrle()) {
+                if (!ram_bulk_stage) {
+                    bytes_sent = save_block_hdr(f, block, offset, cont,
+                                                RAM_SAVE_FLAG_COMPRESS);
+                    qemu_put_byte(f, 0);
+                    bytes_sent++;
+                } else {
+                    acct_info.skipped_pages++;
+                    bytes_sent = 0;
+                }
+            } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
                 current_addr = block->offset + offset;
                 bytes_sent = save_xbzrle_page(f, p, current_addr, block,
                                               offset, cont, last_stage);
@@ -483,7 +478,7 @@
             /* XBZRLE overflow or normal page */
             if (bytes_sent == -1) {
                 bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
-                qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
+                qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
                 bytes_sent += TARGET_PAGE_SIZE;
                 acct_info.norm_pages++;
             }
@@ -558,6 +553,7 @@
     last_sent_block = NULL;
     last_offset = 0;
     last_version = ram_list.version;
+    ram_bulk_stage = true;
 }
 
 #define MAX_WAIT 50 /* ms, half buffered_file limit */
diff --git a/hmp.c b/hmp.c
index c12c495..dbe9b90 100644
--- a/hmp.c
+++ b/hmp.c
@@ -173,6 +173,8 @@
                        info->ram->total >> 10);
         monitor_printf(mon, "duplicate: %" PRIu64 " pages\n",
                        info->ram->duplicate);
+        monitor_printf(mon, "skipped: %" PRIu64 " pages\n",
+                       info->ram->skipped);
         monitor_printf(mon, "normal: %" PRIu64 " pages\n",
                        info->ram->normal);
         monitor_printf(mon, "normal bytes: %" PRIu64 " kbytes\n",
diff --git a/hw/hw.h b/hw/hw.h
index 1553e54..1fb9afa 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -52,16 +52,22 @@
 #if TARGET_LONG_BITS == 64
 #define VMSTATE_UINTTL_V(_f, _s, _v)                                  \
     VMSTATE_UINT64_V(_f, _s, _v)
+#define VMSTATE_UINTTL_EQUAL_V(_f, _s, _v)                            \
+    VMSTATE_UINT64_EQUAL_V(_f, _s, _v)
 #define VMSTATE_UINTTL_ARRAY_V(_f, _s, _n, _v)                        \
     VMSTATE_UINT64_ARRAY_V(_f, _s, _n, _v)
 #else
 #define VMSTATE_UINTTL_V(_f, _s, _v)                                  \
     VMSTATE_UINT32_V(_f, _s, _v)
+#define VMSTATE_UINTTL_EQUAL_V(_f, _s, _v)                            \
+    VMSTATE_UINT32_EQUAL_V(_f, _s, _v)
 #define VMSTATE_UINTTL_ARRAY_V(_f, _s, _n, _v)                        \
     VMSTATE_UINT32_ARRAY_V(_f, _s, _n, _v)
 #endif
 #define VMSTATE_UINTTL(_f, _s)                                        \
     VMSTATE_UINTTL_V(_f, _s, 0)
+#define VMSTATE_UINTTL_EQUAL(_f, _s)                                  \
+    VMSTATE_UINTTL_EQUAL_V(_f, _s, 0)
 #define VMSTATE_UINTTL_ARRAY(_f, _s, _n)                              \
     VMSTATE_UINTTL_ARRAY_V(_f, _s, _n, 0)
 
diff --git a/include/migration/migration.h b/include/migration/migration.h
index bb617fd..e2acec6 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -96,6 +96,8 @@
 
 uint64_t dup_mig_bytes_transferred(void);
 uint64_t dup_mig_pages_transferred(void);
+uint64_t skipped_mig_bytes_transferred(void);
+uint64_t skipped_mig_pages_transferred(void);
 uint64_t norm_mig_bytes_transferred(void);
 uint64_t norm_mig_pages_transferred(void);
 uint64_t xbzrle_mig_bytes_transferred(void);
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index df81261..623c434 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -51,11 +51,18 @@
  */
 typedef int (QEMUFileGetFD)(void *opaque);
 
+/*
+ * This function writes an iovec to file.
+ */
+typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov,
+                                           int iovcnt);
+
 typedef struct QEMUFileOps {
     QEMUFilePutBufferFunc *put_buffer;
     QEMUFileGetBufferFunc *get_buffer;
     QEMUFileCloseFunc *close;
     QEMUFileGetFD *get_fd;
+    QEMUFileWritevBufferFunc *writev_buffer;
 } QEMUFileOps;
 
 QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
@@ -68,6 +75,11 @@
 int64_t qemu_ftell(QEMUFile *f);
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
 void qemu_put_byte(QEMUFile *f, int v);
+/*
+ * put_buffer without copying the buffer.
+ * The buffer should be available till it is sent asynchronously.
+ */
+void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size);
 
 static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
 {
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 6666d27..65918a9 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -149,6 +149,7 @@
 extern const VMStateInfo vmstate_info_uint16_equal;
 extern const VMStateInfo vmstate_info_int32_equal;
 extern const VMStateInfo vmstate_info_uint32_equal;
+extern const VMStateInfo vmstate_info_uint64_equal;
 extern const VMStateInfo vmstate_info_int32_le;
 
 extern const VMStateInfo vmstate_info_uint8;
@@ -156,6 +157,8 @@
 extern const VMStateInfo vmstate_info_uint32;
 extern const VMStateInfo vmstate_info_uint64;
 
+extern const VMStateInfo vmstate_info_float64;
+
 extern const VMStateInfo vmstate_info_timer;
 extern const VMStateInfo vmstate_info_buffer;
 extern const VMStateInfo vmstate_info_unused_buffer;
@@ -340,6 +343,16 @@
     .offset     = vmstate_offset_pointer(_state, _field, _type),     \
 }
 
+#define VMSTATE_STRUCT_VARRAY_POINTER_UINT32(_field, _state, _field_num, _vmsd, _type) { \
+    .name       = (stringify(_field)),                               \
+    .version_id = 0,                                                 \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint32_t),\
+    .size       = sizeof(_type),                                     \
+    .vmsd       = &(_vmsd),                                          \
+    .flags      = VMS_POINTER | VMS_VARRAY_INT32 | VMS_STRUCT,       \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
+}
+
 #define VMSTATE_STRUCT_VARRAY_POINTER_UINT16(_field, _state, _field_num, _vmsd, _type) { \
     .name       = (stringify(_field)),                               \
     .version_id = 0,                                                 \
@@ -380,14 +393,14 @@
     .offset       = vmstate_offset_buffer(_state, _field) + _start,  \
 }
 
-#define VMSTATE_BUFFER_MULTIPLY(_field, _state, _version, _test, _start, _field_size, _multiply) { \
+#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test, _start, _field_size, _multiply) { \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
     .size_offset  = vmstate_offset_value(_state, _field_size, uint32_t),\
     .size         = (_multiply),                                      \
     .info         = &vmstate_info_buffer,                            \
-    .flags        = VMS_VBUFFER|VMS_MULTIPLY,                        \
+    .flags        = VMS_VBUFFER|VMS_POINTER|VMS_MULTIPLY,            \
     .offset       = offsetof(_state, _field),                        \
     .start        = (_start),                                        \
 }
@@ -518,8 +531,17 @@
 #define VMSTATE_INT32_EQUAL(_f, _s)                                   \
     VMSTATE_SINGLE(_f, _s, 0, vmstate_info_int32_equal, int32_t)
 
-#define VMSTATE_UINT32_EQUAL(_f, _s)                                   \
-    VMSTATE_SINGLE(_f, _s, 0, vmstate_info_uint32_equal, uint32_t)
+#define VMSTATE_UINT32_EQUAL_V(_f, _s, _v)                            \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint32_equal, uint32_t)
+
+#define VMSTATE_UINT32_EQUAL(_f, _s)                                  \
+    VMSTATE_UINT32_EQUAL_V(_f, _s, 0)
+
+#define VMSTATE_UINT64_EQUAL_V(_f, _s, _v)                            \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint64_equal, uint64_t)
+
+#define VMSTATE_UINT64_EQUAL(_f, _s)                                  \
+    VMSTATE_UINT64_EQUAL_V(_f, _s, 0)
 
 #define VMSTATE_INT32_LE(_f, _s)                                   \
     VMSTATE_SINGLE(_f, _s, 0, vmstate_info_int32_le, int32_t)
@@ -533,6 +555,13 @@
 #define VMSTATE_UINT32_TEST(_f, _s, _t)                                  \
     VMSTATE_SINGLE_TEST(_f, _s, _t, 0, vmstate_info_uint32, uint32_t)
 
+
+#define VMSTATE_FLOAT64_V(_f, _s, _v)                                 \
+    VMSTATE_SINGLE(_f, _s, _v, vmstate_info_float64, float64)
+
+#define VMSTATE_FLOAT64(_f, _s)                                       \
+    VMSTATE_FLOAT64_V(_f, _s, 0)
+
 #define VMSTATE_TIMER_TEST(_f, _s, _test)                             \
     VMSTATE_POINTER_TEST(_f, _s, _test, vmstate_info_timer, QEMUTimer *)
 
@@ -599,6 +628,12 @@
 #define VMSTATE_INT64_ARRAY(_f, _s, _n)                               \
     VMSTATE_INT64_ARRAY_V(_f, _s, _n, 0)
 
+#define VMSTATE_FLOAT64_ARRAY_V(_f, _s, _n, _v)                       \
+    VMSTATE_ARRAY(_f, _s, _n, _v, vmstate_info_float64, float64)
+
+#define VMSTATE_FLOAT64_ARRAY(_f, _s, _n)                             \
+    VMSTATE_FLOAT64_ARRAY_V(_f, _s, _n, 0)
+
 #define VMSTATE_BUFFER_V(_f, _s, _v)                                  \
     VMSTATE_STATIC_BUFFER(_f, _s, _v, NULL, 0, sizeof(typeof_field(_s, _f)))
 
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 2371132..31fff22 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -448,4 +448,35 @@
 
 void hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 
+/* vector definitions */
+#ifdef __ALTIVEC__
+#include <altivec.h>
+#define VECTYPE        vector unsigned char
+#define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
+#define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
+/* altivec.h may redefine the bool macro as vector type.
+ * Reset it to POSIX semantics. */
+#undef bool
+#define bool _Bool
+#elif defined __SSE2__
+#include <emmintrin.h>
+#define VECTYPE        __m128i
+#define SPLAT(p)       _mm_set1_epi8(*(p))
+#define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
+#else
+#define VECTYPE        unsigned long
+#define SPLAT(p)       (*(p) * (~0UL / 255))
+#define ALL_EQ(v1, v2) ((v1) == (v2))
+#endif
+
+#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
+static inline bool
+can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+size_t buffer_find_nonzero_offset(const void *buf, size_t len);
+
 #endif
diff --git a/migration.c b/migration.c
index 185d112..7fb2147 100644
--- a/migration.c
+++ b/migration.c
@@ -197,11 +197,11 @@
         info->ram->remaining = ram_bytes_remaining();
         info->ram->total = ram_bytes_total();
         info->ram->duplicate = dup_mig_pages_transferred();
+        info->ram->skipped = skipped_mig_pages_transferred();
         info->ram->normal = norm_mig_pages_transferred();
         info->ram->normal_bytes = norm_mig_bytes_transferred();
         info->ram->dirty_pages_rate = s->dirty_pages_rate;
 
-
         if (blk_mig_active()) {
             info->has_disk = true;
             info->disk = g_malloc0(sizeof(*info->disk));
@@ -227,6 +227,7 @@
         info->ram->remaining = 0;
         info->ram->total = ram_bytes_total();
         info->ram->duplicate = dup_mig_pages_transferred();
+        info->ram->skipped = skipped_mig_pages_transferred();
         info->ram->normal = norm_mig_pages_transferred();
         info->ram->normal_bytes = norm_mig_bytes_transferred();
         break;
diff --git a/qapi-schema.json b/qapi-schema.json
index af499bd..6494787 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -496,7 +496,9 @@
 #
 # @total: total amount of bytes involved in the migration process
 #
-# @duplicate: number of duplicate pages (since 1.2)
+# @duplicate: number of duplicate (zero) pages (since 1.2)
+#
+# @skipped: number of skipped zero pages (since 1.5)
 #
 # @normal : number of normal pages (since 1.2)
 #
@@ -509,8 +511,8 @@
 ##
 { 'type': 'MigrationStats',
   'data': {'transferred': 'int', 'remaining': 'int', 'total': 'int' ,
-           'duplicate': 'int', 'normal': 'int', 'normal-bytes': 'int',
-           'dirty-pages-rate' : 'int' } }
+           'duplicate': 'int', 'skipped': 'int', 'normal': 'int',
+           'normal-bytes': 'int', 'dirty-pages-rate' : 'int' } }
 
 ##
 # @XBZRLECacheStats
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 2051fcb..1e0e11e 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -666,7 +666,7 @@
 
 SQMP
 migrate-set-cache-size
----------------------
+----------------------
 
 Set cache size to be used by XBZRLE migration, the cache size will be rounded
 down to the nearest power of 2
@@ -689,7 +689,7 @@
 
 SQMP
 query-migrate-cache-size
----------------------
+------------------------
 
 Show cache size to be used by XBZRLE migration
 
@@ -2453,32 +2453,43 @@
      - Possible values: "active", "completed", "failed", "cancelled"
 - "total-time": total amount of ms since migration started.  If
                 migration has ended, it returns the total migration
-		 time (json-int)
+                time (json-int)
 - "downtime": only present when migration has finished correctly
               total amount in ms for downtime that happened (json-int)
 - "expected-downtime": only present while migration is active
                 total amount in ms for downtime that was calculated on
-		the last bitmap round (json-int)
+                the last bitmap round (json-int)
 - "ram": only present if "status" is "active", it is a json-object with the
-  following RAM information (in bytes):
-         - "transferred": amount transferred (json-int)
-         - "remaining": amount remaining (json-int)
-         - "total": total (json-int)
-         - "duplicate": number of duplicated pages (json-int)
-         - "normal" : number of normal pages transferred (json-int)
-         - "normal-bytes" : number of normal bytes transferred (json-int)
+  following RAM information:
+         - "transferred": amount transferred in bytes (json-int)
+         - "remaining": amount remaining to transfer in bytes (json-int)
+         - "total": total amount of memory in bytes (json-int)
+         - "duplicate": number of pages filled entirely with the same
+            byte (json-int)
+            These are sent over the wire much more efficiently.
+         - "skipped": number of skipped zero pages (json-int)
+         - "normal" : number of whole pages transfered.  I.e. they
+            were not sent as duplicate or xbzrle pages (json-int)
+         - "normal-bytes" : number of bytes transferred in whole
+            pages. This is just normal pages times size of one page,
+            but this way upper levels don't need to care about page
+            size (json-int)
 - "disk": only present if "status" is "active" and it is a block migration,
-  it is a json-object with the following disk information (in bytes):
-         - "transferred": amount transferred (json-int)
-         - "remaining": amount remaining (json-int)
-         - "total": total (json-int)
+  it is a json-object with the following disk information:
+         - "transferred": amount transferred in bytes (json-int)
+         - "remaining": amount remaining to transfer in bytes json-int)
+         - "total": total disk size in bytes (json-int)
 - "xbzrle-cache": only present if XBZRLE is active.
   It is a json-object with the following XBZRLE information:
-         - "cache-size": XBZRLE cache size
-         - "bytes": total XBZRLE bytes transferred
+         - "cache-size": XBZRLE cache size in bytes
+         - "bytes": number of bytes transferred for XBZRLE compressed pages
          - "pages": number of XBZRLE compressed pages
-         - "cache-miss": number of cache misses
-         - "overflow": number of XBZRLE overflows
+         - "cache-miss": number of XBRZRLE page cache misses
+         - "overflow": number of times XBZRLE overflows.  This means
+           that the XBZRLE encoding was bigger than just sent the
+           whole page, and then we sent the whole page instead (as as
+           normal page).
+
 Examples:
 
 1. Before the first migration
@@ -2589,11 +2600,11 @@
 
 SQMP
 migrate-set-capabilities
--------
+------------------------
 
 Enable/Disable migration capabilities
 
-- "xbzrle": xbzrle support
+- "xbzrle": XBZRLE support
 
 Arguments:
 
@@ -2612,7 +2623,7 @@
     },
 SQMP
 query-migrate-capabilities
--------
+--------------------------
 
 Query current migration capabilities
 
diff --git a/savevm.c b/savevm.c
index 35c8d1e..406caa9 100644
--- a/savevm.c
+++ b/savevm.c
@@ -39,6 +39,7 @@
 #include "qmp-commands.h"
 #include "trace.h"
 #include "qemu/bitops.h"
+#include "qemu/iov.h"
 
 #define SELF_ANNOUNCE_ROUNDS 5
 
@@ -113,6 +114,7 @@
 /* savevm/loadvm support */
 
 #define IO_BUF_SIZE 32768
+#define MAX_IOV_SIZE MIN(IOV_MAX, 64)
 
 struct QEMUFile {
     const QEMUFileOps *ops;
@@ -128,6 +130,9 @@
     int buf_size; /* 0 when writing */
     uint8_t buf[IO_BUF_SIZE];
 
+    struct iovec iov[MAX_IOV_SIZE];
+    unsigned int iovcnt;
+
     int last_error;
 };
 
@@ -171,6 +176,19 @@
     qemu_coroutine_yield();
 }
 
+static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt)
+{
+    QEMUFileSocket *s = opaque;
+    ssize_t len;
+    ssize_t size = iov_size(iov, iovcnt);
+
+    len = iov_send(s->fd, iov, iovcnt, 0, size);
+    if (len < size) {
+        len = -socket_error();
+    }
+    return len;
+}
+
 static int socket_get_fd(void *opaque)
 {
     QEMUFileSocket *s = opaque;
@@ -275,7 +293,7 @@
     QEMUFileStdio *s = opaque;
     int ret = 0;
 
-    if (s->file->ops->put_buffer) {
+    if (s->file->ops->put_buffer || s->file->ops->writev_buffer) {
         int fd = fileno(s->stdio_file);
         struct stat st;
 
@@ -387,6 +405,7 @@
 static const QEMUFileOps socket_write_ops = {
     .get_fd =     socket_get_fd,
     .put_buffer = socket_put_buffer,
+    .writev_buffer = socket_writev_buffer,
     .close =      socket_close
 };
 
@@ -497,22 +516,38 @@
     }
 }
 
-/** Flushes QEMUFile buffer
+/**
+ * Flushes QEMUFile buffer
  *
+ * If there is writev_buffer QEMUFileOps it uses it otherwise uses
+ * put_buffer ops.
  */
 static void qemu_fflush(QEMUFile *f)
 {
-    int ret = 0;
+    ssize_t ret = 0;
+    int i = 0;
 
-    if (!f->ops->put_buffer) {
+    if (!f->ops->writev_buffer && !f->ops->put_buffer) {
         return;
     }
-    if (f->is_write && f->buf_index > 0) {
-        ret = f->ops->put_buffer(f->opaque, f->buf, f->pos, f->buf_index);
-        if (ret >= 0) {
-            f->pos += f->buf_index;
+
+    if (f->is_write && f->iovcnt > 0) {
+        if (f->ops->writev_buffer) {
+            ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt);
+            if (ret >= 0) {
+                f->pos += ret;
+            }
+        } else {
+            for (i = 0; i < f->iovcnt && ret >= 0; i++) {
+                ret = f->ops->put_buffer(f->opaque, f->iov[i].iov_base, f->pos,
+                                         f->iov[i].iov_len);
+                if (ret >= 0) {
+                    f->pos += ret;
+                }
+            }
         }
         f->buf_index = 0;
+        f->iovcnt = 0;
     }
     if (ret < 0) {
         qemu_file_set_error(f, ret);
@@ -586,6 +621,40 @@
     return ret;
 }
 
+static void add_to_iovec(QEMUFile *f, const uint8_t *buf, int size)
+{
+    /* check for adjacent buffer and coalesce them */
+    if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base +
+        f->iov[f->iovcnt - 1].iov_len) {
+        f->iov[f->iovcnt - 1].iov_len += size;
+    } else {
+        f->iov[f->iovcnt].iov_base = (uint8_t *)buf;
+        f->iov[f->iovcnt++].iov_len = size;
+    }
+}
+
+void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size)
+{
+    if (f->last_error) {
+        return;
+    }
+
+    if (f->is_write == 0 && f->buf_index > 0) {
+        fprintf(stderr,
+                "Attempted to write to buffer while read buffer is not empty\n");
+        abort();
+    }
+
+    add_to_iovec(f, buf, size);
+
+    f->is_write = 1;
+    f->bytes_xfer += size;
+
+    if (f->buf_index >= IO_BUF_SIZE || f->iovcnt >= MAX_IOV_SIZE) {
+        qemu_fflush(f);
+    }
+}
+
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size)
 {
     int l;
@@ -607,15 +676,12 @@
         memcpy(f->buf + f->buf_index, buf, l);
         f->is_write = 1;
         f->buf_index += l;
-        f->bytes_xfer += l;
+        qemu_put_buffer_async(f, f->buf + (f->buf_index - l), l);
+        if (qemu_file_get_error(f)) {
+            break;
+        }
         buf += l;
         size -= l;
-        if (f->buf_index >= IO_BUF_SIZE) {
-            qemu_fflush(f);
-            if (qemu_file_get_error(f)) {
-                break;
-            }
-        }
     }
 }
 
@@ -633,7 +699,11 @@
 
     f->buf[f->buf_index++] = v;
     f->is_write = 1;
-    if (f->buf_index >= IO_BUF_SIZE) {
+    f->bytes_xfer++;
+
+    add_to_iovec(f, f->buf + (f->buf_index - 1), 1);
+
+    if (f->buf_index >= IO_BUF_SIZE || f->iovcnt >= MAX_IOV_SIZE) {
         qemu_fflush(f);
     }
 }
@@ -1072,6 +1142,27 @@
     .put  = put_uint64,
 };
 
+/* 64 bit unsigned int. See that the received value is the same than the one
+   in the field */
+
+static int get_uint64_equal(QEMUFile *f, void *pv, size_t size)
+{
+    uint64_t *v = pv;
+    uint64_t v2;
+    qemu_get_be64s(f, &v2);
+
+    if (*v == v2) {
+        return 0;
+    }
+    return -EINVAL;
+}
+
+const VMStateInfo vmstate_info_uint64_equal = {
+    .name = "int64 equal",
+    .get  = get_uint64_equal,
+    .put  = put_uint64,
+};
+
 /* 8 bit int. See that the received value is the same than the one
    in the field */
 
@@ -1112,6 +1203,29 @@
     .put  = put_uint16,
 };
 
+/* floating point */
+
+static int get_float64(QEMUFile *f, void *pv, size_t size)
+{
+    float64 *v = pv;
+
+    *v = make_float64(qemu_get_be64(f));
+    return 0;
+}
+
+static void put_float64(QEMUFile *f, void *pv, size_t size)
+{
+    uint64_t *v = pv;
+
+    qemu_put_be64(f, float64_val(*v));
+}
+
+const VMStateInfo vmstate_info_float64 = {
+    .name = "float64",
+    .get  = get_float64,
+    .put  = put_float64,
+};
+
 /* timers  */
 
 static int get_timer(QEMUFile *f, void *pv, size_t size)
diff --git a/util/bitops.c b/util/bitops.c
index e72237a..227c38b 100644
--- a/util/bitops.c
+++ b/util/bitops.c
@@ -42,7 +42,23 @@
         size -= BITS_PER_LONG;
         result += BITS_PER_LONG;
     }
-    while (size & ~(BITS_PER_LONG-1)) {
+    while (size >= 4*BITS_PER_LONG) {
+        unsigned long d1, d2, d3;
+        tmp = *p;
+        d1 = *(p+1);
+        d2 = *(p+2);
+        d3 = *(p+3);
+        if (tmp) {
+            goto found_middle;
+        }
+        if (d1 | d2 | d3) {
+            break;
+        }
+        p += 4;
+        result += 4*BITS_PER_LONG;
+        size -= 4*BITS_PER_LONG;
+    }
+    while (size >= BITS_PER_LONG) {
         if ((tmp = *(p++))) {
             goto found_middle;
         }
diff --git a/util/cutils.c b/util/cutils.c
index 1439da4..5024253 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -143,6 +143,61 @@
 }
 
 /*
+ * Searches for an area with non-zero content in a buffer
+ *
+ * Attention! The len must be a multiple of
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
+ * and addr must be a multiple of sizeof(VECTYPE) due to
+ * restriction of optimizations in this function.
+ *
+ * can_use_buffer_find_nonzero_offset() can be used to check
+ * these requirements.
+ *
+ * The return value is the offset of the non-zero area rounded
+ * down to a multiple of sizeof(VECTYPE) for the first
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR chunks and down to
+ * BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)
+ * afterwards.
+ *
+ * If the buffer is all zero the return value is equal to len.
+ */
+
+size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+{
+    const VECTYPE *p = buf;
+    const VECTYPE zero = (VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!ALL_EQ(p[i], zero)) {
+            return i * sizeof(VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        VECTYPE tmp0 = p[i + 0] | p[i + 1];
+        VECTYPE tmp1 = p[i + 2] | p[i + 3];
+        VECTYPE tmp2 = p[i + 4] | p[i + 5];
+        VECTYPE tmp3 = p[i + 6] | p[i + 7];
+        VECTYPE tmp01 = tmp0 | tmp1;
+        VECTYPE tmp23 = tmp2 | tmp3;
+        if (!ALL_EQ(tmp01 | tmp23, zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(VECTYPE);
+}
+
+/*
  * Checks if a buffer is all zeroes
  *
  * Attention! The len must be a multiple of 4 * sizeof(long) due to
@@ -160,6 +215,11 @@
     long d0, d1, d2, d3;
     const long * const data = buf;
 
+    /* use vector optimized zero check if possible */
+    if (can_use_buffer_find_nonzero_offset(buf, len)) {
+        return buffer_find_nonzero_offset(buf, len) == len;
+    }
+
     assert(len % (4 * sizeof(long)) == 0);
     len /= sizeof(long);