Merge remote-tracking branch 'remotes/awilliam/tags/vfio-update-20151110.0' into staging

VFIO updates 2015-11-10

 - Make Windows happy with vfio-pci devices exposed on conventional
   PCI buses on q35 by hiding PCIe capability (Alex Williamson)
 - Convert to g_new() where appropriate (Markus Armbruster)

# gpg: Signature made Tue 10 Nov 2015 19:46:41 GMT using RSA key ID 3BB08B22
# gpg: Good signature from "Alex Williamson <alex.williamson@redhat.com>"
# gpg:                 aka "Alex Williamson <alex@shazbot.org>"
# gpg:                 aka "Alex Williamson <alwillia@redhat.com>"
# gpg:                 aka "Alex Williamson <alex.l.williamson@gmail.com>"

* remotes/awilliam/tags/vfio-update-20151110.0:
  vfio: Use g_new() & friends where that makes obvious sense
  vfio/pci: Hide device PCIe capability on non-express buses for PCIe VMs

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/balloon.c b/balloon.c
index 5d69e8a..0f45d1b 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,17 @@
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
 static void *balloon_opaque;
+static bool balloon_inhibited;
+
+bool qemu_balloon_is_inhibited(void)
+{
+    return balloon_inhibited;
+}
+
+void qemu_balloon_inhibit(bool state)
+{
+    balloon_inhibited = state;
+}
 
 static bool have_balloon(Error **errp)
 {
diff --git a/docs/migration.txt b/docs/migration.txt
index f6df4be..fda8d61 100644
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -291,3 +291,194 @@
 (that is what ide_drive_pio_state_needed() checks).  If DRQ_STAT is
 not enabled, the values on that fields are garbage and don't need to
 be sent.
+
+= Return path =
+
+In most migration scenarios there is only a single data path that runs
+from the source VM to the destination, typically along a single fd (although
+possibly with another fd or similar for some fast way of throwing pages across).
+
+However, some uses need two way communication; in particular the Postcopy
+destination needs to be able to request pages on demand from the source.
+
+For these scenarios there is a 'return path' from the destination to the source;
+qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return
+path.
+
+  Source side
+     Forward path - written by migration thread
+     Return path  - opened by main thread, read by return-path thread
+
+  Destination side
+     Forward path - read by main thread
+     Return path  - opened by main thread, written by main thread AND postcopy
+                    thread (protected by rp_mutex)
+
+= Postcopy =
+'Postcopy' migration is a way to deal with migrations that refuse to converge
+(or take too long to converge) its plus side is that there is an upper bound on
+the amount of migration traffic and time it takes, the down side is that during
+the postcopy phase, a failure of *either* side or the network connection causes
+the guest to be lost.
+
+In postcopy the destination CPUs are started before all the memory has been
+transferred, and accesses to pages that are yet to be transferred cause
+a fault that's translated by QEMU into a request to the source QEMU.
+
+Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
+doesn't finish in a given time the switch is made to postcopy.
+
+=== Enabling postcopy ===
+
+To enable postcopy, issue this command on the monitor prior to the
+start of migration:
+
+migrate_set_capability x-postcopy-ram on
+
+The normal commands are then used to start a migration, which is still
+started in precopy mode.  Issuing:
+
+migrate_start_postcopy
+
+will now cause the transition from precopy to postcopy.
+It can be issued immediately after migration is started or any
+time later on.  Issuing it after the end of a migration is harmless.
+
+Note: During the postcopy phase, the bandwidth limits set using
+migrate_set_speed is ignored (to avoid delaying requested pages that
+the destination is waiting for).
+
+=== Postcopy device transfer ===
+
+Loading of device data may cause the device emulation to access guest RAM
+that may trigger faults that have to be resolved by the source, as such
+the migration stream has to be able to respond with page data *during* the
+device load, and hence the device data has to be read from the stream completely
+before the device load begins to free the stream up.  This is achieved by
+'packaging' the device data into a blob that's read in one go.
+
+Source behaviour
+
+Until postcopy is entered the migration stream is identical to normal
+precopy, except for the addition of a 'postcopy advise' command at
+the beginning, to tell the destination that postcopy might happen.
+When postcopy starts the source sends the page discard data and then
+forms the 'package' containing:
+
+   Command: 'postcopy listen'
+   The device state
+      A series of sections, identical to the precopy streams device state stream
+      containing everything except postcopiable devices (i.e. RAM)
+   Command: 'postcopy run'
+
+The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the
+contents are formatted in the same way as the main migration stream.
+
+During postcopy the source scans the list of dirty pages and sends them
+to the destination without being requested (in much the same way as precopy),
+however when a page request is received from the destination, the dirty page
+scanning restarts from the requested location.  This causes requested pages
+to be sent quickly, and also causes pages directly after the requested page
+to be sent quickly in the hope that those pages are likely to be used
+by the destination soon.
+
+Destination behaviour
+
+Initially the destination looks the same as precopy, with a single thread
+reading the migration stream; the 'postcopy advise' and 'discard' commands
+are processed to change the way RAM is managed, but don't affect the stream
+processing.
+
+------------------------------------------------------------------------------
+                        1      2   3     4 5                      6   7
+main -----DISCARD-CMD_PACKAGED ( LISTEN  DEVICE     DEVICE DEVICE RUN )
+thread                             |       |
+                                   |     (page request)
+                                   |        \___
+                                   v            \
+listen thread:                     --- page -- page -- page -- page -- page --
+
+                                   a   b        c
+------------------------------------------------------------------------------
+
+On receipt of CMD_PACKAGED (1)
+   All the data associated with the package - the ( ... ) section in the
+diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
+recurses into qemu_loadvm_state_main to process the contents of the package (2)
+which contains commands (3,6) and devices (4...)
+
+On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
+a new thread (a) is started that takes over servicing the migration stream,
+while the main thread carries on loading the package.   It loads normal
+background page data (b) but if during a device load a fault happens (5) the
+returned page (c) is loaded by the listen thread allowing the main threads
+device load to carry on.
+
+The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination
+CPUs start running.
+At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour
+and is no longer used by migration, while the listen thread carries
+on servicing page data until the end of migration.
+
+=== Postcopy states ===
+
+Postcopy moves through a series of states (see postcopy_state) from
+ADVISE->DISCARD->LISTEN->RUNNING->END
+
+  Advise:  Set at the start of migration if postcopy is enabled, even
+           if it hasn't had the start command; here the destination
+           checks that its OS has the support needed for postcopy, and performs
+           setup to ensure the RAM mappings are suitable for later postcopy.
+           The destination will fail early in migration at this point if the
+           required OS support is not present.
+           (Triggered by reception of POSTCOPY_ADVISE command)
+
+  Discard: Entered on receipt of the first 'discard' command; prior to
+           the first Discard being performed, hugepages are switched off
+           (using madvise) to ensure that no new huge pages are created
+           during the postcopy phase, and to cause any huge pages that
+           have discards on them to be broken.
+
+  Listen:  The first command in the package, POSTCOPY_LISTEN, switches
+           the destination state to Listen, and starts a new thread
+           (the 'listen thread') which takes over the job of receiving
+           pages off the migration stream, while the main thread carries
+           on processing the blob.  With this thread able to process page
+           reception, the destination now 'sensitises' the RAM to detect
+           any access to missing pages (on Linux using the 'userfault'
+           system).
+
+  Running: POSTCOPY_RUN causes the destination to synchronise all
+           state and start the CPUs and IO devices running.  The main
+           thread now finishes processing the migration package and
+           now carries on as it would for normal precopy migration
+           (although it can't do the cleanup it would do as it
+           finishes a normal migration).
+
+  End:     The listen thread can now quit, and perform the cleanup of migration
+           state, the migration is now complete.
+
+=== Source side page maps ===
+
+The source side keeps two bitmaps during postcopy; 'the migration bitmap'
+and 'unsent map'.  The 'migration bitmap' is basically the same as in
+the precopy case, and holds a bit to indicate that page is 'dirty' -
+i.e. needs sending.  During the precopy phase this is updated as the CPU
+dirties pages, however during postcopy the CPUs are stopped and nothing
+should dirty anything any more.
+
+The 'unsent map' is used for the transition to postcopy. It is a bitmap that
+has a bit cleared whenever a page is sent to the destination, however during
+the transition to postcopy mode it is combined with the migration bitmap
+to form a set of pages that:
+   a) Have been sent but then redirtied (which must be discarded)
+   b) Have not yet been sent - which also must be discarded to cause any
+      transparent huge pages built during precopy to be broken.
+
+Note that the contents of the unsentmap are sacrificed during the calculation
+of the discard set and thus aren't valid once in postcopy.  The dirtymap
+is still valid and is used to ensure that no page is sent more than once.  Any
+request for a page that has already been sent is ignored.  Duplicate requests
+such as this can happen as a page is sent at about the same time the
+destination accesses it.
+
diff --git a/exec.c b/exec.c
index a028961..b09f18b 100644
--- a/exec.c
+++ b/exec.c
@@ -1377,6 +1377,11 @@
     return NULL;
 }
 
+const char *qemu_ram_get_idstr(RAMBlock *rb)
+{
+    return rb->idstr;
+}
+
 /* Called with iothread lock held.  */
 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
 {
@@ -1447,7 +1452,7 @@
 
     assert(block);
 
-    newsize = TARGET_PAGE_ALIGN(newsize);
+    newsize = HOST_PAGE_ALIGN(newsize);
 
     if (block->used_length == newsize) {
         return 0;
@@ -1591,7 +1596,7 @@
         return -1;
     }
 
-    size = TARGET_PAGE_ALIGN(size);
+    size = HOST_PAGE_ALIGN(size);
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
     new_block->used_length = size;
@@ -1627,8 +1632,8 @@
     ram_addr_t addr;
     Error *local_err = NULL;
 
-    size = TARGET_PAGE_ALIGN(size);
-    max_size = TARGET_PAGE_ALIGN(max_size);
+    size = HOST_PAGE_ALIGN(size);
+    max_size = HOST_PAGE_ALIGN(max_size);
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
     new_block->resized = resized;
@@ -1877,8 +1882,16 @@
     }
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
- * (typically a TLB entry) back to a ram offset.
+/*
+ * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
+ * in that RAMBlock.
+ *
+ * ptr: Host pointer to look up
+ * round_offset: If true round the result offset down to a page boundary
+ * *ram_addr: set to result ram_addr
+ * *offset: set to result offset within the RAMBlock
+ *
+ * Returns: RAMBlock (or NULL if not found)
  *
  * By the time this function returns, the returned pointer is not protected
  * by RCU anymore.  If the caller is not within an RCU critical section and
@@ -1886,18 +1899,22 @@
  * pointer, such as a reference to the region that includes the incoming
  * ram_addr_t.
  */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+                                   ram_addr_t *ram_addr,
+                                   ram_addr_t *offset)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
-    MemoryRegion *mr;
 
     if (xen_enabled()) {
         rcu_read_lock();
         *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        mr = qemu_get_ram_block(*ram_addr)->mr;
+        block = qemu_get_ram_block(*ram_addr);
+        if (block) {
+            *offset = (host - block->host);
+        }
         rcu_read_unlock();
-        return mr;
+        return block;
     }
 
     rcu_read_lock();
@@ -1920,10 +1937,49 @@
     return NULL;
 
 found:
-    *ram_addr = block->offset + (host - block->host);
-    mr = block->mr;
+    *offset = (host - block->host);
+    if (round_offset) {
+        *offset &= TARGET_PAGE_MASK;
+    }
+    *ram_addr = block->offset + *offset;
     rcu_read_unlock();
-    return mr;
+    return block;
+}
+
+/*
+ * Finds the named RAMBlock
+ *
+ * name: The name of RAMBlock to find
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ */
+RAMBlock *qemu_ram_block_by_name(const char *name)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        if (!strcmp(name, block->idstr)) {
+            return block;
+        }
+    }
+
+    return NULL;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+{
+    RAMBlock *block;
+    ram_addr_t offset; /* Not used */
+
+    block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+
+    if (!block) {
+        return NULL;
+    }
+
+    return block->mr;
 }
 
 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
@@ -3502,6 +3558,16 @@
     }
     return 0;
 }
+
+/*
+ * Allows code that needs to deal with migration bitmaps etc to still be built
+ * target independent.
+ */
+size_t qemu_target_page_bits(void)
+{
+    return TARGET_PAGE_BITS;
+}
+
 #endif
 
 /*
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 3a4ae39..8939b98 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1008,6 +1008,21 @@
 ETEXI
 
     {
+        .name       = "migrate_start_postcopy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "Switch migration to postcopy mode",
+        .mhandler.cmd = hmp_migrate_start_postcopy,
+    },
+
+STEXI
+@item migrate_start_postcopy
+@findex migrate_start_postcopy
+Switch in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
+ETEXI
+
+    {
         .name       = "client_migrate_info",
         .args_type  = "protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?",
         .params     = "protocol hostname port tls-port cert-subject",
diff --git a/hmp.c b/hmp.c
index a15d00c..e1f854a 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1293,6 +1293,13 @@
     hmp_handle_error(mon, &err);
 }
 
+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict)
+{
+    Error *err = NULL;
+    qmp_migrate_start_postcopy(&err);
+    hmp_handle_error(mon, &err);
+}
+
 void hmp_set_password(Monitor *mon, const QDict *qdict)
 {
     const char *protocol  = qdict_get_str(qdict, "protocol");
diff --git a/hmp.h b/hmp.h
index 81656c3..a8c5b5a 100644
--- a/hmp.h
+++ b/hmp.h
@@ -69,6 +69,7 @@
 void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict);
 void hmp_client_migrate_info(Monitor *mon, const QDict *qdict);
+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict);
 void hmp_set_password(Monitor *mon, const QDict *qdict);
 void hmp_expire_password(Monitor *mon, const QDict *qdict);
 void hmp_eject(Monitor *mon, const QDict *qdict);
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0ed8527..37d071e 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1588,7 +1588,7 @@
 static SaveVMHandlers savevm_htab_handlers = {
     .save_live_setup = htab_save_setup,
     .save_live_iterate = htab_save_iterate,
-    .save_live_complete = htab_save_complete,
+    .save_live_complete_precopy = htab_save_complete,
     .load_state = htab_load,
 };
 
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index c419b17..9671635 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -37,9 +37,11 @@
 static void balloon_page(void *addr, int deflate)
 {
 #if defined(__linux__)
-    if (!kvm_enabled() || kvm_has_sync_mmu())
+    if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+                                         kvm_has_sync_mmu())) {
         qemu_madvise(addr, TARGET_PAGE_SIZE,
                 deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+    }
 #endif
 }
 
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9fb1d54..85aa403 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -64,8 +64,12 @@
 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
 /* This should not be used by devices.  */
 MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+RAMBlock *qemu_ram_block_by_name(const char *name);
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+                                   ram_addr_t *ram_addr, ram_addr_t *offset);
 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
 void qemu_ram_unset_idstr(ram_addr_t addr);
+const char *qemu_ram_get_idstr(RAMBlock *rb);
 
 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
                             int len, int is_write);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index b07de10..d900b0d 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -72,7 +72,6 @@
 
 void cpu_gen_init(void);
 bool cpu_restore_state(CPUState *cpu, uintptr_t searched_pc);
-void page_size_init(void);
 
 void QEMU_NORETURN cpu_resume_from_signal(CPUState *cpu, void *puc);
 void QEMU_NORETURN cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 3360ac5..7115154 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -22,8 +22,6 @@
 #ifndef CONFIG_USER_ONLY
 #include "hw/xen/xen.h"
 
-typedef struct RAMBlock RAMBlock;
-
 struct RAMBlock {
     struct rcu_head rcu;
     struct MemoryRegion *mr;
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 8334621..fd018b7 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -35,6 +35,7 @@
 #define QEMU_VM_SUBSECTION           0x05
 #define QEMU_VM_VMDESCRIPTION        0x06
 #define QEMU_VM_CONFIGURATION        0x07
+#define QEMU_VM_COMMAND              0x08
 #define QEMU_VM_SECTION_FOOTER       0x7e
 
 struct MigrationParams {
@@ -42,13 +43,67 @@
     bool shared;
 };
 
-typedef struct MigrationState MigrationState;
+/* Messages sent on the return path from destination to source */
+enum mig_rp_message_type {
+    MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
+    MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
+    MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
+
+    MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
+    MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
+
+    MIG_RP_MSG_MAX
+};
 
 typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
 
+/* The current postcopy state is read/set by postcopy_state_get/set
+ * which update it atomically.
+ * The state is updated as postcopy messages are received, and
+ * in general only one thread should be writing to the state at any one
+ * time, initially the main thread and then the listen thread;
+ * Corner cases are where either thread finishes early and/or errors.
+ * The state is checked as messages are received to ensure that
+ * the source is sending us messages in the correct order.
+ * The state is also used by the RAM reception code to know if it
+ * has to place pages atomically, and the cleanup code at the end of
+ * the main thread to know if it has to delay cleanup until the end
+ * of postcopy.
+ */
+typedef enum {
+    POSTCOPY_INCOMING_NONE = 0,  /* Initial state - no postcopy */
+    POSTCOPY_INCOMING_ADVISE,
+    POSTCOPY_INCOMING_DISCARD,
+    POSTCOPY_INCOMING_LISTENING,
+    POSTCOPY_INCOMING_RUNNING,
+    POSTCOPY_INCOMING_END
+} PostcopyState;
+
 /* State for the incoming migration */
 struct MigrationIncomingState {
-    QEMUFile *file;
+    QEMUFile *from_src_file;
+
+    /*
+     * Free at the start of the main state load, set as the main thread finishes
+     * loading state.
+     */
+    QemuEvent main_thread_load_event;
+
+    bool           have_fault_thread;
+    QemuThread     fault_thread;
+    QemuSemaphore  fault_thread_sem;
+
+    bool           have_listen_thread;
+    QemuThread     listen_thread;
+    QemuSemaphore  listen_thread_sem;
+
+    /* For the kernel to send us notifications */
+    int       userfault_fd;
+    /* To tell the fault_thread to quit */
+    int       userfault_quit_fd;
+    QEMUFile *to_src_file;
+    QemuMutex rp_mutex;    /* We send replies from multiple threads */
+    void     *postcopy_tmp_page;
 
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
@@ -58,6 +113,18 @@
 MigrationIncomingState *migration_incoming_state_new(QEMUFile *f);
 void migration_incoming_state_destroy(void);
 
+/*
+ * An outstanding page request, on the source, having been received
+ * and queued
+ */
+struct MigrationSrcPageRequest {
+    RAMBlock *rb;
+    hwaddr    offset;
+    hwaddr    len;
+
+    QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req;
+};
+
 struct MigrationState
 {
     int64_t bandwidth_limit;
@@ -70,6 +137,14 @@
 
     int state;
     MigrationParams params;
+
+    /* State related to return path */
+    struct {
+        QEMUFile     *from_dst_file;
+        QemuThread    rp_thread;
+        bool          error;
+    } rp_state;
+
     double mbps;
     int64_t total_time;
     int64_t downtime;
@@ -80,6 +155,18 @@
     int64_t xbzrle_cache_size;
     int64_t setup_time;
     int64_t dirty_sync_count;
+
+    /* Flag set once the migration has been asked to enter postcopy */
+    bool start_postcopy;
+
+    /* Flag set once the migration thread is running (and needs joining) */
+    bool migration_thread_running;
+
+    /* Queue of outstanding page requests from the destination */
+    QemuMutex src_page_req_mutex;
+    QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests;
+    /* The RAMBlock used in the last src_page_request */
+    RAMBlock *last_req_rb;
 };
 
 void process_incoming_migration(QEMUFile *f);
@@ -116,9 +203,12 @@
 
 void add_migration_state_change_notifier(Notifier *notify);
 void remove_migration_state_change_notifier(Notifier *notify);
+MigrationState *migrate_init(const MigrationParams *params);
 bool migration_in_setup(MigrationState *);
 bool migration_has_finished(MigrationState *);
 bool migration_has_failed(MigrationState *);
+/* True if outgoing migration has entered postcopy phase */
+bool migration_in_postcopy(MigrationState *);
 MigrationState *migrate_get_current(void);
 
 void migrate_compress_threads_create(void);
@@ -145,6 +235,13 @@
 double xbzrle_mig_cache_miss_rate(void);
 
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
+void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
+/* For outgoing discard bitmap */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms);
+/* For incoming postcopy discard */
+int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
+                      uint64_t start, size_t length);
+int ram_postcopy_incoming_init(MigrationIncomingState *mis);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
@@ -160,6 +257,7 @@
  */
 void migrate_del_blocker(Error *reason);
 
+bool migrate_postcopy_ram(void);
 bool migrate_zero_blocks(void);
 
 bool migrate_auto_converge(void);
@@ -179,6 +277,17 @@
 int migrate_decompress_threads(void);
 bool migrate_use_events(void);
 
+/* Sending on the return path - generic and then for each message type */
+void migrate_send_rp_message(MigrationIncomingState *mis,
+                             enum mig_rp_message_type message_type,
+                             uint16_t len, void *data);
+void migrate_send_rp_shut(MigrationIncomingState *mis,
+                          uint32_t value);
+void migrate_send_rp_pong(MigrationIncomingState *mis,
+                          uint32_t value);
+void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname,
+                              ram_addr_t start, size_t len);
+
 void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data);
@@ -204,4 +313,12 @@
 void savevm_skip_configuration(void);
 int global_state_store(void);
 void global_state_store_running(void);
+
+void flush_page_queue(MigrationState *ms);
+int ram_save_queue_pages(MigrationState *ms, const char *rbname,
+                         ram_addr_t start, ram_addr_t len);
+
+PostcopyState postcopy_state_get(void);
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(PostcopyState new_state);
 #endif
diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
new file mode 100644
index 0000000..b6a7491
--- /dev/null
+++ b/include/migration/postcopy-ram.h
@@ -0,0 +1,99 @@
+/*
+ * Postcopy migration for RAM
+ *
+ * Copyright 2013 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Dave Gilbert  <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_POSTCOPY_RAM_H
+#define QEMU_POSTCOPY_RAM_H
+
+/* Return true if the host supports everything we need to do postcopy-ram */
+bool postcopy_ram_supported_by_host(void);
+
+/*
+ * Make all of RAM sensitive to accesses to areas that haven't yet been written
+ * and wire up anything necessary to deal with it.
+ */
+int postcopy_ram_enable_notify(MigrationIncomingState *mis);
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from ram.c's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages);
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis);
+
+/*
+ * Discard the contents of 'length' bytes from 'start'
+ * We can assume that if we've been called postcopy_ram_hosttest returned true
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               size_t length);
+
+/*
+ * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
+ * however leaving it until after precopy means that most of the precopy
+ * data is still THPd
+ */
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis);
+
+/*
+ * Called at the start of each RAMBlock by the bitmap code.
+ * 'offset' is the bitmap offset of the named RAMBlock in the migration
+ * bitmap.
+ * Returns a new PDS
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+                                                 unsigned long offset,
+                                                 const char *name);
+
+/*
+ * Called by the bitmap code for each chunk to discard.
+ * May send a discard message, may just leave it queued to
+ * be sent later.
+ * @start,@length: a range of pages in the migration bitmap in the
+ *  RAM block passed to postcopy_discard_send_init() (length=1 is one page)
+ */
+void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
+                                 unsigned long start, unsigned long length);
+
+/*
+ * Called at the end of each RAMBlock by the bitmap code.
+ * Sends any outstanding discard messages, frees the PDS.
+ */
+void postcopy_discard_send_finish(MigrationState *ms,
+                                  PostcopyDiscardState *pds);
+
+/*
+ * Place a page (from) at (host) efficiently
+ *    There are restrictions on how 'from' must be mapped, in general best
+ *    to use other postcopy_ routines to allocate.
+ * returns 0 on success
+ */
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from);
+
+/*
+ * Place a zero page at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host);
+
+/*
+ * Allocate a page of memory that can be mapped at a later point in time
+ * using postcopy_place_page
+ * Returns: Pointer to allocated page
+ */
+void *postcopy_get_tmp_page(MigrationIncomingState *mis);
+
+#endif
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 29a338d..b5d08d2 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -89,6 +89,11 @@
                                uint64_t *bytes_sent);
 
 /*
+ * Return a QEMUFile for comms in the opposite direction
+ */
+typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
+
+/*
  * Stop any read or write (depending on flags) on the underlying
  * transport on the QEMUFile.
  * Existing blocking reads/writes must be woken
@@ -106,6 +111,7 @@
     QEMURamHookFunc *after_ram_iterate;
     QEMURamHookFunc *hook_ram_load;
     QEMURamSaveFunc *save_page;
+    QEMURetPathFunc *get_return_path;
     QEMUFileShutdownFunc *shut_down;
 } QEMUFileOps;
 
@@ -163,9 +169,11 @@
 void qemu_put_be64(QEMUFile *f, uint64_t v);
 size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset);
 size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size);
+size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size);
 ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
                                   int level);
 int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src);
+
 /*
  * Note that you can only peek continuous bytes from where the current pointer
  * is; you aren't guaranteed to be able to peak to +n bytes unless you've
@@ -194,7 +202,9 @@
 int qemu_file_get_error(QEMUFile *f);
 void qemu_file_set_error(QEMUFile *f, int ret);
 int qemu_file_shutdown(QEMUFile *f);
+QEMUFile *qemu_file_get_return_path(QEMUFile *f);
 void qemu_fflush(QEMUFile *f);
+void qemu_file_set_blocking(QEMUFile *f, bool block);
 
 static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
 {
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index d173b56..7267e38 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -40,7 +40,8 @@
     SaveStateHandler *save_state;
 
     void (*cleanup)(void *opaque);
-    int (*save_live_complete)(QEMUFile *f, void *opaque);
+    int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);
+    int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);
 
     /* This runs both outside and inside the iothread lock.  */
     bool (*is_active)(void *opaque);
@@ -54,8 +55,9 @@
 
     /* This runs outside the iothread lock!  */
     int (*save_live_setup)(QEMUFile *f, void *opaque);
-    uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
-
+    void (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size,
+                              uint64_t *non_postcopiable_pending,
+                              uint64_t *postcopiable_pending);
     LoadStateHandler *load_state;
 } SaveVMHandlers;
 
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 2f74540..405364f 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -499,5 +499,6 @@
 int parse_debug_env(const char *name, int max, int initial);
 
 const char *qemu_ether_ntoa(const MACAddr *mac);
+void page_size_init(void);
 
 #endif
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index ab2d5d9..861d84b 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -139,6 +139,8 @@
 
 #if defined(CONFIG_MADVISE)
 
+#include <sys/mman.h>
+
 #define QEMU_MADV_WILLNEED  MADV_WILLNEED
 #define QEMU_MADV_DONTNEED  MADV_DONTNEED
 #ifdef MADV_DONTFORK
@@ -171,6 +173,11 @@
 #else
 #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
 #endif
+#ifdef MADV_NOHUGEPAGE
+#define QEMU_MADV_NOHUGEPAGE MADV_NOHUGEPAGE
+#else
+#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
+#endif
 
 #elif defined(CONFIG_POSIX_MADVISE)
 
@@ -182,6 +189,7 @@
 #define QEMU_MADV_DODUMP QEMU_MADV_INVALID
 #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
+#define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 
 #else /* no-op */
 
@@ -193,6 +201,7 @@
 #define QEMU_MADV_DODUMP QEMU_MADV_INVALID
 #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
+#define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 
 #endif
 
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 2cdce18..6b1093d 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -44,6 +44,7 @@
 typedef struct MemoryRegionSection MemoryRegionSection;
 typedef struct MigrationIncomingState MigrationIncomingState;
 typedef struct MigrationParams MigrationParams;
+typedef struct MigrationState MigrationState;
 typedef struct Monitor Monitor;
 typedef struct MouseTransformInfo MouseTransformInfo;
 typedef struct MSIMessage MSIMessage;
@@ -66,6 +67,7 @@
 typedef struct PCMachineClass PCMachineClass;
 typedef struct PCMCIACardState PCMCIACardState;
 typedef struct PixelFormat PixelFormat;
+typedef struct PostcopyDiscardState PostcopyDiscardState;
 typedef struct PropertyInfo PropertyInfo;
 typedef struct Property Property;
 typedef struct QEMUBH QEMUBH;
@@ -79,6 +81,7 @@
 typedef struct QEMUTimerListGroup QEMUTimerListGroup;
 typedef struct QEMUTimer QEMUTimer;
 typedef struct Range Range;
+typedef struct RAMBlock RAMBlock;
 typedef struct SerialState SerialState;
 typedef struct SHPCDevice SHPCDevice;
 typedef struct SMBusDevice SMBusDevice;
diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h
index 17fe300..3f976b4 100644
--- a/include/sysemu/balloon.h
+++ b/include/sysemu/balloon.h
@@ -22,5 +22,7 @@
 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
 			     QEMUBalloonStatus *stat_func, void *opaque);
 void qemu_remove_balloon_handler(void *opaque);
+bool qemu_balloon_is_inhibited(void);
+void qemu_balloon_inhibit(bool state);
 
 #endif
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 5cb0f05..f992494 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -70,6 +70,7 @@
 void qemu_devices_reset(void);
 void qemu_system_reset(bool report);
 void qemu_system_guest_panicked(void);
+size_t qemu_target_page_bits(void);
 
 void qemu_add_exit_notifier(Notifier *notify);
 void qemu_remove_exit_notifier(Notifier *notify);
@@ -83,14 +84,52 @@
 
 void qemu_announce_self(void);
 
+/* Subcommands for QEMU_VM_COMMAND */
+enum qemu_vm_cmd {
+    MIG_CMD_INVALID = 0,   /* Must be 0 */
+    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
+    MIG_CMD_PING,              /* Request a PONG on the RP */
+
+    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
+                                      warn we might want to do PC */
+    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
+                                      pages as it's running. */
+    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
+
+    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
+                                      were previously sent during
+                                      precopy but are dirty. */
+    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
+    MIG_CMD_MAX
+};
+
+#define MAX_VM_CMD_PACKAGED_SIZE (1ul << 24)
+
 bool qemu_savevm_state_blocked(Error **errp);
 void qemu_savevm_state_begin(QEMUFile *f,
                              const MigrationParams *params);
 void qemu_savevm_state_header(QEMUFile *f);
-int qemu_savevm_state_iterate(QEMUFile *f);
-void qemu_savevm_state_complete(QEMUFile *f);
+int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
 void qemu_savevm_state_cleanup(void);
-uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
+void qemu_savevm_state_complete_postcopy(QEMUFile *f);
+void qemu_savevm_state_complete_precopy(QEMUFile *f);
+void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
+                               uint64_t *res_non_postcopiable,
+                               uint64_t *res_postcopiable);
+void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
+                              uint16_t len, uint8_t *data);
+void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
+void qemu_savevm_send_open_return_path(QEMUFile *f);
+int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb);
+void qemu_savevm_send_postcopy_advise(QEMUFile *f);
+void qemu_savevm_send_postcopy_listen(QEMUFile *f);
+void qemu_savevm_send_postcopy_run(QEMUFile *f);
+
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+                                           uint16_t len,
+                                           uint64_t *start_list,
+                                           uint64_t *length_list);
+
 int qemu_loadvm_state(QEMUFile *f);
 
 typedef enum DisplayType
@@ -133,6 +172,7 @@
 extern bool boot_strict;
 extern uint8_t *boot_splash_filedata;
 extern size_t boot_splash_filedata_size;
+extern bool enable_mlock;
 extern uint8_t qemu_extra_params_fw[2];
 extern QEMUClockType rtc_clock;
 extern const char *mem_path;
diff --git a/kvm-all.c b/kvm-all.c
index 1bc1273..de3c8c4 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1461,7 +1461,6 @@
      * page size for the system though.
      */
     assert(TARGET_PAGE_SIZE <= getpagesize());
-    page_size_init();
 
     s->sigmask_len = 8;
 
diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h
new file mode 100644
index 0000000..9057d7a
--- /dev/null
+++ b/linux-headers/linux/userfaultfd.h
@@ -0,0 +1,167 @@
+/*
+ *  include/linux/userfaultfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ *			      UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS				\
+	((__u64)1 << _UFFDIO_REGISTER |		\
+	 (__u64)1 << _UFFDIO_UNREGISTER |	\
+	 (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS			\
+	((__u64)1 << _UFFDIO_WAKE |		\
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F.  UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER		(0x00)
+#define _UFFDIO_UNREGISTER		(0x01)
+#define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_COPY			(0x03)
+#define _UFFDIO_ZEROPAGE		(0x04)
+#define _UFFDIO_API			(0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API		_IOWR(UFFDIO, _UFFDIO_API,	\
+				      struct uffdio_api)
+#define UFFDIO_REGISTER		_IOWR(UFFDIO, _UFFDIO_REGISTER, \
+				      struct uffdio_register)
+#define UFFDIO_UNREGISTER	_IOR(UFFDIO, _UFFDIO_UNREGISTER,	\
+				     struct uffdio_range)
+#define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
+				     struct uffdio_range)
+#define UFFDIO_COPY		_IOWR(UFFDIO, _UFFDIO_COPY,	\
+				      struct uffdio_copy)
+#define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
+				      struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+	__u8	event;
+
+	__u8	reserved1;
+	__u16	reserved2;
+	__u32	reserved3;
+
+	union {
+		struct {
+			__u64	flags;
+			__u64	address;
+		} pagefault;
+
+		struct {
+			/* unused reserved fields */
+			__u64	reserved1;
+			__u64	reserved2;
+			__u64	reserved3;
+		} reserved;
+	} arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT	0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK		0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP		(1<<1)	/* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+	/* userland asks for an API number and the features to enable */
+	__u64 api;
+	/*
+	 * Kernel answers below with the all available features for
+	 * the API, this notifies userland of which events and/or
+	 * which flags for each event are enabled in the current
+	 * kernel.
+	 *
+	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+	 * are to be considered implicitly always enabled in all kernels as
+	 * long as the uffdio_api.api requested matches UFFD_API.
+	 */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
+#define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#endif
+	__u64 features;
+
+	__u64 ioctls;
+};
+
+struct uffdio_range {
+	__u64 start;
+	__u64 len;
+};
+
+struct uffdio_register {
+	struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
+	__u64 mode;
+
+	/*
+	 * kernel answers which ioctl commands are available for the
+	 * range, keep at the end as the last 8 bytes aren't read.
+	 */
+	__u64 ioctls;
+};
+
+struct uffdio_copy {
+	__u64 dst;
+	__u64 src;
+	__u64 len;
+	/*
+	 * There will be a wrprotection flag later that allows to map
+	 * pages wrprotected on the fly. And such a flag will be
+	 * available if the wrprotection ioctl are implemented for the
+	 * range according to the uffdio_register.ioctls.
+	 */
+#define UFFDIO_COPY_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "copy" is written by the ioctl and must be at the end: the
+	 * copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 copy;
+};
+
+struct uffdio_zeropage {
+	struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "zeropage" is written by the ioctl and must be at the end:
+	 * the copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index d929e96..0cac6d7 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -1,7 +1,7 @@
 common-obj-y += migration.o tcp.o
 common-obj-y += vmstate.o
 common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o
-common-obj-y += xbzrle.o
+common-obj-y += xbzrle.o postcopy-ram.o
 
 common-obj-$(CONFIG_RDMA) += rdma.o
 common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
diff --git a/migration/block.c b/migration/block.c
index cf9d9f8..310e2b3 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -748,7 +748,9 @@
     return 0;
 }
 
-static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+                               uint64_t *non_postcopiable_pending,
+                               uint64_t *postcopiable_pending)
 {
     /* Estimate pending number of bytes to send */
     uint64_t pending;
@@ -767,7 +769,8 @@
     qemu_mutex_unlock_iothread();
 
     DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
-    return pending;
+    /* We don't do postcopy */
+    *non_postcopiable_pending += pending;
 }
 
 static int block_load(QEMUFile *f, void *opaque, int version_id)
@@ -876,7 +879,7 @@
     .set_params = block_set_params,
     .save_live_setup = block_save_setup,
     .save_live_iterate = block_save_iterate,
-    .save_live_complete = block_save_complete,
+    .save_live_complete_precopy = block_save_complete,
     .save_live_pending = block_save_pending,
     .load_state = block_load,
     .cleanup = block_migration_cleanup,
diff --git a/migration/migration.c b/migration/migration.c
index f99d3ea..c5c977e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -21,15 +21,18 @@
 #include "sysemu/sysemu.h"
 #include "block/block.h"
 #include "qapi/qmp/qerror.h"
+#include "qapi/util.h"
 #include "qemu/sockets.h"
 #include "qemu/rcu.h"
 #include "migration/block.h"
+#include "migration/postcopy-ram.h"
 #include "qemu/thread.h"
 #include "qmp-commands.h"
 #include "trace.h"
-#include "qapi/util.h"
 #include "qapi-event.h"
 #include "qom/cpu.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
 
 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
 
@@ -57,6 +60,13 @@
 
 static bool deferred_incoming;
 
+/*
+ * Current state of incoming postcopy; note this is not part of
+ * MigrationIncomingState since it's state is used during cleanup
+ * at the end as MIS is being freed.
+ */
+static PostcopyState incoming_postcopy_state;
+
 /* When we add fault tolerance, we could have several
    migrations at once.  For now we don't need to add
    dynamic creation of migration */
@@ -64,6 +74,7 @@
 /* For outgoing */
 MigrationState *migrate_get_current(void)
 {
+    static bool once;
     static MigrationState current_migration = {
         .state = MIGRATION_STATUS_NONE,
         .bandwidth_limit = MAX_THROTTLE,
@@ -81,6 +92,10 @@
                 DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT,
     };
 
+    if (!once) {
+        qemu_mutex_init(&current_migration.src_page_req_mutex);
+        once = true;
+    }
     return &current_migration;
 }
 
@@ -95,14 +110,17 @@
 MigrationIncomingState *migration_incoming_state_new(QEMUFile* f)
 {
     mis_current = g_new0(MigrationIncomingState, 1);
-    mis_current->file = f;
+    mis_current->from_src_file = f;
     QLIST_INIT(&mis_current->loadvm_handlers);
+    qemu_mutex_init(&mis_current->rp_mutex);
+    qemu_event_init(&mis_current->main_thread_load_event, false);
 
     return mis_current;
 }
 
 void migration_incoming_state_destroy(void)
 {
+    qemu_event_destroy(&mis_current->main_thread_load_event);
     loadvm_free_handlers(mis_current);
     g_free(mis_current);
     mis_current = NULL;
@@ -248,6 +266,35 @@
     deferred_incoming = true;
 }
 
+/* Request a range of pages from the source VM at the given
+ * start address.
+ *   rbname: Name of the RAMBlock to request the page in, if NULL it's the same
+ *           as the last request (a name must have been given previously)
+ *   Start: Address offset within the RB
+ *   Len: Length in bytes required - must be a multiple of pagesize
+ */
+void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
+                               ram_addr_t start, size_t len)
+{
+    uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname upto 256 */
+    size_t msglen = 12; /* start + len */
+
+    *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
+    *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
+
+    if (rbname) {
+        int rbname_len = strlen(rbname);
+        assert(rbname_len < 256);
+
+        bufc[msglen++] = rbname_len;
+        memcpy(bufc + msglen, rbname, rbname_len);
+        msglen += rbname_len;
+        migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc);
+    } else {
+        migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc);
+    }
+}
+
 void qemu_start_incoming_migration(const char *uri, Error **errp)
 {
     const char *p;
@@ -278,12 +325,37 @@
 {
     QEMUFile *f = opaque;
     Error *local_err = NULL;
+    MigrationIncomingState *mis;
+    PostcopyState ps;
     int ret;
 
-    migration_incoming_state_new(f);
+    mis = migration_incoming_state_new(f);
+    postcopy_state_set(POSTCOPY_INCOMING_NONE);
     migrate_generate_event(MIGRATION_STATUS_ACTIVE);
+
     ret = qemu_loadvm_state(f);
 
+    ps = postcopy_state_get();
+    trace_process_incoming_migration_co_end(ret, ps);
+    if (ps != POSTCOPY_INCOMING_NONE) {
+        if (ps == POSTCOPY_INCOMING_ADVISE) {
+            /*
+             * Where a migration had postcopy enabled (and thus went to advise)
+             * but managed to complete within the precopy period, we can use
+             * the normal exit.
+             */
+            postcopy_ram_incoming_cleanup(mis);
+        } else if (ret >= 0) {
+            /*
+             * Postcopy was started, cleanup should happen at the end of the
+             * postcopy thread.
+             */
+            trace_process_incoming_migration_co_postcopy_end_main();
+            return;
+        }
+        /* Else if something went wrong then just fall out of the normal exit */
+    }
+
     qemu_fclose(f);
     free_xbzrle_decoded_buf();
     migration_incoming_state_destroy();
@@ -344,6 +416,50 @@
     qemu_coroutine_enter(co, f);
 }
 
+/*
+ * Send a message on the return channel back to the source
+ * of the migration.
+ */
+void migrate_send_rp_message(MigrationIncomingState *mis,
+                             enum mig_rp_message_type message_type,
+                             uint16_t len, void *data)
+{
+    trace_migrate_send_rp_message((int)message_type, len);
+    qemu_mutex_lock(&mis->rp_mutex);
+    qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
+    qemu_put_be16(mis->to_src_file, len);
+    qemu_put_buffer(mis->to_src_file, data, len);
+    qemu_fflush(mis->to_src_file);
+    qemu_mutex_unlock(&mis->rp_mutex);
+}
+
+/*
+ * Send a 'SHUT' message on the return channel with the given value
+ * to indicate that we've finished with the RP.  Non-0 value indicates
+ * error.
+ */
+void migrate_send_rp_shut(MigrationIncomingState *mis,
+                          uint32_t value)
+{
+    uint32_t buf;
+
+    buf = cpu_to_be32(value);
+    migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
+}
+
+/*
+ * Send a 'PONG' message on the return channel with the given value
+ * (normally in response to a 'PING')
+ */
+void migrate_send_rp_pong(MigrationIncomingState *mis,
+                          uint32_t value)
+{
+    uint32_t buf;
+
+    buf = cpu_to_be32(value);
+    migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
+}
+
 /* amount of nanoseconds we are willing to wait for migration to be down.
  * the choice of nanoseconds is because it is the maximum resolution that
  * get_clock() can achieve. It is an internal measure. All user-visible
@@ -399,6 +515,24 @@
     return params;
 }
 
+/*
+ * Return true if we're already in the middle of a migration
+ * (i.e. any of the active or setup states)
+ */
+static bool migration_is_setup_or_active(int state)
+{
+    switch (state) {
+    case MIGRATION_STATUS_ACTIVE:
+    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+    case MIGRATION_STATUS_SETUP:
+        return true;
+
+    default:
+        return false;
+
+    }
+}
+
 static void get_xbzrle_cache_stats(MigrationInfo *info)
 {
     if (migrate_use_xbzrle()) {
@@ -465,6 +599,39 @@
 
         get_xbzrle_cache_stats(info);
         break;
+    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+        /* Mostly the same as active; TODO add some postcopy stats */
+        info->has_status = true;
+        info->has_total_time = true;
+        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
+            - s->total_time;
+        info->has_expected_downtime = true;
+        info->expected_downtime = s->expected_downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;
+
+        info->has_ram = true;
+        info->ram = g_malloc0(sizeof(*info->ram));
+        info->ram->transferred = ram_bytes_transferred();
+        info->ram->remaining = ram_bytes_remaining();
+        info->ram->total = ram_bytes_total();
+        info->ram->duplicate = dup_mig_pages_transferred();
+        info->ram->skipped = skipped_mig_pages_transferred();
+        info->ram->normal = norm_mig_pages_transferred();
+        info->ram->normal_bytes = norm_mig_bytes_transferred();
+        info->ram->dirty_pages_rate = s->dirty_pages_rate;
+        info->ram->mbps = s->mbps;
+
+        if (blk_mig_active()) {
+            info->has_disk = true;
+            info->disk = g_malloc0(sizeof(*info->disk));
+            info->disk->transferred = blk_mig_bytes_transferred();
+            info->disk->remaining = blk_mig_bytes_remaining();
+            info->disk->total = blk_mig_bytes_total();
+        }
+
+        get_xbzrle_cache_stats(info);
+        break;
     case MIGRATION_STATUS_COMPLETED:
         get_xbzrle_cache_stats(info);
 
@@ -506,8 +673,7 @@
     MigrationState *s = migrate_get_current();
     MigrationCapabilityStatusList *cap;
 
-    if (s->state == MIGRATION_STATUS_ACTIVE ||
-        s->state == MIGRATION_STATUS_SETUP) {
+    if (migration_is_setup_or_active(s->state)) {
         error_setg(errp, QERR_MIGRATION_ACTIVE);
         return;
     }
@@ -515,6 +681,20 @@
     for (cap = params; cap; cap = cap->next) {
         s->enabled_capabilities[cap->value->capability] = cap->value->state;
     }
+
+    if (migrate_postcopy_ram()) {
+        if (migrate_use_compression()) {
+            /* The decompression threads asynchronously write into RAM
+             * rather than use the atomic copies needed to avoid
+             * userfaulting.  It should be possible to fix the decompression
+             * threads for compatibility in future.
+             */
+            error_report("Postcopy is not currently compatible with "
+                         "compression");
+            s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM] =
+                false;
+        }
+    }
 }
 
 void qmp_migrate_set_parameters(bool has_compress_level,
@@ -583,6 +763,28 @@
     }
 }
 
+void qmp_migrate_start_postcopy(Error **errp)
+{
+    MigrationState *s = migrate_get_current();
+
+    if (!migrate_postcopy_ram()) {
+        error_setg(errp, "Enable postcopy with migration_set_capability before"
+                         " the start of migration");
+        return;
+    }
+
+    if (s->state == MIGRATION_STATUS_NONE) {
+        error_setg(errp, "Postcopy must be started after migration has been"
+                         " started");
+        return;
+    }
+    /*
+     * we don't error if migration has finished since that would be racy
+     * with issuing this command.
+     */
+    atomic_set(&s->start_postcopy, true);
+}
+
 /* shared migration helpers */
 
 static void migrate_set_state(MigrationState *s, int old_state, int new_state)
@@ -600,10 +802,15 @@
     qemu_bh_delete(s->cleanup_bh);
     s->cleanup_bh = NULL;
 
+    flush_page_queue(s);
+
     if (s->file) {
         trace_migrate_fd_cleanup();
         qemu_mutex_unlock_iothread();
-        qemu_thread_join(&s->thread);
+        if (s->migration_thread_running) {
+            qemu_thread_join(&s->thread);
+            s->migration_thread_running = false;
+        }
         qemu_mutex_lock_iothread();
 
         migrate_compress_threads_join();
@@ -611,7 +818,8 @@
         s->file = NULL;
     }
 
-    assert(s->state != MIGRATION_STATUS_ACTIVE);
+    assert((s->state != MIGRATION_STATUS_ACTIVE) &&
+           (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE));
 
     if (s->state == MIGRATION_STATUS_CANCELLING) {
         migrate_set_state(s, MIGRATION_STATUS_CANCELLING,
@@ -635,10 +843,14 @@
     QEMUFile *f = migrate_get_current()->file;
     trace_migrate_fd_cancel();
 
+    if (s->rp_state.from_dst_file) {
+        /* shutdown the rp socket, so causing the rp thread to shutdown */
+        qemu_file_shutdown(s->rp_state.from_dst_file);
+    }
+
     do {
         old_state = s->state;
-        if (old_state != MIGRATION_STATUS_SETUP &&
-            old_state != MIGRATION_STATUS_ACTIVE) {
+        if (!migration_is_setup_or_active(old_state)) {
             break;
         }
         migrate_set_state(s, old_state, MIGRATION_STATUS_CANCELLING);
@@ -682,7 +894,12 @@
             s->state == MIGRATION_STATUS_FAILED);
 }
 
-static MigrationState *migrate_init(const MigrationParams *params)
+bool migration_in_postcopy(MigrationState *s)
+{
+    return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
+}
+
+MigrationState *migrate_init(const MigrationParams *params)
 {
     MigrationState *s = migrate_get_current();
     int64_t bandwidth_limit = s->bandwidth_limit;
@@ -719,6 +936,8 @@
     s->bandwidth_limit = bandwidth_limit;
     migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
 
+    QSIMPLEQ_INIT(&s->src_page_requests);
+
     s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     return s;
 }
@@ -770,8 +989,7 @@
     params.blk = has_blk && blk;
     params.shared = has_inc && inc;
 
-    if (s->state == MIGRATION_STATUS_ACTIVE ||
-        s->state == MIGRATION_STATUS_SETUP ||
+    if (migration_is_setup_or_active(s->state) ||
         s->state == MIGRATION_STATUS_CANCELLING) {
         error_setg(errp, QERR_MIGRATION_ACTIVE);
         return;
@@ -890,6 +1108,15 @@
     max_downtime = (uint64_t)value;
 }
 
+bool migrate_postcopy_ram(void)
+{
+    MigrationState *s;
+
+    s = migrate_get_current();
+
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM];
+}
+
 bool migrate_auto_converge(void)
 {
     MigrationState *s;
@@ -971,36 +1198,376 @@
     return s->xbzrle_cache_size;
 }
 
+/* migration thread support */
+/*
+ * Something bad happened to the RP stream, mark an error
+ * The caller shall print or trace something to indicate why
+ */
+static void mark_source_rp_bad(MigrationState *s)
+{
+    s->rp_state.error = true;
+}
+
+static struct rp_cmd_args {
+    ssize_t     len; /* -1 = variable */
+    const char *name;
+} rp_cmd_args[] = {
+    [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
+    [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
+    [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
+    [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
+    [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
+    [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
+};
+
+/*
+ * Process a request for pages received on the return path,
+ * We're allowed to send more than requested (e.g. to round to our page size)
+ * and we don't need to send pages that have already been sent.
+ */
+static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
+                                       ram_addr_t start, size_t len)
+{
+    long our_host_ps = getpagesize();
+
+    trace_migrate_handle_rp_req_pages(rbname, start, len);
+
+    /*
+     * Since we currently insist on matching page sizes, just sanity check
+     * we're being asked for whole host pages.
+     */
+    if (start & (our_host_ps-1) ||
+       (len & (our_host_ps-1))) {
+        error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
+                     " len: %zd", __func__, start, len);
+        mark_source_rp_bad(ms);
+        return;
+    }
+
+    if (ram_save_queue_pages(ms, rbname, start, len)) {
+        mark_source_rp_bad(ms);
+    }
+}
+
+/*
+ * Handles messages sent on the return path towards the source VM
+ *
+ */
+static void *source_return_path_thread(void *opaque)
+{
+    MigrationState *ms = opaque;
+    QEMUFile *rp = ms->rp_state.from_dst_file;
+    uint16_t header_len, header_type;
+    const int max_len = 512;
+    uint8_t buf[max_len];
+    uint32_t tmp32, sibling_error;
+    ram_addr_t start = 0; /* =0 to silence warning */
+    size_t  len = 0, expected_len;
+    int res;
+
+    trace_source_return_path_thread_entry();
+    while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
+           migration_is_setup_or_active(ms->state)) {
+        trace_source_return_path_thread_loop_top();
+        header_type = qemu_get_be16(rp);
+        header_len = qemu_get_be16(rp);
+
+        if (header_type >= MIG_RP_MSG_MAX ||
+            header_type == MIG_RP_MSG_INVALID) {
+            error_report("RP: Received invalid message 0x%04x length 0x%04x",
+                    header_type, header_len);
+            mark_source_rp_bad(ms);
+            goto out;
+        }
+
+        if ((rp_cmd_args[header_type].len != -1 &&
+            header_len != rp_cmd_args[header_type].len) ||
+            header_len > max_len) {
+            error_report("RP: Received '%s' message (0x%04x) with"
+                    "incorrect length %d expecting %zu",
+                    rp_cmd_args[header_type].name, header_type, header_len,
+                    (size_t)rp_cmd_args[header_type].len);
+            mark_source_rp_bad(ms);
+            goto out;
+        }
+
+        /* We know we've got a valid header by this point */
+        res = qemu_get_buffer(rp, buf, header_len);
+        if (res != header_len) {
+            error_report("RP: Failed reading data for message 0x%04x"
+                         " read %d expected %d",
+                         header_type, res, header_len);
+            mark_source_rp_bad(ms);
+            goto out;
+        }
+
+        /* OK, we have the message and the data */
+        switch (header_type) {
+        case MIG_RP_MSG_SHUT:
+            sibling_error = be32_to_cpup((uint32_t *)buf);
+            trace_source_return_path_thread_shut(sibling_error);
+            if (sibling_error) {
+                error_report("RP: Sibling indicated error %d", sibling_error);
+                mark_source_rp_bad(ms);
+            }
+            /*
+             * We'll let the main thread deal with closing the RP
+             * we could do a shutdown(2) on it, but we're the only user
+             * anyway, so there's nothing gained.
+             */
+            goto out;
+
+        case MIG_RP_MSG_PONG:
+            tmp32 = be32_to_cpup((uint32_t *)buf);
+            trace_source_return_path_thread_pong(tmp32);
+            break;
+
+        case MIG_RP_MSG_REQ_PAGES:
+            start = be64_to_cpup((uint64_t *)buf);
+            len = be32_to_cpup((uint32_t *)(buf + 8));
+            migrate_handle_rp_req_pages(ms, NULL, start, len);
+            break;
+
+        case MIG_RP_MSG_REQ_PAGES_ID:
+            expected_len = 12 + 1; /* header + termination */
+
+            if (header_len >= expected_len) {
+                start = be64_to_cpup((uint64_t *)buf);
+                len = be32_to_cpup((uint32_t *)(buf + 8));
+                /* Now we expect an idstr */
+                tmp32 = buf[12]; /* Length of the following idstr */
+                buf[13 + tmp32] = '\0';
+                expected_len += tmp32;
+            }
+            if (header_len != expected_len) {
+                error_report("RP: Req_Page_id with length %d expecting %zd",
+                        header_len, expected_len);
+                mark_source_rp_bad(ms);
+                goto out;
+            }
+            migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
+            break;
+
+        default:
+            break;
+        }
+    }
+    if (rp && qemu_file_get_error(rp)) {
+        trace_source_return_path_thread_bad_end();
+        mark_source_rp_bad(ms);
+    }
+
+    trace_source_return_path_thread_end();
+out:
+    ms->rp_state.from_dst_file = NULL;
+    qemu_fclose(rp);
+    return NULL;
+}
+
+static int open_return_path_on_source(MigrationState *ms)
+{
+
+    ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->file);
+    if (!ms->rp_state.from_dst_file) {
+        return -1;
+    }
+
+    trace_open_return_path_on_source();
+    qemu_thread_create(&ms->rp_state.rp_thread, "return path",
+                       source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
+
+    trace_open_return_path_on_source_continue();
+
+    return 0;
+}
+
+/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
+static int await_return_path_close_on_source(MigrationState *ms)
+{
+    /*
+     * If this is a normal exit then the destination will send a SHUT and the
+     * rp_thread will exit, however if there's an error we need to cause
+     * it to exit.
+     */
+    if (qemu_file_get_error(ms->file) && ms->rp_state.from_dst_file) {
+        /*
+         * shutdown(2), if we have it, will cause it to unblock if it's stuck
+         * waiting for the destination.
+         */
+        qemu_file_shutdown(ms->rp_state.from_dst_file);
+        mark_source_rp_bad(ms);
+    }
+    trace_await_return_path_close_on_source_joining();
+    qemu_thread_join(&ms->rp_state.rp_thread);
+    trace_await_return_path_close_on_source_close();
+    return ms->rp_state.error;
+}
+
+/*
+ * Switch from normal iteration to postcopy
+ * Returns non-0 on error
+ */
+static int postcopy_start(MigrationState *ms, bool *old_vm_running)
+{
+    int ret;
+    const QEMUSizedBuffer *qsb;
+    int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    migrate_set_state(ms, MIGRATION_STATUS_ACTIVE,
+                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+    trace_postcopy_start();
+    qemu_mutex_lock_iothread();
+    trace_postcopy_start_set_run();
+
+    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+    *old_vm_running = runstate_is_running();
+    global_state_store();
+    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /*
+     * in Finish migrate and with the io-lock held everything should
+     * be quiet, but we've potentially still got dirty pages and we
+     * need to tell the destination to throw any pages it's already received
+     * that are dirty
+     */
+    if (ram_postcopy_send_discard_bitmap(ms)) {
+        error_report("postcopy send discard bitmap failed");
+        goto fail;
+    }
+
+    /*
+     * send rest of state - note things that are doing postcopy
+     * will notice we're in POSTCOPY_ACTIVE and not actually
+     * wrap their state up here
+     */
+    qemu_file_set_rate_limit(ms->file, INT64_MAX);
+    /* Ping just for debugging, helps line traces up */
+    qemu_savevm_send_ping(ms->file, 2);
+
+    /*
+     * While loading the device state we may trigger page transfer
+     * requests and the fd must be free to process those, and thus
+     * the destination must read the whole device state off the fd before
+     * it starts processing it.  Unfortunately the ad-hoc migration format
+     * doesn't allow the destination to know the size to read without fully
+     * parsing it through each devices load-state code (especially the open
+     * coded devices that use get/put).
+     * So we wrap the device state up in a package with a length at the start;
+     * to do this we use a qemu_buf to hold the whole of the device state.
+     */
+    QEMUFile *fb = qemu_bufopen("w", NULL);
+    if (!fb) {
+        error_report("Failed to create buffered file");
+        goto fail;
+    }
+
+    /*
+     * Make sure the receiver can get incoming pages before we send the rest
+     * of the state
+     */
+    qemu_savevm_send_postcopy_listen(fb);
+
+    qemu_savevm_state_complete_precopy(fb);
+    qemu_savevm_send_ping(fb, 3);
+
+    qemu_savevm_send_postcopy_run(fb);
+
+    /* <><> end of stuff going into the package */
+    qsb = qemu_buf_get(fb);
+
+    /* Now send that blob */
+    if (qemu_savevm_send_packaged(ms->file, qsb)) {
+        goto fail_closefb;
+    }
+    qemu_fclose(fb);
+    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
+
+    qemu_mutex_unlock_iothread();
+
+    /*
+     * Although this ping is just for debug, it could potentially be
+     * used for getting a better measurement of downtime at the source.
+     */
+    qemu_savevm_send_ping(ms->file, 4);
+
+    ret = qemu_file_get_error(ms->file);
+    if (ret) {
+        error_report("postcopy_start: Migration stream errored");
+        migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                              MIGRATION_STATUS_FAILED);
+    }
+
+    return ret;
+
+fail_closefb:
+    qemu_fclose(fb);
+fail:
+    migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                          MIGRATION_STATUS_FAILED);
+    qemu_mutex_unlock_iothread();
+    return -1;
+}
+
 /**
  * migration_completion: Used by migration_thread when there's not much left.
  *   The caller 'breaks' the loop when this returns.
  *
  * @s: Current migration state
+ * @current_active_state: The migration state we expect to be in
  * @*old_vm_running: Pointer to old_vm_running flag
  * @*start_time: Pointer to time to update
  */
-static void migration_completion(MigrationState *s, bool *old_vm_running,
+static void migration_completion(MigrationState *s, int current_active_state,
+                                 bool *old_vm_running,
                                  int64_t *start_time)
 {
     int ret;
 
-    qemu_mutex_lock_iothread();
-    *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
-    *old_vm_running = runstate_is_running();
+    if (s->state == MIGRATION_STATUS_ACTIVE) {
+        qemu_mutex_lock_iothread();
+        *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+        qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+        *old_vm_running = runstate_is_running();
+        ret = global_state_store();
 
-    ret = global_state_store();
-    if (!ret) {
-        ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-        if (ret >= 0) {
-            qemu_file_set_rate_limit(s->file, INT64_MAX);
-            qemu_savevm_state_complete(s->file);
+        if (!ret) {
+            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+            if (ret >= 0) {
+                qemu_file_set_rate_limit(s->file, INT64_MAX);
+                qemu_savevm_state_complete_precopy(s->file);
+            }
         }
-    }
-    qemu_mutex_unlock_iothread();
+        qemu_mutex_unlock_iothread();
 
-    if (ret < 0) {
-        goto fail;
+        if (ret < 0) {
+            goto fail;
+        }
+    } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+        trace_migration_completion_postcopy_end();
+
+        qemu_savevm_state_complete_postcopy(s->file);
+        trace_migration_completion_postcopy_end_after_complete();
+    }
+
+    /*
+     * If rp was opened we must clean up the thread before
+     * cleaning everything else up (since if there are no failures
+     * it will wait for the destination to send it's status in
+     * a SHUT command).
+     * Postcopy opens rp if enabled (even if it's not avtivated)
+     */
+    if (migrate_postcopy_ram()) {
+        int rp_error;
+        trace_migration_completion_postcopy_end_before_rp();
+        rp_error = await_return_path_close_on_source(s);
+        trace_migration_completion_postcopy_end_after_rp(rp_error);
+        if (rp_error) {
+            goto fail;
+        }
     }
 
     if (qemu_file_get_error(s->file)) {
@@ -1008,18 +1575,21 @@
         goto fail;
     }
 
-    migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COMPLETED);
+    migrate_set_state(s, current_active_state, MIGRATION_STATUS_COMPLETED);
     return;
 
 fail:
-    migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_FAILED);
+    migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
 }
 
-/* migration thread support */
-
+/*
+ * Master migration thread on the source VM.
+ * It drives the migration and pumps the data down the outgoing channel.
+ */
 static void *migration_thread(void *opaque)
 {
     MigrationState *s = opaque;
+    /* Used by the bandwidth calcs, updated later */
     int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     int64_t initial_bytes = 0;
@@ -1027,34 +1597,79 @@
     int64_t start_time = initial_time;
     int64_t end_time;
     bool old_vm_running = false;
+    bool entered_postcopy = false;
+    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
+    enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
 
     rcu_register_thread();
 
     qemu_savevm_state_header(s->file);
+
+    if (migrate_postcopy_ram()) {
+        /* Now tell the dest that it should open its end so it can reply */
+        qemu_savevm_send_open_return_path(s->file);
+
+        /* And do a ping that will make stuff easier to debug */
+        qemu_savevm_send_ping(s->file, 1);
+
+        /*
+         * Tell the destination that we *might* want to do postcopy later;
+         * if the other end can't do postcopy it should fail now, nice and
+         * early.
+         */
+        qemu_savevm_send_postcopy_advise(s->file);
+    }
+
     qemu_savevm_state_begin(s->file, &s->params);
 
     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+    current_active_state = MIGRATION_STATUS_ACTIVE;
     migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE);
 
-    while (s->state == MIGRATION_STATUS_ACTIVE) {
+    trace_migration_thread_setup_complete();
+
+    while (s->state == MIGRATION_STATUS_ACTIVE ||
+           s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
         int64_t current_time;
         uint64_t pending_size;
 
         if (!qemu_file_rate_limit(s->file)) {
-            pending_size = qemu_savevm_state_pending(s->file, max_size);
-            trace_migrate_pending(pending_size, max_size);
+            uint64_t pend_post, pend_nonpost;
+
+            qemu_savevm_state_pending(s->file, max_size, &pend_nonpost,
+                                      &pend_post);
+            pending_size = pend_nonpost + pend_post;
+            trace_migrate_pending(pending_size, max_size,
+                                  pend_post, pend_nonpost);
             if (pending_size && pending_size >= max_size) {
-                qemu_savevm_state_iterate(s->file);
+                /* Still a significant amount to transfer */
+
+                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+                if (migrate_postcopy_ram() &&
+                    s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE &&
+                    pend_nonpost <= max_size &&
+                    atomic_read(&s->start_postcopy)) {
+
+                    if (!postcopy_start(s, &old_vm_running)) {
+                        current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+                        entered_postcopy = true;
+                    }
+
+                    continue;
+                }
+                /* Just another iteration step */
+                qemu_savevm_state_iterate(s->file, entered_postcopy);
             } else {
                 trace_migration_thread_low_pending(pending_size);
-                migration_completion(s, &old_vm_running, &start_time);
+                migration_completion(s, current_active_state,
+                                     &old_vm_running, &start_time);
                 break;
             }
         }
 
         if (qemu_file_get_error(s->file)) {
-            migrate_set_state(s, MIGRATION_STATUS_ACTIVE,
-                              MIGRATION_STATUS_FAILED);
+            migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
+            trace_migration_thread_file_err();
             break;
         }
         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1085,6 +1700,7 @@
         }
     }
 
+    trace_migration_thread_after_loop();
     /* If we enabled cpu throttling for auto-converge, turn it off. */
     cpu_throttle_stop();
     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1094,14 +1710,16 @@
     if (s->state == MIGRATION_STATUS_COMPLETED) {
         uint64_t transferred_bytes = qemu_ftell(s->file);
         s->total_time = end_time - s->total_time;
-        s->downtime = end_time - start_time;
+        if (!entered_postcopy) {
+            s->downtime = end_time - start_time;
+        }
         if (s->total_time) {
             s->mbps = (((double) transferred_bytes * 8.0) /
                        ((double) s->total_time)) / 1000;
         }
         runstate_set(RUN_STATE_POSTMIGRATE);
     } else {
-        if (old_vm_running) {
+        if (old_vm_running && !entered_postcopy) {
             vm_start();
         }
     }
@@ -1124,7 +1742,34 @@
     /* Notify before starting migration thread */
     notifier_list_notify(&migration_state_notifiers, s);
 
+    /*
+     * Open the return path; currently for postcopy but other things might
+     * also want it.
+     */
+    if (migrate_postcopy_ram()) {
+        if (open_return_path_on_source(s)) {
+            error_report("Unable to open return-path for postcopy");
+            migrate_set_state(s, MIGRATION_STATUS_SETUP,
+                              MIGRATION_STATUS_FAILED);
+            migrate_fd_cleanup(s);
+            return;
+        }
+    }
+
     migrate_compress_threads_create();
     qemu_thread_create(&s->thread, "migration", migration_thread, s,
                        QEMU_THREAD_JOINABLE);
+    s->migration_thread_running = true;
 }
+
+PostcopyState  postcopy_state_get(void)
+{
+    return atomic_mb_read(&incoming_postcopy_state);
+}
+
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(PostcopyState new_state)
+{
+    return atomic_xchg(&incoming_postcopy_state, new_state);
+}
+
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
new file mode 100644
index 0000000..22d6b18
--- /dev/null
+++ b/migration/postcopy-ram.c
@@ -0,0 +1,767 @@
+/*
+ * Postcopy migration for RAM
+ *
+ * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Dave Gilbert  <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * Postcopy is a migration technique where the execution flips from the
+ * source to the destination before all the data has been copied.
+ */
+
+#include <glib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/balloon.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+/* Arbitrary limit on size of each discard command,
+ * keeps them around ~200 bytes
+ */
+#define MAX_DISCARDS_PER_COMMAND 12
+
+struct PostcopyDiscardState {
+    const char *ramblock_name;
+    uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */
+    uint16_t cur_entry;
+    /*
+     * Start and length of a discard range (bytes)
+     */
+    uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
+    uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
+    unsigned int nsentwords;
+    unsigned int nsentcmds;
+};
+
+/* Postcopy needs to detect accesses to pages that haven't yet been copied
+ * across, and efficiently map new pages in, the techniques for doing this
+ * are target OS specific.
+ */
+#if defined(__linux__)
+
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <asm/types.h> /* for __u64 */
+#endif
+
+#if defined(__linux__) && defined(__NR_userfaultfd)
+#include <linux/userfaultfd.h>
+
+static bool ufd_version_check(int ufd)
+{
+    struct uffdio_api api_struct;
+    uint64_t ioctl_mask;
+
+    api_struct.api = UFFD_API;
+    api_struct.features = 0;
+    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
+        error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
+                     strerror(errno));
+        return false;
+    }
+
+    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
+                 (__u64)1 << _UFFDIO_UNREGISTER;
+    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+        error_report("Missing userfault features: %" PRIx64,
+                     (uint64_t)(~api_struct.ioctls & ioctl_mask));
+        return false;
+    }
+
+    return true;
+}
+
+/*
+ * Note: This has the side effect of munlock'ing all of RAM, that's
+ * normally fine since if the postcopy succeeds it gets turned back on at the
+ * end.
+ */
+bool postcopy_ram_supported_by_host(void)
+{
+    long pagesize = getpagesize();
+    int ufd = -1;
+    bool ret = false; /* Error unless we change it */
+    void *testarea = NULL;
+    struct uffdio_register reg_struct;
+    struct uffdio_range range_struct;
+    uint64_t feature_mask;
+
+    if ((1ul << qemu_target_page_bits()) > pagesize) {
+        error_report("Target page size bigger than host page size");
+        goto out;
+    }
+
+    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    if (ufd == -1) {
+        error_report("%s: userfaultfd not available: %s", __func__,
+                     strerror(errno));
+        goto out;
+    }
+
+    /* Version and features check */
+    if (!ufd_version_check(ufd)) {
+        goto out;
+    }
+
+    /*
+     * userfault and mlock don't go together; we'll put it back later if
+     * it was enabled.
+     */
+    if (munlockall()) {
+        error_report("%s: munlockall: %s", __func__,  strerror(errno));
+        return -1;
+    }
+
+    /*
+     *  We need to check that the ops we need are supported on anon memory
+     *  To do that we need to register a chunk and see the flags that
+     *  are returned.
+     */
+    testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
+                                    MAP_ANONYMOUS, -1, 0);
+    if (testarea == MAP_FAILED) {
+        error_report("%s: Failed to map test area: %s", __func__,
+                     strerror(errno));
+        goto out;
+    }
+    g_assert(((size_t)testarea & (pagesize-1)) == 0);
+
+    reg_struct.range.start = (uintptr_t)testarea;
+    reg_struct.range.len = pagesize;
+    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+    if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
+        error_report("%s userfault register: %s", __func__, strerror(errno));
+        goto out;
+    }
+
+    range_struct.start = (uintptr_t)testarea;
+    range_struct.len = pagesize;
+    if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
+        error_report("%s userfault unregister: %s", __func__, strerror(errno));
+        goto out;
+    }
+
+    feature_mask = (__u64)1 << _UFFDIO_WAKE |
+                   (__u64)1 << _UFFDIO_COPY |
+                   (__u64)1 << _UFFDIO_ZEROPAGE;
+    if ((reg_struct.ioctls & feature_mask) != feature_mask) {
+        error_report("Missing userfault map features: %" PRIx64,
+                     (uint64_t)(~reg_struct.ioctls & feature_mask));
+        goto out;
+    }
+
+    /* Success! */
+    ret = true;
+out:
+    if (testarea) {
+        munmap(testarea, pagesize);
+    }
+    if (ufd != -1) {
+        close(ufd);
+    }
+    return ret;
+}
+
+/**
+ * postcopy_ram_discard_range: Discard a range of memory.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true.
+ *
+ * @mis: Current incoming migration state.
+ * @start, @length: range of memory to discard.
+ *
+ * returns: 0 on success.
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               size_t length)
+{
+    trace_postcopy_ram_discard_range(start, length);
+    if (madvise(start, length, MADV_DONTNEED)) {
+        error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Setup an area of RAM so that it *can* be used for postcopy later; this
+ * must be done right at the start prior to pre-copy.
+ * opaque should be the MIS.
+ */
+static int init_range(const char *block_name, void *host_addr,
+                      ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+
+    trace_postcopy_init_range(block_name, host_addr, offset, length);
+
+    /*
+     * We need the whole of RAM to be truly empty for postcopy, so things
+     * like ROMs and any data tables built during init must be zero'd
+     * - we're going to get the copy from the source anyway.
+     * (Precopy will just overwrite this data, so doesn't need the discard)
+     */
+    if (postcopy_ram_discard_range(mis, host_addr, length)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of migration, undo the effects of init_range
+ * opaque should be the MIS.
+ */
+static int cleanup_range(const char *block_name, void *host_addr,
+                        ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    struct uffdio_range range_struct;
+    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
+
+    /*
+     * We turned off hugepage for the precopy stage with postcopy enabled
+     * we can turn it back on now.
+     */
+    if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
+        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    /*
+     * We can also turn off userfault now since we should have all the
+     * pages.   It can be useful to leave it on to debug postcopy
+     * if you're not sure it's always getting every page.
+     */
+    range_struct.start = (uintptr_t)host_addr;
+    range_struct.len = length;
+
+    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
+        error_report("%s: userfault unregister %s", __func__, strerror(errno));
+
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from arch_init's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    if (qemu_ram_foreach_block(init_range, mis)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    trace_postcopy_ram_incoming_cleanup_entry();
+
+    if (mis->have_fault_thread) {
+        uint64_t tmp64;
+
+        if (qemu_ram_foreach_block(cleanup_range, mis)) {
+            return -1;
+        }
+        /*
+         * Tell the fault_thread to exit, it's an eventfd that should
+         * currently be at 0, we're going to increment it to 1
+         */
+        tmp64 = 1;
+        if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
+            trace_postcopy_ram_incoming_cleanup_join();
+            qemu_thread_join(&mis->fault_thread);
+        } else {
+            /* Not much we can do here, but may as well report it */
+            error_report("%s: incrementing userfault_quit_fd: %s", __func__,
+                         strerror(errno));
+        }
+        trace_postcopy_ram_incoming_cleanup_closeuf();
+        close(mis->userfault_fd);
+        close(mis->userfault_quit_fd);
+        mis->have_fault_thread = false;
+    }
+
+    qemu_balloon_inhibit(false);
+
+    if (enable_mlock) {
+        if (os_mlock() < 0) {
+            error_report("mlock: %s", strerror(errno));
+            /*
+             * It doesn't feel right to fail at this point, we have a valid
+             * VM state.
+             */
+        }
+    }
+
+    postcopy_state_set(POSTCOPY_INCOMING_END);
+    migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
+
+    if (mis->postcopy_tmp_page) {
+        munmap(mis->postcopy_tmp_page, getpagesize());
+        mis->postcopy_tmp_page = NULL;
+    }
+    trace_postcopy_ram_incoming_cleanup_exit();
+    return 0;
+}
+
+/*
+ * Disable huge pages on an area
+ */
+static int nhp_range(const char *block_name, void *host_addr,
+                    ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    trace_postcopy_nhp_range(block_name, host_addr, offset, length);
+
+    /*
+     * Before we do discards we need to ensure those discards really
+     * do delete areas of the page, even if THP thinks a hugepage would
+     * be a good idea, so force hugepages off.
+     */
+    if (qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE)) {
+        error_report("%s: NOHUGEPAGE: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
+ * however leaving it until after precopy means that most of the precopy
+ * data is still THPd
+ */
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+    if (qemu_ram_foreach_block(nhp_range, mis)) {
+        return -1;
+    }
+
+    postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
+
+    return 0;
+}
+
+/*
+ * Mark the given area of RAM as requiring notification to unwritten areas
+ * Used as a  callback on qemu_ram_foreach_block.
+ *   host_addr: Base of area to mark
+ *   offset: Offset in the whole ram arena
+ *   length: Length of the section
+ *   opaque: MigrationIncomingState pointer
+ * Returns 0 on success
+ */
+static int ram_block_enable_notify(const char *block_name, void *host_addr,
+                                   ram_addr_t offset, ram_addr_t length,
+                                   void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    struct uffdio_register reg_struct;
+
+    reg_struct.range.start = (uintptr_t)host_addr;
+    reg_struct.range.len = length;
+    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+    /* Now tell our userfault_fd that it's responsible for this area */
+    if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
+        error_report("%s userfault register: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Handle faults detected by the USERFAULT markings
+ */
+static void *postcopy_ram_fault_thread(void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    struct uffd_msg msg;
+    int ret;
+    size_t hostpagesize = getpagesize();
+    RAMBlock *rb = NULL;
+    RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
+
+    trace_postcopy_ram_fault_thread_entry();
+    qemu_sem_post(&mis->fault_thread_sem);
+
+    while (true) {
+        ram_addr_t rb_offset;
+        ram_addr_t in_raspace;
+        struct pollfd pfd[2];
+
+        /*
+         * We're mainly waiting for the kernel to give us a faulting HVA,
+         * however we can be told to quit via userfault_quit_fd which is
+         * an eventfd
+         */
+        pfd[0].fd = mis->userfault_fd;
+        pfd[0].events = POLLIN;
+        pfd[0].revents = 0;
+        pfd[1].fd = mis->userfault_quit_fd;
+        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
+        pfd[1].revents = 0;
+
+        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
+            error_report("%s: userfault poll: %s", __func__, strerror(errno));
+            break;
+        }
+
+        if (pfd[1].revents) {
+            trace_postcopy_ram_fault_thread_quit();
+            break;
+        }
+
+        ret = read(mis->userfault_fd, &msg, sizeof(msg));
+        if (ret != sizeof(msg)) {
+            if (errno == EAGAIN) {
+                /*
+                 * if a wake up happens on the other thread just after
+                 * the poll, there is nothing to read.
+                 */
+                continue;
+            }
+            if (ret < 0) {
+                error_report("%s: Failed to read full userfault message: %s",
+                             __func__, strerror(errno));
+                break;
+            } else {
+                error_report("%s: Read %d bytes from userfaultfd expected %zd",
+                             __func__, ret, sizeof(msg));
+                break; /* Lost alignment, don't know what we'd read next */
+            }
+        }
+        if (msg.event != UFFD_EVENT_PAGEFAULT) {
+            error_report("%s: Read unexpected event %ud from userfaultfd",
+                         __func__, msg.event);
+            continue; /* It's not a page fault, shouldn't happen */
+        }
+
+        rb = qemu_ram_block_from_host(
+                 (void *)(uintptr_t)msg.arg.pagefault.address,
+                 true, &in_raspace, &rb_offset);
+        if (!rb) {
+            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
+                         PRIx64, (uint64_t)msg.arg.pagefault.address);
+            break;
+        }
+
+        rb_offset &= ~(hostpagesize - 1);
+        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
+                                                qemu_ram_get_idstr(rb),
+                                                rb_offset);
+
+        /*
+         * Send the request to the source - we want to request one
+         * of our host page sizes (which is >= TPS)
+         */
+        if (rb != last_rb) {
+            last_rb = rb;
+            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
+                                     rb_offset, hostpagesize);
+        } else {
+            /* Save some space */
+            migrate_send_rp_req_pages(mis, NULL,
+                                     rb_offset, hostpagesize);
+        }
+    }
+    trace_postcopy_ram_fault_thread_exit();
+    return NULL;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+    /* Open the fd for the kernel to give us userfaults */
+    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+    if (mis->userfault_fd == -1) {
+        error_report("%s: Failed to open userfault fd: %s", __func__,
+                     strerror(errno));
+        return -1;
+    }
+
+    /*
+     * Although the host check already tested the API, we need to
+     * do the check again as an ABI handshake on the new fd.
+     */
+    if (!ufd_version_check(mis->userfault_fd)) {
+        return -1;
+    }
+
+    /* Now an eventfd we use to tell the fault-thread to quit */
+    mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
+    if (mis->userfault_quit_fd == -1) {
+        error_report("%s: Opening userfault_quit_fd: %s", __func__,
+                     strerror(errno));
+        close(mis->userfault_fd);
+        return -1;
+    }
+
+    qemu_sem_init(&mis->fault_thread_sem, 0);
+    qemu_thread_create(&mis->fault_thread, "postcopy/fault",
+                       postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
+    qemu_sem_wait(&mis->fault_thread_sem);
+    qemu_sem_destroy(&mis->fault_thread_sem);
+    mis->have_fault_thread = true;
+
+    /* Mark so that we get notified of accesses to unwritten areas */
+    if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
+        return -1;
+    }
+
+    /*
+     * Ballooning can mark pages as absent while we're postcopying
+     * that would cause false userfaults.
+     */
+    qemu_balloon_inhibit(true);
+
+    trace_postcopy_ram_enable_notify();
+
+    return 0;
+}
+
+/*
+ * Place a host page (from) at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+    struct uffdio_copy copy_struct;
+
+    copy_struct.dst = (uint64_t)(uintptr_t)host;
+    copy_struct.src = (uint64_t)(uintptr_t)from;
+    copy_struct.len = getpagesize();
+    copy_struct.mode = 0;
+
+    /* copy also acks to the kernel waking the stalled thread up
+     * TODO: We can inhibit that ack and only do it if it was requested
+     * which would be slightly cheaper, but we'd have to be careful
+     * of the order of updating our page state.
+     */
+    if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
+        int e = errno;
+        error_report("%s: %s copy host: %p from: %p",
+                     __func__, strerror(e), host, from);
+
+        return -e;
+    }
+
+    trace_postcopy_place_page(host);
+    return 0;
+}
+
+/*
+ * Place a zero page at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+    struct uffdio_zeropage zero_struct;
+
+    zero_struct.range.start = (uint64_t)(uintptr_t)host;
+    zero_struct.range.len = getpagesize();
+    zero_struct.mode = 0;
+
+    if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
+        int e = errno;
+        error_report("%s: %s zero host: %p",
+                     __func__, strerror(e), host);
+
+        return -e;
+    }
+
+    trace_postcopy_place_page_zero(host);
+    return 0;
+}
+
+/*
+ * Returns a target page of memory that can be mapped at a later point in time
+ * using postcopy_place_page
+ * The same address is used repeatedly, postcopy_place_page just takes the
+ * backing page away.
+ * Returns: Pointer to allocated page
+ *
+ */
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+    if (!mis->postcopy_tmp_page) {
+        mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
+                             PROT_READ | PROT_WRITE, MAP_PRIVATE |
+                             MAP_ANONYMOUS, -1, 0);
+        if (!mis->postcopy_tmp_page) {
+            error_report("%s: %s", __func__, strerror(errno));
+            return NULL;
+        }
+    }
+
+    return mis->postcopy_tmp_page;
+}
+
+#else
+/* No target OS support, stubs just fail */
+bool postcopy_ram_supported_by_host(void)
+{
+    error_report("%s: No OS support", __func__);
+    return false;
+}
+
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    error_report("postcopy_ram_incoming_init: No OS support");
+    return -1;
+}
+
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               size_t length)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+    assert(0);
+    return -1;
+}
+
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+    assert(0);
+    return NULL;
+}
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+/**
+ * postcopy_discard_send_init: Called at the start of each RAMBlock before
+ *   asking to discard individual ranges.
+ *
+ * @ms: The current migration state.
+ * @offset: the bitmap offset of the named RAMBlock in the migration
+ *   bitmap.
+ * @name: RAMBlock that discards will operate on.
+ *
+ * returns: a new PDS.
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+                                                 unsigned long offset,
+                                                 const char *name)
+{
+    PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
+
+    if (res) {
+        res->ramblock_name = name;
+        res->offset = offset;
+    }
+
+    return res;
+}
+
+/**
+ * postcopy_discard_send_range: Called by the bitmap code for each chunk to
+ *   discard. May send a discard message, may just leave it queued to
+ *   be sent later.
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ * @start,@length: a range of pages in the migration bitmap in the
+ *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
+ */
+void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
+                                unsigned long start, unsigned long length)
+{
+    size_t tp_bits = qemu_target_page_bits();
+    /* Convert to byte offsets within the RAM block */
+    pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
+    pds->length_list[pds->cur_entry] = length << tp_bits;
+    trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
+    pds->cur_entry++;
+    pds->nsentwords++;
+
+    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
+        /* Full set, ship it! */
+        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+                                              pds->cur_entry,
+                                              pds->start_list,
+                                              pds->length_list);
+        pds->nsentcmds++;
+        pds->cur_entry = 0;
+    }
+}
+
+/**
+ * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
+ * bitmap code. Sends any outstanding discard messages, frees the PDS
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ */
+void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
+{
+    /* Anything unsent? */
+    if (pds->cur_entry) {
+        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+                                              pds->cur_entry,
+                                              pds->start_list,
+                                              pds->length_list);
+        pds->nsentcmds++;
+    }
+
+    trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
+                                       pds->nsentcmds);
+
+    g_free(pds);
+}
diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c
index 809bf07..c503b02 100644
--- a/migration/qemu-file-unix.c
+++ b/migration/qemu-file-unix.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
+#include "qemu/error-report.h"
 #include "qemu/iov.h"
 #include "qemu/sockets.h"
 #include "qemu/coroutine.h"
@@ -39,12 +40,43 @@
     QEMUFileSocket *s = opaque;
     ssize_t len;
     ssize_t size = iov_size(iov, iovcnt);
+    ssize_t offset = 0;
+    int     err;
 
-    len = iov_send(s->fd, iov, iovcnt, 0, size);
-    if (len < size) {
-        len = -socket_error();
-    }
-    return len;
+    while (size > 0) {
+        len = iov_send(s->fd, iov, iovcnt, offset, size);
+
+        if (len > 0) {
+            size -= len;
+            offset += len;
+        }
+
+        if (size > 0) {
+            err = socket_error();
+
+            if (err != EAGAIN && err != EWOULDBLOCK) {
+                error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)",
+                             err, (size_t)size, (size_t)len);
+                /*
+                 * If I've already sent some but only just got the error, I
+                 * could return the amount validly sent so far and wait for the
+                 * next call to report the error, but I'd rather flag the error
+                 * immediately.
+                 */
+                return -err;
+            }
+
+            /* Emulate blocking */
+            GPollFD pfd;
+
+            pfd.fd = s->fd;
+            pfd.events = G_IO_OUT | G_IO_ERR;
+            pfd.revents = 0;
+            g_poll(&pfd, 1 /* 1 fd */, -1 /* no timeout */);
+        }
+     }
+
+    return offset;
 }
 
 static int socket_get_fd(void *opaque)
@@ -97,6 +129,56 @@
     }
 }
 
+static int socket_return_close(void *opaque)
+{
+    QEMUFileSocket *s = opaque;
+    /*
+     * Note: We don't close the socket, that should be done by the forward
+     * path.
+     */
+    g_free(s);
+    return 0;
+}
+
+static const QEMUFileOps socket_return_read_ops = {
+    .get_fd          = socket_get_fd,
+    .get_buffer      = socket_get_buffer,
+    .close           = socket_return_close,
+    .shut_down       = socket_shutdown,
+};
+
+static const QEMUFileOps socket_return_write_ops = {
+    .get_fd          = socket_get_fd,
+    .writev_buffer   = socket_writev_buffer,
+    .close           = socket_return_close,
+    .shut_down       = socket_shutdown,
+};
+
+/*
+ * Give a QEMUFile* off the same socket but data in the opposite
+ * direction.
+ */
+static QEMUFile *socket_get_return_path(void *opaque)
+{
+    QEMUFileSocket *forward = opaque;
+    QEMUFileSocket *reverse;
+
+    if (qemu_file_get_error(forward->file)) {
+        /* If the forward file is in error, don't try and open a return */
+        return NULL;
+    }
+
+    reverse = g_malloc0(sizeof(QEMUFileSocket));
+    reverse->fd = forward->fd;
+    /* I don't think there's a better way to tell which direction 'this' is */
+    if (forward->file->ops->get_buffer != NULL) {
+        /* being called from the read side, so we need to be able to write */
+        return qemu_fopen_ops(reverse, &socket_return_write_ops);
+    } else {
+        return qemu_fopen_ops(reverse, &socket_return_read_ops);
+    }
+}
+
 static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
                                   int64_t pos)
 {
@@ -206,18 +288,19 @@
 }
 
 static const QEMUFileOps socket_read_ops = {
-    .get_fd     = socket_get_fd,
-    .get_buffer = socket_get_buffer,
-    .close      = socket_close,
-    .shut_down  = socket_shutdown
-
+    .get_fd          = socket_get_fd,
+    .get_buffer      = socket_get_buffer,
+    .close           = socket_close,
+    .shut_down       = socket_shutdown,
+    .get_return_path = socket_get_return_path
 };
 
 static const QEMUFileOps socket_write_ops = {
-    .get_fd        = socket_get_fd,
-    .writev_buffer = socket_writev_buffer,
-    .close         = socket_close,
-    .shut_down     = socket_shutdown
+    .get_fd          = socket_get_fd,
+    .writev_buffer   = socket_writev_buffer,
+    .close           = socket_close,
+    .shut_down       = socket_shutdown,
+    .get_return_path = socket_get_return_path
 };
 
 QEMUFile *qemu_fopen_socket(int fd, const char *mode)
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index df49023..0bbd257 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -44,6 +44,18 @@
     return f->ops->shut_down(f->opaque, true, true);
 }
 
+/*
+ * Result: QEMUFile* for a 'return path' for comms in the opposite direction
+ *         NULL if not available
+ */
+QEMUFile *qemu_file_get_return_path(QEMUFile *f)
+{
+    if (!f->ops->get_return_path) {
+        return NULL;
+    }
+    return f->ops->get_return_path(f->opaque);
+}
+
 bool qemu_file_mode_is_not_valid(const char *mode)
 {
     if (mode == NULL ||
@@ -434,6 +446,43 @@
 }
 
 /*
+ * Read 'size' bytes of data from the file.
+ * 'size' can be larger than the internal buffer.
+ *
+ * The data:
+ *   may be held on an internal buffer (in which case *buf is updated
+ *     to point to it) that is valid until the next qemu_file operation.
+ * OR
+ *   will be copied to the *buf that was passed in.
+ *
+ * The code tries to avoid the copy if possible.
+ *
+ * It will return size bytes unless there was an error, in which case it will
+ * return as many as it managed to read (assuming blocking fd's which
+ * all current QEMUFile are)
+ *
+ * Note: Since **buf may get changed, the caller should take care to
+ *       keep a pointer to the original buffer if it needs to deallocate it.
+ */
+size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
+{
+    if (size < IO_BUF_SIZE) {
+        size_t res;
+        uint8_t *src;
+
+        res = qemu_peek_buffer(f, &src, size, 0);
+
+        if (res == size) {
+            qemu_file_skip(f, res);
+            *buf = src;
+            return res;
+        }
+    }
+
+    return qemu_get_buffer(f, *buf, size);
+}
+
+/*
  * Peeks a single byte from the buffer; this isn't guaranteed to work if
  * offset leaves a gap after the previous read/peeked data.
  */
@@ -611,3 +660,18 @@
 
     return res == len ? res : 0;
 }
+
+/*
+ * Set the blocking state of the QEMUFile.
+ * Note: On some transports the OS only keeps a single blocking state for
+ *       both directions, and thus changing the blocking on the main
+ *       QEMUFile can also affect the return path.
+ */
+void qemu_file_set_blocking(QEMUFile *f, bool block)
+{
+    if (block) {
+        qemu_set_block(qemu_get_fd(f));
+    } else {
+        qemu_set_nonblock(qemu_get_fd(f));
+    }
+}
diff --git a/migration/ram.c b/migration/ram.c
index df3df9e..62cf42b 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -32,6 +32,7 @@
 #include "qemu/timer.h"
 #include "qemu/main-loop.h"
 #include "migration/migration.h"
+#include "migration/postcopy-ram.h"
 #include "exec/address-spaces.h"
 #include "migration/page_cache.h"
 #include "qemu/error-report.h"
@@ -237,7 +238,14 @@
 
 static struct BitmapRcu {
     struct rcu_head rcu;
+    /* Main migration bitmap */
     unsigned long *bmap;
+    /* bitmap of pages that haven't been sent even once
+     * only maintained and used in postcopy at the moment
+     * where it's used to send the dirtymap at the start
+     * of the postcopy phase
+     */
+    unsigned long *unsentmap;
 } *migration_bitmap_rcu;
 
 struct CompressParam {
@@ -531,10 +539,18 @@
     return 1;
 }
 
-/* Called with rcu_read_lock() to protect migration_bitmap */
+/* Called with rcu_read_lock() to protect migration_bitmap
+ * rb: The RAMBlock  to search for dirty pages in
+ * start: Start address (typically so we can continue from previous page)
+ * ram_addr_abs: Pointer into which to store the address of the dirty page
+ *               within the global ram_addr space
+ *
+ * Returns: byte offset within memory region of the start of a dirty page
+ */
 static inline
-ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
-                                                 ram_addr_t start)
+ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
+                                       ram_addr_t start,
+                                       ram_addr_t *ram_addr_abs)
 {
     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
@@ -551,14 +567,24 @@
         next = find_next_bit(bitmap, size, nr);
     }
 
-    if (next < size) {
-        clear_bit(next, bitmap);
-        migration_dirty_pages--;
-    }
+    *ram_addr_abs = next << TARGET_PAGE_BITS;
     return (next - base) << TARGET_PAGE_BITS;
 }
 
-/* Called with rcu_read_lock() to protect migration_bitmap */
+static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
+{
+    bool ret;
+    int nr = addr >> TARGET_PAGE_BITS;
+    unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+
+    ret = test_and_clear_bit(nr, bitmap);
+
+    if (ret) {
+        migration_dirty_pages--;
+    }
+    return ret;
+}
+
 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 {
     unsigned long *bitmap;
@@ -951,12 +977,14 @@
  * @f: Current migration stream.
  * @pss: Data about the state of the current dirty page scan.
  * @*again: Set to false if the search has scanned the whole of RAM
+ * *ram_addr_abs: Pointer into which to store the address of the dirty page
+ *               within the global ram_addr space
  */
 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
-                             bool *again)
+                             bool *again, ram_addr_t *ram_addr_abs)
 {
-    pss->offset = migration_bitmap_find_and_reset_dirty(pss->block,
-                                                       pss->offset);
+    pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
+                                              ram_addr_abs);
     if (pss->complete_round && pss->block == last_seen_block &&
         pss->offset >= last_offset) {
         /*
@@ -995,6 +1023,276 @@
     }
 }
 
+/*
+ * Helper for 'get_queued_page' - gets a page off the queue
+ *      ms:      MigrationState in
+ * *offset:      Used to return the offset within the RAMBlock
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns:      block (or NULL if none available)
+ */
+static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
+                              ram_addr_t *ram_addr_abs)
+{
+    RAMBlock *block = NULL;
+
+    qemu_mutex_lock(&ms->src_page_req_mutex);
+    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
+        struct MigrationSrcPageRequest *entry =
+                                QSIMPLEQ_FIRST(&ms->src_page_requests);
+        block = entry->rb;
+        *offset = entry->offset;
+        *ram_addr_abs = (entry->offset + entry->rb->offset) &
+                        TARGET_PAGE_MASK;
+
+        if (entry->len > TARGET_PAGE_SIZE) {
+            entry->len -= TARGET_PAGE_SIZE;
+            entry->offset += TARGET_PAGE_SIZE;
+        } else {
+            memory_region_unref(block->mr);
+            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+            g_free(entry);
+        }
+    }
+    qemu_mutex_unlock(&ms->src_page_req_mutex);
+
+    return block;
+}
+
+/*
+ * Unqueue a page from the queue fed by postcopy page requests; skips pages
+ * that are already sent (!dirty)
+ *
+ *      ms:      MigrationState in
+ *     pss:      PageSearchStatus structure updated with found block/offset
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns:      true if a queued page is found
+ */
+static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
+                            ram_addr_t *ram_addr_abs)
+{
+    RAMBlock  *block;
+    ram_addr_t offset;
+    bool dirty;
+
+    do {
+        block = unqueue_page(ms, &offset, ram_addr_abs);
+        /*
+         * We're sending this page, and since it's postcopy nothing else
+         * will dirty it, and we must make sure it doesn't get sent again
+         * even if this queue request was received after the background
+         * search already sent it.
+         */
+        if (block) {
+            unsigned long *bitmap;
+            bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+            dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
+            if (!dirty) {
+                trace_get_queued_page_not_dirty(
+                    block->idstr, (uint64_t)offset,
+                    (uint64_t)*ram_addr_abs,
+                    test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
+                         atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
+            } else {
+                trace_get_queued_page(block->idstr,
+                                      (uint64_t)offset,
+                                      (uint64_t)*ram_addr_abs);
+            }
+        }
+
+    } while (block && !dirty);
+
+    if (block) {
+        /*
+         * As soon as we start servicing pages out of order, then we have
+         * to kill the bulk stage, since the bulk stage assumes
+         * in (migration_bitmap_find_and_reset_dirty) that every page is
+         * dirty, that's no longer true.
+         */
+        ram_bulk_stage = false;
+
+        /*
+         * We want the background search to continue from the queued page
+         * since the guest is likely to want other pages near to the page
+         * it just requested.
+         */
+        pss->block = block;
+        pss->offset = offset;
+    }
+
+    return !!block;
+}
+
+/**
+ * flush_page_queue: Flush any remaining pages in the ram request queue
+ *    it should be empty at the end anyway, but in error cases there may be
+ *    some left.
+ *
+ * ms: MigrationState
+ */
+void flush_page_queue(MigrationState *ms)
+{
+    struct MigrationSrcPageRequest *mspr, *next_mspr;
+    /* This queue generally should be empty - but in the case of a failed
+     * migration might have some droppings in.
+     */
+    rcu_read_lock();
+    QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
+        memory_region_unref(mspr->rb->mr);
+        QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+        g_free(mspr);
+    }
+    rcu_read_unlock();
+}
+
+/**
+ * Queue the pages for transmission, e.g. a request from postcopy destination
+ *   ms: MigrationStatus in which the queue is held
+ *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
+ *   start: Offset from the start of the RAMBlock
+ *   len: Length (in bytes) to send
+ *   Return: 0 on success
+ */
+int ram_save_queue_pages(MigrationState *ms, const char *rbname,
+                         ram_addr_t start, ram_addr_t len)
+{
+    RAMBlock *ramblock;
+
+    rcu_read_lock();
+    if (!rbname) {
+        /* Reuse last RAMBlock */
+        ramblock = ms->last_req_rb;
+
+        if (!ramblock) {
+            /*
+             * Shouldn't happen, we can't reuse the last RAMBlock if
+             * it's the 1st request.
+             */
+            error_report("ram_save_queue_pages no previous block");
+            goto err;
+        }
+    } else {
+        ramblock = qemu_ram_block_by_name(rbname);
+
+        if (!ramblock) {
+            /* We shouldn't be asked for a non-existent RAMBlock */
+            error_report("ram_save_queue_pages no block '%s'", rbname);
+            goto err;
+        }
+        ms->last_req_rb = ramblock;
+    }
+    trace_ram_save_queue_pages(ramblock->idstr, start, len);
+    if (start+len > ramblock->used_length) {
+        error_report("%s request overrun start=%zx len=%zx blocklen=%zx",
+                     __func__, start, len, ramblock->used_length);
+        goto err;
+    }
+
+    struct MigrationSrcPageRequest *new_entry =
+        g_malloc0(sizeof(struct MigrationSrcPageRequest));
+    new_entry->rb = ramblock;
+    new_entry->offset = start;
+    new_entry->len = len;
+
+    memory_region_ref(ramblock->mr);
+    qemu_mutex_lock(&ms->src_page_req_mutex);
+    QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
+    qemu_mutex_unlock(&ms->src_page_req_mutex);
+    rcu_read_unlock();
+
+    return 0;
+
+err:
+    rcu_read_unlock();
+    return -1;
+}
+
+/**
+ * ram_save_target_page: Save one target page
+ *
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page;
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ *
+ * Returns: Number of pages written.
+ */
+static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
+                                RAMBlock *block, ram_addr_t offset,
+                                bool last_stage,
+                                uint64_t *bytes_transferred,
+                                ram_addr_t dirty_ram_abs)
+{
+    int res = 0;
+
+    /* Check the pages is dirty and if it is send it */
+    if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
+        unsigned long *unsentmap;
+        if (compression_switch && migrate_use_compression()) {
+            res = ram_save_compressed_page(f, block, offset,
+                                           last_stage,
+                                           bytes_transferred);
+        } else {
+            res = ram_save_page(f, block, offset, last_stage,
+                                bytes_transferred);
+        }
+
+        if (res < 0) {
+            return res;
+        }
+        unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+        if (unsentmap) {
+            clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
+        }
+    }
+
+    return res;
+}
+
+/**
+ * ram_save_host_page: Starting at *offset send pages upto the end
+ *                     of the current host page.  It's valid for the initial
+ *                     offset to point into the middle of a host page
+ *                     in which case the remainder of the hostpage is sent.
+ *                     Only dirty target pages are sent.
+ *
+ * Returns: Number of pages written.
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page; updated to last target page
+ *          sent
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ */
+static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block,
+                              ram_addr_t *offset, bool last_stage,
+                              uint64_t *bytes_transferred,
+                              ram_addr_t dirty_ram_abs)
+{
+    int tmppages, pages = 0;
+    do {
+        tmppages = ram_save_target_page(ms, f, block, *offset, last_stage,
+                                        bytes_transferred, dirty_ram_abs);
+        if (tmppages < 0) {
+            return tmppages;
+        }
+
+        pages += tmppages;
+        *offset += TARGET_PAGE_SIZE;
+        dirty_ram_abs += TARGET_PAGE_SIZE;
+    } while (*offset & (qemu_host_page_size - 1));
+
+    /* The offset we leave with is the last one we looked at */
+    *offset -= TARGET_PAGE_SIZE;
+    return pages;
+}
+
 /**
  * ram_find_and_save_block: Finds a dirty page and sends it to f
  *
@@ -1006,14 +1304,20 @@
  * @f: QEMUFile where to send the data
  * @last_stage: if we are at the completion stage
  * @bytes_transferred: increase it with the number of transferred bytes
+ *
+ * On systems where host-page-size > target-page-size it will send all the
+ * pages in a host page that are dirty.
  */
 
 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
                                    uint64_t *bytes_transferred)
 {
     PageSearchStatus pss;
+    MigrationState *ms = migrate_get_current();
     int pages = 0;
     bool again, found;
+    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
+                                 ram_addr_t space */
 
     pss.block = last_seen_block;
     pss.offset = last_offset;
@@ -1024,22 +1328,18 @@
     }
 
     do {
-        found = find_dirty_block(f, &pss, &again);
+        again = true;
+        found = get_queued_page(ms, &pss, &dirty_ram_abs);
+
+        if (!found) {
+            /* priority queue empty, so just search for something dirty */
+            found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
+        }
 
         if (found) {
-            if (compression_switch && migrate_use_compression()) {
-                pages = ram_save_compressed_page(f, pss.block, pss.offset,
-                                                 last_stage,
-                                                 bytes_transferred);
-            } else {
-                pages = ram_save_page(f, pss.block, pss.offset, last_stage,
-                                      bytes_transferred);
-            }
-
-            /* if page is unmodified, continue to the next */
-            if (pages > 0) {
-                last_sent_block = pss.block;
-            }
+            pages = ram_save_host_page(ms, f, pss.block, &pss.offset,
+                                       last_stage, bytes_transferred,
+                                       dirty_ram_abs);
         }
     } while (!pages && again);
 
@@ -1097,6 +1397,7 @@
 static void migration_bitmap_free(struct BitmapRcu *bmap)
 {
     g_free(bmap->bmap);
+    g_free(bmap->unsentmap);
     g_free(bmap);
 }
 
@@ -1153,6 +1454,13 @@
         qemu_mutex_lock(&migration_bitmap_mutex);
         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
         bitmap_set(bitmap->bmap, old, new - old);
+
+        /* We don't have a way to safely extend the sentmap
+         * with RCU; so mark it as missing, entry to postcopy
+         * will fail.
+         */
+        bitmap->unsentmap = NULL;
+
         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
         qemu_mutex_unlock(&migration_bitmap_mutex);
         migration_dirty_pages += new - old;
@@ -1160,6 +1468,394 @@
     }
 }
 
+/*
+ * 'expected' is the value you expect the bitmap mostly to be full
+ * of; it won't bother printing lines that are all this value.
+ * If 'todump' is null the migration bitmap is dumped.
+ */
+void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
+{
+    int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    int64_t cur;
+    int64_t linelen = 128;
+    char linebuf[129];
+
+    if (!todump) {
+        todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+    }
+
+    for (cur = 0; cur < ram_pages; cur += linelen) {
+        int64_t curb;
+        bool found = false;
+        /*
+         * Last line; catch the case where the line length
+         * is longer than remaining ram
+         */
+        if (cur + linelen > ram_pages) {
+            linelen = ram_pages - cur;
+        }
+        for (curb = 0; curb < linelen; curb++) {
+            bool thisbit = test_bit(cur + curb, todump);
+            linebuf[curb] = thisbit ? '1' : '.';
+            found = found || (thisbit != expected);
+        }
+        if (found) {
+            linebuf[curb] = '\0';
+            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
+        }
+    }
+}
+
+/* **** functions for postcopy ***** */
+
+/*
+ * Callback from postcopy_each_ram_send_discard for each RAMBlock
+ * Note: At this point the 'unsentmap' is the processed bitmap combined
+ *       with the dirtymap; so a '1' means it's either dirty or unsent.
+ * start,length: Indexes into the bitmap for the first bit
+ *            representing the named block and length in target-pages
+ */
+static int postcopy_send_discard_bm_ram(MigrationState *ms,
+                                        PostcopyDiscardState *pds,
+                                        unsigned long start,
+                                        unsigned long length)
+{
+    unsigned long end = start + length; /* one after the end */
+    unsigned long current;
+    unsigned long *unsentmap;
+
+    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+    for (current = start; current < end; ) {
+        unsigned long one = find_next_bit(unsentmap, end, current);
+
+        if (one <= end) {
+            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
+            unsigned long discard_length;
+
+            if (zero >= end) {
+                discard_length = end - one;
+            } else {
+                discard_length = zero - one;
+            }
+            postcopy_discard_send_range(ms, pds, one, discard_length);
+            current = one + discard_length;
+        } else {
+            current = one;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *   Calls postcopy_send_discard_bm_ram for each RAMBlock
+ *   passing it bitmap indexes and name.
+ * Returns: 0 on success
+ * (qemu_ram_foreach_block ends up passing unscaled lengths
+ *  which would mean postcopy code would have to deal with target page)
+ */
+static int postcopy_each_ram_send_discard(MigrationState *ms)
+{
+    struct RAMBlock *block;
+    int ret;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        unsigned long first = block->offset >> TARGET_PAGE_BITS;
+        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
+                                                               first,
+                                                               block->idstr);
+
+        /*
+         * Postcopy sends chunks of bitmap over the wire, but it
+         * just needs indexes at this point, avoids it having
+         * target page specific code.
+         */
+        ret = postcopy_send_discard_bm_ram(ms, pds, first,
+                                    block->used_length >> TARGET_PAGE_BITS);
+        postcopy_discard_send_finish(ms, pds);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
+ *   the two bitmaps, that are similar, but one is inverted.
+ *
+ * We search for runs of target-pages that don't start or end on a
+ * host page boundary;
+ * unsent_pass=true: Cleans up partially unsent host pages by searching
+ *                 the unsentmap
+ * unsent_pass=false: Cleans up partially dirty host pages by searching
+ *                 the main migration bitmap
+ *
+ */
+static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
+                                          RAMBlock *block,
+                                          PostcopyDiscardState *pds)
+{
+    unsigned long *bitmap;
+    unsigned long *unsentmap;
+    unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
+    unsigned long first = block->offset >> TARGET_PAGE_BITS;
+    unsigned long len = block->used_length >> TARGET_PAGE_BITS;
+    unsigned long last = first + (len - 1);
+    unsigned long run_start;
+
+    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+
+    if (unsent_pass) {
+        /* Find a sent page */
+        run_start = find_next_zero_bit(unsentmap, last + 1, first);
+    } else {
+        /* Find a dirty page */
+        run_start = find_next_bit(bitmap, last + 1, first);
+    }
+
+    while (run_start <= last) {
+        bool do_fixup = false;
+        unsigned long fixup_start_addr;
+        unsigned long host_offset;
+
+        /*
+         * If the start of this run of pages is in the middle of a host
+         * page, then we need to fixup this host page.
+         */
+        host_offset = run_start % host_ratio;
+        if (host_offset) {
+            do_fixup = true;
+            run_start -= host_offset;
+            fixup_start_addr = run_start;
+            /* For the next pass */
+            run_start = run_start + host_ratio;
+        } else {
+            /* Find the end of this run */
+            unsigned long run_end;
+            if (unsent_pass) {
+                run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
+            } else {
+                run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
+            }
+            /*
+             * If the end isn't at the start of a host page, then the
+             * run doesn't finish at the end of a host page
+             * and we need to discard.
+             */
+            host_offset = run_end % host_ratio;
+            if (host_offset) {
+                do_fixup = true;
+                fixup_start_addr = run_end - host_offset;
+                /*
+                 * This host page has gone, the next loop iteration starts
+                 * from after the fixup
+                 */
+                run_start = fixup_start_addr + host_ratio;
+            } else {
+                /*
+                 * No discards on this iteration, next loop starts from
+                 * next sent/dirty page
+                 */
+                run_start = run_end + 1;
+            }
+        }
+
+        if (do_fixup) {
+            unsigned long page;
+
+            /* Tell the destination to discard this page */
+            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
+                /* For the unsent_pass we:
+                 *     discard partially sent pages
+                 * For the !unsent_pass (dirty) we:
+                 *     discard partially dirty pages that were sent
+                 *     (any partially sent pages were already discarded
+                 *     by the previous unsent_pass)
+                 */
+                postcopy_discard_send_range(ms, pds, fixup_start_addr,
+                                            host_ratio);
+            }
+
+            /* Clean up the bitmap */
+            for (page = fixup_start_addr;
+                 page < fixup_start_addr + host_ratio; page++) {
+                /* All pages in this host page are now not sent */
+                set_bit(page, unsentmap);
+
+                /*
+                 * Remark them as dirty, updating the count for any pages
+                 * that weren't previously dirty.
+                 */
+                migration_dirty_pages += !test_and_set_bit(page, bitmap);
+            }
+        }
+
+        if (unsent_pass) {
+            /* Find the next sent page for the next iteration */
+            run_start = find_next_zero_bit(unsentmap, last + 1,
+                                           run_start);
+        } else {
+            /* Find the next dirty page for the next iteration */
+            run_start = find_next_bit(bitmap, last + 1, run_start);
+        }
+    }
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *
+ * Discard any partially sent host-page size chunks, mark any partially
+ * dirty host-page size chunks as all dirty.
+ *
+ * Returns: 0 on success
+ */
+static int postcopy_chunk_hostpages(MigrationState *ms)
+{
+    struct RAMBlock *block;
+
+    if (qemu_host_page_size == TARGET_PAGE_SIZE) {
+        /* Easy case - TPS==HPS - nothing to be done */
+        return 0;
+    }
+
+    /* Easiest way to make sure we don't resume in the middle of a host-page */
+    last_seen_block = NULL;
+    last_sent_block = NULL;
+    last_offset     = 0;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        unsigned long first = block->offset >> TARGET_PAGE_BITS;
+
+        PostcopyDiscardState *pds =
+                         postcopy_discard_send_init(ms, first, block->idstr);
+
+        /* First pass: Discard all partially sent host pages */
+        postcopy_chunk_hostpages_pass(ms, true, block, pds);
+        /*
+         * Second pass: Ensure that all partially dirty host pages are made
+         * fully dirty.
+         */
+        postcopy_chunk_hostpages_pass(ms, false, block, pds);
+
+        postcopy_discard_send_finish(ms, pds);
+    } /* ram_list loop */
+
+    return 0;
+}
+
+/*
+ * Transmit the set of pages to be discarded after precopy to the target
+ * these are pages that:
+ *     a) Have been previously transmitted but are now dirty again
+ *     b) Pages that have never been transmitted, this ensures that
+ *        any pages on the destination that have been mapped by background
+ *        tasks get discarded (transparent huge pages is the specific concern)
+ * Hopefully this is pretty sparse
+ */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms)
+{
+    int ret;
+    unsigned long *bitmap, *unsentmap;
+
+    rcu_read_lock();
+
+    /* This should be our last sync, the src is now paused */
+    migration_bitmap_sync();
+
+    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+    if (!unsentmap) {
+        /* We don't have a safe way to resize the sentmap, so
+         * if the bitmap was resized it will be NULL at this
+         * point.
+         */
+        error_report("migration ram resized during precopy phase");
+        rcu_read_unlock();
+        return -EINVAL;
+    }
+
+    /* Deal with TPS != HPS */
+    ret = postcopy_chunk_hostpages(ms);
+    if (ret) {
+        rcu_read_unlock();
+        return ret;
+    }
+
+    /*
+     * Update the unsentmap to be unsentmap = unsentmap | dirty
+     */
+    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+    bitmap_or(unsentmap, unsentmap, bitmap,
+               last_ram_offset() >> TARGET_PAGE_BITS);
+
+
+    trace_ram_postcopy_send_discard_bitmap();
+#ifdef DEBUG_POSTCOPY
+    ram_debug_dump_bitmap(unsentmap, true);
+#endif
+
+    ret = postcopy_each_ram_send_discard(ms);
+    rcu_read_unlock();
+
+    return ret;
+}
+
+/*
+ * At the start of the postcopy phase of migration, any now-dirty
+ * precopied pages are discarded.
+ *
+ * start, length describe a byte address range within the RAMBlock
+ *
+ * Returns 0 on success.
+ */
+int ram_discard_range(MigrationIncomingState *mis,
+                      const char *block_name,
+                      uint64_t start, size_t length)
+{
+    int ret = -1;
+
+    rcu_read_lock();
+    RAMBlock *rb = qemu_ram_block_by_name(block_name);
+
+    if (!rb) {
+        error_report("ram_discard_range: Failed to find block '%s'",
+                     block_name);
+        goto err;
+    }
+
+    uint8_t *host_startaddr = rb->host + start;
+
+    if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
+        error_report("ram_discard_range: Unaligned start address: %p",
+                     host_startaddr);
+        goto err;
+    }
+
+    if ((start + length) <= rb->used_length) {
+        uint8_t *host_endaddr = host_startaddr + length;
+        if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
+            error_report("ram_discard_range: Unaligned end address: %p",
+                         host_endaddr);
+            goto err;
+        }
+        ret = postcopy_ram_discard_range(mis, host_startaddr, length);
+    } else {
+        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
+                     "/%zu/%zu)",
+                     block_name, start, length, rb->used_length);
+    }
+
+err:
+    rcu_read_unlock();
+
+    return ret;
+}
+
+
 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
  * long-running RCU critical section.  When rcu-reclaims in the code
  * start to become numerous it will be necessary to reduce the
@@ -1214,10 +1910,15 @@
     reset_ram_globals();
 
     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
-    migration_bitmap_rcu = g_new(struct BitmapRcu, 1);
+    migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
 
+    if (migrate_postcopy_ram()) {
+        migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
+        bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
+    }
+
     /*
      * Count the total number of pages used by ram blocks not including any
      * gaps due to alignment or unplugs.
@@ -1317,7 +2018,9 @@
 {
     rcu_read_lock();
 
-    migration_bitmap_sync();
+    if (!migration_in_postcopy(migrate_get_current())) {
+        migration_bitmap_sync();
+    }
 
     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
 
@@ -1344,13 +2047,16 @@
     return 0;
 }
 
-static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+                             uint64_t *non_postcopiable_pending,
+                             uint64_t *postcopiable_pending)
 {
     uint64_t remaining_size;
 
     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
 
-    if (remaining_size < max_size) {
+    if (!migration_in_postcopy(migrate_get_current()) &&
+        remaining_size < max_size) {
         qemu_mutex_lock_iothread();
         rcu_read_lock();
         migration_bitmap_sync();
@@ -1358,7 +2064,9 @@
         qemu_mutex_unlock_iothread();
         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
     }
-    return remaining_size;
+
+    /* We can do postcopy, and all the data is postcopiable */
+    *postcopiable_pending += remaining_size;
 }
 
 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
@@ -1399,6 +2107,14 @@
 /* Must be called from within a rcu critical section.
  * Returns a pointer from within the RCU-protected ram_list.
  */
+/*
+ * Read a RAMBlock ID from the stream f, find the host address of the
+ * start of that block and add on 'offset'
+ *
+ * f: Stream to read from
+ * offset: Offset within the block
+ * flags: Page flags (mostly to see if it's a continuation of previous block)
+ */
 static inline void *host_from_stream_offset(QEMUFile *f,
                                             ram_addr_t offset,
                                             int flags)
@@ -1420,14 +2136,12 @@
     qemu_get_buffer(f, (uint8_t *)id, len);
     id[len] = 0;
 
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-        if (!strncmp(id, block->idstr, sizeof(id)) &&
-            block->max_length > offset) {
-            return block->host + offset;
-        }
+    block = qemu_ram_block_by_name(id);
+    if (block && block->max_length > offset) {
+        return block->host + offset;
     }
 
-    error_report("Can't find block %s!", id);
+    error_report("Can't find block %s", id);
     return NULL;
 }
 
@@ -1535,11 +2249,148 @@
     }
 }
 
+/*
+ * Allocate data structures etc needed by incoming migration with postcopy-ram
+ * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
+ */
+int ram_postcopy_incoming_init(MigrationIncomingState *mis)
+{
+    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    return postcopy_ram_incoming_init(mis, ram_pages);
+}
+
+/*
+ * Called in postcopy mode by ram_load().
+ * rcu_read_lock is taken prior to this being called.
+ */
+static int ram_load_postcopy(QEMUFile *f)
+{
+    int flags = 0, ret = 0;
+    bool place_needed = false;
+    bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    /* Temporary page that is later 'placed' */
+    void *postcopy_host_page = postcopy_get_tmp_page(mis);
+    void *last_host = NULL;
+
+    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+        ram_addr_t addr;
+        void *host = NULL;
+        void *page_buffer = NULL;
+        void *place_source = NULL;
+        uint8_t ch;
+        bool all_zero = false;
+
+        addr = qemu_get_be64(f);
+        flags = addr & ~TARGET_PAGE_MASK;
+        addr &= TARGET_PAGE_MASK;
+
+        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
+        place_needed = false;
+        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
+            host = host_from_stream_offset(f, addr, flags);
+            if (!host) {
+                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+                ret = -EINVAL;
+                break;
+            }
+            page_buffer = host;
+            /*
+             * Postcopy requires that we place whole host pages atomically.
+             * To make it atomic, the data is read into a temporary page
+             * that's moved into place later.
+             * The migration protocol uses,  possibly smaller, target-pages
+             * however the source ensures it always sends all the components
+             * of a host page in order.
+             */
+            page_buffer = postcopy_host_page +
+                          ((uintptr_t)host & ~qemu_host_page_mask);
+            /* If all TP are zero then we can optimise the place */
+            if (!((uintptr_t)host & ~qemu_host_page_mask)) {
+                all_zero = true;
+            } else {
+                /* not the 1st TP within the HP */
+                if (host != (last_host + TARGET_PAGE_SIZE)) {
+                    error_report("Non-sequential target page %p/%p\n",
+                                  host, last_host);
+                    ret = -EINVAL;
+                    break;
+                }
+            }
+
+
+            /*
+             * If it's the last part of a host page then we place the host
+             * page
+             */
+            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
+                                     ~qemu_host_page_mask) == 0;
+            place_source = postcopy_host_page;
+        }
+        last_host = host;
+
+        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
+        case RAM_SAVE_FLAG_COMPRESS:
+            ch = qemu_get_byte(f);
+            memset(page_buffer, ch, TARGET_PAGE_SIZE);
+            if (ch) {
+                all_zero = false;
+            }
+            break;
+
+        case RAM_SAVE_FLAG_PAGE:
+            all_zero = false;
+            if (!place_needed || !matching_page_sizes) {
+                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
+            } else {
+                /* Avoids the qemu_file copy during postcopy, which is
+                 * going to do a copy later; can only do it when we
+                 * do this read in one go (matching page sizes)
+                 */
+                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
+                                         TARGET_PAGE_SIZE);
+            }
+            break;
+        case RAM_SAVE_FLAG_EOS:
+            /* normal exit */
+            break;
+        default:
+            error_report("Unknown combination of migration flags: %#x"
+                         " (postcopy mode)", flags);
+            ret = -EINVAL;
+        }
+
+        if (place_needed) {
+            /* This gets called at the last target page in the host page */
+            if (all_zero) {
+                ret = postcopy_place_page_zero(mis,
+                                               host + TARGET_PAGE_SIZE -
+                                               qemu_host_page_size);
+            } else {
+                ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
+                                               qemu_host_page_size,
+                                               place_source);
+            }
+        }
+        if (!ret) {
+            ret = qemu_file_get_error(f);
+        }
+    }
+
+    return ret;
+}
+
 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
     int flags = 0, ret = 0;
     static uint64_t seq_iter;
     int len = 0;
+    /*
+     * If system is running in postcopy mode, page inserts to host memory must
+     * be atomic
+     */
+    bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
 
     seq_iter++;
 
@@ -1553,15 +2404,30 @@
      * critical section.
      */
     rcu_read_lock();
-    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+
+    if (postcopy_running) {
+        ret = ram_load_postcopy(f);
+    }
+
+    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
         ram_addr_t addr, total_ram_bytes;
-        void *host;
+        void *host = NULL;
         uint8_t ch;
 
         addr = qemu_get_be64(f);
         flags = addr & ~TARGET_PAGE_MASK;
         addr &= TARGET_PAGE_MASK;
 
+        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
+                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
+            host = host_from_stream_offset(f, addr, flags);
+            if (!host) {
+                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+                ret = -EINVAL;
+                break;
+            }
+        }
+
         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
         case RAM_SAVE_FLAG_MEM_SIZE:
             /* Synchronize RAM block list */
@@ -1576,23 +2442,20 @@
                 id[len] = 0;
                 length = qemu_get_be64(f);
 
-                QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-                    if (!strncmp(id, block->idstr, sizeof(id))) {
-                        if (length != block->used_length) {
-                            Error *local_err = NULL;
+                block = qemu_ram_block_by_name(id);
+                if (block) {
+                    if (length != block->used_length) {
+                        Error *local_err = NULL;
 
-                            ret = qemu_ram_resize(block->offset, length, &local_err);
-                            if (local_err) {
-                                error_report_err(local_err);
-                            }
+                        ret = qemu_ram_resize(block->offset, length,
+                                              &local_err);
+                        if (local_err) {
+                            error_report_err(local_err);
                         }
-                        ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
-                                              block->idstr);
-                        break;
                     }
-                }
-
-                if (!block) {
+                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
+                                          block->idstr);
+                } else {
                     error_report("Unknown ramblock \"%s\", cannot "
                                  "accept migration", id);
                     ret = -EINVAL;
@@ -1601,33 +2464,17 @@
                 total_ram_bytes -= length;
             }
             break;
+
         case RAM_SAVE_FLAG_COMPRESS:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
             ch = qemu_get_byte(f);
             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
             break;
+
         case RAM_SAVE_FLAG_PAGE:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
             break;
-        case RAM_SAVE_FLAG_COMPRESS_PAGE:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
 
+        case RAM_SAVE_FLAG_COMPRESS_PAGE:
             len = qemu_get_be32(f);
             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
                 error_report("Invalid compressed data length: %d", len);
@@ -1637,13 +2484,8 @@
             qemu_get_buffer(f, compressed_data_buf, len);
             decompress_data_with_multi_threads(compressed_data_buf, host, len);
             break;
+
         case RAM_SAVE_FLAG_XBZRLE:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
             if (load_xbzrle(f, addr, host) < 0) {
                 error_report("Failed to decompress XBZRLE page at "
                              RAM_ADDR_FMT, addr);
@@ -1677,7 +2519,8 @@
 static SaveVMHandlers savevm_ram_handlers = {
     .save_live_setup = ram_save_setup,
     .save_live_iterate = ram_save_iterate,
-    .save_live_complete = ram_save_complete,
+    .save_live_complete_postcopy = ram_save_complete,
+    .save_live_complete_precopy = ram_save_complete,
     .save_live_pending = ram_save_pending,
     .load_state = ram_load,
     .cleanup = ram_migration_cleanup,
diff --git a/migration/savevm.c b/migration/savevm.c
index e05158d..be52314 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -37,6 +37,7 @@
 #include "qemu/timer.h"
 #include "audio/audio.h"
 #include "migration/migration.h"
+#include "migration/postcopy-ram.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
@@ -45,6 +46,7 @@
 #include "exec/memory.h"
 #include "qmp-commands.h"
 #include "trace.h"
+#include "qemu/bitops.h"
 #include "qemu/iov.h"
 #include "block/snapshot.h"
 #include "block/qapi.h"
@@ -57,8 +59,26 @@
 #define ARP_PTYPE_IP 0x0800
 #define ARP_OP_REQUEST_REV 0x3
 
+const unsigned int postcopy_ram_discard_version = 0;
+
 static bool skip_section_footers;
 
+static struct mig_cmd_args {
+    ssize_t     len; /* -1 = variable */
+    const char *name;
+} mig_cmd_args[] = {
+    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
+    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
+    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
+    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = 16, .name = "POSTCOPY_ADVISE" },
+    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
+    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
+    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
+                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
+    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
+    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
+};
+
 static int announce_self_create(uint8_t *buf,
                                 uint8_t *mac_addr)
 {
@@ -694,6 +714,156 @@
     }
 }
 
+/**
+ * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
+ *                           command and associated data.
+ *
+ * @f: File to send command on
+ * @command: Command type to send
+ * @len: Length of associated data
+ * @data: Data associated with command.
+ */
+void qemu_savevm_command_send(QEMUFile *f,
+                              enum qemu_vm_cmd command,
+                              uint16_t len,
+                              uint8_t *data)
+{
+    trace_savevm_command_send(command, len);
+    qemu_put_byte(f, QEMU_VM_COMMAND);
+    qemu_put_be16(f, (uint16_t)command);
+    qemu_put_be16(f, len);
+    qemu_put_buffer(f, data, len);
+    qemu_fflush(f);
+}
+
+void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
+{
+    uint32_t buf;
+
+    trace_savevm_send_ping(value);
+    buf = cpu_to_be32(value);
+    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
+}
+
+void qemu_savevm_send_open_return_path(QEMUFile *f)
+{
+    trace_savevm_send_open_return_path();
+    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
+}
+
+/* We have a buffer of data to send; we don't want that all to be loaded
+ * by the command itself, so the command contains just the length of the
+ * extra buffer that we then send straight after it.
+ * TODO: Must be a better way to organise that
+ *
+ * Returns:
+ *    0 on success
+ *    -ve on error
+ */
+int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb)
+{
+    size_t cur_iov;
+    size_t len = qsb_get_length(qsb);
+    uint32_t tmp;
+
+    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
+        error_report("%s: Unreasonably large packaged state: %zu",
+                     __func__, len);
+        return -1;
+    }
+
+    tmp = cpu_to_be32(len);
+
+    trace_qemu_savevm_send_packaged();
+    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
+
+    /* all the data follows (concatinating the iov's) */
+    for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) {
+        /* The iov entries are partially filled */
+        size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len);
+        len -= towrite;
+
+        if (!towrite) {
+            break;
+        }
+
+        qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite);
+    }
+
+    return 0;
+}
+
+/* Send prior to any postcopy transfer */
+void qemu_savevm_send_postcopy_advise(QEMUFile *f)
+{
+    uint64_t tmp[2];
+    tmp[0] = cpu_to_be64(getpagesize());
+    tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits());
+
+    trace_qemu_savevm_send_postcopy_advise();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
+}
+
+/* Sent prior to starting the destination running in postcopy, discard pages
+ * that have already been sent but redirtied on the source.
+ * CMD_POSTCOPY_RAM_DISCARD consist of:
+ *      byte   version (0)
+ *      byte   Length of name field (not including 0)
+ *  n x byte   RAM block name
+ *      byte   0 terminator (just for safety)
+ *  n x        Byte ranges within the named RAMBlock
+ *      be64   Start of the range
+ *      be64   Length
+ *
+ *  name:  RAMBlock name that these entries are part of
+ *  len: Number of page entries
+ *  start_list: 'len' addresses
+ *  length_list: 'len' addresses
+ *
+ */
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+                                           uint16_t len,
+                                           uint64_t *start_list,
+                                           uint64_t *length_list)
+{
+    uint8_t *buf;
+    uint16_t tmplen;
+    uint16_t t;
+    size_t name_len = strlen(name);
+
+    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
+    assert(name_len < 256);
+    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
+    buf[0] = postcopy_ram_discard_version;
+    buf[1] = name_len;
+    memcpy(buf + 2, name, name_len);
+    tmplen = 2 + name_len;
+    buf[tmplen++] = '\0';
+
+    for (t = 0; t < len; t++) {
+        cpu_to_be64w((uint64_t *)(buf + tmplen), start_list[t]);
+        tmplen += 8;
+        cpu_to_be64w((uint64_t *)(buf + tmplen), length_list[t]);
+        tmplen += 8;
+    }
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
+    g_free(buf);
+}
+
+/* Get the destination into a state where it can receive postcopy data. */
+void qemu_savevm_send_postcopy_listen(QEMUFile *f)
+{
+    trace_savevm_send_postcopy_listen();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
+}
+
+/* Kick the destination into running */
+void qemu_savevm_send_postcopy_run(QEMUFile *f)
+{
+    trace_savevm_send_postcopy_run();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
+}
+
 bool qemu_savevm_state_blocked(Error **errp)
 {
     SaveStateEntry *se;
@@ -713,6 +883,12 @@
     trace_savevm_state_header();
     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
+
+    if (!savevm_state.skip_configuration) {
+        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
+        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
+    }
+
 }
 
 void qemu_savevm_state_begin(QEMUFile *f,
@@ -729,11 +905,6 @@
         se->ops->set_params(params, se->opaque);
     }
 
-    if (!savevm_state.skip_configuration) {
-        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
-        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
-    }
-
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
         if (!se->ops || !se->ops->save_live_setup) {
             continue;
@@ -760,7 +931,7 @@
  *   0 : We haven't finished, caller have to go again
  *   1 : We have finished, we can go to complete phase
  */
-int qemu_savevm_state_iterate(QEMUFile *f)
+int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
 {
     SaveStateEntry *se;
     int ret = 1;
@@ -775,6 +946,15 @@
                 continue;
             }
         }
+        /*
+         * In the postcopy phase, any device that doesn't know how to
+         * do postcopy should have saved it's state in the _complete
+         * call that's already run, it might get confused if we call
+         * iterate afterwards.
+         */
+        if (postcopy && !se->ops->save_live_complete_postcopy) {
+            continue;
+        }
         if (qemu_file_rate_limit(f)) {
             return 0;
         }
@@ -803,22 +983,65 @@
 static bool should_send_vmdesc(void)
 {
     MachineState *machine = MACHINE(qdev_get_machine());
-    return !machine->suppress_vmdesc;
+    bool in_postcopy = migration_in_postcopy(migrate_get_current());
+    return !machine->suppress_vmdesc && !in_postcopy;
 }
 
-void qemu_savevm_state_complete(QEMUFile *f)
+/*
+ * Calls the save_live_complete_postcopy methods
+ * causing the last few pages to be sent immediately and doing any associated
+ * cleanup.
+ * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
+ * all the other devices, but that happens at the point we switch to postcopy.
+ */
+void qemu_savevm_state_complete_postcopy(QEMUFile *f)
+{
+    SaveStateEntry *se;
+    int ret;
+
+    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+        if (!se->ops || !se->ops->save_live_complete_postcopy) {
+            continue;
+        }
+        if (se->ops && se->ops->is_active) {
+            if (!se->ops->is_active(se->opaque)) {
+                continue;
+            }
+        }
+        trace_savevm_section_start(se->idstr, se->section_id);
+        /* Section type */
+        qemu_put_byte(f, QEMU_VM_SECTION_END);
+        qemu_put_be32(f, se->section_id);
+
+        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
+        trace_savevm_section_end(se->idstr, se->section_id, ret);
+        save_section_footer(f, se);
+        if (ret < 0) {
+            qemu_file_set_error(f, ret);
+            return;
+        }
+    }
+
+    qemu_put_byte(f, QEMU_VM_EOF);
+    qemu_fflush(f);
+}
+
+void qemu_savevm_state_complete_precopy(QEMUFile *f)
 {
     QJSON *vmdesc;
     int vmdesc_len;
     SaveStateEntry *se;
     int ret;
+    bool in_postcopy = migration_in_postcopy(migrate_get_current());
 
-    trace_savevm_state_complete();
+    trace_savevm_state_complete_precopy();
 
     cpu_synchronize_all_states();
 
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
-        if (!se->ops || !se->ops->save_live_complete) {
+        if (!se->ops ||
+            (in_postcopy && se->ops->save_live_complete_postcopy) ||
+            !se->ops->save_live_complete_precopy) {
             continue;
         }
         if (se->ops && se->ops->is_active) {
@@ -830,7 +1053,7 @@
 
         save_section_header(f, se, QEMU_VM_SECTION_END);
 
-        ret = se->ops->save_live_complete(f, se->opaque);
+        ret = se->ops->save_live_complete_precopy(f, se->opaque);
         trace_savevm_section_end(se->idstr, se->section_id, ret);
         save_section_footer(f, se);
         if (ret < 0) {
@@ -867,7 +1090,10 @@
         save_section_footer(f, se);
     }
 
-    qemu_put_byte(f, QEMU_VM_EOF);
+    if (!in_postcopy) {
+        /* Postcopy stream will still be going */
+        qemu_put_byte(f, QEMU_VM_EOF);
+    }
 
     json_end_array(vmdesc);
     qjson_finish(vmdesc);
@@ -883,10 +1109,19 @@
     qemu_fflush(f);
 }
 
-uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
+/* Give an estimate of the amount left to be transferred,
+ * the result is split into the amount for units that can and
+ * for units that can't do postcopy.
+ */
+void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
+                               uint64_t *res_non_postcopiable,
+                               uint64_t *res_postcopiable)
 {
     SaveStateEntry *se;
-    uint64_t ret = 0;
+
+    *res_non_postcopiable = 0;
+    *res_postcopiable = 0;
+
 
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
         if (!se->ops || !se->ops->save_live_pending) {
@@ -897,9 +1132,9 @@
                 continue;
             }
         }
-        ret += se->ops->save_live_pending(f, se->opaque, max_size);
+        se->ops->save_live_pending(f, se->opaque, max_size,
+                                   res_non_postcopiable, res_postcopiable);
     }
-    return ret;
 }
 
 void qemu_savevm_state_cleanup(void)
@@ -921,6 +1156,8 @@
         .blk = 0,
         .shared = 0
     };
+    MigrationState *ms = migrate_init(&params);
+    ms->file = f;
 
     if (qemu_savevm_state_blocked(errp)) {
         return -EINVAL;
@@ -932,18 +1169,18 @@
     qemu_mutex_lock_iothread();
 
     while (qemu_file_get_error(f) == 0) {
-        if (qemu_savevm_state_iterate(f) > 0) {
+        if (qemu_savevm_state_iterate(f, false) > 0) {
             break;
         }
     }
 
     ret = qemu_file_get_error(f);
     if (ret == 0) {
-        qemu_savevm_state_complete(f);
+        qemu_savevm_state_complete_precopy(f);
         ret = qemu_file_get_error(f);
     }
+    qemu_savevm_state_cleanup();
     if (ret != 0) {
-        qemu_savevm_state_cleanup();
         error_setg_errno(errp, -ret, "Error while writing VM state");
     }
     return ret;
@@ -1001,6 +1238,420 @@
     return NULL;
 }
 
+enum LoadVMExitCodes {
+    /* Allow a command to quit all layers of nested loadvm loops */
+    LOADVM_QUIT     =  1,
+};
+
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
+
+/* ------ incoming postcopy messages ------ */
+/* 'advise' arrives before any transfers just to tell us that a postcopy
+ * *might* happen - it might be skipped if precopy transferred everything
+ * quickly.
+ */
+static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+    uint64_t remote_hps, remote_tps;
+
+    trace_loadvm_postcopy_handle_advise();
+    if (ps != POSTCOPY_INCOMING_NONE) {
+        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+
+    if (!postcopy_ram_supported_by_host()) {
+        return -1;
+    }
+
+    remote_hps = qemu_get_be64(mis->from_src_file);
+    if (remote_hps != getpagesize())  {
+        /*
+         * Some combinations of mismatch are probably possible but it gets
+         * a bit more complicated.  In particular we need to place whole
+         * host pages on the dest at once, and we need to ensure that we
+         * handle dirtying to make sure we never end up sending part of
+         * a hostpage on it's own.
+         */
+        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
+                     (int)remote_hps, getpagesize());
+        return -1;
+    }
+
+    remote_tps = qemu_get_be64(mis->from_src_file);
+    if (remote_tps != (1ul << qemu_target_page_bits())) {
+        /*
+         * Again, some differences could be dealt with, but for now keep it
+         * simple.
+         */
+        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
+                     (int)remote_tps, 1 << qemu_target_page_bits());
+        return -1;
+    }
+
+    if (ram_postcopy_incoming_init(mis)) {
+        return -1;
+    }
+
+    postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+
+    return 0;
+}
+
+/* After postcopy we will be told to throw some pages away since they're
+ * dirty and will have to be demand fetched.  Must happen before CPU is
+ * started.
+ * There can be 0..many of these messages, each encoding multiple pages.
+ */
+static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
+                                              uint16_t len)
+{
+    int tmp;
+    char ramid[256];
+    PostcopyState ps = postcopy_state_get();
+
+    trace_loadvm_postcopy_ram_handle_discard();
+
+    switch (ps) {
+    case POSTCOPY_INCOMING_ADVISE:
+        /* 1st discard */
+        tmp = postcopy_ram_prepare_discard(mis);
+        if (tmp) {
+            return tmp;
+        }
+        break;
+
+    case POSTCOPY_INCOMING_DISCARD:
+        /* Expected state */
+        break;
+
+    default:
+        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
+                     ps);
+        return -1;
+    }
+    /* We're expecting a
+     *    Version (0)
+     *    a RAM ID string (length byte, name, 0 term)
+     *    then at least 1 16 byte chunk
+    */
+    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+        return -1;
+    }
+
+    tmp = qemu_get_byte(mis->from_src_file);
+    if (tmp != postcopy_ram_discard_version) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
+        return -1;
+    }
+
+    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
+        return -1;
+    }
+    tmp = qemu_get_byte(mis->from_src_file);
+    if (tmp != 0) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
+        return -1;
+    }
+
+    len -= 3 + strlen(ramid);
+    if (len % 16) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+        return -1;
+    }
+    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
+    while (len) {
+        uint64_t start_addr, block_length;
+        start_addr = qemu_get_be64(mis->from_src_file);
+        block_length = qemu_get_be64(mis->from_src_file);
+
+        len -= 16;
+        int ret = ram_discard_range(mis, ramid, start_addr,
+                                    block_length);
+        if (ret) {
+            return ret;
+        }
+    }
+    trace_loadvm_postcopy_ram_handle_discard_end();
+
+    return 0;
+}
+
+/*
+ * Triggered by a postcopy_listen command; this thread takes over reading
+ * the input stream, leaving the main thread free to carry on loading the rest
+ * of the device state (from RAM).
+ * (TODO:This could do with being in a postcopy file - but there again it's
+ * just another input loop, not that postcopy specific)
+ */
+static void *postcopy_ram_listen_thread(void *opaque)
+{
+    QEMUFile *f = opaque;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    int load_res;
+
+    qemu_sem_post(&mis->listen_thread_sem);
+    trace_postcopy_ram_listen_thread_start();
+
+    /*
+     * Because we're a thread and not a coroutine we can't yield
+     * in qemu_file, and thus we must be blocking now.
+     */
+    qemu_file_set_blocking(f, true);
+    load_res = qemu_loadvm_state_main(f, mis);
+    /* And non-blocking again so we don't block in any cleanup */
+    qemu_file_set_blocking(f, false);
+
+    trace_postcopy_ram_listen_thread_exit();
+    if (load_res < 0) {
+        error_report("%s: loadvm failed: %d", __func__, load_res);
+        qemu_file_set_error(f, load_res);
+    } else {
+        /*
+         * This looks good, but it's possible that the device loading in the
+         * main thread hasn't finished yet, and so we might not be in 'RUN'
+         * state yet; wait for the end of the main thread.
+         */
+        qemu_event_wait(&mis->main_thread_load_event);
+    }
+    postcopy_ram_incoming_cleanup(mis);
+    /*
+     * If everything has worked fine, then the main thread has waited
+     * for us to start, and we're the last use of the mis.
+     * (If something broke then qemu will have to exit anyway since it's
+     * got a bad migration state).
+     */
+    migration_incoming_state_destroy();
+
+    if (load_res < 0) {
+        /*
+         * If something went wrong then we have a bad state so exit;
+         * depending how far we got it might be possible at this point
+         * to leave the guest running and fire MCEs for pages that never
+         * arrived as a desperate recovery step.
+         */
+        exit(EXIT_FAILURE);
+    }
+
+    return NULL;
+}
+
+/* After this message we must be able to immediately receive postcopy data */
+static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
+    trace_loadvm_postcopy_handle_listen();
+    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
+        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+    if (ps == POSTCOPY_INCOMING_ADVISE) {
+        /*
+         * A rare case, we entered listen without having to do any discards,
+         * so do the setup that's normally done at the time of the 1st discard.
+         */
+        postcopy_ram_prepare_discard(mis);
+    }
+
+    /*
+     * Sensitise RAM - can now generate requests for blocks that don't exist
+     * However, at this point the CPU shouldn't be running, and the IO
+     * shouldn't be doing anything yet so don't actually expect requests
+     */
+    if (postcopy_ram_enable_notify(mis)) {
+        return -1;
+    }
+
+    if (mis->have_listen_thread) {
+        error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
+        return -1;
+    }
+
+    mis->have_listen_thread = true;
+    /* Start up the listening thread and wait for it to signal ready */
+    qemu_sem_init(&mis->listen_thread_sem, 0);
+    qemu_thread_create(&mis->listen_thread, "postcopy/listen",
+                       postcopy_ram_listen_thread, mis->from_src_file,
+                       QEMU_THREAD_JOINABLE);
+    qemu_sem_wait(&mis->listen_thread_sem);
+    qemu_sem_destroy(&mis->listen_thread_sem);
+
+    return 0;
+}
+
+/* After all discards we can start running and asking for pages */
+static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
+    Error *local_err = NULL;
+
+    trace_loadvm_postcopy_handle_run();
+    if (ps != POSTCOPY_INCOMING_LISTENING) {
+        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+
+    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
+     * in migration.c
+     */
+    cpu_synchronize_all_post_init();
+
+    qemu_announce_self();
+
+    /* Make sure all file formats flush their mutable metadata */
+    bdrv_invalidate_cache_all(&local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -1;
+    }
+
+    trace_loadvm_postcopy_handle_run_cpu_sync();
+    cpu_synchronize_all_post_init();
+
+    trace_loadvm_postcopy_handle_run_vmstart();
+
+    if (autostart) {
+        /* Hold onto your hats, starting the CPU */
+        vm_start();
+    } else {
+        /* leave it paused and let management decide when to start the CPU */
+        runstate_set(RUN_STATE_PAUSED);
+    }
+
+    /* We need to finish reading the stream from the package
+     * and also stop reading anything more from the stream that loaded the
+     * package (since it's now being read by the listener thread).
+     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
+     */
+    return LOADVM_QUIT;
+}
+
+/**
+ * Immediately following this command is a blob of data containing an embedded
+ * chunk of migration stream; read it and load it.
+ *
+ * @mis: Incoming state
+ * @length: Length of packaged data to read
+ *
+ * Returns: Negative values on error
+ *
+ */
+static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
+{
+    int ret;
+    uint8_t *buffer;
+    uint32_t length;
+    QEMUSizedBuffer *qsb;
+
+    length = qemu_get_be32(mis->from_src_file);
+    trace_loadvm_handle_cmd_packaged(length);
+
+    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
+        error_report("Unreasonably large packaged state: %u", length);
+        return -1;
+    }
+    buffer = g_malloc0(length);
+    ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length);
+    if (ret != length) {
+        g_free(buffer);
+        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d\n",
+                ret, length);
+        return (ret < 0) ? ret : -EAGAIN;
+    }
+    trace_loadvm_handle_cmd_packaged_received(ret);
+
+    /* Setup a dummy QEMUFile that actually reads from the buffer */
+    qsb = qsb_create(buffer, length);
+    g_free(buffer); /* Because qsb_create copies */
+    if (!qsb) {
+        error_report("Unable to create qsb");
+    }
+    QEMUFile *packf = qemu_bufopen("r", qsb);
+
+    ret = qemu_loadvm_state_main(packf, mis);
+    trace_loadvm_handle_cmd_packaged_main(ret);
+    qemu_fclose(packf);
+    qsb_free(qsb);
+
+    return ret;
+}
+
+/*
+ * Process an incoming 'QEMU_VM_COMMAND'
+ * 0           just a normal return
+ * LOADVM_QUIT All good, but exit the loop
+ * <0          Error
+ */
+static int loadvm_process_command(QEMUFile *f)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    uint16_t cmd;
+    uint16_t len;
+    uint32_t tmp32;
+
+    cmd = qemu_get_be16(f);
+    len = qemu_get_be16(f);
+
+    trace_loadvm_process_command(cmd, len);
+    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
+        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
+        return -EINVAL;
+    }
+
+    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
+        error_report("%s received with bad length - expecting %zu, got %d",
+                     mig_cmd_args[cmd].name,
+                     (size_t)mig_cmd_args[cmd].len, len);
+        return -ERANGE;
+    }
+
+    switch (cmd) {
+    case MIG_CMD_OPEN_RETURN_PATH:
+        if (mis->to_src_file) {
+            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
+            /* Not really a problem, so don't give up */
+            return 0;
+        }
+        mis->to_src_file = qemu_file_get_return_path(f);
+        if (!mis->to_src_file) {
+            error_report("CMD_OPEN_RETURN_PATH failed");
+            return -1;
+        }
+        break;
+
+    case MIG_CMD_PING:
+        tmp32 = qemu_get_be32(f);
+        trace_loadvm_process_command_ping(tmp32);
+        if (!mis->to_src_file) {
+            error_report("CMD_PING (0x%x) received with no return path",
+                         tmp32);
+            return -1;
+        }
+        migrate_send_rp_pong(mis, tmp32);
+        break;
+
+    case MIG_CMD_PACKAGED:
+        return loadvm_handle_cmd_packaged(mis);
+
+    case MIG_CMD_POSTCOPY_ADVISE:
+        return loadvm_postcopy_handle_advise(mis);
+
+    case MIG_CMD_POSTCOPY_LISTEN:
+        return loadvm_postcopy_handle_listen(mis);
+
+    case MIG_CMD_POSTCOPY_RUN:
+        return loadvm_postcopy_handle_run(mis);
+
+    case MIG_CMD_POSTCOPY_RAM_DISCARD:
+        return loadvm_postcopy_ram_handle_discard(mis, len);
+    }
+
+    return 0;
+}
+
 struct LoadStateEntry {
     QLIST_ENTRY(LoadStateEntry) entry;
     SaveStateEntry *se;
@@ -1053,14 +1704,113 @@
     }
 }
 
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
+{
+    uint8_t section_type;
+    int ret;
+
+    while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
+        uint32_t instance_id, version_id, section_id;
+        SaveStateEntry *se;
+        LoadStateEntry *le;
+        char idstr[256];
+
+        trace_qemu_loadvm_state_section(section_type);
+        switch (section_type) {
+        case QEMU_VM_SECTION_START:
+        case QEMU_VM_SECTION_FULL:
+            /* Read section start */
+            section_id = qemu_get_be32(f);
+            if (!qemu_get_counted_string(f, idstr)) {
+                error_report("Unable to read ID string for section %u",
+                            section_id);
+                return -EINVAL;
+            }
+            instance_id = qemu_get_be32(f);
+            version_id = qemu_get_be32(f);
+
+            trace_qemu_loadvm_state_section_startfull(section_id, idstr,
+                                                      instance_id, version_id);
+            /* Find savevm section */
+            se = find_se(idstr, instance_id);
+            if (se == NULL) {
+                error_report("Unknown savevm section or instance '%s' %d",
+                             idstr, instance_id);
+                return -EINVAL;
+            }
+
+            /* Validate version */
+            if (version_id > se->version_id) {
+                error_report("savevm: unsupported version %d for '%s' v%d",
+                             version_id, idstr, se->version_id);
+                return -EINVAL;
+            }
+
+            /* Add entry */
+            le = g_malloc0(sizeof(*le));
+
+            le->se = se;
+            le->section_id = section_id;
+            le->version_id = version_id;
+            QLIST_INSERT_HEAD(&mis->loadvm_handlers, le, entry);
+
+            ret = vmstate_load(f, le->se, le->version_id);
+            if (ret < 0) {
+                error_report("error while loading state for instance 0x%x of"
+                             " device '%s'", instance_id, idstr);
+                return ret;
+            }
+            if (!check_section_footer(f, le)) {
+                return -EINVAL;
+            }
+            break;
+        case QEMU_VM_SECTION_PART:
+        case QEMU_VM_SECTION_END:
+            section_id = qemu_get_be32(f);
+
+            trace_qemu_loadvm_state_section_partend(section_id);
+            QLIST_FOREACH(le, &mis->loadvm_handlers, entry) {
+                if (le->section_id == section_id) {
+                    break;
+                }
+            }
+            if (le == NULL) {
+                error_report("Unknown savevm section %d", section_id);
+                return -EINVAL;
+            }
+
+            ret = vmstate_load(f, le->se, le->version_id);
+            if (ret < 0) {
+                error_report("error while loading state section id %d(%s)",
+                             section_id, le->se->idstr);
+                return ret;
+            }
+            if (!check_section_footer(f, le)) {
+                return -EINVAL;
+            }
+            break;
+        case QEMU_VM_COMMAND:
+            ret = loadvm_process_command(f);
+            trace_qemu_loadvm_state_section_command(ret);
+            if ((ret < 0) || (ret & LOADVM_QUIT)) {
+                return ret;
+            }
+            break;
+        default:
+            error_report("Unknown savevm section type %d", section_type);
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
 int qemu_loadvm_state(QEMUFile *f)
 {
     MigrationIncomingState *mis = migration_incoming_get_current();
     Error *local_err = NULL;
-    uint8_t section_type;
     unsigned int v;
     int ret;
-    int file_error_after_eof = -1;
 
     if (qemu_savevm_state_blocked(&local_err)) {
         error_report_err(local_err);
@@ -1095,99 +1845,19 @@
         }
     }
 
-    while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
-        uint32_t instance_id, version_id, section_id;
-        SaveStateEntry *se;
-        LoadStateEntry *le;
-        char idstr[256];
+    ret = qemu_loadvm_state_main(f, mis);
+    qemu_event_set(&mis->main_thread_load_event);
 
-        trace_qemu_loadvm_state_section(section_type);
-        switch (section_type) {
-        case QEMU_VM_SECTION_START:
-        case QEMU_VM_SECTION_FULL:
-            /* Read section start */
-            section_id = qemu_get_be32(f);
-            if (!qemu_get_counted_string(f, idstr)) {
-                error_report("Unable to read ID string for section %u",
-                            section_id);
-                return -EINVAL;
-            }
-            instance_id = qemu_get_be32(f);
-            version_id = qemu_get_be32(f);
+    trace_qemu_loadvm_state_post_main(ret);
 
-            trace_qemu_loadvm_state_section_startfull(section_id, idstr,
-                                                      instance_id, version_id);
-            /* Find savevm section */
-            se = find_se(idstr, instance_id);
-            if (se == NULL) {
-                error_report("Unknown savevm section or instance '%s' %d",
-                             idstr, instance_id);
-                ret = -EINVAL;
-                goto out;
-            }
-
-            /* Validate version */
-            if (version_id > se->version_id) {
-                error_report("savevm: unsupported version %d for '%s' v%d",
-                             version_id, idstr, se->version_id);
-                ret = -EINVAL;
-                goto out;
-            }
-
-            /* Add entry */
-            le = g_malloc0(sizeof(*le));
-
-            le->se = se;
-            le->section_id = section_id;
-            le->version_id = version_id;
-            QLIST_INSERT_HEAD(&mis->loadvm_handlers, le, entry);
-
-            ret = vmstate_load(f, le->se, le->version_id);
-            if (ret < 0) {
-                error_report("error while loading state for instance 0x%x of"
-                             " device '%s'", instance_id, idstr);
-                goto out;
-            }
-            if (!check_section_footer(f, le)) {
-                ret = -EINVAL;
-                goto out;
-            }
-            break;
-        case QEMU_VM_SECTION_PART:
-        case QEMU_VM_SECTION_END:
-            section_id = qemu_get_be32(f);
-
-            trace_qemu_loadvm_state_section_partend(section_id);
-            QLIST_FOREACH(le, &mis->loadvm_handlers, entry) {
-                if (le->section_id == section_id) {
-                    break;
-                }
-            }
-            if (le == NULL) {
-                error_report("Unknown savevm section %d", section_id);
-                ret = -EINVAL;
-                goto out;
-            }
-
-            ret = vmstate_load(f, le->se, le->version_id);
-            if (ret < 0) {
-                error_report("error while loading state section id %d(%s)",
-                             section_id, le->se->idstr);
-                goto out;
-            }
-            if (!check_section_footer(f, le)) {
-                ret = -EINVAL;
-                goto out;
-            }
-            break;
-        default:
-            error_report("Unknown savevm section type %d", section_type);
-            ret = -EINVAL;
-            goto out;
-        }
+    if (mis->have_listen_thread) {
+        /* Listen thread still going, can't clean up yet */
+        return ret;
     }
 
-    file_error_after_eof = qemu_file_get_error(f);
+    if (ret == 0) {
+        ret = qemu_file_get_error(f);
+    }
 
     /*
      * Try to read in the VMDESC section as well, so that dumping tools that
@@ -1199,10 +1869,10 @@
      * We also mustn't read data that isn't there; some transports (RDMA)
      * will stall waiting for that data when the source has already closed.
      */
-    if (should_send_vmdesc()) {
+    if (ret == 0 && should_send_vmdesc()) {
         uint8_t *buf;
         uint32_t size;
-        section_type = qemu_get_byte(f);
+        uint8_t  section_type = qemu_get_byte(f);
 
         if (section_type != QEMU_VM_VMDESCRIPTION) {
             error_report("Expected vmdescription section, but got %d",
@@ -1226,14 +1896,6 @@
 
     cpu_synchronize_all_post_init();
 
-    ret = 0;
-
-out:
-    if (ret == 0) {
-        /* We may not have a VMDESC section, so ignore relative errors */
-        ret = file_error_after_eof;
-    }
-
     return ret;
 }
 
diff --git a/qapi-schema.json b/qapi-schema.json
index e18f14c..8c3a42a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -430,6 +430,8 @@
 #
 # @active: in the process of doing migration.
 #
+# @postcopy-active: like active, but now in postcopy mode. (since 2.5)
+#
 # @completed: migration is finished.
 #
 # @failed: some error occurred during migration process.
@@ -439,7 +441,7 @@
 ##
 { 'enum': 'MigrationStatus',
   'data': [ 'none', 'setup', 'cancelling', 'cancelled',
-            'active', 'completed', 'failed' ] }
+            'active', 'postcopy-active', 'completed', 'failed' ] }
 
 ##
 # @MigrationInfo
@@ -540,11 +542,15 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #          to speed up convergence of RAM migration. (since 1.6)
 #
+# @x-postcopy-ram: Start executing on the migration target before all of RAM has
+#          been migrated, pulling the remaining pages along as needed. NOTE: If
+#          the migration fails during postcopy the VM will fail.  (since 2.5)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
   'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
-           'compress', 'events'] }
+           'compress', 'events', 'x-postcopy-ram'] }
 
 ##
 # @MigrationCapabilityStatus
@@ -698,6 +704,14 @@
             '*tls-port': 'int', '*cert-subject': 'str' } }
 
 ##
+# @migrate-start-postcopy
+#
+# Switch migration to postcopy mode
+#
+# Since: 2.5
+{ 'command': 'migrate-start-postcopy' }
+
+##
 # @MouseInfo:
 #
 # Information about a mouse device.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index d7cf0ff..7f85d40 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -718,6 +718,25 @@
 
 EQMP
     {
+        .name       = "migrate-start-postcopy",
+        .args_type  = "",
+        .mhandler.cmd_new = qmp_marshal_migrate_start_postcopy,
+    },
+
+SQMP
+migrate-start-postcopy
+----------------------
+
+Switch an in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
+
+Example:
+-> { "execute": "migrate-start-postcopy" }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "query-migrate-cache-size",
         .args_type  = "",
         .mhandler.cmd_new = qmp_marshal_query_migrate_cache_size,
diff --git a/qtest.c b/qtest.c
index 8e10340..05cefd2 100644
--- a/qtest.c
+++ b/qtest.c
@@ -657,7 +657,6 @@
 
     inbuf = g_string_new("");
     qtest_chr = chr;
-    page_size_init();
 }
 
 bool qtest_driver(void)
diff --git a/trace-events b/trace-events
index ea2d32e..ef6bc41 100644
--- a/trace-events
+++ b/trace-events
@@ -1202,16 +1202,43 @@
 
 # migration/savevm.c
 qemu_loadvm_state_section(unsigned int section_type) "%d"
+qemu_loadvm_state_section_command(int ret) "%d"
 qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
+qemu_loadvm_state_main(void) ""
+qemu_loadvm_state_main_quit_parent(void) ""
+qemu_loadvm_state_post_main(int ret) "%d"
 qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
+qemu_savevm_send_packaged(void) ""
+loadvm_handle_cmd_packaged(unsigned int length) "%u"
+loadvm_handle_cmd_packaged_main(int ret) "%d"
+loadvm_handle_cmd_packaged_received(int ret) "%d"
+loadvm_postcopy_handle_advise(void) ""
+loadvm_postcopy_handle_listen(void) ""
+loadvm_postcopy_handle_run(void) ""
+loadvm_postcopy_handle_run_cpu_sync(void) ""
+loadvm_postcopy_handle_run_vmstart(void) ""
+loadvm_postcopy_ram_handle_discard(void) ""
+loadvm_postcopy_ram_handle_discard_end(void) ""
+loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
+loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
+loadvm_process_command_ping(uint32_t val) "%x"
+postcopy_ram_listen_thread_exit(void) ""
+postcopy_ram_listen_thread_start(void) ""
+qemu_savevm_send_postcopy_advise(void) ""
+qemu_savevm_send_postcopy_ram_discard(const char *id, uint16_t len) "%s: %ud"
+savevm_command_send(uint16_t command, uint16_t len) "com=0x%x len=%d"
 savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
 savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
 savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u"
+savevm_send_open_return_path(void) ""
+savevm_send_ping(uint32_t val) "%x"
+savevm_send_postcopy_listen(void) ""
+savevm_send_postcopy_run(void) ""
 savevm_state_begin(void) ""
 savevm_state_header(void) ""
 savevm_state_iterate(void) ""
-savevm_state_complete(void) ""
 savevm_state_cleanup(void) ""
+savevm_state_complete_precopy(void) ""
 vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s"
 vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
 qemu_announce_self_iter(const char *mac) "%s"
@@ -1229,9 +1256,14 @@
 qemu_file_fclose(void) ""
 
 # migration/ram.c
+get_queued_page(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr) "%s/%" PRIx64 " ram_addr=%" PRIx64
+get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr, int sent) "%s/%" PRIx64 " ram_addr=%" PRIx64 " (sent=%d)"
 migration_bitmap_sync_start(void) ""
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
 migration_throttle(void) ""
+ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x"
+ram_postcopy_send_discard_bitmap(void) ""
+ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx"
 
 # hw/display/qxl.c
 disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
@@ -1421,17 +1453,40 @@
 flic_reset_failed(int err) "flic: reset failed %d"
 
 # migration.c
+await_return_path_close_on_source_close(void) ""
+await_return_path_close_on_source_joining(void) ""
 migrate_set_state(int new_state) "new state %d"
 migrate_fd_cleanup(void) ""
 migrate_fd_error(void) ""
 migrate_fd_cancel(void) ""
-migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
-migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
-migrate_state_too_big(void) ""
+migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at %zx len %zx"
+migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")"
+migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
+migration_completion_file_err(void) ""
+migration_completion_postcopy_end(void) ""
+migration_completion_postcopy_end_after_complete(void) ""
+migration_completion_postcopy_end_before_rp(void) ""
+migration_completion_postcopy_end_after_rp(int rp_error) "%d"
+migration_thread_after_loop(void) ""
+migration_thread_file_err(void) ""
+migration_thread_setup_complete(void) ""
+open_return_path_on_source(void) ""
+open_return_path_on_source_continue(void) ""
+postcopy_start(void) ""
+postcopy_start_set_run(void) ""
+source_return_path_thread_bad_end(void) ""
+source_return_path_thread_end(void) ""
+source_return_path_thread_entry(void) ""
+source_return_path_thread_loop_top(void) ""
+source_return_path_thread_pong(uint32_t val) "%x"
+source_return_path_thread_shut(uint32_t val) "%x"
 migrate_global_state_post_load(const char *state) "loaded state: %s"
 migrate_global_state_pre_save(const char *state) "saved state: %s"
-migration_completion_file_err(void) ""
 migration_thread_low_pending(uint64_t pending) "%" PRIu64
+migrate_state_too_big(void) ""
+migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
+process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
+process_incoming_migration_co_postcopy_end_main(void) ""
 
 # migration/rdma.c
 qemu_rdma_accept_incoming_migration(void) ""
@@ -1497,6 +1552,25 @@
 rdma_start_outgoing_migration_after_rdma_connect(void) ""
 rdma_start_outgoing_migration_after_rdma_source_init(void) ""
 
+# migration/postcopy-ram.c
+postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
+postcopy_discard_send_range(const char *ramblock, unsigned long start, unsigned long length) "%s:%lx/%lx"
+postcopy_ram_discard_range(void *start, size_t length) "%p,+%zx"
+postcopy_cleanup_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_init_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_place_page(void *host_addr) "host=%p"
+postcopy_place_page_zero(void *host_addr) "host=%p"
+postcopy_ram_enable_notify(void) ""
+postcopy_ram_fault_thread_entry(void) ""
+postcopy_ram_fault_thread_exit(void) ""
+postcopy_ram_fault_thread_quit(void) ""
+postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
+postcopy_ram_incoming_cleanup_closeuf(void) ""
+postcopy_ram_incoming_cleanup_entry(void) ""
+postcopy_ram_incoming_cleanup_exit(void) ""
+postcopy_ram_incoming_cleanup_join(void) ""
+
 # kvm-all.c
 kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
 kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
diff --git a/vl.c b/vl.c
index 21e8876..7d993a5 100644
--- a/vl.c
+++ b/vl.c
@@ -4285,6 +4285,7 @@
         exit(1);
     }
 
+    page_size_init();
     socket_init();
 
     if (qemu_opts_foreach(qemu_find_opts("object"),