Merge "Fix x86 KVM snapshot crash on save, and inability to load snapshots."
diff --git a/docs/QEMU-MEMORY-MANAGEMENT.TXT b/docs/QEMU-MEMORY-MANAGEMENT.TXT
new file mode 100644
index 0000000..2f6e38a
--- /dev/null
+++ b/docs/QEMU-MEMORY-MANAGEMENT.TXT
@@ -0,0 +1,223 @@
+An overview of memory management in QEMU:
+
+I. RAM Management:
+==================
+
+I.1. RAM Address space:
+-----------------------
+
+All pages of virtual RAM used by QEMU at runtime are allocated from
+contiguous blocks in a specific abstract "RAM address space".
+|ram_addr_t| is the type of block addresses in this space.
+
+A single block of contiguous RAM is allocated with 'qemu_ram_alloc()', which
+takes a size in bytes, and allocates the pages through mmap() in the QEMU
+host process. It also sets up the corresponding KVM / Xen / HAX mappings,
+depending on each accelerator's specific needs.
+
+Each block has a name, which is used for snapshot support.
+
+'qemu_ram_alloc_from_ptr()' can also be used to allocated a new RAM
+block, by passing its content explicitly (can be useful for pages of
+ROM).
+
+'qemu_get_ram_ptr()' will translate a 'ram_addr_t' into the corresponding
+address in the QEMU host process. 'qemu_ram_addr_from_host()' does the
+opposite (i.e. translates a host address into a ram_addr_t if possible,
+or return an error).
+
+Note that ram_addr_t addresses are an internal implementation detail of
+QEMU, i.e. the virtual CPU never sees their values directly; it relies
+instead of addresses in its virtual physical address space, described
+in section II. below.
+
+As an example, when emulating an Android/x86 virtual device, the following
+RAM space is being used:
+
+ 0x0000_0000 ... 0x1000_0000 "pc.ram"
+ 0x1000_0000 ... 0x1002_0000 "bios.bin"
+ 0x1002_0000 ... 0x1004_0000 "pc.rom"
+
+
+I.2. RAM Dirty tracking:
+------------------------
+
+QEMU also associates with each RAM page an 8-bit 'dirty' bitmap. The
+main idea is that whenever a page is written to, the value 0xff is
+written to the page's 'dirty' bitmap. Various clients can later inspect
+some of the flags and clear them. I.e.:
+
+ VGA_DIRTY_FLAG (0x1) is typically used by framebuffer drivers to detect
+ which pages of video RAM were touched since the latest VSYNC. The driver
+ typically copies the pixel values to the real QEMU output, then clears
+ the bits. This is very useful to avoid needless copies if nothing
+ changed in the framebuffer.
+
+ MIGRATION_DIRTY_FLAG (0x8) is used to tracked modified RAM pages during
+ live migration (i.e. moving a QEMU virtual machine from one host to
+ another)
+
+ CODE_DIRTY_FLAG (0x2) is a bit more special, and is used to support
+ self-modifying code properly. More on this later.
+
+
+II. The physical address space:
+===============================
+
+Represents the address space that the virtual CPU can read from / write to.
+|hwaddr| is the type of addresses in this space, which is decomposed
+into 'pages'. Each page in the address space is either unassigned, or
+mapped to a specific kind of memory region.
+
+See |phys_page_find()| and |phys_page_find_alloc()| in translate-all.c for
+the implementation details.
+
+
+II.1. Memory region types:
+--------------------------
+
+There are several memory region types:
+
+ - Regions of RAM pages.
+ - Regions of ROM pages (similar to RAM, but cannot be written to).
+ - Regions of I/O pages, used to communicate with virtual hardware.
+
+Virtual devices can register a new I/O region type by calling
+|cpu_register_io_memory()|. This function allows them to provide
+callbacks that will be invoked every time the virtual CPU reads from
+or writes to any page of the corresponding type.
+
+The memory region type of a given page is encoded using PAGE_BITS bits
+in the following format:
+
+ +-------------------------------+
+ | mem_type_index | flags |
+ +-------------------------------+
+
+Where |mem_type_index| is a unique value identifying a given memory
+region type, and |flags| is a 3-bit bitmap used to store flags that are
+only relevant for I/O pages.
+
+The following memory region type values are important:
+
+ IO_MEM_RAM (mem_type_index=0, flags=0):
+ Used for regular RAM pages, always all zero on purpose.
+
+ IO_MEM_ROM (mem_type_index=1, flags=0):
+ Used for ROM pages.
+
+ IO_MEM_UNASSIGNED (mem_type_index=2, flags=0):
+ Used to identify unassigned pages of the physical address space.
+
+ IO_MEM_NOTDIRTY (mem_type_index=3, flags=0):
+ Used to implement tracking of dirty RAM pages. This is essentially
+ used for RAM pages that have not been written to yet.
+
+Any mem_type_index value of 4 or higher corresponds to a device-specific
+I/O memory region type (i.e. with custom read/write callbaks, a
+corresponding 'opaque' value), and can also use the following bits
+in |flags|:
+
+ IO_MEM_ROMD (0x1):
+ Used for ROM-like I/O pages, i.e. they are backed by a page from
+ the RAM address space, but writing to them triggers a device-specific
+ write callback (instead of being ignored or faulting the CPU).
+
+ IO_MEM_SUBPAGE (0x02)
+ Used to indicate that not all addresses in this page map to the same
+ I/O region type / callbacks.
+
+ IO_MEM_SUBWIDTH (0x04)
+ Probably obsolete. Set to indicate that the corresponding I/O region
+ type doesn't support reading/writing values of all possible sizes
+ (1, 2 and 4 bytes). This seems to be never used by the current code.
+
+Note that cpu_register_io_memory() returns a new memory region type value.
+
+II.2. Physical address map:
+---------------------------
+
+QEMU maintains for each assigned page in the physical address space
+two values:
+
+ |phys_offset|, a combination of ram address and memory region type.
+
+ |region_offset|, an optional offset into the region backing the
+ page. This is only useful for I/O pages.
+
+The |phys_offset| value has many interesting encoding which require
+further clarification:
+
+ - Generally speaking, a phys_offset value is decomposed into
+ the following bit fields:
+
+ +-----------------------------------------------------+
+ | high_addr | mem_type |
+ +-----------------------------------------------------+
+
+ where |mem_type| is a PAGE_BITS memory region type as described
+ previously, and |high_addr| may contain the high bits of a
+ ram_addr_t address for RAM-backed pages.
+
+More specifically:
+
+ - Unassigned pages always have the special value IO_MEM_UNASSIGNED
+ (high_addr=0, mem_type=IO_MEM_UNASSIGNED)
+
+ - RAM pages have mem_type=0 (i.e. IO_MEM_RAM) while high_addr are
+ the high bits of the corresponding ram_addr_t. Hence, a simple call to
+ qemu_get_ram_ptr(phys_offset) will return the corresponding
+ address in host QEMU memory.
+
+ This is the reson why IO_MEM_RAM is always 0:
+
+ RAM page phys_offset value:
+ +-----------------------------------------------------+
+ | high_addr | 0 |
+ +-----------------------------------------------------+
+
+
+ - ROM pages are like RAM pages, but have mem_type=IO_MEM_ROM.
+ QEMU ensures that writing to such a page is a no-op, except on
+ some target architectures, like Sparc, this may cause a CPU fault.
+
+ ROM page phys_offset value:
+ +-----------------------------------------------------+
+ | high_addr | IO_MEM_ROM |
+ +-----------------------------------------------------+
+
+ - Dirty RAM page tracking is implemented by using special
+ phys_offset values with mem_type=IO_MEM_NOTDIRTY. Note that these
+ values do not appear directly in the physical page map, but in
+ the CPU TLB cache (explained later).
+
+ non-dirty RAM page phys_offset value (CPU TLB cache only):
+ +-----------------------------------------------------+
+ | high_addr | IO_MEM_NOTDIRTY |
+ +-----------------------------------------------------+
+
+ - Other pages are I/O pages, and their high_addr value will
+ be 0 / ignored:
+
+ I/O page phys_offset value:
+ +----------------------------------------------------------+
+ | 0 | mem_type_index | flags |
+ +----------------------------------------------------------+
+
+ Note that when reading from or writing to I/O pages, the lowest
+ PAGE_BITS bits of the corresponding hwaddr value will be added
+ to the page's |region_offset| value. This new address is passed
+ to the read/write callback as the 'i/o address' for the operation.
+
+ - As a special exception, if the I/O page's IO_MEM_ROMD flag is
+ set, then high_addr is not 0, but the high bits of the corresponding
+ ram_addr_t backing the page's contents on reads. On write operations
+ though, the I/O region type's write callback will be called instead.
+
+ ROMD I/O page phys_offset value:
+ +----------------------------------------------------------+
+ | high_addr | mem_type_index | flags |
+ +----------------------------------------------------------+
+
+ Note that |region_offset| is ignored when reading from such pages,
+ it's only used when writing to the I/O page.
diff --git a/exec.c b/exec.c
index a926760..7b4dacd 100644
--- a/exec.c
+++ b/exec.c
@@ -1143,6 +1143,15 @@
}
new_block->length = size;
+ if (dev) {
+ char *id = qdev_get_dev_path(dev);
+ if (id) {
+ snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
+ g_free(id);
+ }
+ }
+ pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
+
/* Keep the list sorted from biggest to smallest block. */
QTAILQ_FOREACH(block, &ram_list.blocks, next) {
if (block->length < new_block->length) {
@@ -1168,7 +1177,7 @@
//qemu_ram_setup_dump(new_block->host, size);
//qemu_madvise(new_block->host, size, QEMU_MADV_HUGEPAGE);
//qemu_madvise(new_block->host, size, QEMU_MADV_DONTFORK);
-
+
if (kvm_enabled())
kvm_setup_guest_memory(new_block->host, size);
diff --git a/kvm-all.c b/kvm-all.c
index 8bdd110..917cfcd 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -86,15 +86,13 @@
static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
hwaddr start_addr,
- hwaddr end_addr)
+ ram_addr_t size)
{
int i;
for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
KVMSlot *mem = &s->slots[i];
-
- if (start_addr == mem->start_addr &&
- end_addr == mem->start_addr + mem->memory_size) {
+ if (start_addr == mem->start_addr && size == mem->memory_size) {
return mem;
}
}
@@ -107,7 +105,7 @@
*/
static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
hwaddr start_addr,
- hwaddr end_addr)
+ ram_addr_t size)
{
KVMSlot *found = NULL;
int i;
@@ -115,15 +113,29 @@
for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
KVMSlot *mem = &s->slots[i];
- if (mem->memory_size == 0 ||
- (found && found->start_addr < mem->start_addr)) {
+ // Skip empty slots.
+ if (!mem->memory_size)
+ continue;
+
+ // Skip non-overlapping slots, conditions are:
+ // start_addr + size <= mem->start_addr ||
+ // start_addr >= mem->start_addr + mem->memory_size
+ //
+ // However, we want to avoid wrapping errors, so avoid
+ // additions and only compare positive values.
+ if (start_addr <= mem->start_addr) {
+ if (mem->start_addr - start_addr >= size) {
+ continue;
+ }
+ } else if (start_addr - mem->start_addr >= mem->memory_size) {
continue;
}
- if (end_addr > mem->start_addr &&
- start_addr < mem->start_addr + mem->memory_size) {
- found = mem;
+ if (found && found->start_addr < mem->start_addr) {
+ continue;
}
+
+ found = mem;
}
return found;
@@ -224,7 +236,7 @@
ram_addr_t size, int flags, int mask)
{
KVMState *s = kvm_state;
- KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
+ KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, size);
int old_flags;
if (mem == NULL) {
@@ -307,19 +319,19 @@
d.dirty_bitmap = NULL;
while (start_addr < end_addr) {
- mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
+ ram_addr_t start_size = (ram_addr_t)(end_addr - start_addr);
+
+ mem = kvm_lookup_overlapping_slot(s, start_addr, start_size);
if (mem == NULL) {
break;
}
size = ((mem->memory_size >> TARGET_PAGE_BITS) + 7) / 8;
- if (!d.dirty_bitmap) {
- d.dirty_bitmap = g_malloc(size);
- } else if (size > allocated_size) {
+ if (size > allocated_size) {
d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
+ allocated_size = size;
}
- allocated_size = size;
- memset(d.dirty_bitmap, 0, allocated_size);
+ memset(d.dirty_bitmap, 0, size);
d.slot = mem->slot;
@@ -330,7 +342,7 @@
}
for (phys_addr = mem->start_addr, addr = mem->phys_offset;
- phys_addr < mem->start_addr + mem->memory_size;
+ phys_addr - mem->start_addr < mem->memory_size;
phys_addr += TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
unsigned long *bitmap = (unsigned long *)d.dirty_bitmap;
unsigned nr = (phys_addr - mem->start_addr) >> TARGET_PAGE_BITS;
@@ -342,6 +354,11 @@
}
}
start_addr = phys_addr;
+ if (!start_addr) {
+ // Handle wrap-around, which happens when a slot is mapped
+ // at the end of the physical address space.
+ break;
+ }
}
g_free(d.dirty_bitmap);
@@ -683,8 +700,7 @@
if (start_addr & ~TARGET_PAGE_MASK) {
if (flags >= IO_MEM_UNASSIGNED) {
- if (!kvm_lookup_overlapping_slot(s, start_addr,
- start_addr + size)) {
+ if (!kvm_lookup_overlapping_slot(s, start_addr, size)) {
return;
}
fprintf(stderr, "Unaligned split of a KVM memory slot\n");
@@ -698,7 +714,7 @@
phys_offset &= ~IO_MEM_ROM;
while (1) {
- mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
+ mem = kvm_lookup_overlapping_slot(s, start_addr, size);
if (!mem) {
break;
}