Merge remote branch 'qemu-kvm/uq/master' into staging
diff --git a/cpu-exec.c b/cpu-exec.c
index 7d5b96a..184bdde 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -230,11 +230,13 @@
     env = env1;
 
 #if defined(TARGET_I386)
-    /* put eflags in CPU temporary format */
-    CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
-    DF = 1 - (2 * ((env->eflags >> 10) & 1));
-    CC_OP = CC_OP_EFLAGS;
-    env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+    if (!kvm_enabled()) {
+        /* put eflags in CPU temporary format */
+        CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+        DF = 1 - (2 * ((env->eflags >> 10) & 1));
+        CC_OP = CC_OP_EFLAGS;
+        env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+    }
 #elif defined(TARGET_SPARC)
 #elif defined(TARGET_M68K)
     env->cc_op = CC_OP_FLAGS;
diff --git a/kvm-all.c b/kvm-all.c
index 79345b2..1a02076 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -21,6 +21,7 @@
 #include <linux/kvm.h>
 
 #include "qemu-common.h"
+#include "qemu-barrier.h"
 #include "sysemu.h"
 #include "hw/hw.h"
 #include "gdbstub.h"
@@ -730,7 +731,7 @@
             ent = &ring->coalesced_mmio[ring->first];
 
             cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
-            /* FIXME smp_wmb() */
+            smp_wmb();
             ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
         }
     }
@@ -753,11 +754,13 @@
     dprintf("kvm_cpu_exec()\n");
 
     do {
+#ifndef CONFIG_IOTHREAD
         if (env->exit_request) {
             dprintf("interrupt exit requested\n");
             ret = 0;
             break;
         }
+#endif
 
         if (env->kvm_vcpu_dirty) {
             kvm_arch_put_registers(env);
@@ -771,6 +774,7 @@
         kvm_arch_post_run(env, run);
 
         if (ret == -EINTR || ret == -EAGAIN) {
+            cpu_exit(env);
             dprintf("io window exit\n");
             ret = 0;
             break;
@@ -1116,3 +1120,21 @@
 {
 }
 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
+
+int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
+{
+    struct kvm_signal_mask *sigmask;
+    int r;
+
+    if (!sigset)
+        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
+
+    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
+
+    sigmask->len = 8;
+    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
+    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
+    free(sigmask);
+
+    return r;
+}
diff --git a/kvm.h b/kvm.h
index e24bbde..9a9cdd5 100644
--- a/kvm.h
+++ b/kvm.h
@@ -53,6 +53,7 @@
                           target_ulong len, int type);
 void kvm_remove_all_breakpoints(CPUState *current_env);
 int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap);
+int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset);
 
 int kvm_pit_in_kernel(void);
 int kvm_irqchip_in_kernel(void);
diff --git a/osdep.c b/osdep.c
index 9059f01..9e4b17b 100644
--- a/osdep.c
+++ b/osdep.c
@@ -37,6 +37,10 @@
 #include <sys/statvfs.h>
 #endif
 
+#ifdef CONFIG_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
 #ifdef _WIN32
 #include <windows.h>
 #elif defined(CONFIG_BSD)
@@ -281,6 +285,34 @@
 
 #ifndef _WIN32
 /*
+ * Creates an eventfd that looks like a pipe and has EFD_CLOEXEC set.
+ */
+int qemu_eventfd(int fds[2])
+{
+    int ret;
+
+#ifdef CONFIG_EVENTFD
+    ret = eventfd(0, 0);
+    if (ret >= 0) {
+        fds[0] = ret;
+        qemu_set_cloexec(ret);
+        if ((fds[1] = dup(ret)) == -1) {
+            close(ret);
+            return -1;
+        }
+        qemu_set_cloexec(fds[1]);
+        return 0;
+    }
+
+    if (errno != ENOSYS) {
+        return -1;
+    }
+#endif
+
+    return qemu_pipe(fds);
+}
+
+/*
  * Creates a pipe with FD_CLOEXEC set on both file descriptors
  */
 int qemu_pipe(int pipefd[2])
diff --git a/qemu-barrier.h b/qemu-barrier.h
new file mode 100644
index 0000000..3bd1075
--- /dev/null
+++ b/qemu-barrier.h
@@ -0,0 +1,7 @@
+#ifndef __QEMU_BARRIER_H
+#define __QEMU_BARRIER_H 1
+
+/* FIXME: arch dependant, x86 version */
+#define smp_wmb()   asm volatile("" ::: "memory")
+
+#endif
diff --git a/qemu-common.h b/qemu-common.h
index fc32d8d..805be1a 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -172,6 +172,7 @@
 void qemu_set_cloexec(int fd);
 
 #ifndef _WIN32
+int qemu_eventfd(int pipefd[2]);
 int qemu_pipe(int pipefd[2]);
 #endif
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ac8d985..6b741ba 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -159,9 +159,6 @@
 #ifdef KVM_CAP_PV_MMU
         { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
 #endif
-#ifdef KVM_CAP_CR3_CACHE
-        { KVM_CAP_CR3_CACHE, KVM_FEATURE_CR3_CACHE },
-#endif
         { -1, -1 }
 };
 
diff --git a/vl.c b/vl.c
index 874246b..db7a178 100644
--- a/vl.c
+++ b/vl.c
@@ -270,6 +270,12 @@
 static QEMUBootSetHandler *boot_set_handler;
 static void *boot_set_opaque;
 
+#ifdef SIGRTMIN
+#define SIG_IPI (SIGRTMIN+4)
+#else
+#define SIG_IPI SIGUSR1
+#endif
+
 static int default_serial = 1;
 static int default_parallel = 1;
 static int default_virtcon = 1;
@@ -3170,14 +3176,15 @@
 
 static void qemu_event_increment(void)
 {
-    static const char byte = 0;
+    /* Write 8 bytes to be compatible with eventfd.  */
+    static uint64_t val = 1;
     ssize_t ret;
 
     if (io_thread_fd == -1)
         return;
 
     do {
-        ret = write(io_thread_fd, &byte, sizeof(byte));
+        ret = write(io_thread_fd, &val, sizeof(val));
     } while (ret < 0 && errno == EINTR);
 
     /* EAGAIN is fine, a read must be pending.  */
@@ -3194,7 +3201,7 @@
     ssize_t len;
     char buffer[512];
 
-    /* Drain the notify pipe */
+    /* Drain the notify pipe.  For eventfd, only 8 bytes will be read.  */
     do {
         len = read(fd, buffer, sizeof(buffer));
     } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
@@ -3205,7 +3212,7 @@
     int err;
     int fds[2];
 
-    err = qemu_pipe(fds);
+    err = qemu_eventfd(fds);
     if (err == -1)
         return -errno;
 
@@ -3338,9 +3345,11 @@
 static QemuCond qemu_system_cond;
 static QemuCond qemu_pause_cond;
 
-static void block_io_signals(void);
+static void tcg_block_io_signals(void);
+static void kvm_block_io_signals(CPUState *env);
 static void unblock_io_signals(void);
 static int tcg_has_work(void);
+static int cpu_has_work(CPUState *env);
 
 static int qemu_init_main_loop(void)
 {
@@ -3361,6 +3370,15 @@
     return 0;
 }
 
+static void qemu_wait_io_event_common(CPUState *env)
+{
+    if (env->stop) {
+        env->stop = 0;
+        env->stopped = 1;
+        qemu_cond_signal(&qemu_pause_cond);
+    }
+}
+
 static void qemu_wait_io_event(CPUState *env)
 {
     while (!tcg_has_work())
@@ -3377,24 +3395,54 @@
     qemu_mutex_unlock(&qemu_fair_mutex);
 
     qemu_mutex_lock(&qemu_global_mutex);
-    if (env->stop) {
-        env->stop = 0;
-        env->stopped = 1;
-        qemu_cond_signal(&qemu_pause_cond);
+    qemu_wait_io_event_common(env);
+}
+
+static void qemu_kvm_eat_signal(CPUState *env, int timeout)
+{
+    struct timespec ts;
+    int r, e;
+    siginfo_t siginfo;
+    sigset_t waitset;
+
+    ts.tv_sec = timeout / 1000;
+    ts.tv_nsec = (timeout % 1000) * 1000000;
+
+    sigemptyset(&waitset);
+    sigaddset(&waitset, SIG_IPI);
+
+    qemu_mutex_unlock(&qemu_global_mutex);
+    r = sigtimedwait(&waitset, &siginfo, &ts);
+    e = errno;
+    qemu_mutex_lock(&qemu_global_mutex);
+
+    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+        exit(1);
     }
 }
 
+static void qemu_kvm_wait_io_event(CPUState *env)
+{
+    while (!cpu_has_work(env))
+        qemu_cond_timedwait(env->halt_cond, &qemu_global_mutex, 1000);
+
+    qemu_kvm_eat_signal(env, 0);
+    qemu_wait_io_event_common(env);
+}
+
 static int qemu_cpu_exec(CPUState *env);
 
 static void *kvm_cpu_thread_fn(void *arg)
 {
     CPUState *env = arg;
 
-    block_io_signals();
     qemu_thread_self(env->thread);
     if (kvm_enabled())
         kvm_init_vcpu(env);
 
+    kvm_block_io_signals(env);
+
     /* signal CPU creation */
     qemu_mutex_lock(&qemu_global_mutex);
     env->created = 1;
@@ -3407,7 +3455,7 @@
     while (1) {
         if (cpu_can_run(env))
             qemu_cpu_exec(env);
-        qemu_wait_io_event(env);
+        qemu_kvm_wait_io_event(env);
     }
 
     return NULL;
@@ -3419,7 +3467,7 @@
 {
     CPUState *env = arg;
 
-    block_io_signals();
+    tcg_block_io_signals();
     qemu_thread_self(env->thread);
 
     /* signal CPU creation */
@@ -3445,7 +3493,7 @@
     CPUState *env = _env;
     qemu_cond_broadcast(env->halt_cond);
     if (kvm_enabled())
-        qemu_thread_signal(env->thread, SIGUSR1);
+        qemu_thread_signal(env->thread, SIG_IPI);
 }
 
 int qemu_cpu_self(void *_env)
@@ -3464,7 +3512,7 @@
         cpu_exit(cpu_single_env);
 }
 
-static void block_io_signals(void)
+static void tcg_block_io_signals(void)
 {
     sigset_t set;
     struct sigaction sigact;
@@ -3473,15 +3521,48 @@
     sigaddset(&set, SIGUSR2);
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
+    sigaddset(&set, SIGCHLD);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
     sigemptyset(&set);
-    sigaddset(&set, SIGUSR1);
+    sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     memset(&sigact, 0, sizeof(sigact));
     sigact.sa_handler = cpu_signal;
-    sigaction(SIGUSR1, &sigact, NULL);
+    sigaction(SIG_IPI, &sigact, NULL);
+}
+
+static void dummy_signal(int sig)
+{
+}
+
+static void kvm_block_io_signals(CPUState *env)
+{
+    int r;
+    sigset_t set;
+    struct sigaction sigact;
+
+    sigemptyset(&set);
+    sigaddset(&set, SIGUSR2);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
+    sigaddset(&set, SIGCHLD);
+    sigaddset(&set, SIG_IPI);
+    pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    pthread_sigmask(SIG_BLOCK, NULL, &set);
+    sigdelset(&set, SIG_IPI);
+
+    memset(&sigact, 0, sizeof(sigact));
+    sigact.sa_handler = dummy_signal;
+    sigaction(SIG_IPI, &sigact, NULL);
+
+    r = kvm_set_signal_mask(env, &set);
+    if (r) {
+        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
+        exit(1);
+    }
 }
 
 static void unblock_io_signals(void)
@@ -3495,7 +3576,7 @@
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     sigemptyset(&set);
-    sigaddset(&set, SIGUSR1);
+    sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 }
 
@@ -3504,7 +3585,7 @@
     qemu_mutex_lock(&qemu_fair_mutex);
 
     while (qemu_mutex_trylock(&qemu_global_mutex)) {
-        qemu_thread_signal(tcg_cpu_thread, SIGUSR1);
+        qemu_thread_signal(tcg_cpu_thread, SIG_IPI);
         if (!qemu_mutex_timedlock(&qemu_global_mutex, msecs))
             break;
     }
@@ -3545,7 +3626,7 @@
 
     while (penv) {
         penv->stop = 1;
-        qemu_thread_signal(penv->thread, SIGUSR1);
+        qemu_thread_signal(penv->thread, SIG_IPI);
         qemu_cpu_kick(penv);
         penv = (CPUState *)penv->next_cpu;
     }
@@ -3554,7 +3635,7 @@
         qemu_cond_timedwait(&qemu_pause_cond, &qemu_global_mutex, 100);
         penv = first_cpu;
         while (penv) {
-            qemu_thread_signal(penv->thread, SIGUSR1);
+            qemu_thread_signal(penv->thread, SIG_IPI);
             penv = (CPUState *)penv->next_cpu;
         }
     }
@@ -3567,7 +3648,7 @@
     while (penv) {
         penv->stop = 0;
         penv->stopped = 0;
-        qemu_thread_signal(penv->thread, SIGUSR1);
+        qemu_thread_signal(penv->thread, SIG_IPI);
         qemu_cpu_kick(penv);
         penv = (CPUState *)penv->next_cpu;
     }