From a99761d3c85679da380c0f597468acd3dc1b53b3 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Wed, 13 Jun 2018 15:19:06 +0200
Subject: [PATCH 01/60] exec: Fix MAP_RAM for cached access

When an IOMMUMemoryRegion is in front of a virtio device,
address_space_cache_init does not set cache->ptr as the memory
region is not RAM. However when the device performs an access,
we end up in glue() which performs the translation and then uses
MAP_RAM. This latter uses the unset ptr and returns a wrong value
which leads to a SIGSEV in address_space_lduw_internal_cached_slow,
for instance.

In slow path cache->ptr is NULL and MAP_RAM must redirect to
qemu_map_ram_ptr((mr)->ram_block, ofs).

As MAP_RAM, IS_DIRECT and INVALIDATE are the same in _cached_slow
and non cached mode, let's remove those macros.

This fixes the use cases featuring vIOMMU (Intel and ARM SMMU)
which lead to a SIGSEV.

Fixes: 48564041a73a (exec: reintroduce MemoryRegion caching)
Signed-off-by: Eric Auger <eric.auger@redhat.com>

Message-Id: <1528895946-28677-1-git-send-email-eric.auger@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c            |  6 ------
 memory_ldst.inc.c | 47 ++++++++++++++++++++++-------------------------
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/exec.c b/exec.c
index 1e37f7586b..9f35e34ad2 100644
--- a/exec.c
+++ b/exec.c
@@ -3702,9 +3702,6 @@ void cpu_physical_memory_unmap(void *buffer, hwaddr len,
 #define ARG1                     as
 #define SUFFIX
 #define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
-#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
-#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
-#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
 #define RCU_READ_LOCK(...)       rcu_read_lock()
 #define RCU_READ_UNLOCK(...)     rcu_read_unlock()
 #include "memory_ldst.inc.c"
@@ -3841,9 +3838,6 @@ address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
 #define ARG1                     cache
 #define SUFFIX                   _cached_slow
 #define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
-#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
-#define MAP_RAM(mr, ofs)         (cache->ptr + (ofs - cache->xlat))
-#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
 #define RCU_READ_LOCK()          ((void)0)
 #define RCU_READ_UNLOCK()        ((void)0)
 #include "memory_ldst.inc.c"
diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c
index 15483987fe..acf865b900 100644
--- a/memory_ldst.inc.c
+++ b/memory_ldst.inc.c
@@ -34,7 +34,7 @@ static inline uint32_t glue(address_space_ldl_internal, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, false, attrs);
-    if (l < 4 || !IS_DIRECT(mr, false)) {
+    if (l < 4 || !memory_access_is_direct(mr, false)) {
         release_lock |= prepare_mmio_access(mr);
 
         /* I/O case */
@@ -50,7 +50,7 @@ static inline uint32_t glue(address_space_ldl_internal, SUFFIX)(ARG1_DECL,
 #endif
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             val = ldl_le_p(ptr);
@@ -110,7 +110,7 @@ static inline uint64_t glue(address_space_ldq_internal, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, false, attrs);
-    if (l < 8 || !IS_DIRECT(mr, false)) {
+    if (l < 8 || !memory_access_is_direct(mr, false)) {
         release_lock |= prepare_mmio_access(mr);
 
         /* I/O case */
@@ -126,7 +126,7 @@ static inline uint64_t glue(address_space_ldq_internal, SUFFIX)(ARG1_DECL,
 #endif
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             val = ldq_le_p(ptr);
@@ -184,14 +184,14 @@ uint32_t glue(address_space_ldub, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, false, attrs);
-    if (!IS_DIRECT(mr, false)) {
+    if (!memory_access_is_direct(mr, false)) {
         release_lock |= prepare_mmio_access(mr);
 
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 1, attrs);
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         val = ldub_p(ptr);
         r = MEMTX_OK;
     }
@@ -220,7 +220,7 @@ static inline uint32_t glue(address_space_lduw_internal, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, false, attrs);
-    if (l < 2 || !IS_DIRECT(mr, false)) {
+    if (l < 2 || !memory_access_is_direct(mr, false)) {
         release_lock |= prepare_mmio_access(mr);
 
         /* I/O case */
@@ -236,7 +236,7 @@ static inline uint32_t glue(address_space_lduw_internal, SUFFIX)(ARG1_DECL,
 #endif
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             val = lduw_le_p(ptr);
@@ -297,12 +297,12 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, true, attrs);
-    if (l < 4 || !IS_DIRECT(mr, true)) {
+    if (l < 4 || !memory_access_is_direct(mr, true)) {
         release_lock |= prepare_mmio_access(mr);
 
         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
     } else {
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         stl_p(ptr, val);
 
         dirty_log_mask = memory_region_get_dirty_log_mask(mr);
@@ -334,7 +334,7 @@ static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, true, attrs);
-    if (l < 4 || !IS_DIRECT(mr, true)) {
+    if (l < 4 || !memory_access_is_direct(mr, true)) {
         release_lock |= prepare_mmio_access(mr);
 
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -349,7 +349,7 @@ static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             stl_le_p(ptr, val);
@@ -361,7 +361,7 @@ static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
             stl_p(ptr, val);
             break;
         }
-        INVALIDATE(mr, addr1, 4);
+        invalidate_and_set_dirty(mr, addr1, 4);
         r = MEMTX_OK;
     }
     if (result) {
@@ -406,14 +406,14 @@ void glue(address_space_stb, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, true, attrs);
-    if (!IS_DIRECT(mr, true)) {
+    if (!memory_access_is_direct(mr, true)) {
         release_lock |= prepare_mmio_access(mr);
         r = memory_region_dispatch_write(mr, addr1, val, 1, attrs);
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         stb_p(ptr, val);
-        INVALIDATE(mr, addr1, 1);
+        invalidate_and_set_dirty(mr, addr1, 1);
         r = MEMTX_OK;
     }
     if (result) {
@@ -439,7 +439,7 @@ static inline void glue(address_space_stw_internal, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, true, attrs);
-    if (l < 2 || !IS_DIRECT(mr, true)) {
+    if (l < 2 || !memory_access_is_direct(mr, true)) {
         release_lock |= prepare_mmio_access(mr);
 
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -454,7 +454,7 @@ static inline void glue(address_space_stw_internal, SUFFIX)(ARG1_DECL,
         r = memory_region_dispatch_write(mr, addr1, val, 2, attrs);
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             stw_le_p(ptr, val);
@@ -466,7 +466,7 @@ static inline void glue(address_space_stw_internal, SUFFIX)(ARG1_DECL,
             stw_p(ptr, val);
             break;
         }
-        INVALIDATE(mr, addr1, 2);
+        invalidate_and_set_dirty(mr, addr1, 2);
         r = MEMTX_OK;
     }
     if (result) {
@@ -512,7 +512,7 @@ static void glue(address_space_stq_internal, SUFFIX)(ARG1_DECL,
 
     RCU_READ_LOCK();
     mr = TRANSLATE(addr, &addr1, &l, true, attrs);
-    if (l < 8 || !IS_DIRECT(mr, true)) {
+    if (l < 8 || !memory_access_is_direct(mr, true)) {
         release_lock |= prepare_mmio_access(mr);
 
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -527,7 +527,7 @@ static void glue(address_space_stq_internal, SUFFIX)(ARG1_DECL,
         r = memory_region_dispatch_write(mr, addr1, val, 8, attrs);
     } else {
         /* RAM case */
-        ptr = MAP_RAM(mr, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             stq_le_p(ptr, val);
@@ -539,7 +539,7 @@ static void glue(address_space_stq_internal, SUFFIX)(ARG1_DECL,
             stq_p(ptr, val);
             break;
         }
-        INVALIDATE(mr, addr1, 8);
+        invalidate_and_set_dirty(mr, addr1, 8);
         r = MEMTX_OK;
     }
     if (result) {
@@ -576,8 +576,5 @@ void glue(address_space_stq_be, SUFFIX)(ARG1_DECL,
 #undef ARG1
 #undef SUFFIX
 #undef TRANSLATE
-#undef IS_DIRECT
-#undef MAP_RAM
-#undef INVALIDATE
 #undef RCU_READ_LOCK
 #undef RCU_READ_UNLOCK

From 8bca9a03ec60d63b2ee6a959fe85dda4206811e0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 30 May 2018 11:58:36 +0200
Subject: [PATCH 02/60] move public invalidate APIs out of translate-all.{c,h},
 clean up

Place them in exec.c, exec-all.h and ram_addr.h.  This removes
knowledge of translate-all.h (which is an internal header) from
several files outside accel/tcg and removes knowledge of
AddressSpace from translate-all.c (as it only operates on ram_addr_t).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/tcg/translate-all.c | 28 ++++++----------------------
 accel/tcg/translate-all.h |  1 -
 exec.c                    | 29 +++++++++++++++++++++++++----
 include/exec/exec-all.h   |  8 ++++----
 include/exec/ram_addr.h   |  2 ++
 linux-user/mmap.c         |  1 -
 target/xtensa/op_helper.c |  9 +--------
 trace/control-target.c    |  1 -
 8 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index f0c3fd4d03..4b601bd48e 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -46,7 +46,7 @@
 #endif
 #endif
 #else
-#include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
 #endif
 
 #include "exec/cputlb.h"
@@ -1934,7 +1934,11 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
  *
  * Called with mmap_lock held for user-mode emulation.
  */
-void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
+#ifdef CONFIG_SOFTMMU
+void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end)
+#else
+void tb_invalidate_phys_range(target_ulong start, target_ulong end)
+#endif
 {
     struct page_collection *pages;
     tb_page_addr_t next;
@@ -2073,26 +2077,6 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
 }
 #endif
 
-#if !defined(CONFIG_USER_ONLY)
-void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
-{
-    ram_addr_t ram_addr;
-    MemoryRegion *mr;
-    hwaddr l = 1;
-
-    rcu_read_lock();
-    mr = address_space_translate(as, addr, &addr, &l, false, attrs);
-    if (!(memory_region_is_ram(mr)
-          || memory_region_is_romd(mr))) {
-        rcu_read_unlock();
-        return;
-    }
-    ram_addr = memory_region_get_ram_addr(mr) + addr;
-    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
-    rcu_read_unlock();
-}
-#endif /* !defined(CONFIG_USER_ONLY) */
-
 /* user-mode: call with mmap_lock held */
 void tb_check_watchpoint(CPUState *cpu)
 {
diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
index e6cb963d7e..08e2f23a46 100644
--- a/accel/tcg/translate-all.h
+++ b/accel/tcg/translate-all.h
@@ -30,7 +30,6 @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
                                   tb_page_addr_t start, int len);
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
                                    int is_cpu_write_access);
-void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end);
 void tb_check_watchpoint(CPUState *cpu);
 
 #ifdef CONFIG_USER_ONLY
diff --git a/exec.c b/exec.c
index 9f35e34ad2..610d0c0746 100644
--- a/exec.c
+++ b/exec.c
@@ -1028,13 +1028,36 @@ const char *parse_cpu_model(const char *cpu_model)
 }
 
 #if defined(CONFIG_USER_ONLY)
-static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
+void tb_invalidate_phys_addr(target_ulong addr)
 {
     mmap_lock();
-    tb_invalidate_phys_page_range(pc, pc + 1, 0);
+    tb_invalidate_phys_page_range(addr, addr + 1, 0);
     mmap_unlock();
 }
+
+static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
+{
+    tb_invalidate_phys_addr(pc);
+}
 #else
+void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
+{
+    ram_addr_t ram_addr;
+    MemoryRegion *mr;
+    hwaddr l = 1;
+
+    rcu_read_lock();
+    mr = address_space_translate(as, addr, &addr, &l, false, attrs);
+    if (!(memory_region_is_ram(mr)
+          || memory_region_is_romd(mr))) {
+        rcu_read_unlock();
+        return;
+    }
+    ram_addr = memory_region_get_ram_addr(mr) + addr;
+    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
+    rcu_read_unlock();
+}
+
 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 {
     MemTxAttrs attrs;
@@ -3146,9 +3169,7 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
     }
     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
         assert(tcg_enabled());
-        mmap_lock();
         tb_invalidate_phys_range(addr, addr + length);
-        mmap_unlock();
         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
     }
     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 25a6f28ab8..6a7e7a866e 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -299,14 +299,14 @@ static inline void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *cpu,
 static inline void tlb_flush_by_mmuidx_all_cpus(CPUState *cpu, uint16_t idxmap)
 {
 }
+
 static inline void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu,
                                                        uint16_t idxmap)
 {
 }
-static inline void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr,
-                                           MemTxAttrs attrs)
-{
-}
+
+void tb_invalidate_phys_addr(target_ulong addr);
+void tb_invalidate_phys_range(target_ulong start, target_ulong end);
 #endif
 
 #define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 33c361cad5..cf4ce06248 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -93,6 +93,8 @@ int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp);
 #define DIRTY_CLIENTS_ALL     ((1 << DIRTY_MEMORY_NUM) - 1)
 #define DIRTY_CLIENTS_NOCODE  (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE))
 
+void tb_invalidate_phys_range(ram_addr_t start, ram_addr_t end);
+
 static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
                                                  ram_addr_t length,
                                                  unsigned client)
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 9168a2051c..d0c50e4888 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -20,7 +20,6 @@
 
 #include "qemu.h"
 #include "qemu-common.h"
-#include "translate-all.h"
 
 //#define DEBUG_MMAP
 
diff --git a/target/xtensa/op_helper.c b/target/xtensa/op_helper.c
index 8a8c763c63..bbbbb33f3c 100644
--- a/target/xtensa/op_helper.c
+++ b/target/xtensa/op_helper.c
@@ -36,11 +36,6 @@
 #include "qemu/timer.h"
 #include "fpu/softfloat.h"
 
-#ifdef CONFIG_USER_ONLY
-/* tb_invalidate_phys_range */
-#include "accel/tcg/translate-all.h"
-#endif
-
 #ifndef CONFIG_USER_ONLY
 
 void xtensa_cpu_do_unaligned_access(CPUState *cs,
@@ -114,9 +109,7 @@ static void tb_invalidate_virtual_addr(CPUXtensaState *env, uint32_t vaddr)
 
 static void tb_invalidate_virtual_addr(CPUXtensaState *env, uint32_t vaddr)
 {
-    mmap_lock();
-    tb_invalidate_phys_range(vaddr, vaddr + 1);
-    mmap_unlock();
+    tb_invalidate_phys_addr(vaddr);
 }
 
 #endif
diff --git a/trace/control-target.c b/trace/control-target.c
index 706b2cee9d..ceb55c70ce 100644
--- a/trace/control-target.c
+++ b/trace/control-target.c
@@ -11,7 +11,6 @@
 #include "cpu.h"
 #include "trace-root.h"
 #include "trace/control.h"
-#include "translate-all.h"
 
 
 void trace_event_set_state_dynamic_init(TraceEvent *ev, bool state)

From 1507bd136fd9a516226fce8738d361a64f45b699 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Mon, 4 Jun 2018 13:30:43 +0100
Subject: [PATCH 03/60] chardev: don't splatter terminal settings on exit if
 not previously set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The stdio chardev finalize method calls term_exit() to restore the
original terminal settings that were saved in the "oldtty" global. If
the qemu_chr_open_stdio() method exited with an error, we might not have
any original terminal settings saved in "oldtty" yet.

eg

  $ qemu-system-x86_64 -monitor stdio -daemonize
  qemu-system-x86_64: -monitor stdio: cannot use stdio with -daemonize

will cause QEMU to splatter the terminal settings with an all-zeros
"struct termios", with predictably unpleasant results. Fortunately the
existing "stdio_in_use" flag is suitable witness for whether "oldtty"
contains settings that need restoring.

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Message-Id: <20180604123043.13985-1-berrange@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 chardev/char-stdio.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/chardev/char-stdio.c b/chardev/char-stdio.c
index 96375f2ab8..9624220e6d 100644
--- a/chardev/char-stdio.c
+++ b/chardev/char-stdio.c
@@ -46,8 +46,10 @@ static bool stdio_echo_state;
 
 static void term_exit(void)
 {
-    tcsetattr(0, TCSANOW, &oldtty);
-    fcntl(0, F_SETFL, old_fd0_flags);
+    if (stdio_in_use) {
+        tcsetattr(0, TCSANOW, &oldtty);
+        fcntl(0, F_SETFL, old_fd0_flags);
+    }
 }
 
 static void qemu_chr_set_echo_stdio(Chardev *chr, bool echo)

From d29a8a1b0758a905b148929dd14b79bfeb297a80 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Sat, 2 Jun 2018 09:52:59 +0100
Subject: [PATCH 04/60] main-loop: document IOCanReadHandler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20180602085259.17853-1-stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 include/qemu/main-loop.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 6b4b60bf6d..721aa2416a 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -168,6 +168,20 @@ void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
 /* async I/O support */
 
 typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size);
+
+/**
+ * IOCanReadHandler: Return the number of bytes that #IOReadHandler can accept
+ *
+ * This function reports how many bytes #IOReadHandler is prepared to accept.
+ * #IOReadHandler may be invoked with up to this number of bytes.  If this
+ * function returns 0 then #IOReadHandler is not invoked.
+ *
+ * This function is typically called from an event loop.  If the number of
+ * bytes changes outside the event loop (e.g. because a vcpu thread drained the
+ * buffer), then it is necessary to kick the event loop so that this function
+ * is called again.  aio_notify() or qemu_notify_event() can be used to kick
+ * the event loop.
+ */
 typedef int IOCanReadHandler(void *opaque);
 
 /**

From 019288bf137183bf3407c9824655b753bfafc99f Mon Sep 17 00:00:00 2001
From: Sergio Lopez <slp@redhat.com>
Date: Tue, 5 Jun 2018 03:54:55 -0400
Subject: [PATCH 05/60] hw/char/serial: Only retry if qemu_chr_fe_write returns
 0

Only retry on serial_xmit if qemu_chr_fe_write returns 0, as this is the
only recoverable error.

Retrying with any other scenario, in addition to being a waste of CPU
cycles, can compromise the Guest stability if by the vCPU issuing the
write and the main loop thread are, by chance or explicit pinning,
running on the same pCPU.

Previous discussion:

https://lists.nongnu.org/archive/html/qemu-devel/2018-05/msg06998.html

Signed-off-by: Sergio Lopez <slp@redhat.com>
Message-Id: <1528185295-14199-1-git-send-email-slp@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/char/serial.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/char/serial.c b/hw/char/serial.c
index 605b0d02f9..6de6c29779 100644
--- a/hw/char/serial.c
+++ b/hw/char/serial.c
@@ -260,7 +260,7 @@ static void serial_xmit(SerialState *s)
         if (s->mcr & UART_MCR_LOOP) {
             /* in loopback mode, say that we just received a char */
             serial_receive1(s, &s->tsr, 1);
-        } else if (qemu_chr_fe_write(&s->chr, &s->tsr, 1) != 1 &&
+        } else if (qemu_chr_fe_write(&s->chr, &s->tsr, 1) == 0 &&
                    s->tsr_retry < MAX_XMIT_RETRY) {
             assert(s->watch_tag == 0);
             s->watch_tag =

From 13672386a93fef64cfd33bd72fbf3d80f2c00e94 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Wed, 12 Jul 2017 09:29:02 -1000
Subject: [PATCH 06/60] target/i386: Fix BLSR and BLSI

The implementation of these two instructions was swapped.
At the same time, unify the setup of eflags for the insn group.

Reported-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Message-Id: <20170712192902.15493-1-rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/translate.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/target/i386/translate.c b/target/i386/translate.c
index 697a918c11..c91849417b 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -4059,34 +4059,26 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 ot = mo_64_32(s->dflag);
                 gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
 
+                tcg_gen_mov_tl(cpu_cc_src, cpu_T0);
                 switch (reg & 7) {
                 case 1: /* blsr By,Ey */
+                    tcg_gen_subi_tl(cpu_T1, cpu_T0, 1);
+                    tcg_gen_and_tl(cpu_T0, cpu_T0, cpu_T1);
+                    break;
+                case 2: /* blsmsk By,Ey */
+                    tcg_gen_subi_tl(cpu_T1, cpu_T0, 1);
+                    tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_T1);
+                    break;
+                case 3: /* blsi By, Ey */
                     tcg_gen_neg_tl(cpu_T1, cpu_T0);
                     tcg_gen_and_tl(cpu_T0, cpu_T0, cpu_T1);
-                    gen_op_mov_reg_v(ot, s->vex_v, cpu_T0);
-                    gen_op_update2_cc();
-                    set_cc_op(s, CC_OP_BMILGB + ot);
                     break;
-
-                case 2: /* blsmsk By,Ey */
-                    tcg_gen_mov_tl(cpu_cc_src, cpu_T0);
-                    tcg_gen_subi_tl(cpu_T0, cpu_T0, 1);
-                    tcg_gen_xor_tl(cpu_T0, cpu_T0, cpu_cc_src);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T0);
-                    set_cc_op(s, CC_OP_BMILGB + ot);
-                    break;
-
-                case 3: /* blsi By, Ey */
-                    tcg_gen_mov_tl(cpu_cc_src, cpu_T0);
-                    tcg_gen_subi_tl(cpu_T0, cpu_T0, 1);
-                    tcg_gen_and_tl(cpu_T0, cpu_T0, cpu_cc_src);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T0);
-                    set_cc_op(s, CC_OP_BMILGB + ot);
-                    break;
-
                 default:
                     goto unknown_op;
                 }
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_T0);
+                gen_op_mov_reg_v(ot, s->vex_v, cpu_T0);
+                set_cc_op(s, CC_OP_BMILGB + ot);
                 break;
 
             default:

From 93d1499c8119989e3eb9a6936c5a18aaaaca6330 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 6 Jun 2018 15:41:58 +0200
Subject: [PATCH 07/60] whpx: commit missing file

Not included by mistake in commit 327fccb288976f95808efa968082fc9d4a9ced84.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/whp-dispatch.h | 56 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 target/i386/whp-dispatch.h

diff --git a/target/i386/whp-dispatch.h b/target/i386/whp-dispatch.h
new file mode 100644
index 0000000000..d8d3485976
--- /dev/null
+++ b/target/i386/whp-dispatch.h
@@ -0,0 +1,56 @@
+#include "windows.h"
+#include <stdbool.h>
+
+#include <WinHvPlatform.h>
+#include <WinHvEmulation.h>
+
+#ifndef WHP_DISPATCH_H
+#define WHP_DISPATCH_H
+
+
+#define LIST_WINHVPLATFORM_FUNCTIONS(X) \
+  X(HRESULT, WHvGetCapability, (WHV_CAPABILITY_CODE CapabilityCode, VOID* CapabilityBuffer, UINT32 CapabilityBufferSizeInBytes, UINT32* WrittenSizeInBytes)) \
+  X(HRESULT, WHvCreatePartition, (WHV_PARTITION_HANDLE* Partition)) \
+  X(HRESULT, WHvSetupPartition, (WHV_PARTITION_HANDLE Partition)) \
+  X(HRESULT, WHvDeletePartition, (WHV_PARTITION_HANDLE Partition)) \
+  X(HRESULT, WHvGetPartitionProperty, (WHV_PARTITION_HANDLE Partition, WHV_PARTITION_PROPERTY_CODE PropertyCode, VOID* PropertyBuffer, UINT32 PropertyBufferSizeInBytes, UINT32* WrittenSizeInBytes)) \
+  X(HRESULT, WHvSetPartitionProperty, (WHV_PARTITION_HANDLE Partition, WHV_PARTITION_PROPERTY_CODE PropertyCode, const VOID* PropertyBuffer, UINT32 PropertyBufferSizeInBytes)) \
+  X(HRESULT, WHvMapGpaRange, (WHV_PARTITION_HANDLE Partition, VOID* SourceAddress, WHV_GUEST_PHYSICAL_ADDRESS GuestAddress, UINT64 SizeInBytes, WHV_MAP_GPA_RANGE_FLAGS Flags)) \
+  X(HRESULT, WHvUnmapGpaRange, (WHV_PARTITION_HANDLE Partition, WHV_GUEST_PHYSICAL_ADDRESS GuestAddress, UINT64 SizeInBytes)) \
+  X(HRESULT, WHvTranslateGva, (WHV_PARTITION_HANDLE Partition, UINT32 VpIndex, WHV_GUEST_VIRTUAL_ADDRESS Gva, WHV_TRANSLATE_GVA_FLAGS TranslateFlags, WHV_TRANSLATE_GVA_RESULT* TranslationResult, WHV_GUEST_PHYSICAL_ADDRESS* Gpa)) \
+  X(HRESULT, WHvCreateVirtualProcessor, (WHV_PARTITION_HANDLE Partition, UINT32 VpIndex, UINT32 Flags)) \
+  X(HRESULT, WHvDeleteVirtualProcessor, (WHV_PARTITION_HANDLE Partition, UINT32 VpIndex)) \
+  X(HRESULT, WHvRunVirtualProcessor, (WHV_PARTITION_HANDLE Partition, UINT32 VpIndex, VOID* ExitContext, UINT32 ExitContextSizeInBytes)) \
+  X(HRESULT, WHvCancelRunVirtualProcessor, (WHV_PARTITION_HANDLE Partition, UINT32 VpIndex, UINT32 Flags)) \
+  X(HRESULT, WHvGetVirtualProcessorRegisters, (WHV_PARTITION_HANDLE Partition, UINT32 VpIndex, const WHV_REGISTER_NAME* RegisterNames, UINT32 RegisterCount, WHV_REGISTER_VALUE* RegisterValues)) \
+  X(HRESULT, WHvSetVirtualProcessorRegisters, (WHV_PARTITION_HANDLE Partition, UINT32 VpIndex, const WHV_REGISTER_NAME* RegisterNames, UINT32 RegisterCount, const WHV_REGISTER_VALUE* RegisterValues)) \
+
+
+#define LIST_WINHVEMULATION_FUNCTIONS(X) \
+  X(HRESULT, WHvEmulatorCreateEmulator, (const WHV_EMULATOR_CALLBACKS* Callbacks, WHV_EMULATOR_HANDLE* Emulator)) \
+  X(HRESULT, WHvEmulatorDestroyEmulator, (WHV_EMULATOR_HANDLE Emulator)) \
+  X(HRESULT, WHvEmulatorTryIoEmulation, (WHV_EMULATOR_HANDLE Emulator, VOID* Context, const WHV_VP_EXIT_CONTEXT* VpContext, const WHV_X64_IO_PORT_ACCESS_CONTEXT* IoInstructionContext, WHV_EMULATOR_STATUS* EmulatorReturnStatus)) \
+  X(HRESULT, WHvEmulatorTryMmioEmulation, (WHV_EMULATOR_HANDLE Emulator, VOID* Context, const WHV_VP_EXIT_CONTEXT* VpContext, const WHV_MEMORY_ACCESS_CONTEXT* MmioInstructionContext, WHV_EMULATOR_STATUS* EmulatorReturnStatus)) \
+
+
+#define WHP_DEFINE_TYPE(return_type, function_name, signature) \
+    typedef return_type (WINAPI *function_name ## _t) signature;
+
+#define WHP_DECLARE_MEMBER(return_type, function_name, signature) \
+    function_name ## _t function_name;
+
+/* Define function typedef */
+LIST_WINHVPLATFORM_FUNCTIONS(WHP_DEFINE_TYPE)
+LIST_WINHVEMULATION_FUNCTIONS(WHP_DEFINE_TYPE)
+
+struct WHPDispatch {
+    LIST_WINHVPLATFORM_FUNCTIONS(WHP_DECLARE_MEMBER)
+    LIST_WINHVEMULATION_FUNCTIONS(WHP_DECLARE_MEMBER)
+};
+
+extern struct WHPDispatch whp_dispatch;
+
+bool init_whp_dispatch(void);
+
+
+#endif /* WHP_DISPATCH_H */

From 4d8938a05db15dea2c86c4ab9c5f872f160d2188 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 7 Jun 2018 17:47:04 +0200
Subject: [PATCH 08/60] memory-device: turn alignment assert into check

The start of the address space indicates which maximum alignment is
supported by our machine (e.g. ppc, x86 1GB). This is helpful to
catch fragmenting guest physical memory in strange fashions.

Right now we can crash QEMU by e.g. (there might be easier examples)

qemu-system-x86_64 -m 256M,maxmem=20G,slots=2 \
 -object memory-backend-file,id=mem0,size=8192M,mem-path=/dev/zero,align=8192M \
 -device pc-dimm,id=dimm1,memdev=mem0

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180607154705.6316-2-david@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/memory-device.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c
index 3e04f3954e..6de4f70bb4 100644
--- a/hw/mem/memory-device.c
+++ b/hw/mem/memory-device.c
@@ -116,9 +116,15 @@ uint64_t memory_device_get_free_addr(MachineState *ms, const uint64_t *hint,
     address_space_start = ms->device_memory->base;
     address_space_end = address_space_start +
                         memory_region_size(&ms->device_memory->mr);
-    g_assert(QEMU_ALIGN_UP(address_space_start, align) == address_space_start);
     g_assert(address_space_end >= address_space_start);
 
+    /* address_space_start indicates the maximum alignment we expect */
+    if (QEMU_ALIGN_UP(address_space_start, align) != address_space_start) {
+        error_setg(errp, "the alignment (0%" PRIx64 ") is not supported",
+                   align);
+        return 0;
+    }
+
     memory_device_check_addable(ms, size, errp);
     if (*errp) {
         return 0;

From 61362b71c105ccb151ca16897a7d56534423f390 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 7 Jun 2018 17:47:05 +0200
Subject: [PATCH 09/60] exec: check that alignment is a power of two

Right now we can crash QEMU using e.g.

qemu-system-x86_64 -m 256M,maxmem=20G,slots=2 \
 -object memory-backend-file,id=mem0,size=12288,mem-path=/dev/zero,align=12288 \
 -device pc-dimm,id=dimm1,memdev=mem0

qemu-system-x86_64: util/mmap-alloc.c:115:
 qemu_ram_mmap: Assertion `is_power_of_2(align)' failed

Fix this by adding a proper check.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180607154705.6316-3-david@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/exec.c b/exec.c
index 610d0c0746..cdcf769daa 100644
--- a/exec.c
+++ b/exec.c
@@ -1841,6 +1841,10 @@ static void *file_ram_alloc(RAMBlock *block,
                    " must be multiples of page size 0x%zx",
                    block->mr->align, block->page_size);
         return NULL;
+    } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
+        error_setg(errp, "alignment 0x%" PRIx64
+                   " must be a power of two", block->mr->align);
+        return NULL;
     }
     block->mr->align = MAX(block->page_size, block->mr->align);
 #if defined(__s390x__)

From 6c090d4a755bb6245461450869130a517e18a3dc Mon Sep 17 00:00:00 2001
From: Shannon Zhao <zhaoshenglong@huawei.com>
Date: Wed, 16 May 2018 17:18:34 +0800
Subject: [PATCH 10/60] kvm: Delete the slot if and only if the
 KVM_MEM_READONLY flag is changed

According to KVM commit 75d61fbc, it needs to delete the slot before
changing the KVM_MEM_READONLY flag. But QEMU commit 235e8982 only check
whether KVM_MEM_READONLY flag is set instead of changing. It doesn't
need to delete the slot if the KVM_MEM_READONLY flag is not changed.

This fixes a issue that migrating a VM at the OVMF startup stage and
VM is executing the codes in rom. Between the deleting and adding the
slot in kvm_set_user_memory_region, there is a chance that guest access
rom and trap to KVM, then KVM can't find the corresponding memslot.
While KVM (on ARM) injects an abort to guest due to the broken hva, then
guest will get stuck.

Signed-off-by: Shannon Zhao <zhaoshenglong@huawei.com>
Message-Id: <1526462314-19720-1-git-send-email-zhaoshenglong@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c      | 17 ++++++++---------
 include/sysemu/kvm_int.h |  1 +
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ffee68e603..eb7db92a5e 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -256,7 +256,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
     return 0;
 }
 
-static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
+static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
 {
     KVMState *s = kvm_state;
     struct kvm_userspace_memory_region mem;
@@ -267,7 +267,7 @@ static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
     mem.userspace_addr = (unsigned long)slot->ram;
     mem.flags = slot->flags;
 
-    if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
+    if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
         /* Set the slot size to 0 before setting the slot to the desired
          * value. This is needed based on KVM commit 75d61fbc. */
         mem.memory_size = 0;
@@ -275,6 +275,7 @@ static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
     }
     mem.memory_size = slot->memory_size;
     ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
+    slot->old_flags = mem.flags;
     trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
                               mem.memory_size, mem.userspace_addr, ret);
     return ret;
@@ -391,17 +392,14 @@ static int kvm_mem_flags(MemoryRegion *mr)
 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
                                  MemoryRegion *mr)
 {
-    int old_flags;
-
-    old_flags = mem->flags;
     mem->flags = kvm_mem_flags(mr);
 
     /* If nothing changed effectively, no need to issue ioctl */
-    if (mem->flags == old_flags) {
+    if (mem->flags == mem->old_flags) {
         return 0;
     }
 
-    return kvm_set_user_memory_region(kml, mem);
+    return kvm_set_user_memory_region(kml, mem, false);
 }
 
 static int kvm_section_update_flags(KVMMemoryListener *kml,
@@ -755,7 +753,8 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
 
         /* unregister the slot */
         mem->memory_size = 0;
-        err = kvm_set_user_memory_region(kml, mem);
+        mem->flags = 0;
+        err = kvm_set_user_memory_region(kml, mem, false);
         if (err) {
             fprintf(stderr, "%s: error unregistering slot: %s\n",
                     __func__, strerror(-err));
@@ -771,7 +770,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
     mem->ram = ram;
     mem->flags = kvm_mem_flags(mr);
 
-    err = kvm_set_user_memory_region(kml, mem);
+    err = kvm_set_user_memory_region(kml, mem, true);
     if (err) {
         fprintf(stderr, "%s: error registering slot: %s\n", __func__,
                 strerror(-err));
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index 888557a1ca..f838412491 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -20,6 +20,7 @@ typedef struct KVMSlot
     void *ram;
     int slot;
     int flags;
+    int old_flags;
 } KVMSlot;
 
 typedef struct KVMMemoryListener {

From 70c31264afd1f50c3b93a9007d97215ed5485e32 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Wed, 25 Apr 2018 10:54:56 +0800
Subject: [PATCH 11/60] tests/atomic_add-bench: add -m option to use mutexes

This allows us to use atomic-add-bench as a microbenchmark
for evaluating qemu_mutex_lock's performance.

Signed-off-by: Emilio G. Cota <cota@braap.org>
[cherry picked from https://github.com/cota/qemu/commit/f04f34df]
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20180425025459.5258-2-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tests/atomic_add-bench.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/atomic_add-bench.c b/tests/atomic_add-bench.c
index caa1e8e689..f96d448f77 100644
--- a/tests/atomic_add-bench.c
+++ b/tests/atomic_add-bench.c
@@ -8,6 +8,7 @@ struct thread_info {
 } QEMU_ALIGNED(64);
 
 struct count {
+    QemuMutex lock;
     unsigned long val;
 } QEMU_ALIGNED(64);
 
@@ -18,11 +19,13 @@ static unsigned int n_ready_threads;
 static struct count *counts;
 static unsigned int duration = 1;
 static unsigned int range = 1024;
+static bool use_mutex;
 static bool test_start;
 static bool test_stop;
 
 static const char commands_string[] =
     " -n = number of threads\n"
+    " -m = use mutexes instead of atomic increments\n"
     " -d = duration in seconds\n"
     " -r = range (will be rounded up to pow2)";
 
@@ -59,7 +62,13 @@ static void *thread_func(void *arg)
 
         info->r = xorshift64star(info->r);
         index = info->r & (range - 1);
-        atomic_inc(&counts[index].val);
+        if (use_mutex) {
+            qemu_mutex_lock(&counts[index].lock);
+            counts[index].val += 1;
+            qemu_mutex_unlock(&counts[index].lock);
+        } else {
+            atomic_inc(&counts[index].val);
+        }
     }
     return NULL;
 }
@@ -91,6 +100,9 @@ static void create_threads(void)
     th_info = g_new(struct thread_info, n_threads);
     counts = qemu_memalign(64, sizeof(*counts) * range);
     memset(counts, 0, sizeof(*counts) * range);
+    for (i = 0; i < range; i++) {
+        qemu_mutex_init(&counts[i].lock);
+    }
 
     for (i = 0; i < n_threads; i++) {
         struct thread_info *info = &th_info[i];
@@ -131,7 +143,7 @@ static void parse_args(int argc, char *argv[])
     int c;
 
     for (;;) {
-        c = getopt(argc, argv, "hd:n:r:");
+        c = getopt(argc, argv, "hd:n:mr:");
         if (c < 0) {
             break;
         }
@@ -145,6 +157,9 @@ static void parse_args(int argc, char *argv[])
         case 'n':
             n_threads = atoi(optarg);
             break;
+        case 'm':
+            use_mutex = true;
+            break;
         case 'r':
             range = pow2ceil(atoi(optarg));
             break;

From f1aff7aa8e6f238909bd0b0e7a1fe235802843f2 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 25 Apr 2018 10:54:57 +0800
Subject: [PATCH 12/60] qemu-thread: introduce qemu-thread-common.h

Introduce some hooks for the shared part of qemu thread between POSIX
and Windows implementations.  Note that in qemu_mutex_unlock_impl() we
moved the call before unlock operation which should make more sense.
And we don't need qemu_mutex_post_unlock() hook.

Put all these shared hooks into the header files.  It should be internal
to qemu-thread but not for qemu-thread users, hence put into util/
directory.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20180425025459.5258-3-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/qemu-thread-common.h | 43 +++++++++++++++++++++++++++++++++++++++
 util/qemu-thread-posix.c  | 19 ++++++++---------
 util/qemu-thread-win32.c  | 17 ++++++++--------
 3 files changed, 59 insertions(+), 20 deletions(-)
 create mode 100644 util/qemu-thread-common.h

diff --git a/util/qemu-thread-common.h b/util/qemu-thread-common.h
new file mode 100644
index 0000000000..d3292084d6
--- /dev/null
+++ b/util/qemu-thread-common.h
@@ -0,0 +1,43 @@
+/*
+ * Common qemu-thread implementation header file.
+ *
+ * Copyright Red Hat, Inc. 2018
+ *
+ * Authors:
+ *  Peter Xu <peterx@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_THREAD_COMMON_H
+#define QEMU_THREAD_COMMON_H
+
+#include "qemu/typedefs.h"
+#include "qemu/thread.h"
+#include "trace.h"
+
+static inline void qemu_mutex_post_init(QemuMutex *mutex)
+{
+    mutex->initialized = true;
+}
+
+static inline void qemu_mutex_pre_lock(QemuMutex *mutex,
+                                       const char *file, int line)
+{
+    trace_qemu_mutex_lock(mutex, file, line);
+}
+
+static inline void qemu_mutex_post_lock(QemuMutex *mutex,
+                                        const char *file, int line)
+{
+    trace_qemu_mutex_locked(mutex, file, line);
+}
+
+static inline void qemu_mutex_pre_unlock(QemuMutex *mutex,
+                                         const char *file, int line)
+{
+    trace_qemu_mutex_unlock(mutex, file, line);
+}
+
+#endif
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index a1c34ba6f2..dfa66ff2fb 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -14,7 +14,7 @@
 #include "qemu/thread.h"
 #include "qemu/atomic.h"
 #include "qemu/notify.h"
-#include "trace.h"
+#include "qemu-thread-common.h"
 
 static bool name_threads;
 
@@ -43,7 +43,7 @@ void qemu_mutex_init(QemuMutex *mutex)
     err = pthread_mutex_init(&mutex->lock, NULL);
     if (err)
         error_exit(err, __func__);
-    mutex->initialized = true;
+    qemu_mutex_post_init(mutex);
 }
 
 void qemu_mutex_destroy(QemuMutex *mutex)
@@ -62,13 +62,11 @@ void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line)
     int err;
 
     assert(mutex->initialized);
-    trace_qemu_mutex_lock(mutex, file, line);
-
+    qemu_mutex_pre_lock(mutex, file, line);
     err = pthread_mutex_lock(&mutex->lock);
     if (err)
         error_exit(err, __func__);
-
-    trace_qemu_mutex_locked(mutex, file, line);
+    qemu_mutex_post_lock(mutex, file, line);
 }
 
 int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
@@ -78,7 +76,7 @@ int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
     assert(mutex->initialized);
     err = pthread_mutex_trylock(&mutex->lock);
     if (err == 0) {
-        trace_qemu_mutex_locked(mutex, file, line);
+        qemu_mutex_post_lock(mutex, file, line);
         return 0;
     }
     if (err != EBUSY) {
@@ -92,11 +90,10 @@ void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line)
     int err;
 
     assert(mutex->initialized);
+    qemu_mutex_pre_unlock(mutex, file, line);
     err = pthread_mutex_unlock(&mutex->lock);
     if (err)
         error_exit(err, __func__);
-
-    trace_qemu_mutex_unlock(mutex, file, line);
 }
 
 void qemu_rec_mutex_init(QemuRecMutex *mutex)
@@ -160,9 +157,9 @@ void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, con
     int err;
 
     assert(cond->initialized);
-    trace_qemu_mutex_unlock(mutex, file, line);
+    qemu_mutex_pre_unlock(mutex, file, line);
     err = pthread_cond_wait(&cond->cond, &mutex->lock);
-    trace_qemu_mutex_locked(mutex, file, line);
+    qemu_mutex_post_lock(mutex, file, line);
     if (err)
         error_exit(err, __func__);
 }
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
index ab60c0d557..b303188a36 100644
--- a/util/qemu-thread-win32.c
+++ b/util/qemu-thread-win32.c
@@ -19,7 +19,7 @@
 #include "qemu-common.h"
 #include "qemu/thread.h"
 #include "qemu/notify.h"
-#include "trace.h"
+#include "qemu-thread-common.h"
 #include <process.h>
 
 static bool name_threads;
@@ -46,7 +46,7 @@ static void error_exit(int err, const char *msg)
 void qemu_mutex_init(QemuMutex *mutex)
 {
     InitializeSRWLock(&mutex->lock);
-    mutex->initialized = true;
+    qemu_mutex_post_init(mutex);
 }
 
 void qemu_mutex_destroy(QemuMutex *mutex)
@@ -59,10 +59,9 @@ void qemu_mutex_destroy(QemuMutex *mutex)
 void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line)
 {
     assert(mutex->initialized);
-    trace_qemu_mutex_lock(mutex, file, line);
-
+    qemu_mutex_pre_lock(mutex, file, line);
     AcquireSRWLockExclusive(&mutex->lock);
-    trace_qemu_mutex_locked(mutex, file, line);
+    qemu_mutex_post_lock(mutex, file, line);
 }
 
 int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
@@ -72,7 +71,7 @@ int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
     assert(mutex->initialized);
     owned = TryAcquireSRWLockExclusive(&mutex->lock);
     if (owned) {
-        trace_qemu_mutex_locked(mutex, file, line);
+        qemu_mutex_post_lock(mutex, file, line);
         return 0;
     }
     return -EBUSY;
@@ -81,7 +80,7 @@ int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
 void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line)
 {
     assert(mutex->initialized);
-    trace_qemu_mutex_unlock(mutex, file, line);
+    qemu_mutex_pre_unlock(mutex, file, line);
     ReleaseSRWLockExclusive(&mutex->lock);
 }
 
@@ -145,9 +144,9 @@ void qemu_cond_broadcast(QemuCond *cond)
 void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line)
 {
     assert(cond->initialized);
-    trace_qemu_mutex_unlock(mutex, file, line);
+    qemu_mutex_pre_unlock(mutex, file, line);
     SleepConditionVariableSRW(&cond->var, &mutex->lock, INFINITE, 0);
-    trace_qemu_mutex_locked(mutex, file, line);
+    qemu_mutex_post_lock(mutex, file, line);
 }
 
 void qemu_sem_init(QemuSemaphore *sem, int init)

From ba59fb778ec68b072196cff9af11c7612a6e52f2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 Jun 2018 14:23:08 +0200
Subject: [PATCH 13/60] QemuMutex: support --enable-debug-mutex

We have had some tracing tools for mutex but it's not easy to use them
for e.g. dead locks.  Let's provide "--enable-debug-mutex" parameter
when configure to allow QemuMutex to store the last owner that took
specific lock.  It will be easy to use this tool to debug deadlocks
since we can directly know who took the lock then as long as we can have
a debugger attached to the process.

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20180425025459.5258-4-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 configure                   | 10 ++++++++++
 include/qemu/thread-posix.h |  4 ++++
 include/qemu/thread-win32.h |  4 ++++
 util/qemu-thread-common.h   | 12 ++++++++++++
 4 files changed, 30 insertions(+)

diff --git a/configure b/configure
index 4d12cfbe3f..3f53aa76fd 100755
--- a/configure
+++ b/configure
@@ -456,6 +456,7 @@ replication="yes"
 vxhs=""
 libxml2=""
 docker="no"
+debug_mutex="no"
 
 # cross compilers defaults, can be overridden with --cross-cc-ARCH
 cross_cc_aarch64="aarch64-linux-gnu-gcc"
@@ -1411,6 +1412,10 @@ for opt do
   ;;
   --disable-git-update) git_update=no
   ;;
+  --enable-debug-mutex) debug_mutex=yes
+  ;;
+  --disable-debug-mutex) debug_mutex=no
+  ;;
   *)
       echo "ERROR: unknown option $opt"
       echo "Try '$0 --help' for more information"
@@ -1685,6 +1690,7 @@ disabled with --disable-FEATURE, default is enabled if available:
   crypto-afalg    Linux AF_ALG crypto backend driver
   vhost-user      vhost-user support
   capstone        capstone disassembler support
+  debug-mutex     mutex debugging support
 
 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -5951,6 +5957,7 @@ echo "seccomp support   $seccomp"
 echo "coroutine backend $coroutine"
 echo "coroutine pool    $coroutine_pool"
 echo "debug stack usage $debug_stack_usage"
+echo "mutex debugging   $debug_mutex"
 echo "crypto afalg      $crypto_afalg"
 echo "GlusterFS support $glusterfs"
 echo "gcov              $gcov_tool"
@@ -6704,6 +6711,9 @@ fi
 if test "$capstone" != "no" ; then
   echo "CONFIG_CAPSTONE=y" >> $config_host_mak
 fi
+if test "$debug_mutex" = "yes" ; then
+  echo "CONFIG_DEBUG_MUTEX=y" >> $config_host_mak
+fi
 
 # Hold two types of flag:
 #   CONFIG_THREAD_SETNAME_BYTHREAD  - we've got a way of setting the name on
diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h
index f3f47e426f..fd27b34128 100644
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -12,6 +12,10 @@ typedef QemuMutex QemuRecMutex;
 
 struct QemuMutex {
     pthread_mutex_t lock;
+#ifdef CONFIG_DEBUG_MUTEX
+    const char *file;
+    int line;
+#endif
     bool initialized;
 };
 
diff --git a/include/qemu/thread-win32.h b/include/qemu/thread-win32.h
index 3a05e3b3aa..d668d789b4 100644
--- a/include/qemu/thread-win32.h
+++ b/include/qemu/thread-win32.h
@@ -5,6 +5,10 @@
 
 struct QemuMutex {
     SRWLOCK lock;
+#ifdef CONFIG_DEBUG_MUTEX
+    const char *file;
+    int line;
+#endif
     bool initialized;
 };
 
diff --git a/util/qemu-thread-common.h b/util/qemu-thread-common.h
index d3292084d6..a0ea7c0d92 100644
--- a/util/qemu-thread-common.h
+++ b/util/qemu-thread-common.h
@@ -19,6 +19,10 @@
 
 static inline void qemu_mutex_post_init(QemuMutex *mutex)
 {
+#ifdef CONFIG_DEBUG_MUTEX
+    mutex->file = NULL;
+    mutex->line = 0;
+#endif
     mutex->initialized = true;
 }
 
@@ -31,12 +35,20 @@ static inline void qemu_mutex_pre_lock(QemuMutex *mutex,
 static inline void qemu_mutex_post_lock(QemuMutex *mutex,
                                         const char *file, int line)
 {
+#ifdef CONFIG_DEBUG_MUTEX
+    mutex->file = file;
+    mutex->line = line;
+#endif
     trace_qemu_mutex_locked(mutex, file, line);
 }
 
 static inline void qemu_mutex_pre_unlock(QemuMutex *mutex,
                                          const char *file, int line)
 {
+#ifdef CONFIG_DEBUG_MUTEX
+    mutex->file = NULL;
+    mutex->line = 0;
+#endif
     trace_qemu_mutex_unlock(mutex, file, line);
 }
 

From 1fcc6d42e78c5fbccef63f47a380361ee81d344a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 25 Apr 2018 10:54:59 +0800
Subject: [PATCH 14/60] configure: enable debug-mutex if debug enabled

Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20180425025459.5258-5-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 configure | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure b/configure
index 3f53aa76fd..ca7de4f55f 100755
--- a/configure
+++ b/configure
@@ -1042,6 +1042,7 @@ for opt do
   --enable-debug)
       # Enable debugging options that aren't excessively noisy
       debug_tcg="yes"
+      debug_mutex="yes"
       debug="yes"
       strip_opt="no"
       fortify_source="no"

From a1d30f285ebc0ba89d8dcba0b10a6b2516c2e470 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 13 Jun 2018 07:05:19 +0200
Subject: [PATCH 15/60] Replace '-enable-kvm' with '-accel kvm' in docs and
 help texts

The preferred way to select the KVM accelerator is to use "-accel kvm"
these days, so let's be consistent in our documentation and help texts.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <1528866321-23886-3-git-send-email-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 docs/COLO-FT.txt                  | 8 ++++----
 docs/can.txt                      | 4 ++--
 docs/multi-thread-compression.txt | 2 +-
 docs/multiseat.txt                | 2 +-
 docs/specs/tpm.txt                | 8 ++++----
 hw/block/dataplane/virtio-blk.c   | 4 ++--
 hw/scsi/virtio-scsi-dataplane.c   | 4 ++--
 7 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
index e289be2f41..d7c7dcda8f 100644
--- a/docs/COLO-FT.txt
+++ b/docs/COLO-FT.txt
@@ -113,16 +113,16 @@ by using 'x-colo-lost-heartbeat' command.
 == Test procedure ==
 1. Startup qemu
 Primary:
-# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name primary \
-  -device piix3-usb-uhci \
+# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name primary \
+  -device piix3-usb-uhci -vnc :7 \
   -device usb-tablet -netdev tap,id=hn0,vhost=off \
   -device virtio-net-pci,id=net-pci0,netdev=hn0 \
   -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
          children.0.file.filename=1.raw,\
          children.0.driver=raw -S
 Secondary:
-# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name secondary \
-  -device piix3-usb-uhci \
+# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name secondary \
+  -device piix3-usb-uhci -vnc :7 \
   -device usb-tablet -netdev tap,id=hn0,vhost=off \
   -device virtio-net-pci,id=net-pci0,netdev=hn0 \
   -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \
diff --git a/docs/can.txt b/docs/can.txt
index a357105762..7ba23b259a 100644
--- a/docs/can.txt
+++ b/docs/can.txt
@@ -52,7 +52,7 @@ The ''kvaser_pci'' board/device model is compatible with and has been tested wit
 The tested setup was Linux 4.9 kernel on the host and guest side.
 Example for qemu-system-x86_64:
 
-    qemu-system-x86_64 -enable-kvm -kernel /boot/vmlinuz-4.9.0-4-amd64 \
+    qemu-system-x86_64 -accel kvm -kernel /boot/vmlinuz-4.9.0-4-amd64 \
       -initrd ramdisk.cpio \
       -virtfs local,path=shareddir,security_model=none,mount_tag=shareddir \
       -object can-bus,id=canbus0 \
@@ -104,4 +104,4 @@ Links to other resources
      Slides
      http://rtime.felk.cvut.cz/publications/public/rtlws2015-qemu-can-slides.pdf
  (5) Linux SocketCAN utilities
-     https://github.com/linux-can/can-utils/
\ No newline at end of file
+     https://github.com/linux-can/can-utils/
diff --git a/docs/multi-thread-compression.txt b/docs/multi-thread-compression.txt
index d0caaf7b3b..bb88c6bdf1 100644
--- a/docs/multi-thread-compression.txt
+++ b/docs/multi-thread-compression.txt
@@ -62,7 +62,7 @@ RAM: 128G
 NIC: Intel I350 (10/100/1000Mbps)
 Host OS: CentOS 7 64-bit
 Guest OS: RHEL 6.5 64-bit
-Parameter: qemu-system-x86_64 -enable-kvm -smp 4 -m 4096
+Parameter: qemu-system-x86_64 -accel kvm -smp 4 -m 4096
  /share/ia32e_rhel6u5.qcow -monitor stdio
 
 There is no additional application is running on the guest when doing
diff --git a/docs/multiseat.txt b/docs/multiseat.txt
index 807518c8af..dc28cdb613 100644
--- a/docs/multiseat.txt
+++ b/docs/multiseat.txt
@@ -18,7 +18,7 @@ or
 
 Next put together the qemu command line (sdk/gtk):
 
-qemu	-enable-kvm -usb $memory $disk $whatever \
+qemu	-accel kvm -usb $memory $disk $whatever \
 	-display [ sdl | gtk ] \
 	-vga std \
 	-device usb-tablet
diff --git a/docs/specs/tpm.txt b/docs/specs/tpm.txt
index c230c4c93e..70ad4a0cba 100644
--- a/docs/specs/tpm.txt
+++ b/docs/specs/tpm.txt
@@ -98,7 +98,7 @@ QEMU files related to the TPM passthrough device:
 Command line to start QEMU with the TPM passthrough device using the host's
 hardware TPM /dev/tpm0:
 
-qemu-system-x86_64 -display sdl -enable-kvm \
+qemu-system-x86_64 -display sdl -accel kvm \
   -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
   -tpmdev passthrough,id=tpm0,path=/dev/tpm0 \
   -device tpm-tis,tpmdev=tpm0 test.img
@@ -164,7 +164,7 @@ swtpm socket --tpmstate dir=/tmp/mytpm1 \
 Command line to start QEMU with the TPM emulator device communicating with
 the swtpm:
 
-qemu-system-x86_64 -display sdl -enable-kvm \
+qemu-system-x86_64 -display sdl -accel kvm \
   -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
   -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
   -tpmdev emulator,id=tpm0,chardev=chrtpm \
@@ -222,7 +222,7 @@ swtpm socket --tpmstate dir=/tmp/mytpm1 \
 
 In a 2nd terminal start the VM:
 
-qemu-system-x86_64 -display sdl -enable-kvm \
+qemu-system-x86_64 -display sdl -accel kvm \
   -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
   -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
   -tpmdev emulator,id=tpm0,chardev=chrtpm \
@@ -255,7 +255,7 @@ swtpm socket --tpmstate dir=/tmp/mytpm1 \
 In the 2nd terminal restore the state of the VM using the additonal
 '-incoming' option.
 
-qemu-system-x86_64 -display sdl -enable-kvm \
+qemu-system-x86_64 -display sdl -accel kvm \
   -m 1024 -boot d -bios bios-256k.bin -boot menu=on \
   -chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
   -tpmdev emulator,id=tpm0,chardev=chrtpm \
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index d648aeb73b..8c37bd314a 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -190,8 +190,8 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
     /* Set up guest notifier (irq) */
     r = k->set_guest_notifiers(qbus->parent, nvqs, true);
     if (r != 0) {
-        fprintf(stderr, "virtio-blk failed to set guest notifier (%d), "
-                "ensure -enable-kvm is set\n", r);
+        error_report("virtio-blk failed to set guest notifier (%d), "
+                     "ensure -accel kvm is set.", r);
         goto fail_guest_notifiers;
     }
 
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 912e5005d8..b995bab3a2 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -142,8 +142,8 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
     /* Set up guest notifier (irq) */
     rc = k->set_guest_notifiers(qbus->parent, vs->conf.num_queues + 2, true);
     if (rc != 0) {
-        fprintf(stderr, "virtio-scsi: Failed to set guest notifiers (%d), "
-                "ensure -enable-kvm is set\n", rc);
+        error_report("virtio-scsi: Failed to set guest notifiers (%d), "
+                     "ensure -accel kvm is set.", rc);
         goto fail_guest_notifiers;
     }
 

From 0b3c5c81bf0a9e32fd08c532acde3caa446b3712 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 11 Jun 2018 16:56:07 -0300
Subject: [PATCH 16/60] qemu-options: Add missing newline to -accel help text

The newline was removed by commit c97d6d2c, and broke -help output:

Before this patch:

  $ qemu-system-x86_64 -help | grep smp
                  thread=single|multi (enable multi-threaded TCG)-smp [...]

After this patch:

  $ qemu-system-x86_64 -help  | grep smp
  -smp [cpus=]n[,maxcpus=cpus][,cores=cores][,threads=threads][,sockets=sockets]

Fixes: c97d6d2cdf97edb4aebe832fdba65d701ad7bcb6
Cc: Sergio Andres Gomez Del Real <sergio.g.delreal@gmail.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Message-Id: <20180611195607.3015-1-ehabkost@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 qemu-options.hx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index d5b0c26e8e..270772817a 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -130,7 +130,7 @@ ETEXI
 DEF("accel", HAS_ARG, QEMU_OPTION_accel,
     "-accel [accel=]accelerator[,thread=single|multi]\n"
     "                select accelerator (kvm, xen, hax, hvf, whpx or tcg; use 'help' for a list)\n"
-    "                thread=single|multi (enable multi-threaded TCG)", QEMU_ARCH_ALL)
+    "                thread=single|multi (enable multi-threaded TCG)\n", QEMU_ARCH_ALL)
 STEXI
 @item -accel @var{name}[,prop=@var{value}[,...]]
 @findex -accel

From 1e695fd7c3147ed2fde3225f5c534bfc4774d5f2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:30 +0200
Subject: [PATCH 17/60] pc-dimm: remove leftover "struct pc_dimms_capacity"

Not needed anymore, let's drop it.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-2-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/pc-dimm.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 12da89d562..62b34a992e 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -27,11 +27,6 @@
 #include "sysemu/numa.h"
 #include "trace.h"
 
-typedef struct pc_dimms_capacity {
-     uint64_t size;
-     Error    **errp;
-} pc_dimms_capacity;
-
 void pc_dimm_memory_plug(DeviceState *dev, MachineState *machine,
                          uint64_t align, Error **errp)
 {

From bb6e2f7a54dfa791510f64bc3a551e5a152ea5f7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:31 +0200
Subject: [PATCH 18/60] pc: rename pc_dimm_(plug|unplug|...)* into
 pc_memory_(plug|unplug|...)*

Use a similar naming scheme as spapr. This way, we can go ahead and
rename e.g. pc_dimm_memory_plug to pc_dimm_plug, which avoids
confusion.

Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-3-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/pc.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 622e49d6bc..f9250ffae7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1674,8 +1674,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
     }
 }
 
-static void pc_dimm_plug(HotplugHandler *hotplug_dev,
-                         DeviceState *dev, Error **errp)
+static void pc_memory_plug(HotplugHandler *hotplug_dev,
+                           DeviceState *dev, Error **errp)
 {
     HotplugHandlerClass *hhc;
     Error *local_err = NULL;
@@ -1728,8 +1728,8 @@ out:
     error_propagate(errp, local_err);
 }
 
-static void pc_dimm_unplug_request(HotplugHandler *hotplug_dev,
-                                   DeviceState *dev, Error **errp)
+static void pc_memory_unplug_request(HotplugHandler *hotplug_dev,
+                                     DeviceState *dev, Error **errp)
 {
     HotplugHandlerClass *hhc;
     Error *local_err = NULL;
@@ -1759,8 +1759,8 @@ out:
     error_propagate(errp, local_err);
 }
 
-static void pc_dimm_unplug(HotplugHandler *hotplug_dev,
-                           DeviceState *dev, Error **errp)
+static void pc_memory_unplug(HotplugHandler *hotplug_dev,
+                             DeviceState *dev, Error **errp)
 {
     PCMachineState *pcms = PC_MACHINE(hotplug_dev);
     HotplugHandlerClass *hhc;
@@ -2015,7 +2015,7 @@ static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev,
                                       DeviceState *dev, Error **errp)
 {
     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-        pc_dimm_plug(hotplug_dev, dev, errp);
+        pc_memory_plug(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_plug(hotplug_dev, dev, errp);
     }
@@ -2025,7 +2025,7 @@ static void pc_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
                                                 DeviceState *dev, Error **errp)
 {
     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-        pc_dimm_unplug_request(hotplug_dev, dev, errp);
+        pc_memory_unplug_request(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_unplug_request_cb(hotplug_dev, dev, errp);
     } else {
@@ -2038,7 +2038,7 @@ static void pc_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
                                         DeviceState *dev, Error **errp)
 {
     if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-        pc_dimm_unplug(hotplug_dev, dev, errp);
+        pc_memory_unplug(hotplug_dev, dev, errp);
     } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_unplug_cb(hotplug_dev, dev, errp);
     } else {

From 284878ee98d682b1d4c859dd0e6334df421d3a50 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:32 +0200
Subject: [PATCH 19/60] pc-dimm: rename pc_dimm_memory_* to pc_dimm_*

Let's rename it to make it look more consistent.

Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-4-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/pc.c             | 4 ++--
 hw/mem/pc-dimm.c         | 6 +++---
 hw/ppc/spapr.c           | 6 +++---
 include/hw/mem/pc-dimm.h | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f9250ffae7..f23133facc 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1713,7 +1713,7 @@ static void pc_memory_plug(HotplugHandler *hotplug_dev,
         goto out;
     }
 
-    pc_dimm_memory_plug(dev, MACHINE(pcms), align, &local_err);
+    pc_dimm_plug(dev, MACHINE(pcms), align, &local_err);
     if (local_err) {
         goto out;
     }
@@ -1773,7 +1773,7 @@ static void pc_memory_unplug(HotplugHandler *hotplug_dev,
         goto out;
     }
 
-    pc_dimm_memory_unplug(dev, MACHINE(pcms));
+    pc_dimm_unplug(dev, MACHINE(pcms));
     object_unparent(OBJECT(dev));
 
  out:
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 62b34a992e..9e0c83e415 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -27,8 +27,8 @@
 #include "sysemu/numa.h"
 #include "trace.h"
 
-void pc_dimm_memory_plug(DeviceState *dev, MachineState *machine,
-                         uint64_t align, Error **errp)
+void pc_dimm_plug(DeviceState *dev, MachineState *machine, uint64_t align,
+                  Error **errp)
 {
     int slot;
     PCDIMMDevice *dimm = PC_DIMM(dev);
@@ -84,7 +84,7 @@ out:
     error_propagate(errp, local_err);
 }
 
-void pc_dimm_memory_unplug(DeviceState *dev, MachineState *machine)
+void pc_dimm_unplug(DeviceState *dev, MachineState *machine)
 {
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0d032a1ad0..3a1bd733be 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3160,7 +3160,7 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
     align = memory_region_get_alignment(mr);
     size = memory_region_size(mr);
 
-    pc_dimm_memory_plug(dev, MACHINE(ms), align, &local_err);
+    pc_dimm_plug(dev, MACHINE(ms), align, &local_err);
     if (local_err) {
         goto out;
     }
@@ -3183,7 +3183,7 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
     return;
 
 out_unplug:
-    pc_dimm_memory_unplug(dev, MACHINE(ms));
+    pc_dimm_unplug(dev, MACHINE(ms));
 out:
     error_propagate(errp, local_err);
 }
@@ -3332,7 +3332,7 @@ static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
     sPAPRMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
     sPAPRDIMMState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
 
-    pc_dimm_memory_unplug(dev, MACHINE(hotplug_dev));
+    pc_dimm_unplug(dev, MACHINE(hotplug_dev));
     object_unparent(OBJECT(dev));
     spapr_pending_dimm_unplugs_remove(spapr, ds);
 }
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 627c8601d9..860343d64f 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -78,7 +78,7 @@ typedef struct PCDIMMDeviceClass {
 
 int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
 
-void pc_dimm_memory_plug(DeviceState *dev, MachineState *machine,
-                         uint64_t align, Error **errp);
-void pc_dimm_memory_unplug(DeviceState *dev, MachineState *machine);
+void pc_dimm_plug(DeviceState *dev, MachineState *machine, uint64_t align,
+                  Error **errp);
+void pc_dimm_unplug(DeviceState *dev, MachineState *machine);
 #endif

From 9995c759510391ad9d3f7997c93c1ecdc6ed08b8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:33 +0200
Subject: [PATCH 20/60] pc-dimm: remove pc_dimm_get_free_slot() from header

Not used outside of pc-dimm.c and there shouldn't be other users. If
other devices (e.g. memory devices) ever have to also use slots, then we
will have to factor this out.

Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-5-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/pc-dimm.c         | 4 +++-
 include/hw/mem/pc-dimm.h | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 9e0c83e415..7387963cf1 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -27,6 +27,8 @@
 #include "sysemu/numa.h"
 #include "trace.h"
 
+static int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
+
 void pc_dimm_plug(DeviceState *dev, MachineState *machine, uint64_t align,
                   Error **errp)
 {
@@ -111,7 +113,7 @@ static int pc_dimm_slot2bitmap(Object *obj, void *opaque)
     return 0;
 }
 
-int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp)
+static int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp)
 {
     unsigned long *bitmap;
     int slot = 0;
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 860343d64f..cf71247630 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -76,8 +76,6 @@ typedef struct PCDIMMDeviceClass {
     MemoryRegion *(*get_vmstate_memory_region)(PCDIMMDevice *dimm);
 } PCDIMMDeviceClass;
 
-int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
-
 void pc_dimm_plug(DeviceState *dev, MachineState *machine, uint64_t align,
                   Error **errp);
 void pc_dimm_unplug(DeviceState *dev, MachineState *machine);

From d468115b1c7a4d0843f18bc9da41f2c44f93877e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:34 +0200
Subject: [PATCH 21/60] pc: factor out pc specific dimm checks into
 pc_memory_pre_plug()

We can perform these checks before the device is actually realized.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-6-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/pc.c | 44 ++++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f23133facc..2db032a6df 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1674,6 +1674,29 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
     }
 }
 
+static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                               Error **errp)
+{
+    const PCMachineState *pcms = PC_MACHINE(hotplug_dev);
+    const bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
+
+    /*
+     * When -no-acpi is used with Q35 machine type, no ACPI is built,
+     * but pcms->acpi_dev is still created. Check !acpi_enabled in
+     * addition to cover this case.
+     */
+    if (!pcms->acpi_dev || !acpi_enabled) {
+        error_setg(errp,
+                   "memory hotplug is not enabled: missing acpi device or acpi disabled");
+        return;
+    }
+
+    if (is_nvdimm && !pcms->acpi_nvdimm_state.is_enabled) {
+        error_setg(errp, "nvdimm is not enabled: missing 'nvdimm' in '-M'");
+        return;
+    }
+}
+
 static void pc_memory_plug(HotplugHandler *hotplug_dev,
                            DeviceState *dev, Error **errp)
 {
@@ -1696,23 +1719,6 @@ static void pc_memory_plug(HotplugHandler *hotplug_dev,
         align = memory_region_get_alignment(mr);
     }
 
-    /*
-     * When -no-acpi is used with Q35 machine type, no ACPI is built,
-     * but pcms->acpi_dev is still created. Check !acpi_enabled in
-     * addition to cover this case.
-     */
-    if (!pcms->acpi_dev || !acpi_enabled) {
-        error_setg(&local_err,
-                   "memory hotplug is not enabled: missing acpi device or acpi disabled");
-        goto out;
-    }
-
-    if (is_nvdimm && !pcms->acpi_nvdimm_state.is_enabled) {
-        error_setg(&local_err,
-                   "nvdimm is not enabled: missing 'nvdimm' in '-M'");
-        goto out;
-    }
-
     pc_dimm_plug(dev, MACHINE(pcms), align, &local_err);
     if (local_err) {
         goto out;
@@ -2006,7 +2012,9 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev,
 static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
                                           DeviceState *dev, Error **errp)
 {
-    if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+        pc_memory_pre_plug(hotplug_dev, dev, errp);
+    } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
         pc_cpu_pre_plug(hotplug_dev, dev, errp);
     }
 }

From 4ab56d04ede6e0f979fc8e4a54b381e99cf0a255 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:35 +0200
Subject: [PATCH 22/60] nvdimm: no need to overwrite
 get_vmstate_memory_region()

Our parent class (PC_DIMM) provides exactly the same function.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-7-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/nvdimm.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 4087aca25e..f974accbdd 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -166,11 +166,6 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, const void *buf,
     memory_region_set_dirty(mr, backend_offset, size);
 }
 
-static MemoryRegion *nvdimm_get_vmstate_memory_region(PCDIMMDevice *dimm)
-{
-    return host_memory_backend_get_memory(dimm->hostmem, &error_abort);
-}
-
 static void nvdimm_class_init(ObjectClass *oc, void *data)
 {
     PCDIMMDeviceClass *ddc = PC_DIMM_CLASS(oc);
@@ -178,7 +173,6 @@ static void nvdimm_class_init(ObjectClass *oc, void *data)
 
     ddc->realize = nvdimm_realize;
     ddc->get_memory_region = nvdimm_get_memory_region;
-    ddc->get_vmstate_memory_region = nvdimm_get_vmstate_memory_region;
 
     nvc->read_label_data = nvdimm_read_label_data;
     nvc->write_label_data = nvdimm_write_label_data;

From 7943e97b858e64eddf0f3395427e58c5cc00a7d9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:36 +0200
Subject: [PATCH 23/60] hostmem: drop error variable from
 host_memory_backend_get_memory()

Unused, so let's remove it.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-8-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 backends/hostmem.c       | 3 +--
 hw/mem/nvdimm.c          | 4 ++--
 hw/mem/pc-dimm.c         | 4 ++--
 hw/misc/ivshmem.c        | 3 +--
 include/sysemu/hostmem.h | 3 +--
 numa.c                   | 3 +--
 6 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 3627e61584..4908946cd3 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -247,8 +247,7 @@ bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
     return memory_region_size(&backend->mr) != 0;
 }
 
-MemoryRegion *
-host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp)
+MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend)
 {
     return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL;
 }
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index f974accbdd..df9716231f 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -105,7 +105,7 @@ static MemoryRegion *nvdimm_get_memory_region(PCDIMMDevice *dimm, Error **errp)
 
 static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
 {
-    MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem, errp);
+    MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
     NVDIMMDevice *nvdimm = NVDIMM(dimm);
     uint64_t align, pmem_size, size = memory_region_size(mr);
 
@@ -161,7 +161,7 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, const void *buf,
 
     memcpy(nvdimm->label_data + offset, buf, size);
 
-    mr = host_memory_backend_get_memory(dimm->hostmem, &error_abort);
+    mr = host_memory_backend_get_memory(dimm->hostmem);
     backend_offset = memory_region_size(mr) - nvdimm->label_size + offset;
     memory_region_set_dirty(mr, backend_offset, size);
 }
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 7387963cf1..73f0eee4c7 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -226,12 +226,12 @@ static MemoryRegion *pc_dimm_get_memory_region(PCDIMMDevice *dimm, Error **errp)
         return NULL;
     }
 
-    return host_memory_backend_get_memory(dimm->hostmem, errp);
+    return host_memory_backend_get_memory(dimm->hostmem);
 }
 
 static MemoryRegion *pc_dimm_get_vmstate_memory_region(PCDIMMDevice *dimm)
 {
-    return host_memory_backend_get_memory(dimm->hostmem, &error_abort);
+    return host_memory_backend_get_memory(dimm->hostmem);
 }
 
 static uint64_t pc_dimm_md_get_addr(const MemoryDeviceState *md)
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index 16f03701b7..ee01c5e66b 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -909,8 +909,7 @@ static void ivshmem_common_realize(PCIDevice *dev, Error **errp)
     if (s->hostmem != NULL) {
         IVSHMEM_DPRINTF("using hostmem\n");
 
-        s->ivshmem_bar2 = host_memory_backend_get_memory(s->hostmem,
-                                                         &error_abort);
+        s->ivshmem_bar2 = host_memory_backend_get_memory(s->hostmem);
     } else {
         Chardev *chr = qemu_chr_fe_get_driver(&s->server_chr);
         assert(chr);
diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
index 5beb0ef8ab..6e6bd2c1cb 100644
--- a/include/sysemu/hostmem.h
+++ b/include/sysemu/hostmem.h
@@ -62,8 +62,7 @@ struct HostMemoryBackend {
 };
 
 bool host_memory_backend_mr_inited(HostMemoryBackend *backend);
-MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend,
-                                             Error **errp);
+MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend);
 
 void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped);
 bool host_memory_backend_is_mapped(HostMemoryBackend *backend);
diff --git a/numa.c b/numa.c
index 33572bfa74..94f758c757 100644
--- a/numa.c
+++ b/numa.c
@@ -523,8 +523,7 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
         if (!backend) {
             continue;
         }
-        MemoryRegion *seg = host_memory_backend_get_memory(backend,
-                                                           &error_fatal);
+        MemoryRegion *seg = host_memory_backend_get_memory(backend);
 
         if (memory_region_is_mapped(seg)) {
             char *path = object_get_canonical_path_component(OBJECT(backend));

From a57d1911222bba79bda543568e925635461ead82 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:37 +0200
Subject: [PATCH 24/60] pc-dimm: merge get_(vmstate_)memory_region()

Importantly, get_vmstate_memory_region() should also fail with a proper
error if called before the device is realized. For a PCDIMM, both functions
are to return the same thing, so share the implementation.

All current users are called after the device has been realized, so we
can expect the calls to succeed.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-9-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/pc-dimm.c         | 13 +++++--------
 include/hw/mem/pc-dimm.h |  3 ++-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 73f0eee4c7..4ff39b59ef 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -35,7 +35,8 @@ void pc_dimm_plug(DeviceState *dev, MachineState *machine, uint64_t align,
     int slot;
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
-    MemoryRegion *vmstate_mr = ddc->get_vmstate_memory_region(dimm);
+    MemoryRegion *vmstate_mr = ddc->get_vmstate_memory_region(dimm,
+                                                              &error_abort);
     Error *local_err = NULL;
     MemoryRegion *mr;
     uint64_t addr;
@@ -90,7 +91,8 @@ void pc_dimm_unplug(DeviceState *dev, MachineState *machine)
 {
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
-    MemoryRegion *vmstate_mr = ddc->get_vmstate_memory_region(dimm);
+    MemoryRegion *vmstate_mr = ddc->get_vmstate_memory_region(dimm,
+                                                              &error_abort);
     MemoryRegion *mr = ddc->get_memory_region(dimm, &error_abort);
 
     memory_device_unplug_region(machine, mr);
@@ -229,11 +231,6 @@ static MemoryRegion *pc_dimm_get_memory_region(PCDIMMDevice *dimm, Error **errp)
     return host_memory_backend_get_memory(dimm->hostmem);
 }
 
-static MemoryRegion *pc_dimm_get_vmstate_memory_region(PCDIMMDevice *dimm)
-{
-    return host_memory_backend_get_memory(dimm->hostmem);
-}
-
 static uint64_t pc_dimm_md_get_addr(const MemoryDeviceState *md)
 {
     const PCDIMMDevice *dimm = PC_DIMM(md);
@@ -298,7 +295,7 @@ static void pc_dimm_class_init(ObjectClass *oc, void *data)
     dc->desc = "DIMM memory module";
 
     ddc->get_memory_region = pc_dimm_get_memory_region;
-    ddc->get_vmstate_memory_region = pc_dimm_get_vmstate_memory_region;
+    ddc->get_vmstate_memory_region = pc_dimm_get_memory_region;
 
     mdc->get_addr = pc_dimm_md_get_addr;
     /* for a dimm plugged_size == region_size */
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index cf71247630..5679a80465 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -73,7 +73,8 @@ typedef struct PCDIMMDeviceClass {
     /* public */
     void (*realize)(PCDIMMDevice *dimm, Error **errp);
     MemoryRegion *(*get_memory_region)(PCDIMMDevice *dimm, Error **errp);
-    MemoryRegion *(*get_vmstate_memory_region)(PCDIMMDevice *dimm);
+    MemoryRegion *(*get_vmstate_memory_region)(PCDIMMDevice *dimm,
+                                               Error **errp);
 } PCDIMMDeviceClass;
 
 void pc_dimm_plug(DeviceState *dev, MachineState *machine, uint64_t align,

From 5d10a0e12bf3d00958fee73c1b795cfab921873b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:38 +0200
Subject: [PATCH 25/60] nvdimm: convert "unarmed" into a static property

We don't allow to modify it after realization. So we can simply turn
it into a static property.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-10-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/nvdimm.c | 32 +++++++-------------------------
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index df9716231f..7260c9c6b1 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -64,36 +64,11 @@ out:
     error_propagate(errp, local_err);
 }
 
-static bool nvdimm_get_unarmed(Object *obj, Error **errp)
-{
-    NVDIMMDevice *nvdimm = NVDIMM(obj);
-
-    return nvdimm->unarmed;
-}
-
-static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp)
-{
-    NVDIMMDevice *nvdimm = NVDIMM(obj);
-    Error *local_err = NULL;
-
-    if (memory_region_size(&nvdimm->nvdimm_mr)) {
-        error_setg(&local_err, "cannot change property value");
-        goto out;
-    }
-
-    nvdimm->unarmed = value;
-
- out:
-    error_propagate(errp, local_err);
-}
-
 static void nvdimm_init(Object *obj)
 {
     object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
                         nvdimm_get_label_size, nvdimm_set_label_size, NULL,
                         NULL, NULL);
-    object_property_add_bool(obj, NVDIMM_UNARMED_PROP,
-                             nvdimm_get_unarmed, nvdimm_set_unarmed, NULL);
 }
 
 static MemoryRegion *nvdimm_get_memory_region(PCDIMMDevice *dimm, Error **errp)
@@ -166,13 +141,20 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, const void *buf,
     memory_region_set_dirty(mr, backend_offset, size);
 }
 
+static Property nvdimm_properties[] = {
+    DEFINE_PROP_BOOL(NVDIMM_UNARMED_PROP, NVDIMMDevice, unarmed, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void nvdimm_class_init(ObjectClass *oc, void *data)
 {
     PCDIMMDeviceClass *ddc = PC_DIMM_CLASS(oc);
     NVDIMMClass *nvc = NVDIMM_CLASS(oc);
+    DeviceClass *dc = DEVICE_CLASS(oc);
 
     ddc->realize = nvdimm_realize;
     ddc->get_memory_region = nvdimm_get_memory_region;
+    dc->props = nvdimm_properties;
 
     nvc->read_label_data = nvdimm_read_label_data;
     nvc->write_label_data = nvdimm_write_label_data;

From eb7fd4d0f64fcab2da9ae454a1f214174e881372 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:39 +0200
Subject: [PATCH 26/60] nvdimm: convert nvdimm_mr into a pointer

This way we can easily check if the region has already been inititalized
without having to rely on the size of an uninitialized region being 0.

Free the region in nvdimm_finalize() and not in unrealize() as we will
allow to create the region before realization in following patches.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-11-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/nvdimm.c         | 17 +++++++++++++----
 include/hw/mem/nvdimm.h |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7260c9c6b1..afd3912d6b 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -43,7 +43,7 @@ static void nvdimm_set_label_size(Object *obj, Visitor *v, const char *name,
     Error *local_err = NULL;
     uint64_t value;
 
-    if (memory_region_size(&nvdimm->nvdimm_mr)) {
+    if (nvdimm->nvdimm_mr) {
         error_setg(&local_err, "cannot change property value");
         goto out;
     }
@@ -71,11 +71,18 @@ static void nvdimm_init(Object *obj)
                         NULL, NULL);
 }
 
+static void nvdimm_finalize(Object *obj)
+{
+    NVDIMMDevice *nvdimm = NVDIMM(obj);
+
+    g_free(nvdimm->nvdimm_mr);
+}
+
 static MemoryRegion *nvdimm_get_memory_region(PCDIMMDevice *dimm, Error **errp)
 {
     NVDIMMDevice *nvdimm = NVDIMM(dimm);
 
-    return &nvdimm->nvdimm_mr;
+    return nvdimm->nvdimm_mr;
 }
 
 static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
@@ -102,9 +109,10 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
         return;
     }
 
-    memory_region_init_alias(&nvdimm->nvdimm_mr, OBJECT(dimm),
+    nvdimm->nvdimm_mr = g_new(MemoryRegion, 1);
+    memory_region_init_alias(nvdimm->nvdimm_mr, OBJECT(dimm),
                              "nvdimm-memory", mr, 0, pmem_size);
-    nvdimm->nvdimm_mr.align = align;
+    nvdimm->nvdimm_mr->align = align;
 }
 
 /*
@@ -167,6 +175,7 @@ static TypeInfo nvdimm_info = {
     .class_init    = nvdimm_class_init,
     .instance_size = sizeof(NVDIMMDevice),
     .instance_init = nvdimm_init,
+    .instance_finalize = nvdimm_finalize,
 };
 
 static void nvdimm_register_types(void)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 9340631cfc..c5c9b3c7f8 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -74,7 +74,7 @@ struct NVDIMMDevice {
      * it's the PMEM region in NVDIMM device, which is presented to
      * guest via ACPI NFIT and _FIT method if NVDIMM hotplug is supported.
      */
-    MemoryRegion nvdimm_mr;
+    MemoryRegion *nvdimm_mr;
 
     /*
      * The 'on' value results in the unarmed flag set in ACPI NFIT,

From a4659a8ef424928f654707ca637ba133cbe22396 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:40 +0200
Subject: [PATCH 27/60] nvdimm: make get_memory_region() perform checks and
 initialization

We might get a call to get_memory_region() before the device has been
realized. We should return a consistent value, as the return value
will e.g. later on be used in the pre_plug handler.

To avoid duplicating too much code, factor the initialization and checks
out into a helper function.

Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-12-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/nvdimm.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index afd3912d6b..021d1c3997 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -78,20 +78,22 @@ static void nvdimm_finalize(Object *obj)
     g_free(nvdimm->nvdimm_mr);
 }
 
-static MemoryRegion *nvdimm_get_memory_region(PCDIMMDevice *dimm, Error **errp)
+static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
 {
-    NVDIMMDevice *nvdimm = NVDIMM(dimm);
+    PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+    uint64_t align, pmem_size, size;
+    MemoryRegion *mr;
 
-    return nvdimm->nvdimm_mr;
-}
+    g_assert(!nvdimm->nvdimm_mr);
 
-static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
-{
-    MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
-    NVDIMMDevice *nvdimm = NVDIMM(dimm);
-    uint64_t align, pmem_size, size = memory_region_size(mr);
+    if (!dimm->hostmem) {
+        error_setg(errp, "'" PC_DIMM_MEMDEV_PROP "' property must be set");
+        return;
+    }
 
+    mr = host_memory_backend_get_memory(dimm->hostmem);
     align = memory_region_get_alignment(mr);
+    size = memory_region_size(mr);
 
     pmem_size = size - nvdimm->label_size;
     nvdimm->label_data = memory_region_get_ram_ptr(mr) + pmem_size;
@@ -115,6 +117,30 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
     nvdimm->nvdimm_mr->align = align;
 }
 
+static MemoryRegion *nvdimm_get_memory_region(PCDIMMDevice *dimm, Error **errp)
+{
+    NVDIMMDevice *nvdimm = NVDIMM(dimm);
+    Error *local_err = NULL;
+
+    if (!nvdimm->nvdimm_mr) {
+        nvdimm_prepare_memory_region(nvdimm, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return NULL;
+        }
+    }
+    return nvdimm->nvdimm_mr;
+}
+
+static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
+{
+    NVDIMMDevice *nvdimm = NVDIMM(dimm);
+
+    if (!nvdimm->nvdimm_mr) {
+        nvdimm_prepare_memory_region(nvdimm, errp);
+    }
+}
+
 /*
  * the caller should check the input parameters before calling
  * label read/write functions.

From f0b7bca64dbe8a15c1f4285c6061ce3c81a4a5c7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 19 Jun 2018 15:41:41 +0200
Subject: [PATCH 28/60] pc-dimm: get_memory_region() will not fail after
 realize

Let's try to reduce error handling a bit. In the plug/unplug case, the
device was realized and therefore we can assume that getting access to
the memory region will not fail.

For get_vmstate_memory_region() this is already handled that way.
Document both cases.

Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180619134141.29478-13-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/pc.c             |  7 +------
 hw/mem/pc-dimm.c         |  7 +------
 hw/ppc/spapr.c           | 12 ++----------
 include/hw/mem/pc-dimm.h |  6 ++++--
 4 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 2db032a6df..f310040351 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1706,15 +1706,10 @@ static void pc_memory_plug(HotplugHandler *hotplug_dev,
     PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
-    MemoryRegion *mr;
+    MemoryRegion *mr = ddc->get_memory_region(dimm, &error_abort);
     uint64_t align = TARGET_PAGE_SIZE;
     bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
 
-    mr = ddc->get_memory_region(dimm, &local_err);
-    if (local_err) {
-        goto out;
-    }
-
     if (memory_region_get_alignment(mr) && pcmc->enforce_aligned_dimm) {
         align = memory_region_get_alignment(mr);
     }
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 4ff39b59ef..65843bc52a 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -37,15 +37,10 @@ void pc_dimm_plug(DeviceState *dev, MachineState *machine, uint64_t align,
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
     MemoryRegion *vmstate_mr = ddc->get_vmstate_memory_region(dimm,
                                                               &error_abort);
+    MemoryRegion *mr = ddc->get_memory_region(dimm, &error_abort);
     Error *local_err = NULL;
-    MemoryRegion *mr;
     uint64_t addr;
 
-    mr = ddc->get_memory_region(dimm, &local_err);
-    if (local_err) {
-        goto out;
-    }
-
     addr = object_property_get_uint(OBJECT(dimm),
                                     PC_DIMM_ADDR_PROP, &local_err);
     if (local_err) {
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3a1bd733be..b32b971a14 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3149,14 +3149,10 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
     sPAPRMachineState *ms = SPAPR_MACHINE(hotplug_dev);
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
-    MemoryRegion *mr;
+    MemoryRegion *mr = ddc->get_memory_region(dimm, &error_abort);
     uint64_t align, size, addr;
     uint32_t node;
 
-    mr = ddc->get_memory_region(dimm, &local_err);
-    if (local_err) {
-        goto out;
-    }
     align = memory_region_get_alignment(mr);
     size = memory_region_size(mr);
 
@@ -3344,16 +3340,12 @@ static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
     Error *local_err = NULL;
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
-    MemoryRegion *mr;
+    MemoryRegion *mr = ddc->get_memory_region(dimm, &error_abort);
     uint32_t nr_lmbs;
     uint64_t size, addr_start, addr;
     int i;
     sPAPRDRConnector *drc;
 
-    mr = ddc->get_memory_region(dimm, &local_err);
-    if (local_err) {
-        goto out;
-    }
     size = memory_region_size(mr);
     nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
 
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 5679a80465..26ebb7d5e9 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -62,9 +62,11 @@ typedef struct PCDIMMDevice {
  * @realize: called after common dimm is realized so that the dimm based
  * devices get the chance to do specified operations.
  * @get_memory_region: returns #MemoryRegion associated with @dimm which
- * is directly mapped into the physical address space of guest.
+ * is directly mapped into the physical address space of guest. Will not
+ * fail after the device was realized.
  * @get_vmstate_memory_region: returns #MemoryRegion which indicates the
- * memory of @dimm should be kept during live migration.
+ * memory of @dimm should be kept during live migration. Will not fail
+ * after the device was realized.
  */
 typedef struct PCDIMMDeviceClass {
     /* private */

From 178003ea49aef4273d94c3c002b8f15858070f68 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 22 Jun 2018 16:40:45 +0200
Subject: [PATCH 29/60] numa: report all DIMM/NVDIMMs as plugged memory

Right now, there is some inconsistency between hotplugged and
coldplugged memory. DIMMs added via "-device" result in different stats
than DIMMs added using "device_add".

E.g.
    [...]
    -numa node,nodeid=0,cpus=0-1 -numa node,nodeid=1,cpus=2-3 \
    -m 4G,maxmem=20G,slots=2 \
    -object memory-backend-ram,id=mem0,size=8G \
    -device pc-dimm,id=dimm0,memdev=mem0 \
    -object memory-backend-ram,id=mem1,size=8G \
    -device nvdimm,id=dimm1,memdev=mem1,node=1

Results in NUMA info
    (qemu) info numa
    info numa
    2 nodes
    node 0 cpus: 0 1
    node 0 size: 10240 MB
    node 0 plugged: 0 MB
    node 1 cpus: 2 3
    node 1 size: 10240 MB
    node 1 plugged: 0 MB

But in memory size summary:
    (qemu) info memory_size_summary
    info memory_size_summary
    base memory: 4294967296
    plugged memory: 17179869184

Make this consistent by reporting all hot and coldplugged
memory a.k.a. DIMM and NVDIMM as "plugged".

Fixes: 31959e82fb0 ("hmp: extend "info numa" with hotplugged memory information")
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180622144045.737-1-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 numa.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/numa.c b/numa.c
index 94f758c757..5f6367b989 100644
--- a/numa.c
+++ b/numa.c
@@ -566,10 +566,8 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[])
 
             if (pcdimm_info) {
                 node_mem[pcdimm_info->node].node_mem += pcdimm_info->size;
-                if (pcdimm_info->hotpluggable && pcdimm_info->hotplugged) {
-                    node_mem[pcdimm_info->node].node_plugged_mem +=
-                        pcdimm_info->size;
-                }
+                node_mem[pcdimm_info->node].node_plugged_mem +=
+                    pcdimm_info->size;
             }
         }
     }

From a1a98357e3fdfce92b5ed0c6728489b9992fecb5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 25 Jun 2018 16:51:39 +0200
Subject: [PATCH 30/60] osdep: work around Coverity parsing errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Coverity does not like the new _Float* types that are used by
recent glibc, and croaks on every single file that includes
stdlib.h.  Add dummy typedefs to please it.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/osdep.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 9ed62423c0..a91068df0e 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -33,6 +33,21 @@
 #else
 #include "exec/poison.h"
 #endif
+#ifdef __COVERITY__
+/* Coverity does not like the new _Float* types that are used by
+ * recent glibc, and croaks on every single file that includes
+ * stdlib.h.  These typedefs are enough to please it.
+ *
+ * Note that these fix parse errors so they cannot be placed in
+ * scripts/coverity-model.c.
+ */
+typedef float _Float32;
+typedef double _Float32x;
+typedef double _Float64;
+typedef __float80 _Float64x;
+typedef __float128 _Float128;
+#endif
+
 #include "qemu/compiler.h"
 
 /* Older versions of C++ don't get definitions of various macros from

From c44df2ff9be326b218f6655ee17ddd914ece8d5a Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Mon, 25 Jun 2018 20:22:13 +0200
Subject: [PATCH 31/60] Deprecate the -enable-hax option

We currently have got three ways of turning on the HAX accelerator:
"-machine accel=hax", "-accel hax" and "-enable-hax". That's really
confusing and overloaded. Since "-accel" is our preferred way to enable
an accelerator nowadays, and "-accel hax" is even less to type than
"-enable-hax", let's deprecate the "-enable-hax" option now.

Note: While "-enable-kvm" is available since a long time and can hardly be
removed since it is used in a lot of upper layer tools and scripts, the
"-enable-hax" option is still rather new and not very widespread yet, so
I think that it should be OK if we remove this in a couple of releases again
(we'll see whether someone complains after seeing the deprecation message -
then we could still reconsider to keep it if there a well-founded reasons).

Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <1529950933-28347-1-git-send-email-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 qemu-doc.texi   | 5 +++++
 qemu-options.hx | 2 +-
 vl.c            | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/qemu-doc.texi b/qemu-doc.texi
index 16fcb47901..1cb3ba4341 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -2912,6 +2912,11 @@ Option @option{-virtioconsole} has been replaced by
 The @code{-clock} option is ignored since QEMU version 1.7.0. There is no
 replacement since it is not needed anymore.
 
+@subsection -enable-hax (since 3.0.0)
+
+The @option{-enable-hax} option has been replaced by @option{-accel hax}.
+Both options have been introduced in QEMU version 2.9.0.
+
 @section QEMU Machine Protocol (QMP) commands
 
 @subsection block-dirty-bitmap-add "autoload" parameter (since 2.12.0)
diff --git a/qemu-options.hx b/qemu-options.hx
index 270772817a..3e45483834 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3421,7 +3421,7 @@ STEXI
 Enable HAX (Hardware-based Acceleration eXecution) support. This option
 is only available if HAX support is enabled when compiling. HAX is only
 applicable to MAC and Windows platform, and thus does not conflict with
-KVM.
+KVM. This option is deprecated, use @option{-accel hax} instead.
 ETEXI
 
 DEF("xen-domid", HAS_ARG, QEMU_OPTION_xen_domid,
diff --git a/vl.c b/vl.c
index d26f19b06d..7c9f19aa31 100644
--- a/vl.c
+++ b/vl.c
@@ -3581,6 +3581,7 @@ int main(int argc, char **argv, char **envp)
                 qemu_opts_parse_noisily(olist, "accel=kvm", false);
                 break;
             case QEMU_OPTION_enable_hax:
+                warn_report("Option is deprecated, use '-accel hax' instead");
                 olist = qemu_find_opts("machine");
                 qemu_opts_parse_noisily(olist, "accel=hax", false);
                 break;

From 50fa332516d5e42695811f43396b749185e21b9c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 26 Jun 2018 13:55:04 +0200
Subject: [PATCH 32/60] pr-helper: fix --socket-path default in help
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently --help shows "(default '(null)')" for the -k/--socket-path
option.  Fix it by getting the default path in /var/run.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 scsi/qemu-pr-helper.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
index d0f83176e1..4057cf355c 100644
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@@ -74,8 +74,16 @@ static int uid = -1;
 static int gid = -1;
 #endif
 
+static void compute_default_paths(void)
+{
+    if (!socket_path) {
+        socket_path = qemu_get_local_state_pathname("run/qemu-pr-helper.sock");
+    }
+}
+
 static void usage(const char *name)
 {
+    compute_default_paths();
     (printf) (
 "Usage: %s [OPTIONS] FILE\n"
 "Persistent Reservation helper program for QEMU\n"
@@ -845,13 +853,6 @@ static const char *socket_activation_validate_opts(void)
     return NULL;
 }
 
-static void compute_default_paths(void)
-{
-    if (!socket_path) {
-        socket_path = qemu_get_local_state_pathname("run/qemu-pr-helper.sock");
-    }
-}
-
 static void termsig_handler(int signum)
 {
     atomic_cmpxchg(&state, RUNNING, TERMINATE);

From 86933b4e7879e427e03365bf352c0964640cb37b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Jun 2018 19:36:15 +0200
Subject: [PATCH 33/60] pr-helper: fix assertion failure on failed multipath
 PERSISTENT RESERVE IN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The response size is expected to be zero if the SCSI status is not
"GOOD", but nothing was resetting it.

This can be reproduced simply by "sg_persist -s /dev/sdb" where /dev/sdb
in the guest is a scsi-block device corresponding to a multipath device
on the host.

Before:

  PR in (Read full status): Aborted command

and on the host:

  prh_write_response: Assertion `resp->sz == 0' failed.

After:

  PR in (Read full status): bad field in cdb or parameter list
  (perhaps unsupported service action)

Reported-by: Jiri Belka <jbelka@redhat.com>
Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 scsi/qemu-pr-helper.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
index 4057cf355c..0218d65bbf 100644
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@@ -558,7 +558,11 @@ static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense,
 #ifdef CONFIG_MPATH
     if (is_mpath(fd)) {
         /* multipath_pr_in fills the whole input buffer.  */
-        return multipath_pr_in(fd, cdb, sense, data, *resp_sz);
+        int r = multipath_pr_in(fd, cdb, sense, data, *resp_sz);
+        if (r != GOOD) {
+            *resp_sz = 0;
+        }
+        return r;
     }
 #endif
 

From aad10040d411d21542dc9ae58a2854c89ccedd78 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 26 Jun 2018 15:39:18 +0200
Subject: [PATCH 34/60] pr-manager-helper: avoid SIGSEGV when writing to the
 socket fail

When writing to the qemu-pr-helper socket failed, the persistent
reservation manager was correctly disconnecting the socket, but it
did not clear pr_mgr->ioc.  So the rest of the code did not know
that the socket had been disconnected, accessed pr_mgr->ioc and
happily caused a crash.

To reproduce, it is enough to stop qemu-pr-helper between QEMU
startup and executing e.g. sg_persist -k /dev/sdb.

Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 scsi/pr-manager-helper.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scsi/pr-manager-helper.c b/scsi/pr-manager-helper.c
index 82ff6b6123..0c0fe389b7 100644
--- a/scsi/pr-manager-helper.c
+++ b/scsi/pr-manager-helper.c
@@ -71,6 +71,7 @@ static int pr_manager_helper_write(PRManagerHelper *pr_mgr,
         if (n_written <= 0) {
             assert(n_written != QIO_CHANNEL_ERR_BLOCK);
             object_unref(OBJECT(pr_mgr->ioc));
+            pr_mgr->ioc = NULL;
             return n_written < 0 ? -EINVAL : 0;
         }
 

From 58b3017f7fba15e8c440115dfd5d380f490d0b61 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 28 Jun 2018 18:01:42 +0200
Subject: [PATCH 35/60] pr-manager: put stubs in .c file

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/scsi/pr-manager.h |  9 ---------
 scsi/Makefile.objs        |  1 +
 scsi/pr-manager-stub.c    | 24 ++++++++++++++++++++++++
 3 files changed, 25 insertions(+), 9 deletions(-)
 create mode 100644 scsi/pr-manager-stub.c

diff --git a/include/scsi/pr-manager.h b/include/scsi/pr-manager.h
index 5d2f13a5e4..71971ae34a 100644
--- a/include/scsi/pr-manager.h
+++ b/include/scsi/pr-manager.h
@@ -41,15 +41,6 @@ BlockAIOCB *pr_manager_execute(PRManager *pr_mgr,
                                BlockCompletionFunc *complete,
                                void *opaque);
 
-#ifdef CONFIG_LINUX
 PRManager *pr_manager_lookup(const char *id, Error **errp);
-#else
-static inline PRManager *pr_manager_lookup(const char *id, Error **errp)
-{
-    /* The classes do not exist at all!  */
-    error_setg(errp, "No persistent reservation manager with id '%s'", id);
-    return NULL;
-}
-#endif
 
 #endif
diff --git a/scsi/Makefile.objs b/scsi/Makefile.objs
index 4d25e476cf..bb8789cd8b 100644
--- a/scsi/Makefile.objs
+++ b/scsi/Makefile.objs
@@ -1,3 +1,4 @@
 block-obj-y += utils.o
 
 block-obj-$(CONFIG_LINUX) += pr-manager.o pr-manager-helper.o
+block-obj-$(call lnot,$(CONFIG_LINUX)) += pr-manager-stub.o
diff --git a/scsi/pr-manager-stub.c b/scsi/pr-manager-stub.c
new file mode 100644
index 0000000000..632f17c7f9
--- /dev/null
+++ b/scsi/pr-manager-stub.c
@@ -0,0 +1,24 @@
+/*
+ * Persistent reservation manager - stub for non-Linux platforms
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This code is licensed under the LGPL.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "scsi/pr-manager.h"
+#include "trace.h"
+#include "qapi/qapi-types-block.h"
+#include "qapi/qapi-commands-block.h"
+
+PRManager *pr_manager_lookup(const char *id, Error **errp)
+{
+    /* The classes do not exist at all!  */
+    error_setg(errp, "No persistent reservation manager with id '%s'", id);
+        return NULL;
+}

From 5f64089416f0d77c87683401838f064c51a292ed Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 28 Feb 2018 18:47:57 +0100
Subject: [PATCH 36/60] pr-manager: add query-pr-managers QMP command

This command lets you query the connection status of each pr-manager-helper
object.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/scsi/pr-manager.h |  2 ++
 qapi/block.json           | 28 ++++++++++++++++++++++++
 scsi/pr-manager-helper.c  | 13 +++++++++++
 scsi/pr-manager-stub.c    |  6 ++++++
 scsi/pr-manager.c         | 45 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 94 insertions(+)

diff --git a/include/scsi/pr-manager.h b/include/scsi/pr-manager.h
index 71971ae34a..50a77b08fc 100644
--- a/include/scsi/pr-manager.h
+++ b/include/scsi/pr-manager.h
@@ -33,8 +33,10 @@ typedef struct PRManagerClass {
 
     /* <public> */
     int (*run)(PRManager *pr_mgr, int fd, struct sg_io_hdr *hdr);
+    bool (*is_connected)(PRManager *pr_mgr);
 } PRManagerClass;
 
+bool pr_manager_is_connected(PRManager *pr_mgr);
 BlockAIOCB *pr_manager_execute(PRManager *pr_mgr,
                                AioContext *ctx, int fd,
                                struct sg_io_hdr *hdr,
diff --git a/qapi/block.json b/qapi/block.json
index ca807f176a..8765c29a06 100644
--- a/qapi/block.json
+++ b/qapi/block.json
@@ -77,6 +77,34 @@
 { 'struct': 'BlockdevSnapshotInternal',
   'data': { 'device': 'str', 'name': 'str' } }
 
+##
+# @PRManagerInfo:
+#
+# Information about a persistent reservation manager
+#
+# @id: the identifier of the persistent reservation manager
+#
+# @connected: true if the persistent reservation manager is connected to
+#             the underlying storage or helper
+#
+# Since: 3.0
+##
+{ 'struct': 'PRManagerInfo',
+  'data': {'id': 'str', 'connected': 'bool'} }
+
+##
+# @query-pr-managers:
+#
+# Returns a list of information about each persistent reservation manager.
+#
+# Returns: a list of @PRManagerInfo for each persistent reservation manager
+#
+# Since: 3.0
+##
+{ 'command': 'query-pr-managers', 'returns': ['PRManagerInfo'],
+  'allow-preconfig': true }
+
+
 ##
 # @blockdev-snapshot-internal-sync:
 #
diff --git a/scsi/pr-manager-helper.c b/scsi/pr-manager-helper.c
index 0c0fe389b7..b11481be9e 100644
--- a/scsi/pr-manager-helper.c
+++ b/scsi/pr-manager-helper.c
@@ -235,6 +235,18 @@ out:
     return ret;
 }
 
+static bool pr_manager_helper_is_connected(PRManager *p)
+{
+    PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(p);
+    bool result;
+
+    qemu_mutex_lock(&pr_mgr->lock);
+    result = (pr_mgr->ioc != NULL);
+    qemu_mutex_unlock(&pr_mgr->lock);
+
+    return result;
+}
+
 static void pr_manager_helper_complete(UserCreatable *uc, Error **errp)
 {
     PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(uc);
@@ -284,6 +296,7 @@ static void pr_manager_helper_class_init(ObjectClass *klass,
                                   &error_abort);
     uc_klass->complete = pr_manager_helper_complete;
     prmgr_klass->run = pr_manager_helper_run;
+    prmgr_klass->is_connected = pr_manager_helper_is_connected;
 }
 
 static const TypeInfo pr_manager_helper_info = {
diff --git a/scsi/pr-manager-stub.c b/scsi/pr-manager-stub.c
index 632f17c7f9..738b6d7425 100644
--- a/scsi/pr-manager-stub.c
+++ b/scsi/pr-manager-stub.c
@@ -22,3 +22,9 @@ PRManager *pr_manager_lookup(const char *id, Error **errp)
     error_setg(errp, "No persistent reservation manager with id '%s'", id);
         return NULL;
 }
+
+
+PRManagerInfoList *qmp_query_pr_managers(Error **errp)
+{
+    return NULL;
+}
diff --git a/scsi/pr-manager.c b/scsi/pr-manager.c
index 87c45db5d4..2a8f300dde 100644
--- a/scsi/pr-manager.c
+++ b/scsi/pr-manager.c
@@ -17,6 +17,10 @@
 #include "block/thread-pool.h"
 #include "scsi/pr-manager.h"
 #include "trace.h"
+#include "qapi/qapi-types-block.h"
+#include "qapi/qapi-commands-block.h"
+
+#define PR_MANAGER_PATH     "/objects"
 
 typedef struct PRManagerData {
     PRManager *pr_mgr;
@@ -64,6 +68,14 @@ BlockAIOCB *pr_manager_execute(PRManager *pr_mgr,
                                   data, complete, opaque);
 }
 
+bool pr_manager_is_connected(PRManager *pr_mgr)
+{
+    PRManagerClass *pr_mgr_class =
+        PR_MANAGER_GET_CLASS(pr_mgr);
+
+    return !pr_mgr_class->is_connected || pr_mgr_class->is_connected(pr_mgr);
+}
+
 static const TypeInfo pr_manager_info = {
     .parent = TYPE_OBJECT,
     .name = TYPE_PR_MANAGER,
@@ -105,5 +117,38 @@ pr_manager_register_types(void)
     type_register_static(&pr_manager_info);
 }
 
+static int query_one_pr_manager(Object *object, void *opaque)
+{
+    PRManagerInfoList ***prev = opaque;
+    PRManagerInfoList *elem;
+    PRManagerInfo *info;
+    PRManager *pr_mgr;
+
+    pr_mgr = (PRManager *)object_dynamic_cast(object, TYPE_PR_MANAGER);
+    if (!pr_mgr) {
+        return 0;
+    }
+
+    elem = g_new0(PRManagerInfoList, 1);
+    info = g_new0(PRManagerInfo, 1);
+    info->id = object_get_canonical_path_component(object);
+    info->connected = pr_manager_is_connected(pr_mgr);
+    elem->value = info;
+    elem->next = NULL;
+
+    **prev = elem;
+    *prev = &elem->next;
+    return 0;
+}
+
+PRManagerInfoList *qmp_query_pr_managers(Error **errp)
+{
+    PRManagerInfoList *head = NULL;
+    PRManagerInfoList **prev = &head;
+    Object *container = container_get(object_get_root(), PR_MANAGER_PATH);
+
+    object_child_foreach(container, query_one_pr_manager, &prev);
+    return head;
+}
 
 type_init(pr_manager_register_types);

From e2c81a45101fdddfd47477a1805806f2c76639bf Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 28 Feb 2018 19:01:40 +0100
Subject: [PATCH 37/60] pr-manager-helper: report event on
 connection/disconnection

Let management know if there were any problems communicating with
qemu-pr-helper.  The event is edge-triggered, and is sent every time
the connection status of the pr-manager-helper object changes.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 qapi/block.json          | 24 ++++++++++++++++++++++++
 scsi/pr-manager-helper.c | 14 ++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/qapi/block.json b/qapi/block.json
index 8765c29a06..11f01f28ef 100644
--- a/qapi/block.json
+++ b/qapi/block.json
@@ -358,6 +358,30 @@
 { 'event': 'DEVICE_TRAY_MOVED',
   'data': { 'device': 'str', 'id': 'str', 'tray-open': 'bool' } }
 
+##
+# @PR_MANAGER_STATUS_CHANGED:
+#
+# Emitted whenever the connected status of a persistent reservation
+# manager changes.
+#
+# @id: The id of the PR manager object
+#
+# @connected: true if the PR manager is connected to a backend
+#
+# Since: 3.0
+#
+# Example:
+#
+# <- { "event": "PR_MANAGER_STATUS_CHANGED",
+#      "data": { "id": "pr-helper0",
+#                "connected": true
+#      },
+#      "timestamp": { "seconds": 1519840375, "microseconds": 450486 } }
+#
+##
+{ 'event': 'PR_MANAGER_STATUS_CHANGED',
+  'data': { 'id': 'str', 'connected': 'bool' } }
+
 ##
 # @QuorumOpType:
 #
diff --git a/scsi/pr-manager-helper.c b/scsi/pr-manager-helper.c
index b11481be9e..519a296905 100644
--- a/scsi/pr-manager-helper.c
+++ b/scsi/pr-manager-helper.c
@@ -17,6 +17,7 @@
 #include "io/channel.h"
 #include "io/channel-socket.h"
 #include "pr-helper.h"
+#include "qapi/qapi-events-block.h"
 
 #include <scsi/sg.h>
 
@@ -38,6 +39,16 @@ typedef struct PRManagerHelper {
     QIOChannel *ioc;
 } PRManagerHelper;
 
+static void pr_manager_send_status_changed_event(PRManagerHelper *pr_mgr)
+{
+    char *id = object_get_canonical_path_component(OBJECT(pr_mgr));
+
+    if (id) {
+        qapi_event_send_pr_manager_status_changed(id, !!pr_mgr->ioc,
+                                                  &error_abort);
+    }
+}
+
 /* Called with lock held.  */
 static int pr_manager_helper_read(PRManagerHelper *pr_mgr,
                                   void *buf, int sz, Error **errp)
@@ -47,6 +58,7 @@ static int pr_manager_helper_read(PRManagerHelper *pr_mgr,
     if (r < 0) {
         object_unref(OBJECT(pr_mgr->ioc));
         pr_mgr->ioc = NULL;
+        pr_manager_send_status_changed_event(pr_mgr);
         return -EINVAL;
     }
 
@@ -72,6 +84,7 @@ static int pr_manager_helper_write(PRManagerHelper *pr_mgr,
             assert(n_written != QIO_CHANNEL_ERR_BLOCK);
             object_unref(OBJECT(pr_mgr->ioc));
             pr_mgr->ioc = NULL;
+            pr_manager_send_status_changed_event(pr_mgr);
             return n_written < 0 ? -EINVAL : 0;
         }
 
@@ -127,6 +140,7 @@ static int pr_manager_helper_initialize(PRManagerHelper *pr_mgr,
         goto out_close;
     }
 
+    pr_manager_send_status_changed_event(pr_mgr);
     return 0;
 
 out_close:

From 09eb69a573521b90cfa5b2c1c02e01adceb5405f Mon Sep 17 00:00:00 2001
From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Date: Wed, 13 Jun 2018 10:47:26 +0100
Subject: [PATCH 38/60] hw/mips/jazz: create ESP device directly via qdev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MIPS jazz is the last user of the legacy esp_init() function so move creation
of the ESP device over to use qdev.

Note that the esp_reset and dma_enable qemu_irqs are currently unused and so
we do not wire these up and instead remove the variables to prevent the
compiler emitting unused variable warnings.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Message-Id: <20180613094727.11326-2-mark.cave-ayland@ilande.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Hervé Poussineau <hpoussin@reactos.org>
---
 hw/mips/mips_jazz.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/hw/mips/mips_jazz.c b/hw/mips/mips_jazz.c
index 90cb306f53..1afbe3ce6a 100644
--- a/hw/mips/mips_jazz.c
+++ b/hw/mips/mips_jazz.c
@@ -145,10 +145,10 @@ static void mips_jazz_init(MachineState *machine,
     ISABus *isa_bus;
     ISADevice *pit;
     DriveInfo *fds[MAX_FD];
-    qemu_irq esp_reset, dma_enable;
     MemoryRegion *ram = g_new(MemoryRegion, 1);
     MemoryRegion *bios = g_new(MemoryRegion, 1);
     MemoryRegion *bios2 = g_new(MemoryRegion, 1);
+    SysBusESPState *sysbus_esp;
     ESPState *esp;
 
     /* init CPUs */
@@ -281,8 +281,21 @@ static void mips_jazz_init(MachineState *machine,
     }
 
     /* SCSI adapter */
-    esp = esp_init(0x80002000, 0, rc4030_dma_read, rc4030_dma_write, dmas[0],
-                   qdev_get_gpio_in(rc4030, 5), &esp_reset, &dma_enable);
+    dev = qdev_create(NULL, TYPE_ESP);
+    sysbus_esp = ESP_STATE(dev);
+    esp = &sysbus_esp->esp;
+    esp->dma_memory_read = rc4030_dma_read;
+    esp->dma_memory_write = rc4030_dma_write;
+    esp->dma_opaque = dmas[0];
+    sysbus_esp->it_shift = 0;
+    /* XXX for now until rc4030 has been changed to use DMA enable signal */
+    esp->dma_enabled = 1;
+    qdev_init_nofail(dev);
+
+    sysbus = SYS_BUS_DEVICE(dev);
+    sysbus_connect_irq(sysbus, 0, qdev_get_gpio_in(rc4030, 5));
+    sysbus_mmio_map(sysbus, 0, 0x80002000);
+
     scsi_bus_legacy_handle_cmdline(&esp->bus);
 
     /* Floppy */

From e7d99825f018cf4e658c3eb10c0163e75e653a23 Mon Sep 17 00:00:00 2001
From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Date: Wed, 13 Jun 2018 10:47:27 +0100
Subject: [PATCH 39/60] esp: remove legacy esp_init() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the legacy esp_init() function now that there are no more remaining
users.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Message-Id: <20180613094727.11326-3-mark.cave-ayland@ilande.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Hervé Poussineau <hpoussin@reactos.org>
---
 hw/scsi/esp.c         | 30 ------------------------------
 include/hw/scsi/esp.h |  5 -----
 2 files changed, 35 deletions(-)

diff --git a/hw/scsi/esp.c b/hw/scsi/esp.c
index 9ed9727744..630d923623 100644
--- a/hw/scsi/esp.c
+++ b/hw/scsi/esp.c
@@ -619,36 +619,6 @@ static const MemoryRegionOps sysbus_esp_mem_ops = {
     .valid.accepts = esp_mem_accepts,
 };
 
-ESPState *esp_init(hwaddr espaddr, int it_shift,
-                   ESPDMAMemoryReadWriteFunc dma_memory_read,
-                   ESPDMAMemoryReadWriteFunc dma_memory_write,
-                   void *dma_opaque, qemu_irq irq, qemu_irq *reset,
-                   qemu_irq *dma_enable)
-{
-    DeviceState *dev;
-    SysBusDevice *s;
-    SysBusESPState *sysbus;
-    ESPState *esp;
-
-    dev = qdev_create(NULL, TYPE_ESP);
-    sysbus = ESP_STATE(dev);
-    esp = &sysbus->esp;
-    esp->dma_memory_read = dma_memory_read;
-    esp->dma_memory_write = dma_memory_write;
-    esp->dma_opaque = dma_opaque;
-    sysbus->it_shift = it_shift;
-    /* XXX for now until rc4030 has been changed to use DMA enable signal */
-    esp->dma_enabled = 1;
-    qdev_init_nofail(dev);
-    s = SYS_BUS_DEVICE(dev);
-    sysbus_connect_irq(s, 0, irq);
-    sysbus_mmio_map(s, 0, espaddr);
-    *reset = qdev_get_gpio_in(dev, 0);
-    *dma_enable = qdev_get_gpio_in(dev, 1);
-
-    return esp;
-}
-
 static const struct SCSIBusInfo esp_scsi_info = {
     .tcq = false,
     .max_target = ESP_MAX_DEVS,
diff --git a/include/hw/scsi/esp.h b/include/hw/scsi/esp.h
index 93fdaced67..682a0d2de0 100644
--- a/include/hw/scsi/esp.h
+++ b/include/hw/scsi/esp.h
@@ -131,11 +131,6 @@ typedef struct {
 #define TCHI_FAS100A 0x4
 #define TCHI_AM53C974 0x12
 
-ESPState *esp_init(hwaddr espaddr, int it_shift,
-                   ESPDMAMemoryReadWriteFunc dma_memory_read,
-                   ESPDMAMemoryReadWriteFunc dma_memory_write,
-                   void *dma_opaque, qemu_irq irq, qemu_irq *reset,
-                   qemu_irq *dma_enable);
 void esp_dma_enable(ESPState *s, int irq, int level);
 void esp_request_cancelled(SCSIRequest *req);
 void esp_command_complete(SCSIRequest *req, uint32_t status, size_t resid);

From e1753a7e1d8174f5861367504c5cea5fbcd85953 Mon Sep 17 00:00:00 2001
From: "Justin Terry (VM)" <juterry@microsoft.com>
Date: Tue, 5 Jun 2018 22:15:27 +0000
Subject: [PATCH 40/60] WHPX workaround bug in OSVW handling

Adds a workaround to an incorrect value setting
CPUID Fn8000_0001_ECX[bit 9 OSVW] = 1. This can cause a guest linux kernel
to panic when an issue to rdmsr C001_0140h returns 0. Disabling this feature
correctly allows the guest to boot without accessing the osv workarounds.

Signed-off-by: Justin Terry (VM) <juterry@microsoft.com>
Message-Id: <20180605221500.21674-1-juterry@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/whpx-all.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/target/i386/whpx-all.c b/target/i386/whpx-all.c
index 6b42096698..99501bac57 100644
--- a/target/i386/whpx-all.c
+++ b/target/i386/whpx-all.c
@@ -961,6 +961,16 @@ static int whpx_vcpu_run(CPUState *cpu)
                     vcpu->exit_ctx.CpuidAccess.DefaultResultRcx |
                     CPUID_EXT_HYPERVISOR;
 
+                rdx = vcpu->exit_ctx.CpuidAccess.DefaultResultRdx;
+                rbx = vcpu->exit_ctx.CpuidAccess.DefaultResultRbx;
+                break;
+            case 0x80000001:
+                rax = vcpu->exit_ctx.CpuidAccess.DefaultResultRax;
+                /* Remove any support of OSVW */
+                rcx =
+                    vcpu->exit_ctx.CpuidAccess.DefaultResultRcx &
+                    ~CPUID_EXT3_OSVW;
+
                 rdx = vcpu->exit_ctx.CpuidAccess.DefaultResultRdx;
                 rbx = vcpu->exit_ctx.CpuidAccess.DefaultResultRbx;
                 break;
@@ -1382,12 +1392,13 @@ static int whpx_accel_init(MachineState *ms)
         goto error;
     }
 
-    UINT32 cpuidExitList[] = {1};
+    UINT32 cpuidExitList[] = {1, 0x80000001};
     hr = whp_dispatch.WHvSetPartitionProperty(
         whpx->partition,
         WHvPartitionPropertyCodeCpuidExitList,
         cpuidExitList,
         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
+
     if (FAILED(hr)) {
         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
                      hr);

From e7ca549fc8caf9b1c79814f3854622448815f2bf Mon Sep 17 00:00:00 2001
From: "Justin Terry (VM)" <juterry@microsoft.com>
Date: Tue, 5 Jun 2018 22:15:28 +0000
Subject: [PATCH 41/60] WHPX: register for unrecognized MSR exits

Some variations of Linux kernels end up accessing MSR's that the Windows
Hypervisor doesn't implement which causes a GP to be returned to the guest.
This fix registers QEMU for unimplemented MSR access and globally returns 0 on
reads and ignores writes. This behavior is allows the Linux kernel to probe the
MSR with a write/read/check sequence it does often without failing the access.

Signed-off-by: Justin Terry (VM) <juterry@microsoft.com>
Message-Id: <20180605221500.21674-2-juterry@microsoft.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/whpx-all.c | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/target/i386/whpx-all.c b/target/i386/whpx-all.c
index 99501bac57..57e53e1f1f 100644
--- a/target/i386/whpx-all.c
+++ b/target/i386/whpx-all.c
@@ -932,6 +932,7 @@ static int whpx_vcpu_run(CPUState *cpu)
 
         case WHvRunVpExitReasonX64InterruptWindow:
             vcpu->window_registered = 0;
+            ret = 0;
             break;
 
         case WHvRunVpExitReasonX64Halt:
@@ -943,6 +944,40 @@ static int whpx_vcpu_run(CPUState *cpu)
             ret = 1;
             break;
 
+        case WHvRunVpExitReasonX64MsrAccess: {
+            WHV_REGISTER_VALUE reg_values[3] = {0};
+            WHV_REGISTER_NAME reg_names[3];
+            UINT32 reg_count;
+
+            reg_names[0] = WHvX64RegisterRip;
+            reg_names[1] = WHvX64RegisterRax;
+            reg_names[2] = WHvX64RegisterRdx;
+
+            reg_values[0].Reg64 =
+                vcpu->exit_ctx.VpContext.Rip +
+                vcpu->exit_ctx.VpContext.InstructionLength;
+
+            /*
+             * For all unsupported MSR access we:
+             *     ignore writes
+             *     return 0 on read.
+             */
+            reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
+                        1 : 3;
+
+            hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
+                whpx->partition,
+                cpu->cpu_index,
+                reg_names, reg_count,
+                reg_values);
+
+            if (FAILED(hr)) {
+                error_report("WHPX: Failed to set MsrAccess state "
+                             " registers, hr=%08lx", hr);
+            }
+            ret = 0;
+            break;
+        }
         case WHvRunVpExitReasonX64Cpuid: {
             WHV_REGISTER_VALUE reg_values[5];
             WHV_REGISTER_NAME reg_names[5];
@@ -1010,7 +1045,6 @@ static int whpx_vcpu_run(CPUState *cpu)
         case WHvRunVpExitReasonUnrecoverableException:
         case WHvRunVpExitReasonInvalidVpRegisterValue:
         case WHvRunVpExitReasonUnsupportedFeature:
-        case WHvRunVpExitReasonX64MsrAccess:
         case WHvRunVpExitReasonException:
         default:
             error_report("WHPX: Unexpected VP exit code %d",
@@ -1378,6 +1412,7 @@ static int whpx_accel_init(MachineState *ms)
     }
 
     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
+    prop.ExtendedVmExits.X64MsrExit = 1;
     prop.ExtendedVmExits.X64CpuidExit = 1;
     hr = whp_dispatch.WHvSetPartitionProperty(
         whpx->partition,
@@ -1386,8 +1421,8 @@ static int whpx_accel_init(MachineState *ms)
         sizeof(WHV_PARTITION_PROPERTY));
 
     if (FAILED(hr)) {
-        error_report("WHPX: Failed to enable partition extended X64CpuidExit"
-                     " hr=%08lx", hr);
+        error_report("WHPX: Failed to enable partition extended X64MsrExit and"
+                     " X64CpuidExit hr=%08lx", hr);
         ret = -EINVAL;
         goto error;
     }

From fc051ae6c42216ca87145106b509fa3bdfa98e00 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 4 Jun 2018 13:25:11 +1000
Subject: [PATCH 42/60] memory/hmp: Print owners/parents in "info mtree"

This adds owners/parents (which are the same, just occasionally
owner==NULL) printing for memory regions; a new '-o' flag
enabled new output.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Message-Id: <20180604032511.6980-1-aik@ozlabs.ru>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hmp-commands-info.hx  |  7 +++--
 include/exec/memory.h |  2 +-
 memory.c              | 72 +++++++++++++++++++++++++++++++++++++------
 monitor.c             |  4 ++-
 4 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 6db3457a78..59bdd8f713 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -253,10 +253,11 @@ ETEXI
 
     {
         .name       = "mtree",
-        .args_type  = "flatview:-f,dispatch_tree:-d",
-        .params     = "[-f][-d]",
+        .args_type  = "flatview:-f,dispatch_tree:-d,owner:-o",
+        .params     = "[-f][-d][-o]",
         .help       = "show memory tree (-f: dump flat view for address spaces;"
-                      "-d: dump dispatch tree, valid with -f only)",
+                      "-d: dump dispatch tree, valid with -f only);"
+                      "-o: dump region owners/parents",
         .cmd        = hmp_info_mtree,
     },
 
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 050323f532..448d41a752 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1665,7 +1665,7 @@ void memory_global_dirty_log_start(void);
 void memory_global_dirty_log_stop(void);
 
 void mtree_info(fprintf_function mon_printf, void *f, bool flatview,
-                bool dispatch_tree);
+                bool dispatch_tree, bool owner);
 
 /**
  * memory_region_request_mmio_ptr: request a pointer to an mmio
diff --git a/memory.c b/memory.c
index 21aa57d24c..e9cd446968 100644
--- a/memory.c
+++ b/memory.c
@@ -2858,10 +2858,49 @@ typedef QTAILQ_HEAD(mrqueue, MemoryRegionList) MemoryRegionListHead;
                            int128_sub((size), int128_one())) : 0)
 #define MTREE_INDENT "  "
 
+static void mtree_expand_owner(fprintf_function mon_printf, void *f,
+                               const char *label, Object *obj)
+{
+    DeviceState *dev = (DeviceState *) object_dynamic_cast(obj, TYPE_DEVICE);
+
+    mon_printf(f, " %s:{%s", label, dev ? "dev" : "obj");
+    if (dev && dev->id) {
+        mon_printf(f, " id=%s", dev->id);
+    } else {
+        gchar *canonical_path = object_get_canonical_path(obj);
+        if (canonical_path) {
+            mon_printf(f, " path=%s", canonical_path);
+            g_free(canonical_path);
+        } else {
+            mon_printf(f, " type=%s", object_get_typename(obj));
+        }
+    }
+    mon_printf(f, "}");
+}
+
+static void mtree_print_mr_owner(fprintf_function mon_printf, void *f,
+                                 const MemoryRegion *mr)
+{
+    Object *owner = mr->owner;
+    Object *parent = memory_region_owner((MemoryRegion *)mr);
+
+    if (!owner && !parent) {
+        mon_printf(f, " orphan");
+        return;
+    }
+    if (owner) {
+        mtree_expand_owner(mon_printf, f, "owner", owner);
+    }
+    if (parent && parent != owner) {
+        mtree_expand_owner(mon_printf, f, "parent", parent);
+    }
+}
+
 static void mtree_print_mr(fprintf_function mon_printf, void *f,
                            const MemoryRegion *mr, unsigned int level,
                            hwaddr base,
-                           MemoryRegionListHead *alias_print_queue)
+                           MemoryRegionListHead *alias_print_queue,
+                           bool owner)
 {
     MemoryRegionList *new_ml, *ml, *next_ml;
     MemoryRegionListHead submr_print_queue;
@@ -2907,7 +2946,7 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f,
         }
         mon_printf(f, TARGET_FMT_plx "-" TARGET_FMT_plx
                    " (prio %d, %s): alias %s @%s " TARGET_FMT_plx
-                   "-" TARGET_FMT_plx "%s\n",
+                   "-" TARGET_FMT_plx "%s",
                    cur_start, cur_end,
                    mr->priority,
                    memory_region_type((MemoryRegion *)mr),
@@ -2916,15 +2955,22 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f,
                    mr->alias_offset,
                    mr->alias_offset + MR_SIZE(mr->size),
                    mr->enabled ? "" : " [disabled]");
+        if (owner) {
+            mtree_print_mr_owner(mon_printf, f, mr);
+        }
     } else {
         mon_printf(f,
-                   TARGET_FMT_plx "-" TARGET_FMT_plx " (prio %d, %s): %s%s\n",
+                   TARGET_FMT_plx "-" TARGET_FMT_plx " (prio %d, %s): %s%s",
                    cur_start, cur_end,
                    mr->priority,
                    memory_region_type((MemoryRegion *)mr),
                    memory_region_name(mr),
                    mr->enabled ? "" : " [disabled]");
+        if (owner) {
+            mtree_print_mr_owner(mon_printf, f, mr);
+        }
     }
+    mon_printf(f, "\n");
 
     QTAILQ_INIT(&submr_print_queue);
 
@@ -2947,7 +2993,7 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f,
 
     QTAILQ_FOREACH(ml, &submr_print_queue, mrqueue) {
         mtree_print_mr(mon_printf, f, ml->mr, level + 1, cur_start,
-                       alias_print_queue);
+                       alias_print_queue, owner);
     }
 
     QTAILQ_FOREACH_SAFE(ml, &submr_print_queue, mrqueue, next_ml) {
@@ -2960,6 +3006,7 @@ struct FlatViewInfo {
     void *f;
     int counter;
     bool dispatch_tree;
+    bool owner;
 };
 
 static void mtree_print_flatview(gpointer key, gpointer value,
@@ -3000,7 +3047,7 @@ static void mtree_print_flatview(gpointer key, gpointer value,
         mr = range->mr;
         if (range->offset_in_region) {
             p(f, MTREE_INDENT TARGET_FMT_plx "-"
-              TARGET_FMT_plx " (prio %d, %s): %s @" TARGET_FMT_plx "\n",
+              TARGET_FMT_plx " (prio %d, %s): %s @" TARGET_FMT_plx,
               int128_get64(range->addr.start),
               int128_get64(range->addr.start) + MR_SIZE(range->addr.size),
               mr->priority,
@@ -3009,13 +3056,17 @@ static void mtree_print_flatview(gpointer key, gpointer value,
               range->offset_in_region);
         } else {
             p(f, MTREE_INDENT TARGET_FMT_plx "-"
-              TARGET_FMT_plx " (prio %d, %s): %s\n",
+              TARGET_FMT_plx " (prio %d, %s): %s",
               int128_get64(range->addr.start),
               int128_get64(range->addr.start) + MR_SIZE(range->addr.size),
               mr->priority,
               range->readonly ? "rom" : memory_region_type(mr),
               memory_region_name(mr));
         }
+        if (fvi->owner) {
+            mtree_print_mr_owner(p, f, mr);
+        }
+        p(f, "\n");
         range++;
     }
 
@@ -3041,7 +3092,7 @@ static gboolean mtree_info_flatview_free(gpointer key, gpointer value,
 }
 
 void mtree_info(fprintf_function mon_printf, void *f, bool flatview,
-                bool dispatch_tree)
+                bool dispatch_tree, bool owner)
 {
     MemoryRegionListHead ml_head;
     MemoryRegionList *ml, *ml2;
@@ -3053,7 +3104,8 @@ void mtree_info(fprintf_function mon_printf, void *f, bool flatview,
             .mon_printf = mon_printf,
             .f = f,
             .counter = 0,
-            .dispatch_tree = dispatch_tree
+            .dispatch_tree = dispatch_tree,
+            .owner = owner,
         };
         GArray *fv_address_spaces;
         GHashTable *views = g_hash_table_new(g_direct_hash, g_direct_equal);
@@ -3085,14 +3137,14 @@ void mtree_info(fprintf_function mon_printf, void *f, bool flatview,
 
     QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
         mon_printf(f, "address-space: %s\n", as->name);
-        mtree_print_mr(mon_printf, f, as->root, 1, 0, &ml_head);
+        mtree_print_mr(mon_printf, f, as->root, 1, 0, &ml_head, owner);
         mon_printf(f, "\n");
     }
 
     /* print aliased regions */
     QTAILQ_FOREACH(ml, &ml_head, mrqueue) {
         mon_printf(f, "memory-region: %s\n", memory_region_name(ml->mr));
-        mtree_print_mr(mon_printf, f, ml->mr, 1, 0, &ml_head);
+        mtree_print_mr(mon_printf, f, ml->mr, 1, 0, &ml_head, owner);
         mon_printf(f, "\n");
     }
 
diff --git a/monitor.c b/monitor.c
index 0730a27172..0988eb4788 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2007,8 +2007,10 @@ static void hmp_info_mtree(Monitor *mon, const QDict *qdict)
 {
     bool flatview = qdict_get_try_bool(qdict, "flatview", false);
     bool dispatch_tree = qdict_get_try_bool(qdict, "dispatch_tree", false);
+    bool owner = qdict_get_try_bool(qdict, "owner", false);
 
-    mtree_info((fprintf_function)monitor_printf, mon, flatview, dispatch_tree);
+    mtree_info((fprintf_function)monitor_printf, mon, flatview, dispatch_tree,
+               owner);
 }
 
 static void hmp_info_numa(Monitor *mon, const QDict *qdict)

From 02f7fd25a446a220905c2e5cb0fc3655d7f63b29 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 3 Apr 2018 17:36:11 +0200
Subject: [PATCH 43/60] target-i386: Add NMI interception to SVM

Check for SVM interception prior to injecting an NMI. Tested via the
Jailhouse hypervisor.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <c65877e9a011ee4962931287e59f502c482b8d0b.1522769774.git.jan.kiszka@web.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/seg_helper.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c
index 600a4d7586..00301a0c04 100644
--- a/target/i386/seg_helper.c
+++ b/target/i386/seg_helper.c
@@ -1337,6 +1337,7 @@ bool x86_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
             ret = true;
         } else if ((interrupt_request & CPU_INTERRUPT_NMI) &&
                    !(env->hflags2 & HF2_NMI_MASK)) {
+            cpu_svm_check_intercept_param(env, SVM_EXIT_NMI, 0, 0);
             cs->interrupt_request &= ~CPU_INTERRUPT_NMI;
             env->hflags2 |= HF2_NMI_MASK;
             do_interrupt_x86_hardirq(env, EXCP02_NMI, 1);

From df2518aa587a0157bbfbc635fe47295629d9914a Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 3 Apr 2018 17:36:12 +0200
Subject: [PATCH 44/60] target-i386: Allow interrupt injection after STGI

We need to terminate the translation block after STGI so that pending
interrupts can be injected.

This fixes pending NMI injection for Jailhouse which uses "stgi; clgi"
to open a brief injection window.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <37939b244dda0e9cccf96ce50f2b15df1e48315d.1522769774.git.jan.kiszka@web.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/translate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/i386/translate.c b/target/i386/translate.c
index c91849417b..07d185e7b6 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -7444,8 +7444,9 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
                 break;
             }
             gen_update_cc_op(s);
-            gen_jmp_im(pc_start - s->cs_base);
             gen_helper_stgi(cpu_env);
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
             break;
 
         case 0xdd: /* CLGI */

From 50b3de6e5cd464dcc20e3a48f5a09e0299a184ac Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 3 Apr 2018 17:36:13 +0200
Subject: [PATCH 45/60] target-i386: Mark cpu_vmexit noreturn

It calls cpu_loop_exit in system emulation mode (and should never be
called in user emulation mode).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <6f4d44ffde55d074cbceb48309c1678600abad2f.1522769774.git.jan.kiszka@web.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.h        | 4 ++--
 target/i386/svm_helper.c | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 89c82be8d2..16c59b7099 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1840,8 +1840,8 @@ void helper_lock_init(void);
 /* svm_helper.c */
 void cpu_svm_check_intercept_param(CPUX86State *env1, uint32_t type,
                                    uint64_t param, uintptr_t retaddr);
-void cpu_vmexit(CPUX86State *nenv, uint32_t exit_code, uint64_t exit_info_1,
-                uintptr_t retaddr);
+void QEMU_NORETURN cpu_vmexit(CPUX86State *nenv, uint32_t exit_code,
+                              uint64_t exit_info_1, uintptr_t retaddr);
 void do_vmexit(CPUX86State *env, uint32_t exit_code, uint64_t exit_info_1);
 
 /* seg_helper.c */
diff --git a/target/i386/svm_helper.c b/target/i386/svm_helper.c
index 350492359c..f245aec310 100644
--- a/target/i386/svm_helper.c
+++ b/target/i386/svm_helper.c
@@ -62,6 +62,7 @@ void helper_invlpga(CPUX86State *env, int aflag)
 void cpu_vmexit(CPUX86State *nenv, uint32_t exit_code, uint64_t exit_info_1,
                 uintptr_t retaddr)
 {
+    assert(0);
 }
 
 void helper_svm_check_intercept_param(CPUX86State *env, uint32_t type,

From 0cda9d876c7d4b05cac164020e8cbafa4adb3728 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 29 Dec 2017 15:31:00 +0800
Subject: [PATCH 46/60] doc: another fix to "info pic"

Something that commit 254316fa1f ("intc: make HMP 'info irq' and 'info
pic' commands available on all targets", 2016-10-04) forgot to touch up.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20171229073104.3810-2-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hmp-commands-info.hx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 59bdd8f713..a482b6e56b 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -201,7 +201,7 @@ ETEXI
 STEXI
 @item info pic
 @findex info pic
-Show i8259 (PIC) state.
+Show PIC state.
 ETEXI
 
     {

From 4a499ad295e007891d2a27ad21269aee8e698e58 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 29 Dec 2017 15:31:01 +0800
Subject: [PATCH 47/60] ioapic: support "info pic"

People start to use "info pic" for all kinds of irqchip dumps.  Let x86
ioapic join the family.  It dumps the same thing as "info ioapic".

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20171229073104.3810-3-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/intc/ioapic_common.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/hw/intc/ioapic_common.c b/hw/intc/ioapic_common.c
index 3b3d0a7680..c62ba27018 100644
--- a/hw/intc/ioapic_common.c
+++ b/hw/intc/ioapic_common.c
@@ -24,6 +24,7 @@
 #include "monitor/monitor.h"
 #include "hw/i386/ioapic.h"
 #include "hw/i386/ioapic_internal.h"
+#include "hw/intc/intc.h"
 #include "hw/sysbus.h"
 
 /* ioapic_no count start from 0 to MAX_IOAPICS,
@@ -142,6 +143,15 @@ static void ioapic_common_realize(DeviceState *dev, Error **errp)
     ioapic_no++;
 }
 
+static void ioapic_print_info(InterruptStatsProvider *obj,
+                              Monitor *mon)
+{
+    IOAPICCommonState *s = IOAPIC_COMMON(obj);
+
+    ioapic_dispatch_pre_save(s);
+    ioapic_print_redtbl(mon, s);
+}
+
 static const VMStateDescription vmstate_ioapic_common = {
     .name = "ioapic",
     .version_id = 3,
@@ -161,9 +171,11 @@ static const VMStateDescription vmstate_ioapic_common = {
 static void ioapic_common_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    InterruptStatsProviderClass *ic = INTERRUPT_STATS_PROVIDER_CLASS(klass);
 
     dc->realize = ioapic_common_realize;
     dc->vmsd = &vmstate_ioapic_common;
+    ic->print_info = ioapic_print_info;
 }
 
 static const TypeInfo ioapic_common_type = {
@@ -173,6 +185,10 @@ static const TypeInfo ioapic_common_type = {
     .class_size = sizeof(IOAPICCommonClass),
     .class_init = ioapic_common_class_init,
     .abstract = true,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_INTERRUPT_STATS_PROVIDER },
+        { }
+    },
 };
 
 static void ioapic_common_register_types(void)

From 6a218b032b2d62b3c13e9553593b75e445ce5f1a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 29 Dec 2017 15:31:02 +0800
Subject: [PATCH 48/60] ioapic: some proper indents when dump info

So that now it looks better when with other irqchips.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20171229073104.3810-4-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/intc/ioapic_common.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/intc/ioapic_common.c b/hw/intc/ioapic_common.c
index c62ba27018..a02c135b24 100644
--- a/hw/intc/ioapic_common.c
+++ b/hw/intc/ioapic_common.c
@@ -59,7 +59,7 @@ void ioapic_print_redtbl(Monitor *mon, IOAPICCommonState *s)
     uint32_t remote_irr = 0;
     int i;
 
-    monitor_printf(mon, "ioapic ver=0x%x id=0x%02x sel=0x%02x",
+    monitor_printf(mon, "ioapic0: ver=0x%x id=0x%02x sel=0x%02x",
                    s->version, s->id, s->ioregsel);
     if (s->ioregsel) {
         monitor_printf(mon, " (redir[%u])\n",
@@ -71,7 +71,7 @@ void ioapic_print_redtbl(Monitor *mon, IOAPICCommonState *s)
         uint64_t entry = s->ioredtbl[i];
         uint32_t delm = (uint32_t)((entry & IOAPIC_LVT_DELIV_MODE) >>
                                    IOAPIC_LVT_DELIV_MODE_SHIFT);
-        monitor_printf(mon, "pin %-2u 0x%016"PRIx64" dest=%"PRIx64
+        monitor_printf(mon, "  pin %-2u 0x%016"PRIx64" dest=%"PRIx64
                        " vec=%-3"PRIu64" %s %-5s %-6s %-6s %s\n",
                        i, entry,
                        (entry >> IOAPIC_LVT_DEST_SHIFT) &
@@ -86,8 +86,8 @@ void ioapic_print_redtbl(Monitor *mon, IOAPICCommonState *s)
         remote_irr |= entry & IOAPIC_LVT_TRIGGER_MODE ?
                         (entry & IOAPIC_LVT_REMOTE_IRR ? (1 << i) : 0) : 0;
     }
-    ioapic_irr_dump(mon, "IRR", s->irr);
-    ioapic_irr_dump(mon, "Remote IRR", remote_irr);
+    ioapic_irr_dump(mon, "  IRR", s->irr);
+    ioapic_irr_dump(mon, "  Remote IRR", remote_irr);
 }
 
 void ioapic_reset_common(DeviceState *dev)

From cce5405e0ebce0cd400cfd3d3d218a776ac6b333 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 29 Dec 2017 15:31:03 +0800
Subject: [PATCH 49/60] ioapic: support "info irq"

This include both userspace and in-kernel ioapic.  Note that the numbers
can be inaccurate for kvm-ioapic.  One reason is the same with
kvm-i8259, that when irqfd is used, irqs can be delivered all inside
kernel without our notice.  Meanwhile, kvm-ioapic is specially treated
when irq numbers <ISA_NUM_IRQS, those irqs will be delivered in kernel
too via kvm-i8259 (please refer to kvm_pc_gsi_handler).

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20171229073104.3810-5-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/kvm/ioapic.c              |  2 ++
 hw/intc/ioapic.c                  |  1 +
 hw/intc/ioapic_common.c           | 23 +++++++++++++++++++++++
 include/hw/i386/ioapic_internal.h |  3 +++
 4 files changed, 29 insertions(+)

diff --git a/hw/i386/kvm/ioapic.c b/hw/i386/kvm/ioapic.c
index 646f6245ee..5274dc709b 100644
--- a/hw/i386/kvm/ioapic.c
+++ b/hw/i386/kvm/ioapic.c
@@ -132,8 +132,10 @@ static void kvm_ioapic_reset(DeviceState *dev)
 static void kvm_ioapic_set_irq(void *opaque, int irq, int level)
 {
     KVMIOAPICState *s = opaque;
+    IOAPICCommonState *common = IOAPIC_COMMON(s);
     int delivered;
 
+    ioapic_stat_update_irq(common, irq, level);
     delivered = kvm_set_irq(kvm_state, s->kvm_gsi_base + irq, level);
     apic_report_irq_delivered(delivered);
 }
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index c45f073271..222f3f7d47 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -148,6 +148,7 @@ static void ioapic_set_irq(void *opaque, int vector, int level)
      * the cleanest way of doing it but it should work. */
 
     trace_ioapic_set_irq(vector, level);
+    ioapic_stat_update_irq(s, vector, level);
     if (vector == 0) {
         vector = 2;
     }
diff --git a/hw/intc/ioapic_common.c b/hw/intc/ioapic_common.c
index a02c135b24..692dc37bb6 100644
--- a/hw/intc/ioapic_common.c
+++ b/hw/intc/ioapic_common.c
@@ -35,6 +35,28 @@
  */
 int ioapic_no;
 
+void ioapic_stat_update_irq(IOAPICCommonState *s, int irq, int level)
+{
+    if (level != s->irq_level[irq]) {
+        s->irq_level[irq] = level;
+        if (level == 1) {
+            s->irq_count[irq]++;
+        }
+    }
+}
+
+static bool ioapic_get_statistics(InterruptStatsProvider *obj,
+                                  uint64_t **irq_counts,
+                                  unsigned int *nb_irqs)
+{
+    IOAPICCommonState *s = IOAPIC_COMMON(obj);
+
+    *irq_counts = s->irq_count;
+    *nb_irqs = IOAPIC_NUM_PINS;
+
+    return true;
+}
+
 static void ioapic_irr_dump(Monitor *mon, const char *name, uint32_t bitmap)
 {
     int i;
@@ -176,6 +198,7 @@ static void ioapic_common_class_init(ObjectClass *klass, void *data)
     dc->realize = ioapic_common_realize;
     dc->vmsd = &vmstate_ioapic_common;
     ic->print_info = ioapic_print_info;
+    ic->get_statistics = ioapic_get_statistics;
 }
 
 static const TypeInfo ioapic_common_type = {
diff --git a/include/hw/i386/ioapic_internal.h b/include/hw/i386/ioapic_internal.h
index a11d86de46..9848f391bb 100644
--- a/include/hw/i386/ioapic_internal.h
+++ b/include/hw/i386/ioapic_internal.h
@@ -109,10 +109,13 @@ struct IOAPICCommonState {
     uint64_t ioredtbl[IOAPIC_NUM_PINS];
     Notifier machine_done;
     uint8_t version;
+    uint64_t irq_count[IOAPIC_NUM_PINS];
+    int irq_level[IOAPIC_NUM_PINS];
 };
 
 void ioapic_reset_common(DeviceState *dev);
 
 void ioapic_print_redtbl(Monitor *mon, IOAPICCommonState *s);
+void ioapic_stat_update_irq(IOAPICCommonState *s, int irq, int level);
 
 #endif /* QEMU_IOAPIC_INTERNAL_H */

From 0c8465440d50c18a7bb13d0a866748f0593e193a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 29 Dec 2017 15:31:04 +0800
Subject: [PATCH 50/60] hmp: obsolete "info ioapic"

Let's start to use "info pic" just like other platforms.  For now we
keep the command for a while so that old users can know what is the new
command to use.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20171229073104.3810-6-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/kvm/ioapic.c  |  9 ---------
 hw/intc/ioapic.c      | 11 -----------
 include/hw/i386/pc.h  |  3 ---
 target/i386/monitor.c |  8 ++------
 4 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/hw/i386/kvm/ioapic.c b/hw/i386/kvm/ioapic.c
index 5274dc709b..5b40d75439 100644
--- a/hw/i386/kvm/ioapic.c
+++ b/hw/i386/kvm/ioapic.c
@@ -112,15 +112,6 @@ static void kvm_ioapic_put(IOAPICCommonState *s)
     }
 }
 
-void kvm_ioapic_dump_state(Monitor *mon, const QDict *qdict)
-{
-    IOAPICCommonState *s = IOAPIC_COMMON(object_resolve_path("ioapic", NULL));
-
-    assert(s);
-    kvm_ioapic_get(s);
-    ioapic_print_redtbl(mon, s);
-}
-
 static void kvm_ioapic_reset(DeviceState *dev)
 {
     IOAPICCommonState *s = IOAPIC_COMMON(dev);
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 222f3f7d47..b3937807c2 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -234,17 +234,6 @@ void ioapic_eoi_broadcast(int vector)
     }
 }
 
-void ioapic_dump_state(Monitor *mon, const QDict *qdict)
-{
-    int i;
-
-    for (i = 0; i < MAX_IOAPICS; i++) {
-        if (ioapics[i] != 0) {
-            ioapic_print_redtbl(mon, ioapics[i]);
-        }
-    }
-}
-
 static uint64_t
 ioapic_mem_read(void *opaque, hwaddr addr, unsigned int size)
 {
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 316230e570..4d99d69681 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -154,9 +154,6 @@ int pic_get_output(DeviceState *d);
 
 /* ioapic.c */
 
-void kvm_ioapic_dump_state(Monitor *mon, const QDict *qdict);
-void ioapic_dump_state(Monitor *mon, const QDict *qdict);
-
 /* Global System Interrupts */
 
 #define GSI_NUM_PINS IOAPIC_NUM_PINS
diff --git a/target/i386/monitor.c b/target/i386/monitor.c
index a890b3c2ab..6bbb3a96cd 100644
--- a/target/i386/monitor.c
+++ b/target/i386/monitor.c
@@ -658,12 +658,8 @@ void hmp_info_local_apic(Monitor *mon, const QDict *qdict)
 
 void hmp_info_io_apic(Monitor *mon, const QDict *qdict)
 {
-    if (kvm_irqchip_in_kernel() &&
-        !kvm_irqchip_is_split()) {
-        kvm_ioapic_dump_state(mon, qdict);
-    } else {
-        ioapic_dump_state(mon, qdict);
-    }
+    monitor_printf(mon, "This command is obsolete and will be "
+                   "removed soon. Please use 'info pic' instead.\n");
 }
 
 SevInfo *qmp_query_sev(Error **errp)

From 6f131f13e68d648a8e4f083c667ab1acd88ce4cd Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 22 Jun 2018 22:22:05 +0300
Subject: [PATCH 51/60] kvm: support -overcommit cpu-pm=on|off

With this flag, kvm allows guest to control host CPU power state.  This
increases latency for other processes using same host CPU in an
unpredictable way, but if decreases idle entry/exit times for the
running VCPU, so to use it QEMU needs a hint about whether host CPU is
overcommitted, hence the flag name.

Follow-up patches will expose this capability to guest
(using mwait leaf).

Based on a patch by Wanpeng Li <kernellwp@gmail.com> .

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20180622192148.178309-2-mst@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/sysemu/sysemu.h |  1 +
 qemu-options.hx         | 24 ++++++++++++++++++++++++
 target/i386/kvm.c       | 23 +++++++++++++++++++++++
 vl.c                    | 32 +++++++++++++++++++++++++++++++-
 4 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index e893f72f3b..b921c6f3b7 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -128,6 +128,7 @@ extern bool boot_strict;
 extern uint8_t *boot_splash_filedata;
 extern size_t boot_splash_filedata_size;
 extern bool enable_mlock;
+extern bool enable_cpu_pm;
 extern uint8_t qemu_extra_params_fw[2];
 extern QEMUClockType rtc_clock;
 extern const char *mem_path;
diff --git a/qemu-options.hx b/qemu-options.hx
index 3e45483834..81b1e99d58 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3325,6 +3325,30 @@ mlocking qemu and guest memory can be enabled via @option{mlock=on}
 (enabled by default).
 ETEXI
 
+DEF("overcommit", HAS_ARG, QEMU_OPTION_overcommit,
+    "--overcommit [mem-lock=on|off][cpu-pm=on|off]\n"
+    "                run qemu with overcommit hints\n"
+    "                mem-lock=on|off controls memory lock support (default: off)\n"
+    "                cpu-pm=on|off controls cpu power management (default: off)\n",
+    QEMU_ARCH_ALL)
+STEXI
+@item -overcommit mem-lock=on|off
+@item -overcommit cpu-pm=on|off
+@findex -overcommit
+Run qemu with hints about host resource overcommit. The default is
+to assume that host overcommits all resources.
+
+Locking qemu and guest memory can be enabled via @option{mem-lock=on} (disabled
+by default).  This works when host memory is not overcommitted and reduces the
+worst-case latency for guest.  This is equivalent to @option{realtime}.
+
+Guest ability to manage power state of host cpus (increasing latency for other
+processes on the same host cpu, but decreasing latency for guest) can be
+enabled via @option{cpu-pm=on} (disabled by default).  This works best when
+host CPU is not overcommitted. When used, host estimates of CPU cycle and power
+utilization will be incorrect, not taking into account guest idle time.
+ETEXI
+
 DEF("gdb", HAS_ARG, QEMU_OPTION_gdb, \
     "-gdb dev        wait for gdb connection on 'dev'\n", QEMU_ARCH_ALL)
 STEXI
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 2d174f3a91..dc991f6aca 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1387,6 +1387,29 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
         smram_machine_done.notify = register_smram_listener;
         qemu_add_machine_init_done_notifier(&smram_machine_done);
     }
+
+    if (enable_cpu_pm) {
+        int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
+        int ret;
+
+/* Work around for kernel header with a typo. TODO: fix header and drop. */
+#if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
+#define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
+#endif
+        if (disable_exits) {
+            disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
+                              KVM_X86_DISABLE_EXITS_HLT |
+                              KVM_X86_DISABLE_EXITS_PAUSE);
+        }
+
+        ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
+                                disable_exits);
+        if (ret < 0) {
+            error_report("kvm: guest stopping CPU not supported: %s",
+                         strerror(-ret));
+        }
+    }
+
     return 0;
 }
 
diff --git a/vl.c b/vl.c
index 7c9f19aa31..ef6cfcec40 100644
--- a/vl.c
+++ b/vl.c
@@ -142,6 +142,7 @@ ram_addr_t ram_size;
 const char *mem_path = NULL;
 int mem_prealloc = 0; /* force preallocation of physical target memory */
 bool enable_mlock = false;
+bool enable_cpu_pm = false;
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int autostart;
@@ -390,6 +391,22 @@ static QemuOptsList qemu_realtime_opts = {
     },
 };
 
+static QemuOptsList qemu_overcommit_opts = {
+    .name = "overcommit",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_overcommit_opts.head),
+    .desc = {
+        {
+            .name = "mem-lock",
+            .type = QEMU_OPT_BOOL,
+        },
+        {
+            .name = "cpu-pm",
+            .type = QEMU_OPT_BOOL,
+        },
+        { /* end of list */ }
+    },
+};
+
 static QemuOptsList qemu_msg_opts = {
     .name = "msg",
     .head = QTAILQ_HEAD_INITIALIZER(qemu_msg_opts.head),
@@ -3906,7 +3923,20 @@ int main(int argc, char **argv, char **envp)
                 if (!opts) {
                     exit(1);
                 }
-                enable_mlock = qemu_opt_get_bool(opts, "mlock", true);
+                /* Don't override the -overcommit option if set */
+                enable_mlock = enable_mlock ||
+                    qemu_opt_get_bool(opts, "mlock", true);
+                break;
+            case QEMU_OPTION_overcommit:
+                opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"),
+                                               optarg, false);
+                if (!opts) {
+                    exit(1);
+                }
+                /* Don't override the -realtime option if set */
+                enable_mlock = enable_mlock ||
+                    qemu_opt_get_bool(opts, "mem-lock", false);
+                enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", false);
                 break;
             case QEMU_OPTION_msg:
                 opts = qemu_opts_parse_noisily(qemu_find_opts("msg"), optarg,

From 2266d44311321a833d569cd4deb46cca6021d0e7 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 22 Jun 2018 22:22:05 +0300
Subject: [PATCH 52/60] i386/cpu: make -cpu host support monitor/mwait

When guest CPU PM is enabled, and with -cpu host, expose the host CPU
MWAIT leaf in the CPUID so guest can make good PM decisions.

Note: the result is 100% CPU utilization reported by host as host
no longer knows that the CPU is halted.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
Message-Id: <20180622192148.178309-3-mst@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/tcg/user-exec-stub.c |  3 +++
 target/i386/cpu.c          | 32 ++++++++++++++++++++++----------
 target/i386/cpu.h          |  9 +++++++++
 target/i386/kvm.c          |  9 +++++++++
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/accel/tcg/user-exec-stub.c b/accel/tcg/user-exec-stub.c
index dbcf1ade9c..a32b4496af 100644
--- a/accel/tcg/user-exec-stub.c
+++ b/accel/tcg/user-exec-stub.c
@@ -2,6 +2,9 @@
 #include "qemu-common.h"
 #include "qom/cpu.h"
 #include "sysemu/replay.h"
+#include "sysemu/sysemu.h"
+
+bool enable_cpu_pm = false;
 
 void cpu_resume(CPUState *cpu)
 {
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index e6c2f8a22a..1e6a7d0a75 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3959,11 +3959,11 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         }
         break;
     case 5:
-        /* mwait info: needed for Core compatibility */
-        *eax = 0; /* Smallest monitor-line size in bytes */
-        *ebx = 0; /* Largest monitor-line size in bytes */
-        *ecx = CPUID_MWAIT_EMX | CPUID_MWAIT_IBE;
-        *edx = 0;
+        /* MONITOR/MWAIT Leaf */
+        *eax = cpu->mwait.eax; /* Smallest monitor-line size in bytes */
+        *ebx = cpu->mwait.ebx; /* Largest monitor-line size in bytes */
+        *ecx = cpu->mwait.ecx; /* flags */
+        *edx = cpu->mwait.edx; /* mwait substates */
         break;
     case 6:
         /* Thermal and Power Leaf */
@@ -4804,13 +4804,25 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
     Error *local_err = NULL;
     static bool ht_warned;
 
-    if (xcc->host_cpuid_required && !accel_uses_host_cpuid()) {
-        char *name = x86_cpu_class_get_model_name(xcc);
-        error_setg(&local_err, "CPU model '%s' requires KVM", name);
-        g_free(name);
-        goto out;
+    if (xcc->host_cpuid_required) {
+        if (!accel_uses_host_cpuid()) {
+            char *name = x86_cpu_class_get_model_name(xcc);
+            error_setg(&local_err, "CPU model '%s' requires KVM", name);
+            g_free(name);
+            goto out;
+        }
+
+        if (enable_cpu_pm) {
+            host_cpuid(5, 0, &cpu->mwait.eax, &cpu->mwait.ebx,
+                       &cpu->mwait.ecx, &cpu->mwait.edx);
+            env->features[FEAT_1_ECX] |= CPUID_EXT_MONITOR;
+        }
     }
 
+    /* mwait extended info: needed for Core compatibility */
+    /* We always wake on interrupt even if host does not have the capability */
+    cpu->mwait.ecx |= CPUID_MWAIT_EMX | CPUID_MWAIT_IBE;
+
     if (cpu->apic_id == UNASSIGNED_APIC_ID) {
         error_setg(errp, "apic-id property was not initialized properly");
         return;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 16c59b7099..8eaefeee3e 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1382,6 +1382,15 @@ struct X86CPU {
     /* if true the CPUID code directly forward host cache leaves to the guest */
     bool cache_info_passthrough;
 
+    /* if true the CPUID code directly forwards
+     * host monitor/mwait leaves to the guest */
+    struct {
+        uint32_t eax;
+        uint32_t ebx;
+        uint32_t ecx;
+        uint32_t edx;
+    } mwait;
+
     /* Features that were filtered out because of missing host capabilities */
     uint32_t filtered_features[FEATURE_WORDS];
 
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index dc991f6aca..c5f72d645b 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -366,6 +366,15 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
         if (!kvm_irqchip_in_kernel()) {
             ret &= ~CPUID_EXT_X2APIC;
         }
+
+        if (enable_cpu_pm) {
+            int disable_exits = kvm_check_extension(s,
+                                                    KVM_CAP_X86_DISABLE_EXITS);
+
+            if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
+                ret |= CPUID_EXT_MONITOR;
+            }
+        }
     } else if (function == 6 && reg == R_EAX) {
         ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
     } else if (function == 7 && index == 0 && reg == R_EBX) {

From 2da91b54fe98faa8676264ac6e5a3aac5b69bec2 Mon Sep 17 00:00:00 2001
From: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Date: Thu, 17 May 2018 19:23:39 +0300
Subject: [PATCH 53/60] dump: add Windows dump format to dump-guest-memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds Windows crashdumping feature. Now QEMU can produce ELF-dump
containing Windows crashdump header, which can help to convert to a valid
WinDbg-understandable crashdump file, or immediately create such file.
The crashdump will be obtained by joining physical memory dump and 8K header
exposed through vmcoreinfo/fw_cfg device by guest driver at BSOD time. Option
'-w' was added to dump-guest-memory command. At the moment, only x64
configuration is supported.
Suitable driver can be found at
https://github.com/virtio-win/kvm-guest-drivers-windows/tree/master/fwcfg64

Signed-off-by: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20180517162342.4330-2-viktor.prutyanov@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Makefile.target |   1 +
 dump.c          |  24 +++++-
 hmp-commands.hx |  13 +--
 hmp.c           |   9 ++-
 qapi/misc.json  |   5 +-
 win_dump.c      | 209 ++++++++++++++++++++++++++++++++++++++++++++++++
 win_dump.h      |  87 ++++++++++++++++++++
 7 files changed, 339 insertions(+), 9 deletions(-)
 create mode 100644 win_dump.c
 create mode 100644 win_dump.h

diff --git a/Makefile.target b/Makefile.target
index a9d8928f96..4d56298bbf 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -143,6 +143,7 @@ obj-y += hw/
 obj-y += memory.o
 obj-y += memory_mapping.o
 obj-y += dump.o
+obj-$(TARGET_X86_64) += win_dump.o
 obj-y += migration/ram.o
 LIBS := $(libs_softmmu) $(LIBS)
 
diff --git a/dump.c b/dump.c
index b54cd42b21..04467b353e 100644
--- a/dump.c
+++ b/dump.c
@@ -29,6 +29,10 @@
 #include "qemu/error-report.h"
 #include "hw/misc/vmcoreinfo.h"
 
+#ifdef TARGET_X86_64
+#include "win_dump.h"
+#endif
+
 #include <zlib.h>
 #ifdef CONFIG_LZO
 #include <lzo/lzo1x.h>
@@ -1866,7 +1870,11 @@ static void dump_process(DumpState *s, Error **errp)
     Error *local_err = NULL;
     DumpQueryResult *result = NULL;
 
-    if (s->has_format && s->format != DUMP_GUEST_MEMORY_FORMAT_ELF) {
+    if (s->has_format && s->format == DUMP_GUEST_MEMORY_FORMAT_WIN_DMP) {
+#ifdef TARGET_X86_64
+        create_win_dump(s, &local_err);
+#endif
+    } else if (s->has_format && s->format != DUMP_GUEST_MEMORY_FORMAT_ELF) {
         create_kdump_vmcore(s, &local_err);
     } else {
         create_vmcore(s, &local_err);
@@ -1970,6 +1978,13 @@ void qmp_dump_guest_memory(bool paging, const char *file,
     }
 #endif
 
+#ifndef TARGET_X86_64
+    if (has_format && format == DUMP_GUEST_MEMORY_FORMAT_WIN_DMP) {
+        error_setg(errp, "Windows dump is only available for x86-64");
+        return;
+    }
+#endif
+
 #if !defined(WIN32)
     if (strstart(file, "fd:", &p)) {
         fd = monitor_get_fd(cur_mon, p, errp);
@@ -2044,5 +2059,12 @@ DumpGuestMemoryCapability *qmp_query_dump_guest_memory_capability(Error **errp)
     item->value = DUMP_GUEST_MEMORY_FORMAT_KDUMP_SNAPPY;
 #endif
 
+    /* Windows dump is available only if target is x86_64 */
+#ifdef TARGET_X86_64
+    item->next = g_malloc0(sizeof(DumpGuestMemoryFormatList));
+    item = item->next;
+    item->value = DUMP_GUEST_MEMORY_FORMAT_WIN_DMP;
+#endif
+
     return cap;
 }
diff --git a/hmp-commands.hx b/hmp-commands.hx
index ba9cdb8800..c1fc747403 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1136,30 +1136,33 @@ ETEXI
 
     {
         .name       = "dump-guest-memory",
-        .args_type  = "paging:-p,detach:-d,zlib:-z,lzo:-l,snappy:-s,filename:F,begin:l?,length:l?",
-        .params     = "[-p] [-d] [-z|-l|-s] filename [begin length]",
+        .args_type  = "paging:-p,detach:-d,windmp:-w,zlib:-z,lzo:-l,snappy:-s,filename:F,begin:l?,length:l?",
+        .params     = "[-p] [-d] [-z|-l|-s|-w] filename [begin length]",
         .help       = "dump guest memory into file 'filename'.\n\t\t\t"
                       "-p: do paging to get guest's memory mapping.\n\t\t\t"
                       "-d: return immediately (do not wait for completion).\n\t\t\t"
                       "-z: dump in kdump-compressed format, with zlib compression.\n\t\t\t"
                       "-l: dump in kdump-compressed format, with lzo compression.\n\t\t\t"
                       "-s: dump in kdump-compressed format, with snappy compression.\n\t\t\t"
+                      "-w: dump in Windows crashdump format (can be used instead of ELF-dump converting),\n\t\t\t"
+                      "    for Windows x64 guests with vmcoreinfo driver only.\n\t\t\t"
                       "begin: the starting physical address.\n\t\t\t"
                       "length: the memory size, in bytes.",
         .cmd        = hmp_dump_guest_memory,
     },
 
-
 STEXI
 @item dump-guest-memory [-p] @var{filename} @var{begin} @var{length}
-@item dump-guest-memory [-z|-l|-s] @var{filename}
+@item dump-guest-memory [-z|-l|-s|-w] @var{filename}
 @findex dump-guest-memory
 Dump guest memory to @var{protocol}. The file can be processed with crash or
-gdb. Without -z|-l|-s, the dump format is ELF.
+gdb. Without -z|-l|-s|-w, the dump format is ELF.
         -p: do paging to get guest's memory mapping.
         -z: dump in kdump-compressed format, with zlib compression.
         -l: dump in kdump-compressed format, with lzo compression.
         -s: dump in kdump-compressed format, with snappy compression.
+        -w: dump in Windows crashdump format (can be used instead of ELF-dump converting),
+            for Windows x64 guests with vmcoreinfo driver only
   filename: dump file name.
      begin: the starting physical address. It's optional, and should be
             specified together with length.
diff --git a/hmp.c b/hmp.c
index 0da0b0ac33..41f5e39b72 100644
--- a/hmp.c
+++ b/hmp.c
@@ -2014,6 +2014,7 @@ void hmp_device_del(Monitor *mon, const QDict *qdict)
 void hmp_dump_guest_memory(Monitor *mon, const QDict *qdict)
 {
     Error *err = NULL;
+    bool win_dmp = qdict_get_try_bool(qdict, "windmp", false);
     bool paging = qdict_get_try_bool(qdict, "paging", false);
     bool zlib = qdict_get_try_bool(qdict, "zlib", false);
     bool lzo = qdict_get_try_bool(qdict, "lzo", false);
@@ -2028,12 +2029,16 @@ void hmp_dump_guest_memory(Monitor *mon, const QDict *qdict)
     enum DumpGuestMemoryFormat dump_format = DUMP_GUEST_MEMORY_FORMAT_ELF;
     char *prot;
 
-    if (zlib + lzo + snappy > 1) {
-        error_setg(&err, "only one of '-z|-l|-s' can be set");
+    if (zlib + lzo + snappy + win_dmp > 1) {
+        error_setg(&err, "only one of '-z|-l|-s|-w' can be set");
         hmp_handle_error(mon, &err);
         return;
     }
 
+    if (win_dmp) {
+        dump_format = DUMP_GUEST_MEMORY_FORMAT_WIN_DMP;
+    }
+
     if (zlib) {
         dump_format = DUMP_GUEST_MEMORY_FORMAT_KDUMP_ZLIB;
     }
diff --git a/qapi/misc.json b/qapi/misc.json
index c6bc18a859..29da7856e3 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -1677,10 +1677,13 @@
 #
 # @kdump-snappy: kdump-compressed format with snappy-compressed
 #
+# @win-dmp: Windows full crashdump format,
+#           can be used instead of ELF converting (since 2.13)
+#
 # Since: 2.0
 ##
 { 'enum': 'DumpGuestMemoryFormat',
-  'data': [ 'elf', 'kdump-zlib', 'kdump-lzo', 'kdump-snappy' ] }
+  'data': [ 'elf', 'kdump-zlib', 'kdump-lzo', 'kdump-snappy', 'win-dmp' ] }
 
 ##
 # @dump-guest-memory:
diff --git a/win_dump.c b/win_dump.c
new file mode 100644
index 0000000000..58255c12ee
--- /dev/null
+++ b/win_dump.c
@@ -0,0 +1,209 @@
+/*
+ * Windows crashdump
+ *
+ * Copyright (c) 2018 Virtuozzo International GmbH
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "elf.h"
+#include "cpu.h"
+#include "exec/hwaddr.h"
+#include "monitor/monitor.h"
+#include "sysemu/kvm.h"
+#include "sysemu/dump.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/memory_mapping.h"
+#include "sysemu/cpus.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+#include "hw/misc/vmcoreinfo.h"
+#include "win_dump.h"
+
+static size_t write_run(WinDumpPhyMemRun64 *run, int fd, Error **errp)
+{
+    void *buf;
+    uint64_t addr = run->BasePage << TARGET_PAGE_BITS;
+    uint64_t size = run->PageCount << TARGET_PAGE_BITS;
+    uint64_t len = size;
+
+    buf = cpu_physical_memory_map(addr, &len, false);
+    if (!buf) {
+        error_setg(errp, "win-dump: failed to map run");
+        return 0;
+    }
+    if (len != size) {
+        error_setg(errp, "win-dump: failed to map entire run");
+        len = 0;
+        goto out_unmap;
+    }
+
+    len = qemu_write_full(fd, buf, len);
+    if (len != size) {
+        error_setg(errp, QERR_IO_ERROR);
+    }
+
+out_unmap:
+    cpu_physical_memory_unmap(buf, addr, false, len);
+
+    return len;
+}
+
+static void write_runs(DumpState *s, WinDumpHeader64 *h, Error **errp)
+{
+    WinDumpPhyMemDesc64 *desc = &h->PhysicalMemoryBlock;
+    WinDumpPhyMemRun64 *run = desc->Run;
+    Error *local_err = NULL;
+    int i;
+
+    for (i = 0; i < desc->NumberOfRuns; i++) {
+        s->written_size += write_run(run + i, s->fd, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
+
+static void patch_mm_pfn_database(WinDumpHeader64 *h, Error **errp)
+{
+    if (cpu_memory_rw_debug(first_cpu,
+            h->KdDebuggerDataBlock + KDBG_MM_PFN_DATABASE_OFFSET64,
+            (uint8_t *)&h->PfnDatabase, sizeof(h->PfnDatabase), 0)) {
+        error_setg(errp, "win-dump: failed to read MmPfnDatabase");
+        return;
+    }
+}
+
+static void patch_bugcheck_data(WinDumpHeader64 *h, Error **errp)
+{
+    uint64_t KiBugcheckData;
+
+    if (cpu_memory_rw_debug(first_cpu,
+            h->KdDebuggerDataBlock + KDBG_KI_BUGCHECK_DATA_OFFSET64,
+            (uint8_t *)&KiBugcheckData, sizeof(KiBugcheckData), 0)) {
+        error_setg(errp, "win-dump: failed to read KiBugcheckData");
+        return;
+    }
+
+    if (cpu_memory_rw_debug(first_cpu,
+            KiBugcheckData,
+            h->BugcheckData, sizeof(h->BugcheckData), 0)) {
+        error_setg(errp, "win-dump: failed to read bugcheck data");
+        return;
+    }
+}
+
+/*
+ * This routine tries to correct mistakes in crashdump header.
+ */
+static void patch_header(WinDumpHeader64 *h)
+{
+    Error *local_err = NULL;
+
+    h->RequiredDumpSpace = sizeof(WinDumpHeader64) +
+            (h->PhysicalMemoryBlock.NumberOfPages << TARGET_PAGE_BITS);
+    h->PhysicalMemoryBlock.unused = 0;
+    h->unused1 = 0;
+
+    /*
+     * We assume h->DirectoryBase and current CR3 are the same when we access
+     * memory by virtual address. In other words, we suppose current context
+     * is system context. It is definetely true in case of BSOD.
+     */
+
+    patch_mm_pfn_database(h, &local_err);
+    if (local_err) {
+        warn_report_err(local_err);
+        local_err = NULL;
+    }
+    patch_bugcheck_data(h, &local_err);
+    if (local_err) {
+        warn_report_err(local_err);
+    }
+}
+
+static void check_header(WinDumpHeader64 *h, Error **errp)
+{
+    const char Signature[] = "PAGE";
+    const char ValidDump[] = "DU64";
+
+    if (memcmp(h->Signature, Signature, sizeof(h->Signature))) {
+        error_setg(errp, "win-dump: invalid header, expected '%.4s',"
+                         " got '%.4s'", Signature, h->Signature);
+        return;
+    }
+
+    if (memcmp(h->ValidDump, ValidDump, sizeof(h->ValidDump))) {
+        error_setg(errp, "win-dump: invalid header, expected '%.4s',"
+                         " got '%.4s'", ValidDump, h->ValidDump);
+        return;
+    }
+}
+
+static void check_kdbg(WinDumpHeader64 *h, Error **errp)
+{
+    const char OwnerTag[] = "KDBG";
+    char read_OwnerTag[4];
+
+    if (cpu_memory_rw_debug(first_cpu,
+            h->KdDebuggerDataBlock + KDBG_OWNER_TAG_OFFSET64,
+            (uint8_t *)&read_OwnerTag, sizeof(read_OwnerTag), 0)) {
+        error_setg(errp, "win-dump: failed to read OwnerTag");
+        return;
+    }
+
+    if (memcmp(read_OwnerTag, OwnerTag, sizeof(read_OwnerTag))) {
+        error_setg(errp, "win-dump: invalid KDBG OwnerTag,"
+                         " expected '%.4s', got '%.4s',"
+                         " KdDebuggerDataBlock seems to be encrypted",
+                         OwnerTag, read_OwnerTag);
+        return;
+    }
+}
+
+void create_win_dump(DumpState *s, Error **errp)
+{
+    WinDumpHeader64 *h = (WinDumpHeader64 *)(s->guest_note +
+            VMCOREINFO_ELF_NOTE_HDR_SIZE);
+    Error *local_err = NULL;
+
+    if (s->guest_note_size != sizeof(WinDumpHeader64) +
+            VMCOREINFO_ELF_NOTE_HDR_SIZE) {
+        error_setg(errp, "win-dump: invalid vmcoreinfo note size");
+        return;
+    }
+
+    check_header(h, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    check_kdbg(h, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    patch_header(h);
+
+    s->total_size = h->RequiredDumpSpace;
+
+    s->written_size = qemu_write_full(s->fd, h, sizeof(*h));
+    if (s->written_size != sizeof(*h)) {
+        error_setg(errp, QERR_IO_ERROR);
+        return;
+    }
+
+    write_runs(s, h, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+}
diff --git a/win_dump.h b/win_dump.h
new file mode 100644
index 0000000000..281241881e
--- /dev/null
+++ b/win_dump.h
@@ -0,0 +1,87 @@
+/*
+ * Windows crashdump
+ *
+ * Copyright (c) 2018 Virtuozzo International GmbH
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+typedef struct WinDumpPhyMemRun64 {
+    uint64_t BasePage;
+    uint64_t PageCount;
+} QEMU_PACKED WinDumpPhyMemRun64;
+
+typedef struct WinDumpPhyMemDesc64 {
+    uint32_t NumberOfRuns;
+    uint32_t unused;
+    uint64_t NumberOfPages;
+    WinDumpPhyMemRun64 Run[43];
+} QEMU_PACKED WinDumpPhyMemDesc64;
+
+typedef struct WinDumpExceptionRecord {
+    uint32_t ExceptionCode;
+    uint32_t ExceptionFlags;
+    uint64_t ExceptionRecord;
+    uint64_t ExceptionAddress;
+    uint32_t NumberParameters;
+    uint32_t unused;
+    uint64_t ExceptionInformation[15];
+} QEMU_PACKED WinDumpExceptionRecord;
+
+typedef struct WinDumpHeader64 {
+    char Signature[4];
+    char ValidDump[4];
+    uint32_t MajorVersion;
+    uint32_t MinorVersion;
+    uint64_t DirectoryTableBase;
+    uint64_t PfnDatabase;
+    uint64_t PsLoadedModuleList;
+    uint64_t PsActiveProcessHead;
+    uint32_t MachineImageType;
+    uint32_t NumberProcessors;
+    union {
+        struct {
+            uint32_t BugcheckCode;
+            uint32_t unused0;
+            uint64_t BugcheckParameter1;
+            uint64_t BugcheckParameter2;
+            uint64_t BugcheckParameter3;
+            uint64_t BugcheckParameter4;
+        };
+        uint8_t BugcheckData[40];
+    };
+    uint8_t VersionUser[32];
+    uint64_t KdDebuggerDataBlock;
+    union {
+        WinDumpPhyMemDesc64 PhysicalMemoryBlock;
+        uint8_t PhysicalMemoryBlockBuffer[704];
+    };
+    union {
+        uint8_t ContextBuffer[3000];
+    };
+    WinDumpExceptionRecord Exception;
+    uint32_t DumpType;
+    uint32_t unused1;
+    uint64_t RequiredDumpSpace;
+    uint64_t SystemTime;
+    char Comment[128];
+    uint64_t SystemUpTime;
+    uint32_t MiniDumpFields;
+    uint32_t SecondaryDataState;
+    uint32_t ProductType;
+    uint32_t SuiteMask;
+    uint32_t WriterStatus;
+    uint8_t unused2;
+    uint8_t KdSecondaryVersion;
+    uint8_t reserved[4018];
+} QEMU_PACKED WinDumpHeader64;
+
+void create_win_dump(DumpState *s, Error **errp);
+
+#define KDBG_OWNER_TAG_OFFSET64         0x10
+#define KDBG_KI_BUGCHECK_DATA_OFFSET64  0x88
+#define KDBG_MM_PFN_DATABASE_OFFSET64   0xC0
+
+#define VMCOREINFO_ELF_NOTE_HDR_SIZE    24

From 92d1b3d5086c182bab66fd1814c4a04ba1e59337 Mon Sep 17 00:00:00 2001
From: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Date: Thu, 17 May 2018 19:23:40 +0300
Subject: [PATCH 54/60] dump: use system context in Windows dump

We use CPU #0 to access guest virtual memory, but it can execute user
thread at that moment. So, switch CR3 to PageDirectoryBase from header
and restore original value at the end.

Signed-off-by: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Message-Id: <20180517162342.4330-3-viktor.prutyanov@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 win_dump.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/win_dump.c b/win_dump.c
index 58255c12ee..7d956ca996 100644
--- a/win_dump.c
+++ b/win_dump.c
@@ -111,12 +111,6 @@ static void patch_header(WinDumpHeader64 *h)
     h->PhysicalMemoryBlock.unused = 0;
     h->unused1 = 0;
 
-    /*
-     * We assume h->DirectoryBase and current CR3 are the same when we access
-     * memory by virtual address. In other words, we suppose current context
-     * is system context. It is definetely true in case of BSOD.
-     */
-
     patch_mm_pfn_database(h, &local_err);
     if (local_err) {
         warn_report_err(local_err);
@@ -171,6 +165,8 @@ void create_win_dump(DumpState *s, Error **errp)
 {
     WinDumpHeader64 *h = (WinDumpHeader64 *)(s->guest_note +
             VMCOREINFO_ELF_NOTE_HDR_SIZE);
+    X86CPU *first_x86_cpu = X86_CPU(first_cpu);
+    uint64_t saved_cr3 = first_x86_cpu->env.cr[3];
     Error *local_err = NULL;
 
     if (s->guest_note_size != sizeof(WinDumpHeader64) +
@@ -185,10 +181,17 @@ void create_win_dump(DumpState *s, Error **errp)
         return;
     }
 
+    /*
+     * Further access to kernel structures by virtual addresses
+     * should be made from system context.
+     */
+
+    first_x86_cpu->env.cr[3] = h->DirectoryTableBase;
+
     check_kdbg(h, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        return;
+        goto out_cr3;
     }
 
     patch_header(h);
@@ -198,12 +201,17 @@ void create_win_dump(DumpState *s, Error **errp)
     s->written_size = qemu_write_full(s->fd, h, sizeof(*h));
     if (s->written_size != sizeof(*h)) {
         error_setg(errp, QERR_IO_ERROR);
-        return;
+        goto out_cr3;
     }
 
     write_runs(s, h, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        return;
+        goto out_cr3;
     }
+
+out_cr3:
+    first_x86_cpu->env.cr[3] = saved_cr3;
+
+    return;
 }

From 2ababfcc0e5e778c9005abb57f4bf6a036145a57 Mon Sep 17 00:00:00 2001
From: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Date: Thu, 17 May 2018 19:23:41 +0300
Subject: [PATCH 55/60] dump: add fallback KDBG using in Windows dump

KdDebuggerDataBlock may be encrypted in guest memory and dump will be
useless in this case. But guest driver can obtain decrypted KDBG and
expose its address through BugcheckParameter1 field in raw header.
After this patch, QEMU will be able to use fallback KdDebuggerDataBlock.

Signed-off-by: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Message-Id: <20180517162342.4330-4-viktor.prutyanov@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 win_dump.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/win_dump.c b/win_dump.c
index 7d956ca996..2d9afb514e 100644
--- a/win_dump.c
+++ b/win_dump.c
@@ -144,21 +144,37 @@ static void check_kdbg(WinDumpHeader64 *h, Error **errp)
 {
     const char OwnerTag[] = "KDBG";
     char read_OwnerTag[4];
+    uint64_t KdDebuggerDataBlock = h->KdDebuggerDataBlock;
+    bool try_fallback = true;
 
+try_again:
     if (cpu_memory_rw_debug(first_cpu,
-            h->KdDebuggerDataBlock + KDBG_OWNER_TAG_OFFSET64,
+            KdDebuggerDataBlock + KDBG_OWNER_TAG_OFFSET64,
             (uint8_t *)&read_OwnerTag, sizeof(read_OwnerTag), 0)) {
         error_setg(errp, "win-dump: failed to read OwnerTag");
         return;
     }
 
     if (memcmp(read_OwnerTag, OwnerTag, sizeof(read_OwnerTag))) {
-        error_setg(errp, "win-dump: invalid KDBG OwnerTag,"
-                         " expected '%.4s', got '%.4s',"
-                         " KdDebuggerDataBlock seems to be encrypted",
-                         OwnerTag, read_OwnerTag);
-        return;
+        if (try_fallback) {
+            /*
+             * If attempt to use original KDBG failed
+             * (most likely because of its encryption),
+             * we try to use KDBG obtained by guest driver.
+             */
+
+            KdDebuggerDataBlock = h->BugcheckParameter1;
+            try_fallback = false;
+            goto try_again;
+        } else {
+            error_setg(errp, "win-dump: invalid KDBG OwnerTag,"
+                             " expected '%.4s', got '%.4s'",
+                             OwnerTag, read_OwnerTag);
+            return;
+        }
     }
+
+    h->KdDebuggerDataBlock = KdDebuggerDataBlock;
 }
 
 void create_win_dump(DumpState *s, Error **errp)

From 2ad9b50f713053dcd6c44b2b5e3bbdb0cfe8a52d Mon Sep 17 00:00:00 2001
From: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Date: Thu, 17 May 2018 19:23:42 +0300
Subject: [PATCH 56/60] dump: add Windows live system dump

Unlike dying Windows, live system memory doesn't contain
correct register contexts. But they can be populated with QEMU register
values.
After this patch, QEMU will be able to produce guest Windows live system
dump.

Signed-off-by: Viktor Prutyanov <viktor.prutyanov@virtuozzo.com>
Message-Id: <20180517162342.4330-5-viktor.prutyanov@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 win_dump.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 win_dump.h |  95 ++++++++++++++++++++++++++++++--
 2 files changed, 246 insertions(+), 5 deletions(-)

diff --git a/win_dump.c b/win_dump.c
index 2d9afb514e..b15c191ad7 100644
--- a/win_dump.c
+++ b/win_dump.c
@@ -97,6 +97,14 @@ static void patch_bugcheck_data(WinDumpHeader64 *h, Error **errp)
         error_setg(errp, "win-dump: failed to read bugcheck data");
         return;
     }
+
+    /*
+     * If BugcheckCode wasn't saved, we consider guest OS as alive.
+     */
+
+    if (!h->BugcheckCode) {
+        h->BugcheckCode = LIVE_SYSTEM_DUMP;
+    }
 }
 
 /*
@@ -177,12 +185,139 @@ try_again:
     h->KdDebuggerDataBlock = KdDebuggerDataBlock;
 }
 
+struct saved_context {
+    WinContext ctx;
+    uint64_t addr;
+};
+
+static void patch_and_save_context(WinDumpHeader64 *h,
+                                   struct saved_context *saved_ctx,
+                                   Error **errp)
+{
+    uint64_t KiProcessorBlock;
+    uint16_t OffsetPrcbContext;
+    CPUState *cpu;
+    int i = 0;
+
+    if (cpu_memory_rw_debug(first_cpu,
+            h->KdDebuggerDataBlock + KDBG_KI_PROCESSOR_BLOCK_OFFSET64,
+            (uint8_t *)&KiProcessorBlock, sizeof(KiProcessorBlock), 0)) {
+        error_setg(errp, "win-dump: failed to read KiProcessorBlock");
+        return;
+    }
+
+    if (cpu_memory_rw_debug(first_cpu,
+            h->KdDebuggerDataBlock + KDBG_OFFSET_PRCB_CONTEXT_OFFSET64,
+            (uint8_t *)&OffsetPrcbContext, sizeof(OffsetPrcbContext), 0)) {
+        error_setg(errp, "win-dump: failed to read OffsetPrcbContext");
+        return;
+    }
+
+    CPU_FOREACH(cpu) {
+        X86CPU *x86_cpu = X86_CPU(cpu);
+        CPUX86State *env = &x86_cpu->env;
+        uint64_t Prcb;
+        uint64_t Context;
+        WinContext ctx;
+
+        if (cpu_memory_rw_debug(first_cpu,
+                KiProcessorBlock + i * sizeof(uint64_t),
+                (uint8_t *)&Prcb, sizeof(Prcb), 0)) {
+            error_setg(errp, "win-dump: failed to read"
+                             " CPU #%d PRCB location", i);
+            return;
+        }
+
+        if (cpu_memory_rw_debug(first_cpu,
+                Prcb + OffsetPrcbContext,
+                (uint8_t *)&Context, sizeof(Context), 0)) {
+            error_setg(errp, "win-dump: failed to read"
+                             " CPU #%d ContextFrame location", i);
+            return;
+        }
+
+        saved_ctx[i].addr = Context;
+
+        ctx = (WinContext){
+            .ContextFlags = WIN_CTX_ALL,
+            .MxCsr = env->mxcsr,
+
+            .SegEs = env->segs[0].selector,
+            .SegCs = env->segs[1].selector,
+            .SegSs = env->segs[2].selector,
+            .SegDs = env->segs[3].selector,
+            .SegFs = env->segs[4].selector,
+            .SegGs = env->segs[5].selector,
+            .EFlags = cpu_compute_eflags(env),
+
+            .Dr0 = env->dr[0],
+            .Dr1 = env->dr[1],
+            .Dr2 = env->dr[2],
+            .Dr3 = env->dr[3],
+            .Dr6 = env->dr[6],
+            .Dr7 = env->dr[7],
+
+            .Rax = env->regs[R_EAX],
+            .Rbx = env->regs[R_EBX],
+            .Rcx = env->regs[R_ECX],
+            .Rdx = env->regs[R_EDX],
+            .Rsp = env->regs[R_ESP],
+            .Rbp = env->regs[R_EBP],
+            .Rsi = env->regs[R_ESI],
+            .Rdi = env->regs[R_EDI],
+            .R8  = env->regs[8],
+            .R9  = env->regs[9],
+            .R10 = env->regs[10],
+            .R11 = env->regs[11],
+            .R12 = env->regs[12],
+            .R13 = env->regs[13],
+            .R14 = env->regs[14],
+            .R15 = env->regs[15],
+
+            .Rip = env->eip,
+            .FltSave = {
+                .MxCsr = env->mxcsr,
+            },
+        };
+
+        if (cpu_memory_rw_debug(first_cpu, Context,
+                (uint8_t *)&saved_ctx[i].ctx, sizeof(WinContext), 0)) {
+            error_setg(errp, "win-dump: failed to save CPU #%d context", i);
+            return;
+        }
+
+        if (cpu_memory_rw_debug(first_cpu, Context,
+                (uint8_t *)&ctx, sizeof(WinContext), 1)) {
+            error_setg(errp, "win-dump: failed to write CPU #%d context", i);
+            return;
+        }
+
+        i++;
+    }
+}
+
+static void restore_context(WinDumpHeader64 *h,
+                            struct saved_context *saved_ctx)
+{
+    int i;
+    Error *err = NULL;
+
+    for (i = 0; i < h->NumberProcessors; i++) {
+        if (cpu_memory_rw_debug(first_cpu, saved_ctx[i].addr,
+                (uint8_t *)&saved_ctx[i].ctx, sizeof(WinContext), 1)) {
+            error_setg(&err, "win-dump: failed to restore CPU #%d context", i);
+            warn_report_err(err);
+        }
+    }
+}
+
 void create_win_dump(DumpState *s, Error **errp)
 {
     WinDumpHeader64 *h = (WinDumpHeader64 *)(s->guest_note +
             VMCOREINFO_ELF_NOTE_HDR_SIZE);
     X86CPU *first_x86_cpu = X86_CPU(first_cpu);
     uint64_t saved_cr3 = first_x86_cpu->env.cr[3];
+    struct saved_context *saved_ctx = NULL;
     Error *local_err = NULL;
 
     if (s->guest_note_size != sizeof(WinDumpHeader64) +
@@ -212,20 +347,37 @@ void create_win_dump(DumpState *s, Error **errp)
 
     patch_header(h);
 
+    saved_ctx = g_new(struct saved_context, h->NumberProcessors);
+
+    /*
+     * Always patch context because there is no way
+     * to determine if the system-saved context is valid
+     */
+
+    patch_and_save_context(h, saved_ctx, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto out_free;
+    }
+
     s->total_size = h->RequiredDumpSpace;
 
     s->written_size = qemu_write_full(s->fd, h, sizeof(*h));
     if (s->written_size != sizeof(*h)) {
         error_setg(errp, QERR_IO_ERROR);
-        goto out_cr3;
+        goto out_restore;
     }
 
     write_runs(s, h, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        goto out_cr3;
+        goto out_restore;
     }
 
+out_restore:
+    restore_context(h, saved_ctx);
+out_free:
+    g_free(saved_ctx);
 out_cr3:
     first_x86_cpu->env.cr[3] = saved_cr3;
 
diff --git a/win_dump.h b/win_dump.h
index 281241881e..f9e1faf8eb 100644
--- a/win_dump.h
+++ b/win_dump.h
@@ -80,8 +80,97 @@ typedef struct WinDumpHeader64 {
 
 void create_win_dump(DumpState *s, Error **errp);
 
-#define KDBG_OWNER_TAG_OFFSET64         0x10
-#define KDBG_KI_BUGCHECK_DATA_OFFSET64  0x88
-#define KDBG_MM_PFN_DATABASE_OFFSET64   0xC0
+#define KDBG_OWNER_TAG_OFFSET64             0x10
+#define KDBG_MM_PFN_DATABASE_OFFSET64       0xC0
+#define KDBG_KI_BUGCHECK_DATA_OFFSET64      0x88
+#define KDBG_KI_PROCESSOR_BLOCK_OFFSET64    0x218
+#define KDBG_OFFSET_PRCB_CONTEXT_OFFSET64   0x338
 
 #define VMCOREINFO_ELF_NOTE_HDR_SIZE    24
+
+#define WIN_CTX_X64 0x00100000L
+
+#define WIN_CTX_CTL 0x00000001L
+#define WIN_CTX_INT 0x00000002L
+#define WIN_CTX_SEG 0x00000004L
+#define WIN_CTX_FP  0x00000008L
+#define WIN_CTX_DBG 0x00000010L
+
+#define WIN_CTX_FULL    (WIN_CTX_X64 | WIN_CTX_CTL | WIN_CTX_INT | WIN_CTX_FP)
+#define WIN_CTX_ALL     (WIN_CTX_FULL | WIN_CTX_SEG | WIN_CTX_DBG)
+
+#define LIVE_SYSTEM_DUMP    0x00000161
+
+typedef struct WinM128A {
+    uint64_t low;
+    int64_t high;
+} QEMU_ALIGNED(16) WinM128A;
+
+typedef struct WinContext {
+    uint64_t PHome[6];
+
+    uint32_t ContextFlags;
+    uint32_t MxCsr;
+
+    uint16_t SegCs;
+    uint16_t SegDs;
+    uint16_t SegEs;
+    uint16_t SegFs;
+    uint16_t SegGs;
+    uint16_t SegSs;
+    uint32_t EFlags;
+
+    uint64_t Dr0;
+    uint64_t Dr1;
+    uint64_t Dr2;
+    uint64_t Dr3;
+    uint64_t Dr6;
+    uint64_t Dr7;
+
+    uint64_t Rax;
+    uint64_t Rcx;
+    uint64_t Rdx;
+    uint64_t Rbx;
+    uint64_t Rsp;
+    uint64_t Rbp;
+    uint64_t Rsi;
+    uint64_t Rdi;
+    uint64_t R8;
+    uint64_t R9;
+    uint64_t R10;
+    uint64_t R11;
+    uint64_t R12;
+    uint64_t R13;
+    uint64_t R14;
+    uint64_t R15;
+
+    uint64_t Rip;
+
+    struct {
+        uint16_t ControlWord;
+        uint16_t StatusWord;
+        uint8_t TagWord;
+        uint8_t Reserved1;
+        uint16_t ErrorOpcode;
+        uint32_t ErrorOffset;
+        uint16_t ErrorSelector;
+        uint16_t Reserved2;
+        uint32_t DataOffset;
+        uint16_t DataSelector;
+        uint16_t Reserved3;
+        uint32_t MxCsr;
+        uint32_t MxCsr_Mask;
+        WinM128A FloatRegisters[8];
+        WinM128A XmmRegisters[16];
+        uint8_t Reserved4[96];
+    } FltSave;
+
+    WinM128A VectorRegister[26];
+    uint64_t VectorControl;
+
+    uint64_t DebugControl;
+    uint64_t LastBranchToRip;
+    uint64_t LastBranchFromRip;
+    uint64_t LastExceptionToRip;
+    uint64_t LastExceptionFromRip;
+} QEMU_ALIGNED(16) WinContext;

From 0a96ca2437646bad197b0108c5f4a93e7ead05a9 Mon Sep 17 00:00:00 2001
From: Daniel Henrique Barboza <danielhb413@gmail.com>
Date: Wed, 27 Jun 2018 14:24:30 -0300
Subject: [PATCH 57/60] hw/scsi: cleanups before VPD BL emulation

To add support for the emulation of Block Limits VPD page
for passthrough devices, a few adjustments in the current code
base is required to avoid repetition and improve clarity.

In scsi-generic.c, detach the Inquiry handling from
scsi_read_complete and put it into a new function called
scsi_handle_inquiry_reply. This change aims to avoid
cluttering of scsi_read_complete when we more logic in the
Inquiry response handling is added in the next patches,
centralizing the changes in the new function.

In scsi-disk.c, take the build of all emulated VPD pages
from scsi_disk_emulate_inquiry and make it available to
other files into a non-static function called
scsi_disk_emulate_vpd_page. Making it public will allow
the future VPD BL emulation code for passthrough devices
to use it from scsi-generic.c, avoiding copy/pasting this
code solely for that purpose. It also has the advantage of
providing emulation of all VPD pages in case we need to
emulate other pages in other scenarios. As a bonus,
scsi_disk_emulate_inquiry got tidier.

Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20180627172432.11120-2-danielhb413@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-disk.c    | 425 +++++++++++++++++++++--------------------
 hw/scsi/scsi-generic.c |  71 +++----
 include/hw/scsi/scsi.h |   1 +
 3 files changed, 258 insertions(+), 239 deletions(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index aeaf611854..664d634c98 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -585,219 +585,228 @@ static uint8_t *scsi_get_buf(SCSIRequest *req)
     return (uint8_t *)r->iov.iov_base;
 }
 
+int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf)
+{
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
+    uint8_t page_code = req->cmd.buf[2];
+    int start, buflen = 0;
+
+    outbuf[buflen++] = s->qdev.type & 0x1f;
+    outbuf[buflen++] = page_code;
+    outbuf[buflen++] = 0x00;
+    outbuf[buflen++] = 0x00;
+    start = buflen;
+
+    switch (page_code) {
+    case 0x00: /* Supported page codes, mandatory */
+    {
+        DPRINTF("Inquiry EVPD[Supported pages] "
+                "buffer size %zd\n", req->cmd.xfer);
+        outbuf[buflen++] = 0x00; /* list of supported pages (this page) */
+        if (s->serial) {
+            outbuf[buflen++] = 0x80; /* unit serial number */
+        }
+        outbuf[buflen++] = 0x83; /* device identification */
+        if (s->qdev.type == TYPE_DISK) {
+            outbuf[buflen++] = 0xb0; /* block limits */
+            outbuf[buflen++] = 0xb1; /* block device characteristics */
+            outbuf[buflen++] = 0xb2; /* thin provisioning */
+        }
+        break;
+    }
+    case 0x80: /* Device serial number, optional */
+    {
+        int l;
+
+        if (!s->serial) {
+            DPRINTF("Inquiry (EVPD[Serial number] not supported\n");
+            return -1;
+        }
+
+        l = strlen(s->serial);
+        if (l > 36) {
+            l = 36;
+        }
+
+        DPRINTF("Inquiry EVPD[Serial number] "
+                "buffer size %zd\n", req->cmd.xfer);
+        memcpy(outbuf + buflen, s->serial, l);
+        buflen += l;
+        break;
+    }
+
+    case 0x83: /* Device identification page, mandatory */
+    {
+        const char *str = s->serial ?: blk_name(s->qdev.conf.blk);
+        int max_len = s->serial ? 20 : 255 - 8;
+        int id_len = strlen(str);
+
+        if (id_len > max_len) {
+            id_len = max_len;
+        }
+        DPRINTF("Inquiry EVPD[Device identification] "
+                "buffer size %zd\n", req->cmd.xfer);
+
+        outbuf[buflen++] = 0x2; /* ASCII */
+        outbuf[buflen++] = 0;   /* not officially assigned */
+        outbuf[buflen++] = 0;   /* reserved */
+        outbuf[buflen++] = id_len; /* length of data following */
+        memcpy(outbuf + buflen, str, id_len);
+        buflen += id_len;
+
+        if (s->qdev.wwn) {
+            outbuf[buflen++] = 0x1; /* Binary */
+            outbuf[buflen++] = 0x3; /* NAA */
+            outbuf[buflen++] = 0;   /* reserved */
+            outbuf[buflen++] = 8;
+            stq_be_p(&outbuf[buflen], s->qdev.wwn);
+            buflen += 8;
+        }
+
+        if (s->qdev.port_wwn) {
+            outbuf[buflen++] = 0x61; /* SAS / Binary */
+            outbuf[buflen++] = 0x93; /* PIV / Target port / NAA */
+            outbuf[buflen++] = 0;    /* reserved */
+            outbuf[buflen++] = 8;
+            stq_be_p(&outbuf[buflen], s->qdev.port_wwn);
+            buflen += 8;
+        }
+
+        if (s->port_index) {
+            outbuf[buflen++] = 0x61; /* SAS / Binary */
+
+            /* PIV/Target port/relative target port */
+            outbuf[buflen++] = 0x94;
+
+            outbuf[buflen++] = 0;    /* reserved */
+            outbuf[buflen++] = 4;
+            stw_be_p(&outbuf[buflen + 2], s->port_index);
+            buflen += 4;
+        }
+        break;
+    }
+    case 0xb0: /* block limits */
+    {
+        unsigned int unmap_sectors =
+            s->qdev.conf.discard_granularity / s->qdev.blocksize;
+        unsigned int min_io_size =
+            s->qdev.conf.min_io_size / s->qdev.blocksize;
+        unsigned int opt_io_size =
+            s->qdev.conf.opt_io_size / s->qdev.blocksize;
+        unsigned int max_unmap_sectors =
+            s->max_unmap_size / s->qdev.blocksize;
+        unsigned int max_io_sectors =
+            s->max_io_size / s->qdev.blocksize;
+
+        if (s->qdev.type == TYPE_ROM) {
+            DPRINTF("Inquiry (EVPD[%02X] not supported for CDROM\n",
+                    page_code);
+            return -1;
+        }
+        if (s->qdev.type == TYPE_DISK) {
+            int max_transfer_blk = blk_get_max_transfer(s->qdev.conf.blk);
+            int max_io_sectors_blk =
+                max_transfer_blk / s->qdev.blocksize;
+
+            max_io_sectors =
+                MIN_NON_ZERO(max_io_sectors_blk, max_io_sectors);
+
+            /* min_io_size and opt_io_size can't be greater than
+             * max_io_sectors */
+            if (min_io_size) {
+                min_io_size = MIN(min_io_size, max_io_sectors);
+            }
+            if (opt_io_size) {
+                opt_io_size = MIN(opt_io_size, max_io_sectors);
+            }
+        }
+        /* required VPD size with unmap support */
+        buflen = 0x40;
+        memset(outbuf + 4, 0, buflen - 4);
+
+        outbuf[4] = 0x1; /* wsnz */
+
+        /* optimal transfer length granularity */
+        outbuf[6] = (min_io_size >> 8) & 0xff;
+        outbuf[7] = min_io_size & 0xff;
+
+        /* maximum transfer length */
+        outbuf[8] = (max_io_sectors >> 24) & 0xff;
+        outbuf[9] = (max_io_sectors >> 16) & 0xff;
+        outbuf[10] = (max_io_sectors >> 8) & 0xff;
+        outbuf[11] = max_io_sectors & 0xff;
+
+        /* optimal transfer length */
+        outbuf[12] = (opt_io_size >> 24) & 0xff;
+        outbuf[13] = (opt_io_size >> 16) & 0xff;
+        outbuf[14] = (opt_io_size >> 8) & 0xff;
+        outbuf[15] = opt_io_size & 0xff;
+
+        /* max unmap LBA count, default is 1GB */
+        outbuf[20] = (max_unmap_sectors >> 24) & 0xff;
+        outbuf[21] = (max_unmap_sectors >> 16) & 0xff;
+        outbuf[22] = (max_unmap_sectors >> 8) & 0xff;
+        outbuf[23] = max_unmap_sectors & 0xff;
+
+        /* max unmap descriptors, 255 fit in 4 kb with an 8-byte header */
+        outbuf[24] = 0;
+        outbuf[25] = 0;
+        outbuf[26] = 0;
+        outbuf[27] = 255;
+
+        /* optimal unmap granularity */
+        outbuf[28] = (unmap_sectors >> 24) & 0xff;
+        outbuf[29] = (unmap_sectors >> 16) & 0xff;
+        outbuf[30] = (unmap_sectors >> 8) & 0xff;
+        outbuf[31] = unmap_sectors & 0xff;
+
+        /* max write same size */
+        outbuf[36] = 0;
+        outbuf[37] = 0;
+        outbuf[38] = 0;
+        outbuf[39] = 0;
+
+        outbuf[40] = (max_io_sectors >> 24) & 0xff;
+        outbuf[41] = (max_io_sectors >> 16) & 0xff;
+        outbuf[42] = (max_io_sectors >> 8) & 0xff;
+        outbuf[43] = max_io_sectors & 0xff;
+        break;
+    }
+    case 0xb1: /* block device characteristics */
+    {
+        buflen = 8;
+        outbuf[4] = (s->rotation_rate >> 8) & 0xff;
+        outbuf[5] = s->rotation_rate & 0xff;
+        outbuf[6] = 0;
+        outbuf[7] = 0;
+        break;
+    }
+    case 0xb2: /* thin provisioning */
+    {
+        buflen = 8;
+        outbuf[4] = 0;
+        outbuf[5] = 0xe0; /* unmap & write_same 10/16 all supported */
+        outbuf[6] = s->qdev.conf.discard_granularity ? 2 : 1;
+        outbuf[7] = 0;
+        break;
+    }
+    default:
+        return -1;
+    }
+    /* done with EVPD */
+    assert(buflen - start <= 255);
+    outbuf[start - 1] = buflen - start;
+    return buflen;
+}
+
 static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     int buflen = 0;
-    int start;
 
     if (req->cmd.buf[1] & 0x1) {
         /* Vital product data */
-        uint8_t page_code = req->cmd.buf[2];
-
-        outbuf[buflen++] = s->qdev.type & 0x1f;
-        outbuf[buflen++] = page_code ; // this page
-        outbuf[buflen++] = 0x00;
-        outbuf[buflen++] = 0x00;
-        start = buflen;
-
-        switch (page_code) {
-        case 0x00: /* Supported page codes, mandatory */
-        {
-            DPRINTF("Inquiry EVPD[Supported pages] "
-                    "buffer size %zd\n", req->cmd.xfer);
-            outbuf[buflen++] = 0x00; // list of supported pages (this page)
-            if (s->serial) {
-                outbuf[buflen++] = 0x80; // unit serial number
-            }
-            outbuf[buflen++] = 0x83; // device identification
-            if (s->qdev.type == TYPE_DISK) {
-                outbuf[buflen++] = 0xb0; // block limits
-                outbuf[buflen++] = 0xb1; /* block device characteristics */
-                outbuf[buflen++] = 0xb2; // thin provisioning
-            }
-            break;
-        }
-        case 0x80: /* Device serial number, optional */
-        {
-            int l;
-
-            if (!s->serial) {
-                DPRINTF("Inquiry (EVPD[Serial number] not supported\n");
-                return -1;
-            }
-
-            l = strlen(s->serial);
-            if (l > 36) {
-                l = 36;
-            }
-
-            DPRINTF("Inquiry EVPD[Serial number] "
-                    "buffer size %zd\n", req->cmd.xfer);
-            memcpy(outbuf+buflen, s->serial, l);
-            buflen += l;
-            break;
-        }
-
-        case 0x83: /* Device identification page, mandatory */
-        {
-            const char *str = s->serial ?: blk_name(s->qdev.conf.blk);
-            int max_len = s->serial ? 20 : 255 - 8;
-            int id_len = strlen(str);
-
-            if (id_len > max_len) {
-                id_len = max_len;
-            }
-            DPRINTF("Inquiry EVPD[Device identification] "
-                    "buffer size %zd\n", req->cmd.xfer);
-
-            outbuf[buflen++] = 0x2; // ASCII
-            outbuf[buflen++] = 0;   // not officially assigned
-            outbuf[buflen++] = 0;   // reserved
-            outbuf[buflen++] = id_len; // length of data following
-            memcpy(outbuf+buflen, str, id_len);
-            buflen += id_len;
-
-            if (s->qdev.wwn) {
-                outbuf[buflen++] = 0x1; // Binary
-                outbuf[buflen++] = 0x3; // NAA
-                outbuf[buflen++] = 0;   // reserved
-                outbuf[buflen++] = 8;
-                stq_be_p(&outbuf[buflen], s->qdev.wwn);
-                buflen += 8;
-            }
-
-            if (s->qdev.port_wwn) {
-                outbuf[buflen++] = 0x61; // SAS / Binary
-                outbuf[buflen++] = 0x93; // PIV / Target port / NAA
-                outbuf[buflen++] = 0;    // reserved
-                outbuf[buflen++] = 8;
-                stq_be_p(&outbuf[buflen], s->qdev.port_wwn);
-                buflen += 8;
-            }
-
-            if (s->port_index) {
-                outbuf[buflen++] = 0x61; // SAS / Binary
-                outbuf[buflen++] = 0x94; // PIV / Target port / relative target port
-                outbuf[buflen++] = 0;    // reserved
-                outbuf[buflen++] = 4;
-                stw_be_p(&outbuf[buflen + 2], s->port_index);
-                buflen += 4;
-            }
-            break;
-        }
-        case 0xb0: /* block limits */
-        {
-            unsigned int unmap_sectors =
-                    s->qdev.conf.discard_granularity / s->qdev.blocksize;
-            unsigned int min_io_size =
-                    s->qdev.conf.min_io_size / s->qdev.blocksize;
-            unsigned int opt_io_size =
-                    s->qdev.conf.opt_io_size / s->qdev.blocksize;
-            unsigned int max_unmap_sectors =
-                    s->max_unmap_size / s->qdev.blocksize;
-            unsigned int max_io_sectors =
-                    s->max_io_size / s->qdev.blocksize;
-
-            if (s->qdev.type == TYPE_ROM) {
-                DPRINTF("Inquiry (EVPD[%02X] not supported for CDROM\n",
-                        page_code);
-                return -1;
-            }
-            if (s->qdev.type == TYPE_DISK) {
-                int max_transfer_blk = blk_get_max_transfer(s->qdev.conf.blk);
-                int max_io_sectors_blk =
-                    max_transfer_blk / s->qdev.blocksize;
-
-                max_io_sectors =
-                    MIN_NON_ZERO(max_io_sectors_blk, max_io_sectors);
-
-                /* min_io_size and opt_io_size can't be greater than
-                 * max_io_sectors */
-                if (min_io_size) {
-                    min_io_size = MIN(min_io_size, max_io_sectors);
-                }
-                if (opt_io_size) {
-                    opt_io_size = MIN(opt_io_size, max_io_sectors);
-                }
-            }
-            /* required VPD size with unmap support */
-            buflen = 0x40;
-            memset(outbuf + 4, 0, buflen - 4);
-
-            outbuf[4] = 0x1; /* wsnz */
-
-            /* optimal transfer length granularity */
-            outbuf[6] = (min_io_size >> 8) & 0xff;
-            outbuf[7] = min_io_size & 0xff;
-
-            /* maximum transfer length */
-            outbuf[8] = (max_io_sectors >> 24) & 0xff;
-            outbuf[9] = (max_io_sectors >> 16) & 0xff;
-            outbuf[10] = (max_io_sectors >> 8) & 0xff;
-            outbuf[11] = max_io_sectors & 0xff;
-
-            /* optimal transfer length */
-            outbuf[12] = (opt_io_size >> 24) & 0xff;
-            outbuf[13] = (opt_io_size >> 16) & 0xff;
-            outbuf[14] = (opt_io_size >> 8) & 0xff;
-            outbuf[15] = opt_io_size & 0xff;
-
-            /* max unmap LBA count, default is 1GB */
-            outbuf[20] = (max_unmap_sectors >> 24) & 0xff;
-            outbuf[21] = (max_unmap_sectors >> 16) & 0xff;
-            outbuf[22] = (max_unmap_sectors >> 8) & 0xff;
-            outbuf[23] = max_unmap_sectors & 0xff;
-
-            /* max unmap descriptors, 255 fit in 4 kb with an 8-byte header.  */
-            outbuf[24] = 0;
-            outbuf[25] = 0;
-            outbuf[26] = 0;
-            outbuf[27] = 255;
-
-            /* optimal unmap granularity */
-            outbuf[28] = (unmap_sectors >> 24) & 0xff;
-            outbuf[29] = (unmap_sectors >> 16) & 0xff;
-            outbuf[30] = (unmap_sectors >> 8) & 0xff;
-            outbuf[31] = unmap_sectors & 0xff;
-
-            /* max write same size */
-            outbuf[36] = 0;
-            outbuf[37] = 0;
-            outbuf[38] = 0;
-            outbuf[39] = 0;
-
-            outbuf[40] = (max_io_sectors >> 24) & 0xff;
-            outbuf[41] = (max_io_sectors >> 16) & 0xff;
-            outbuf[42] = (max_io_sectors >> 8) & 0xff;
-            outbuf[43] = max_io_sectors & 0xff;
-            break;
-        }
-        case 0xb1: /* block device characteristics */
-        {
-            buflen = 8;
-            outbuf[4] = (s->rotation_rate >> 8) & 0xff;
-            outbuf[5] = s->rotation_rate & 0xff;
-            outbuf[6] = 0;
-            outbuf[7] = 0;
-            break;
-        }
-        case 0xb2: /* thin provisioning */
-        {
-            buflen = 8;
-            outbuf[4] = 0;
-            outbuf[5] = 0xe0; /* unmap & write_same 10/16 all supported */
-            outbuf[6] = s->qdev.conf.discard_granularity ? 2 : 1;
-            outbuf[7] = 0;
-            break;
-        }
-        default:
-            return -1;
-        }
-        /* done with EVPD */
-        assert(buflen - start <= 255);
-        outbuf[start - 1] = buflen - start;
-        return buflen;
+        return scsi_disk_emulate_vpd_page(req, outbuf);
     }
 
     /* Standard INQUIRY data */
@@ -3039,6 +3048,10 @@ static Property scsi_block_properties[] = {
     DEFINE_PROP_DRIVE("drive", SCSIDiskState, qdev.conf.blk),
     DEFINE_PROP_BOOL("share-rw", SCSIDiskState, qdev.conf.share_rw, false),
     DEFINE_PROP_UINT16("rotation_rate", SCSIDiskState, rotation_rate, 0),
+    DEFINE_PROP_UINT64("max_unmap_size", SCSIDiskState, max_unmap_size,
+                       DEFAULT_MAX_UNMAP_SIZE),
+    DEFINE_PROP_UINT64("max_io_size", SCSIDiskState, max_io_size,
+                       DEFAULT_MAX_IO_SIZE),
     DEFINE_PROP_INT32("scsi_version", SCSIDiskState, qdev.default_scsi_version,
                       -1),
     DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 03bce8ff39..a04a704bbf 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -142,6 +142,43 @@ static int execute_command(BlockBackend *blk,
     return 0;
 }
 
+static void scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s)
+{
+    /*
+     *  EVPD set to zero returns the standard INQUIRY data.
+     *
+     *  Check if scsi_version is unset (-1) to avoid re-defining it
+     *  each time an INQUIRY with standard data is received.
+     *  scsi_version is initialized with -1 in scsi_generic_reset
+     *  and scsi_disk_reset, making sure that we'll set the
+     *  scsi_version after a reset. If the version field of the
+     *  INQUIRY response somehow changes after a guest reboot,
+     *  we'll be able to keep track of it.
+     *
+     *  On SCSI-2 and older, first 3 bits of byte 2 is the
+     *  ANSI-approved version, while on later versions the
+     *  whole byte 2 contains the version. Check if we're dealing
+     *  with a newer version and, in that case, assign the
+     *  whole byte.
+     */
+    if (s->scsi_version == -1 && !(r->req.cmd.buf[1] & 0x01)) {
+        s->scsi_version = r->buf[2] & 0x07;
+        if (s->scsi_version > 2) {
+            s->scsi_version = r->buf[2];
+        }
+    }
+    if (s->type == TYPE_DISK && r->req.cmd.buf[2] == 0xb0) {
+        uint32_t max_transfer =
+            blk_get_max_transfer(s->conf.blk) / s->blocksize;
+
+        assert(max_transfer);
+        stl_be_p(&r->buf[8], max_transfer);
+        /* Also take care of the opt xfer len. */
+        stl_be_p(&r->buf[12],
+                 MIN_NON_ZERO(max_transfer, ldl_be_p(&r->buf[12])));
+    }
+}
+
 static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
@@ -194,39 +231,7 @@ static void scsi_read_complete(void * opaque, int ret)
         }
     }
     if (r->req.cmd.buf[0] == INQUIRY) {
-        /*
-         *  EVPD set to zero returns the standard INQUIRY data.
-         *
-         *  Check if scsi_version is unset (-1) to avoid re-defining it
-         *  each time an INQUIRY with standard data is received.
-         *  scsi_version is initialized with -1 in scsi_generic_reset
-         *  and scsi_disk_reset, making sure that we'll set the
-         *  scsi_version after a reset. If the version field of the
-         *  INQUIRY response somehow changes after a guest reboot,
-         *  we'll be able to keep track of it.
-         *
-         *  On SCSI-2 and older, first 3 bits of byte 2 is the
-         *  ANSI-approved version, while on later versions the
-         *  whole byte 2 contains the version. Check if we're dealing
-         *  with a newer version and, in that case, assign the
-         *  whole byte.
-         */
-        if (s->scsi_version == -1 && !(r->req.cmd.buf[1] & 0x01)) {
-            s->scsi_version = r->buf[2] & 0x07;
-            if (s->scsi_version > 2) {
-                s->scsi_version = r->buf[2];
-            }
-        }
-        if (s->type == TYPE_DISK && r->req.cmd.buf[2] == 0xb0) {
-            uint32_t max_transfer =
-                blk_get_max_transfer(s->conf.blk) / s->blocksize;
-
-            assert(max_transfer);
-            stl_be_p(&r->buf[8], max_transfer);
-            /* Also take care of the opt xfer len. */
-            stl_be_p(&r->buf[12],
-                     MIN_NON_ZERO(max_transfer, ldl_be_p(&r->buf[12])));
-        }
+        scsi_handle_inquiry_reply(r, s);
     }
     scsi_req_data(&r->req, len);
     scsi_req_unref(&r->req);
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index e35137ea78..138eb79a5f 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -186,6 +186,7 @@ void scsi_device_report_change(SCSIDevice *dev, SCSISense sense);
 void scsi_device_unit_attention_reported(SCSIDevice *dev);
 void scsi_generic_read_device_identification(SCSIDevice *dev);
 int scsi_device_get_sense(SCSIDevice *dev, uint8_t *buf, int len, bool fixed);
+int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf);
 SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int target, int lun);
 
 /* scsi-generic.c. */

From a0c7e35b17b3d2cade8a5fc8e57904e02fb91fe4 Mon Sep 17 00:00:00 2001
From: Daniel Henrique Barboza <danielhb413@gmail.com>
Date: Wed, 27 Jun 2018 14:24:31 -0300
Subject: [PATCH 58/60] hw/scsi: centralize SG_IO calls into single function

For the VPD Block Limits emulation with SCSI passthrough,
we'll issue an Inquiry request with EVPD set to retrieve
the available VPD pages of the device. This would be done in
a way similar of what scsi_generic_read_device_identification
does: create a SCSI command and a reply buffer, fill in the
sg_io_hdr_t structure, call blk_ioctl, check if an error
occurred, process the response.

This same process is done in other 2 functions, get_device_type
and get_stream_blocksize. They differ in the command/reply
buffer and post-processing, everything else is almost a
copy/paste.

Instead of adding a forth copy/pasted-ish code when adding
the passthrough VPD BL emulation, this patch extirpates
this repetition of those 3 functions and put it into
a new one called scsi_SG_IO_FROM_DEV. Any future code that
wants to execute an SG_DXFER_FROM_DEV to the device can
use it, avoiding filling sg_io_hdr_t again and et cetera.

Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20180627172432.11120-3-danielhb413@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-disk.c    | 18 +++----------
 hw/scsi/scsi-generic.c | 61 +++++++++++++++++++++---------------------
 include/hw/scsi/scsi.h |  2 ++
 3 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 664d634c98..b0b39f1e92 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2578,8 +2578,6 @@ static int get_device_type(SCSIDiskState *s)
 {
     uint8_t cmd[16];
     uint8_t buf[36];
-    uint8_t sensebuf[8];
-    sg_io_hdr_t io_header;
     int ret;
 
     memset(cmd, 0, sizeof(cmd));
@@ -2587,19 +2585,9 @@ static int get_device_type(SCSIDiskState *s)
     cmd[0] = INQUIRY;
     cmd[4] = sizeof(buf);
 
-    memset(&io_header, 0, sizeof(io_header));
-    io_header.interface_id = 'S';
-    io_header.dxfer_direction = SG_DXFER_FROM_DEV;
-    io_header.dxfer_len = sizeof(buf);
-    io_header.dxferp = buf;
-    io_header.cmdp = cmd;
-    io_header.cmd_len = sizeof(cmd);
-    io_header.mx_sb_len = sizeof(sensebuf);
-    io_header.sbp = sensebuf;
-    io_header.timeout = 6000; /* XXX */
-
-    ret = blk_ioctl(s->qdev.conf.blk, SG_IO, &io_header);
-    if (ret < 0 || io_header.driver_status || io_header.host_status) {
+    ret = scsi_SG_IO_FROM_DEV(s->qdev.conf.blk, cmd, sizeof(cmd),
+                              buf, sizeof(buf));
+    if (ret < 0) {
         return -1;
     }
     s->qdev.type = buf[0];
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index a04a704bbf..61abc2763a 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -409,12 +409,35 @@ static int read_naa_id(const uint8_t *p, uint64_t *p_wwn)
     return -EINVAL;
 }
 
+int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size,
+                        uint8_t *buf, uint8_t buf_size)
+{
+    sg_io_hdr_t io_header;
+    uint8_t sensebuf[8];
+    int ret;
+
+    memset(&io_header, 0, sizeof(io_header));
+    io_header.interface_id = 'S';
+    io_header.dxfer_direction = SG_DXFER_FROM_DEV;
+    io_header.dxfer_len = buf_size;
+    io_header.dxferp = buf;
+    io_header.cmdp = cmd;
+    io_header.cmd_len = cmd_size;
+    io_header.mx_sb_len = sizeof(sensebuf);
+    io_header.sbp = sensebuf;
+    io_header.timeout = 6000; /* XXX */
+
+    ret = blk_ioctl(blk, SG_IO, &io_header);
+    if (ret < 0 || io_header.driver_status || io_header.host_status) {
+        return -1;
+    }
+    return 0;
+}
+
 void scsi_generic_read_device_identification(SCSIDevice *s)
 {
     uint8_t cmd[6];
     uint8_t buf[250];
-    uint8_t sensebuf[8];
-    sg_io_hdr_t io_header;
     int ret;
     int i, len;
 
@@ -425,19 +448,9 @@ void scsi_generic_read_device_identification(SCSIDevice *s)
     cmd[2] = 0x83;
     cmd[4] = sizeof(buf);
 
-    memset(&io_header, 0, sizeof(io_header));
-    io_header.interface_id = 'S';
-    io_header.dxfer_direction = SG_DXFER_FROM_DEV;
-    io_header.dxfer_len = sizeof(buf);
-    io_header.dxferp = buf;
-    io_header.cmdp = cmd;
-    io_header.cmd_len = sizeof(cmd);
-    io_header.mx_sb_len = sizeof(sensebuf);
-    io_header.sbp = sensebuf;
-    io_header.timeout = 6000; /* XXX */
-
-    ret = blk_ioctl(s->conf.blk, SG_IO, &io_header);
-    if (ret < 0 || io_header.driver_status || io_header.host_status) {
+    ret = scsi_SG_IO_FROM_DEV(s->conf.blk, cmd, sizeof(cmd),
+                              buf, sizeof(buf));
+    if (ret < 0) {
         return;
     }
 
@@ -470,8 +483,6 @@ static int get_stream_blocksize(BlockBackend *blk)
 {
     uint8_t cmd[6];
     uint8_t buf[12];
-    uint8_t sensebuf[8];
-    sg_io_hdr_t io_header;
     int ret;
 
     memset(cmd, 0, sizeof(cmd));
@@ -479,21 +490,11 @@ static int get_stream_blocksize(BlockBackend *blk)
     cmd[0] = MODE_SENSE;
     cmd[4] = sizeof(buf);
 
-    memset(&io_header, 0, sizeof(io_header));
-    io_header.interface_id = 'S';
-    io_header.dxfer_direction = SG_DXFER_FROM_DEV;
-    io_header.dxfer_len = sizeof(buf);
-    io_header.dxferp = buf;
-    io_header.cmdp = cmd;
-    io_header.cmd_len = sizeof(cmd);
-    io_header.mx_sb_len = sizeof(sensebuf);
-    io_header.sbp = sensebuf;
-    io_header.timeout = 6000; /* XXX */
-
-    ret = blk_ioctl(blk, SG_IO, &io_header);
-    if (ret < 0 || io_header.driver_status || io_header.host_status) {
+    ret = scsi_SG_IO_FROM_DEV(blk, cmd, sizeof(cmd), buf, sizeof(buf));
+    if (ret < 0) {
         return -1;
     }
+
     return (buf[9] << 16) | (buf[10] << 8) | buf[11];
 }
 
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 138eb79a5f..75eced34d3 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -187,6 +187,8 @@ void scsi_device_unit_attention_reported(SCSIDevice *dev);
 void scsi_generic_read_device_identification(SCSIDevice *dev);
 int scsi_device_get_sense(SCSIDevice *dev, uint8_t *buf, int len, bool fixed);
 int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf);
+int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size,
+                        uint8_t *buf, uint8_t buf_size);
 SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int target, int lun);
 
 /* scsi-generic.c. */

From a71c775b24ebc664129eb1d9b4c360590353efd5 Mon Sep 17 00:00:00 2001
From: Daniel Henrique Barboza <danielhb413@gmail.com>
Date: Wed, 27 Jun 2018 14:24:32 -0300
Subject: [PATCH 59/60] hw/scsi: add VPD Block Limits emulation

The VPD Block Limits Inquiry page is optional, allowing SCSI devices
to not implement it. This is the case for devices like the MegaRAID
SAS 9361-8i and Microsemi PM8069.

In case of SCSI passthrough, the response of this request is used by
the QEMU SCSI layer to set the max_io_sectors that the guest
device will support, based on the value of the max_sectors_kb that
the device has set in the host at that time. Without this response,
the guest kernel is free to assume any value of max_io_sectors
for the SCSI device. If this value is greater than the value from
the host, SCSI Sense errors will occur because the guest will send
read/write requests that are larger than the underlying host device
is configured to support. An example of this behavior can be seen
in [1].

A workaround is to set the max_sectors_kb host value back in the guest
kernel (a process that can be automated using rc.local startup scripts
and the like), but this has several drawbacks:

- it can be troublesome if the guest has many passthrough devices that
needs this tuning;

- if a change in max_sectors_kb is made in the host side, manual change
in the guests will also be required;

- during an OS install it is difficult, and sometimes not possible, to
go to a terminal and change the max_sectors_kb prior to the installation.
This means that the disk can't be used during the install process. The
easiest alternative here is to roll back to scsi-hd, install the guest
and then go back to SCSI passthrough when the installation is done and
max_sectors_kb can be set.

An easier way would be to QEMU handle the absence of the Block Limits
VPD device response, setting max_io_sectors accordingly and allowing
the guest to use the device without the hassle.

This patch adds emulation of the Block Limits VPD response for
SCSI passthrough devices of type TYPE_DISK that doesn't support
it. The following changes were made:

- scsi_handle_inquiry_reply will now check the available VPD
pages from the Inquiry EVPD reply. In case the device does not

- a new function called scsi_generic_set_vpd_bl_emulation,
that is called during device realize,  was created to set a
new flag 'needs_vpd_bl_emulation' of the device. This function
retrieves the Inquiry EVPD response of the device to check for
VPD BL support.

- scsi_handle_inquiry_reply will now check the available VPD
pages from the Inquiry EVPD reply in case the device needs
VPD BL emulation, adding the Block Limits page (0xb0) to
the list. This will make the guest kernel aware of the
support that we're now providing by emulation.

- a new function scsi_emulate_block_limits creates the
emulated Block Limits response. This function is called
inside scsi_read_complete in case the device requires
Block Limits VPD emulation and we detected a SCSI Sense
error in the VPD Block Limits reply that was issued
from the guest kernel to the device. This error is
expected: we're reporting support from our side, but
the device isn't aware of it.

With this patch, the guest now queries the Block Limits
page during the device configuration because it is being
advertised in the Supported Pages response. It will either
receive the Block Limits page from the hardware, if it supports
it, or will receive an emulated response from QEMU. At any rate,
the guest now has the information to set the max_sectors_kb
parameter accordingly, sparing the user of SCSI sense errors
that would happen without the emulated response and in the
absence of Block Limits support from the hardware.

[1] https://bugzilla.redhat.com/show_bug.cgi?id=1566195

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1566195
Reported-by: Dac Nguyen <dacng@us.ibm.com>
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Message-Id: <20180627172432.11120-4-danielhb413@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-disk.c    |   2 +-
 hw/scsi/scsi-generic.c | 132 +++++++++++++++++++++++++++++++++++++----
 include/hw/scsi/scsi.h |   3 +-
 3 files changed, 125 insertions(+), 12 deletions(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index b0b39f1e92..55a34b3895 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2645,7 +2645,7 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp)
     s->features |= (1 << SCSI_DISK_F_NO_REMOVABLE_DEVOPS);
 
     scsi_realize(&s->qdev, errp);
-    scsi_generic_read_device_identification(&s->qdev);
+    scsi_generic_read_device_inquiry(&s->qdev);
 }
 
 typedef struct SCSIBlockReq {
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 61abc2763a..d60c4d0fcf 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -144,6 +144,8 @@ static int execute_command(BlockBackend *blk,
 
 static void scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s)
 {
+    uint8_t page, page_len;
+
     /*
      *  EVPD set to zero returns the standard INQUIRY data.
      *
@@ -167,22 +169,57 @@ static void scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s)
             s->scsi_version = r->buf[2];
         }
     }
-    if (s->type == TYPE_DISK && r->req.cmd.buf[2] == 0xb0) {
-        uint32_t max_transfer =
-            blk_get_max_transfer(s->conf.blk) / s->blocksize;
 
-        assert(max_transfer);
-        stl_be_p(&r->buf[8], max_transfer);
-        /* Also take care of the opt xfer len. */
-        stl_be_p(&r->buf[12],
-                 MIN_NON_ZERO(max_transfer, ldl_be_p(&r->buf[12])));
+    if (s->type == TYPE_DISK && (r->req.cmd.buf[1] & 0x01)) {
+        page = r->req.cmd.buf[2];
+        if (page == 0xb0) {
+            uint32_t max_transfer =
+                blk_get_max_transfer(s->conf.blk) / s->blocksize;
+
+            assert(max_transfer);
+            stl_be_p(&r->buf[8], max_transfer);
+            /* Also take care of the opt xfer len. */
+            stl_be_p(&r->buf[12],
+                    MIN_NON_ZERO(max_transfer, ldl_be_p(&r->buf[12])));
+        } else if (page == 0x00 && s->needs_vpd_bl_emulation) {
+            /*
+             * Now we're capable of supplying the VPD Block Limits
+             * response if the hardware can't. Add it in the INQUIRY
+             * Supported VPD pages response in case we are using the
+             * emulation for this device.
+             *
+             * This way, the guest kernel will be aware of the support
+             * and will use it to proper setup the SCSI device.
+             */
+            page_len = r->buf[3];
+            r->buf[page_len + 4] = 0xb0;
+            r->buf[3] = ++page_len;
+        }
     }
 }
 
+static int scsi_emulate_block_limits(SCSIGenericReq *r)
+{
+    r->buflen = scsi_disk_emulate_vpd_page(&r->req, r->buf);
+    r->io_header.sb_len_wr = 0;
+
+    /*
+    * We have valid contents in the reply buffer but the
+    * io_header can report a sense error coming from
+    * the hardware in scsi_command_complete_noio. Clean
+    * up the io_header to avoid reporting it.
+    */
+    r->io_header.driver_status = 0;
+    r->io_header.status = 0;
+
+    return r->buflen;
+}
+
 static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
     SCSIDevice *s = r->req.dev;
+    SCSISense sense;
     int len;
 
     assert(r->req.aiocb != NULL);
@@ -199,6 +236,27 @@ static void scsi_read_complete(void * opaque, int ret)
     DPRINTF("Data ready tag=0x%x len=%d\n", r->req.tag, len);
 
     r->len = -1;
+
+    /*
+     * Check if this is a VPD Block Limits request that
+     * resulted in sense error but would need emulation.
+     * In this case, emulate a valid VPD response.
+     */
+    if (s->needs_vpd_bl_emulation) {
+        int is_vpd_bl = r->req.cmd.buf[0] == INQUIRY &&
+                         r->req.cmd.buf[1] & 0x01 &&
+                         r->req.cmd.buf[2] == 0xb0;
+
+        if (is_vpd_bl && sg_io_sense_from_errno(-ret, &r->io_header, &sense)) {
+            len = scsi_emulate_block_limits(r);
+            /*
+             * No need to let scsi_read_complete go on and handle an
+             * INQUIRY VPD BL request we created manually.
+             */
+            goto req_complete;
+        }
+    }
+
     if (len == 0) {
         scsi_command_complete_noio(r, 0);
         goto done;
@@ -233,6 +291,8 @@ static void scsi_read_complete(void * opaque, int ret)
     if (r->req.cmd.buf[0] == INQUIRY) {
         scsi_handle_inquiry_reply(r, s);
     }
+
+req_complete:
     scsi_req_data(&r->req, len);
     scsi_req_unref(&r->req);
 
@@ -434,7 +494,49 @@ int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size,
     return 0;
 }
 
-void scsi_generic_read_device_identification(SCSIDevice *s)
+/*
+ * Executes an INQUIRY request with EVPD set to retrieve the
+ * available VPD pages of the device. If the device does
+ * not support the Block Limits page (page 0xb0), set
+ * the needs_vpd_bl_emulation flag for future use.
+ */
+static void scsi_generic_set_vpd_bl_emulation(SCSIDevice *s)
+{
+    uint8_t cmd[6];
+    uint8_t buf[250];
+    uint8_t page_len;
+    int ret, i;
+
+    memset(cmd, 0, sizeof(cmd));
+    memset(buf, 0, sizeof(buf));
+    cmd[0] = INQUIRY;
+    cmd[1] = 1;
+    cmd[2] = 0x00;
+    cmd[4] = sizeof(buf);
+
+    ret = scsi_SG_IO_FROM_DEV(s->conf.blk, cmd, sizeof(cmd),
+                              buf, sizeof(buf));
+    if (ret < 0) {
+        /*
+         * Do not assume anything if we can't retrieve the
+         * INQUIRY response to assert the VPD Block Limits
+         * support.
+         */
+        s->needs_vpd_bl_emulation = false;
+        return;
+    }
+
+    page_len = buf[3];
+    for (i = 4; i < page_len + 4; i++) {
+        if (buf[i] == 0xb0) {
+            s->needs_vpd_bl_emulation = false;
+            return;
+        }
+    }
+    s->needs_vpd_bl_emulation = true;
+}
+
+static void scsi_generic_read_device_identification(SCSIDevice *s)
 {
     uint8_t cmd[6];
     uint8_t buf[250];
@@ -479,6 +581,16 @@ void scsi_generic_read_device_identification(SCSIDevice *s)
     }
 }
 
+void scsi_generic_read_device_inquiry(SCSIDevice *s)
+{
+    scsi_generic_read_device_identification(s);
+    if (s->type == TYPE_DISK) {
+        scsi_generic_set_vpd_bl_emulation(s);
+    } else {
+        s->needs_vpd_bl_emulation = false;
+    }
+}
+
 static int get_stream_blocksize(BlockBackend *blk)
 {
     uint8_t cmd[6];
@@ -580,7 +692,7 @@ static void scsi_generic_realize(SCSIDevice *s, Error **errp)
 
     /* Only used by scsi-block, but initialize it nevertheless to be clean.  */
     s->default_scsi_version = -1;
-    scsi_generic_read_device_identification(s);
+    scsi_generic_read_device_inquiry(s);
 }
 
 const SCSIReqOps scsi_generic_req_ops = {
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 75eced34d3..21a3a6fec2 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -87,6 +87,7 @@ struct SCSIDevice
     uint64_t port_wwn;
     int scsi_version;
     int default_scsi_version;
+    bool needs_vpd_bl_emulation;
 };
 
 extern const VMStateDescription vmstate_scsi_device;
@@ -184,7 +185,7 @@ void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense);
 void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense);
 void scsi_device_report_change(SCSIDevice *dev, SCSISense sense);
 void scsi_device_unit_attention_reported(SCSIDevice *dev);
-void scsi_generic_read_device_identification(SCSIDevice *dev);
+void scsi_generic_read_device_inquiry(SCSIDevice *dev);
 int scsi_device_get_sense(SCSIDevice *dev, uint8_t *buf, int len, bool fixed);
 int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf);
 int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size,

From 28a3cfc10b2e1a34985797357b4aa7558a63d08f Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Tue, 22 May 2018 10:30:31 +0200
Subject: [PATCH 60/60] tests/boot-serial: Do not delete the output file in
 case of errors

Peter reported that the boot-serial tester sometimes runs into timeouts
with SPARC guests. It's currently completely unclear whether this is due
to too much load on the host machine (so that the guest really just ran
too slow), or whether there is something wrong with the guest's firmware
boot. For further debugging, we need the serial output of the guest in
case of errors, so instead of unlinking the file immediately, this is
now only done in case of success. In case of error, print the name of the
file with the serial output via g_error() (which then also calls abort()
internally to mark the test as failed).

Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <1526977831-31129-1-git-send-email-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tests/boot-serial-test.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/boot-serial-test.c b/tests/boot-serial-test.c
index 4d6815c3e0..952a2e7ead 100644
--- a/tests/boot-serial-test.c
+++ b/tests/boot-serial-test.c
@@ -111,9 +111,8 @@ static testdef_t tests[] = {
     { NULL }
 };
 
-static void check_guest_output(const testdef_t *test, int fd)
+static bool check_guest_output(const testdef_t *test, int fd)
 {
-    bool output_ok = false;
     int i, nbr = 0, pos = 0, ccnt;
     char ch;
 
@@ -125,8 +124,7 @@ static void check_guest_output(const testdef_t *test, int fd)
                 pos += 1;
                 if (test->expect[pos] == '\0') {
                     /* We've reached the end of the expected string! */
-                    output_ok = true;
-                    goto done;
+                    return true;
                 }
             } else {
                 pos = 0;
@@ -136,8 +134,7 @@ static void check_guest_output(const testdef_t *test, int fd)
         g_usleep(10000);
     }
 
-done:
-    g_assert(output_ok);
+    return false;
 }
 
 static void test_machine(const void *data)
@@ -180,12 +177,16 @@ static void test_machine(const void *data)
                                 "-no-shutdown -serial chardev:serial0 %s",
                                 codeparam, code ? codetmp : "",
                                 test->machine, serialtmp, test->extra);
-    unlink(serialtmp);
     if (code) {
         unlink(codetmp);
     }
 
-    check_guest_output(test, ser_fd);
+    if (!check_guest_output(test, ser_fd)) {
+        g_error("Failed to find expected string. Please check '%s'",
+                serialtmp);
+    }
+    unlink(serialtmp);
+
     qtest_quit(global_qtest);
 
     close(ser_fd);