From 013aabdc665e4256b38d8875a1a7b5e030ba98f1 Mon Sep 17 00:00:00 2001
From: Clement Deschamps <clement.deschamps@greensocs.com>
Date: Sun, 21 Oct 2018 16:21:03 +0200
Subject: [PATCH 01/17] icount: fix deadlock when all cpus are sleeping

When all cpus are sleeping (e.g in WFI), to avoid a deadlock
in the main_loop, wake it up in order to start the warp timer.

Signed-off-by: Clement Deschamps <clement.deschamps@greensocs.com>
Message-Id: <20181021142103.19014-1-clement.deschamps@greensocs.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpus.c b/cpus.c
index 3978f63d8f..a2b33ccb29 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1554,6 +1554,14 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
             atomic_mb_set(&cpu->exit_request, 0);
         }
 
+        if (use_icount && all_cpu_threads_idle()) {
+            /*
+             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
+             * in the main_loop, wake it up in order to start the warp timer.
+             */
+            qemu_notify_event();
+        }
+
         qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
         deal_with_unplugged_cpus();
     }

From e204ac612cb2cc1a33f4205976386d237d676319 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 22 Oct 2018 18:55:06 +0200
Subject: [PATCH 02/17] x86: hv_evmcs CPU flag support

Adds a new CPU flag to enable the Enlightened VMCS KVM feature.
QEMU enables KVM_CAP_HYPERV_ENLIGHTENED_VMCS and gets back the
version to be advertised in lower 16 bits of CPUID.0x4000000A:EAX.

Suggested-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20181022165506.30332-3-vkuznets@redhat.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.c          |  1 +
 target/i386/cpu.h          |  1 +
 target/i386/hyperv-proto.h |  2 ++
 target/i386/kvm.c          | 30 ++++++++++++++++++++++++++++--
 4 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index af7e9f09cc..f81d35e1f9 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5732,6 +5732,7 @@ static Property x86_cpu_properties[] = {
     DEFINE_PROP_BOOL("hv-frequencies", X86CPU, hyperv_frequencies, false),
     DEFINE_PROP_BOOL("hv-reenlightenment", X86CPU, hyperv_reenlightenment, false),
     DEFINE_PROP_BOOL("hv-tlbflush", X86CPU, hyperv_tlbflush, false),
+    DEFINE_PROP_BOOL("hv-evmcs", X86CPU, hyperv_evmcs, false),
     DEFINE_PROP_BOOL("hv-ipi", X86CPU, hyperv_ipi, false),
     DEFINE_PROP_BOOL("check", X86CPU, check_cpuid, true),
     DEFINE_PROP_BOOL("enforce", X86CPU, enforce_cpuid, false),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index ad0e0b4534..9c52d0cbeb 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1391,6 +1391,7 @@ struct X86CPU {
     bool hyperv_frequencies;
     bool hyperv_reenlightenment;
     bool hyperv_tlbflush;
+    bool hyperv_evmcs;
     bool hyperv_ipi;
     bool check_cpuid;
     bool enforce_cpuid;
diff --git a/target/i386/hyperv-proto.h b/target/i386/hyperv-proto.h
index 8c572cd7c2..c0272b3a01 100644
--- a/target/i386/hyperv-proto.h
+++ b/target/i386/hyperv-proto.h
@@ -18,6 +18,7 @@
 #define HV_CPUID_FEATURES                     0x40000003
 #define HV_CPUID_ENLIGHTMENT_INFO             0x40000004
 #define HV_CPUID_IMPLEMENT_LIMITS             0x40000005
+#define HV_CPUID_NESTED_FEATURES              0x4000000A
 #define HV_CPUID_MIN                          0x40000005
 #define HV_CPUID_MAX                          0x4000ffff
 #define HV_HYPERVISOR_PRESENT_BIT             0x80000000
@@ -60,6 +61,7 @@
 #define HV_RELAXED_TIMING_RECOMMENDED       (1u << 5)
 #define HV_CLUSTER_IPI_RECOMMENDED          (1u << 10)
 #define HV_EX_PROCESSOR_MASKS_RECOMMENDED   (1u << 11)
+#define HV_ENLIGHTENED_VMCS_RECOMMENDED     (1u << 14)
 
 /*
  * Basic virtualized MSRs
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 796a049a0d..f524e7d929 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -869,6 +869,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
     uint32_t unused;
     struct kvm_cpuid_entry2 *c;
     uint32_t signature[3];
+    uint16_t evmcs_version;
     int kvm_base = KVM_CPUID_SIGNATURE;
     int r;
     Error *local_err = NULL;
@@ -912,7 +913,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
             memset(signature, 0, 12);
             memcpy(signature, cpu->hyperv_vendor_id, len);
         }
-        c->eax = HV_CPUID_MIN;
+        c->eax = cpu->hyperv_evmcs ?
+            HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
         c->ebx = signature[0];
         c->ecx = signature[1];
         c->edx = signature[2];
@@ -970,7 +972,16 @@ int kvm_arch_init_vcpu(CPUState *cs)
             c->eax |= HV_CLUSTER_IPI_RECOMMENDED;
             c->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
         }
-
+        if (cpu->hyperv_evmcs) {
+            if (kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
+                                    (uintptr_t)&evmcs_version)) {
+                fprintf(stderr, "Hyper-V Enlightened VMCS "
+                        "(requested by 'hv-evmcs' cpu flag) "
+                        "is not supported by kernel\n");
+                return -ENOSYS;
+            }
+            c->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
+        }
         c->ebx = cpu->hyperv_spinlock_attempts;
 
         c = &cpuid_data.entries[cpuid_i++];
@@ -981,6 +992,21 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
         kvm_base = KVM_CPUID_SIGNATURE_NEXT;
         has_msr_hv_hypercall = true;
+
+        if (cpu->hyperv_evmcs) {
+            __u32 function;
+
+            /* Create zeroed 0x40000006..0x40000009 leaves */
+            for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
+                 function < HV_CPUID_NESTED_FEATURES; function++) {
+                c = &cpuid_data.entries[cpuid_i++];
+                c->function = function;
+            }
+
+            c = &cpuid_data.entries[cpuid_i++];
+            c->function = HV_CPUID_NESTED_FEATURES;
+            c->eax = evmcs_version;
+        }
     }
 
     if (cpu->expose_kvm) {

From d4715481ded13231d9ff8ae17da648de78b925d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Thu, 30 Aug 2018 11:57:57 +0100
Subject: [PATCH 03/17] i386: clarify that the Q35 machine type implements a
 P35 chipset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'q35' machine type implements an Intel Series 3 chipset,
of which there are several variants:

  https://www.intel.com/Assets/PDF/datasheet/316966.pdf

The key difference between the 82P35 MCH ('p35', PCI device ID 0x29c0)
and 82Q35 GMCH ('q35', PCI device ID 0x29b0) variants is that the latter
has an integrated graphics adapter. QEMU does not implement integrated
graphics, so uses the PCI ID for the 82P35 chipset, despite calling the
machine type 'q35'. Thus we rename the PCI device ID constant to reflect
reality, to avoid confusing future developers. The new name more closely
matches what pci.ids reports it to be:

$ grep  P35 /usr/share/hwdata/pci.ids  | grep 29
	29c0  82G33/G31/P35/P31 Express DRAM Controller
	29c1  82G33/G31/P35/P31 Express PCI Express Root Port
	29c4  82G33/G31/P35/P31 Express MEI Controller
	29c5  82G33/G31/P35/P31 Express MEI Controller
	29c6  82G33/G31/P35/P31 Express PT IDER Controller
	29c7  82G33/G31/P35/P31 Express Serial KT Controller

$ grep  Q35 /usr/share/hwdata/pci.ids  | grep 29
	29b0  82Q35 Express DRAM Controller
	29b1  82Q35 Express PCI Express Root Port
	29b2  82Q35 Express Integrated Graphics Controller
	29b3  82Q35 Express Integrated Graphics Controller
	29b4  82Q35 Express MEI Controller
	29b5  82Q35 Express MEI Controller
	29b6  82Q35 Express PT IDER Controller
	29b7  82Q35 Express Serial KT Controller

Arguably the QEMU machine type should be named 'p35'. At this point in
time, however, it is not worth the churn for management applications &
documentation to worry about renaming it.

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Message-Id: <20180830105757.10577-1-berrange@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/pci-host/q35.c        | 10 +++++++++-
 include/hw/pci/pci_ids.h |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 966a7cf92d..71e4ca5eec 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -622,7 +622,15 @@ static void mch_class_init(ObjectClass *klass, void *data)
     dc->desc = "Host bridge";
     dc->vmsd = &vmstate_mch;
     k->vendor_id = PCI_VENDOR_ID_INTEL;
-    k->device_id = PCI_DEVICE_ID_INTEL_Q35_MCH;
+    /*
+     * The 'q35' machine type implements an Intel Series 3 chipset,
+     * of which there are several variants. The key difference between
+     * the 82P35 MCH ('p35') and 82Q35 GMCH ('q35') variants is that
+     * the latter has an integrated graphics adapter. QEMU does not
+     * implement integrated graphics, so uses the PCI ID for the 82P35
+     * chipset.
+     */
+    k->device_id = PCI_DEVICE_ID_INTEL_P35_MCH;
     k->revision = MCH_HOST_BRIDGE_REVISION_DEFAULT;
     k->class_id = PCI_CLASS_BRIDGE_HOST;
     /*
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 63acc722a9..eeb33018ad 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -255,7 +255,7 @@
 #define PCI_DEVICE_ID_INTEL_82801I_EHCI2 0x293c
 #define PCI_DEVICE_ID_INTEL_82599_SFP_VF 0x10ed
 
-#define PCI_DEVICE_ID_INTEL_Q35_MCH      0x29c0
+#define PCI_DEVICE_ID_INTEL_P35_MCH      0x29c0
 
 #define PCI_VENDOR_ID_XEN                0x5853
 #define PCI_DEVICE_ID_XEN_PLATFORM       0x0001

From bce410a33b9ed51051eb6a1fb31f8d0c13a51d48 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Thu, 1 Nov 2018 11:44:46 +0100
Subject: [PATCH 04/17] ivshmem: fix memory backend leak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

object_new() returns a new backend with refcount == 1 and
then later object_property_add_child() increases refcount to 2
So when ivshmem is destroyed, the backend it has created isn't
destroyed along with it as children cleanup will bring
backend's refcount only to 1, which leaks backend including
resources it is using.

Drop the original reference from object_new() once backend
is attached to its parent.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Message-Id: <1541069086-167036-1-git-send-email-imammedo@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Fixes: 5503e285041979dd29698ecb41729b3b22622e8d
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/misc/ivshmem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index f88910e55c..ecfd10a29a 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -1279,6 +1279,7 @@ static void desugar_shm(IVShmemState *s)
     object_property_set_bool(obj, true, "share", &error_abort);
     object_property_add_child(OBJECT(s), "internal-shm-backend", obj,
                               &error_abort);
+    object_unref(obj);
     user_creatable_complete(obj, &error_abort);
     s->hostmem = MEMORY_BACKEND(obj);
 }

From 2185fd67d2f277ebb1d2946cf5f7cdc773e04198 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 19 Oct 2018 14:25:42 +0200
Subject: [PATCH 05/17] MAINTAINERS: remove or downgrade myself to reviewer
 from some subsystems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Other people are doing a much better work than myself at handling some
subsystems.  For those files it is better if I downgrade myself to
reviewer or recognize that I am not actually doing any work there.

Cc: Daniel P. Berrange <berrange@redhat.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Eric Blake <eblake@redhat.com>
Cc: Thomas Huth <thuth@redhat.com>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 MAINTAINERS | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8ec2281a7e..d411521d15 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -105,9 +105,9 @@ Guest CPU cores (TCG):
 ----------------------
 Overall
 L: qemu-devel@nongnu.org
-M: Paolo Bonzini <pbonzini@redhat.com>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 M: Richard Henderson <rth@twiddle.net>
+R: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: cpus.c
 F: exec.c
@@ -1139,7 +1139,8 @@ F: hw/pci-host/ppce500.c
 F: hw/net/fsl_etsec/
 
 Character devices
-M: Paolo Bonzini <pbonzini@redhat.com>
+M: Marc-André Lureau <marcandre.lureau@redhat.com>
+R: Paolo Bonzini <pbonzini@redhat.com>
 S: Odd Fixes
 F: hw/char/
 
@@ -1526,8 +1527,8 @@ T: git git://github.com/famz/qemu.git bitmaps
 T: git git://github.com/jnsnow/qemu.git bitmaps
 
 Character device backends
-M: Paolo Bonzini <pbonzini@redhat.com>
 M: Marc-André Lureau <marcandre.lureau@redhat.com>
+R: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: chardev/
 F: include/chardev/
@@ -1760,9 +1761,9 @@ F: tests/qmp-cmd-test.c
 T: git git://repo.or.cz/qemu/armbru.git qapi-next
 
 qtest
-M: Paolo Bonzini <pbonzini@redhat.com>
 M: Thomas Huth <thuth@redhat.com>
 M: Laurent Vivier <lvivier@redhat.com>
+R: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: qtest.c
 F: tests/libqtest.*
@@ -1869,7 +1870,6 @@ F: tests/test-io-*
 Sockets
 M: Daniel P. Berrange <berrange@redhat.com>
 M: Gerd Hoffmann <kraxel@redhat.com>
-M: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
 F: include/qemu/sockets.h
 F: util/qemu-sockets.c
@@ -2056,13 +2056,12 @@ M: Ronnie Sahlberg <ronniesahlberg@gmail.com>
 M: Paolo Bonzini <pbonzini@redhat.com>
 M: Peter Lieven <pl@kamp.de>
 L: qemu-block@nongnu.org
-S: Supported
+S: Odd Fixes
 F: block/iscsi.c
 F: block/iscsi-opts.c
 
 Network Block Device (NBD)
 M: Eric Blake <eblake@redhat.com>
-M: Paolo Bonzini <pbonzini@redhat.com>
 L: qemu-block@nongnu.org
 S: Maintained
 F: block/nbd*

From 1a1435dd61e28c1e3b70971107d72a7d05b28d03 Mon Sep 17 00:00:00 2001
From: Rudolf Marek <rudolf.marek@sysgo.com>
Date: Fri, 19 Oct 2018 14:24:49 +0200
Subject: [PATCH 06/17] target/i386: Clear RF on SYSCALL instruction

Fix the SYSCALL instruction in 64-bit (long mode). The RF flag
should be cleared in R11 as well as in the RFLAGS. Intel
and AMD CPUs behave same. AMD has this documented in the
APM vol 3.

Signed-off-by: Roman Kapl <rka@sysgo.com>
Signed-off-by: Rudolf Marek <rudolf.marek@sysgo.com>
Message-Id: <20181019122449.26387-1-rka@sysgo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/seg_helper.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c
index 33714bc6e1..63e265cb38 100644
--- a/target/i386/seg_helper.c
+++ b/target/i386/seg_helper.c
@@ -991,11 +991,11 @@ void helper_syscall(CPUX86State *env, int next_eip_addend)
         int code64;
 
         env->regs[R_ECX] = env->eip + next_eip_addend;
-        env->regs[11] = cpu_compute_eflags(env);
+        env->regs[11] = cpu_compute_eflags(env) & ~RF_MASK;
 
         code64 = env->hflags & HF_CS64_MASK;
 
-        env->eflags &= ~env->fmask;
+        env->eflags &= ~(env->fmask | RF_MASK);
         cpu_load_eflags(env, env->eflags, 0);
         cpu_x86_load_seg_cache(env, R_CS, selector & 0xfffc,
                            0, 0xffffffff,

From c26763f8ec70b1011098cab0da9178666d8256a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Wed, 3 Oct 2018 15:44:52 +0400
Subject: [PATCH 07/17] memory: learn about non-volatile memory region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new flag to mark memory region that are used as non-volatile, by
NVDIMM for example. That bit is propagated down to the flat view, and
reflected in HMP info mtree with a "nv-" prefix on the memory type.

This way, guest_phys_blocks_region_add() can skip the NV memory
regions for dumps and TCG memory clear in a following patch.

Cc: dgilbert@redhat.com
Cc: imammedo@redhat.com
Cc: pbonzini@redhat.com
Cc: guangrong.xiao@linux.intel.com
Cc: mst@redhat.com
Cc: xiaoguangrong.eric@gmail.com
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20181003114454.5662-2-marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 docs/devel/migration.rst |  1 +
 include/exec/memory.h    | 25 ++++++++++++++++++++++
 memory.c                 | 45 +++++++++++++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst
index 687570754d..e7658ab050 100644
--- a/docs/devel/migration.rst
+++ b/docs/devel/migration.rst
@@ -435,6 +435,7 @@ Examples of such memory API functions are:
   - memory_region_add_subregion()
   - memory_region_del_subregion()
   - memory_region_set_readonly()
+  - memory_region_set_nonvolatile()
   - memory_region_set_enabled()
   - memory_region_set_address()
   - memory_region_set_alias_offset()
diff --git a/include/exec/memory.h b/include/exec/memory.h
index d0c7f0d9e9..8e61450de3 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -355,6 +355,7 @@ struct MemoryRegion {
     bool ram;
     bool subpage;
     bool readonly; /* For RAM regions */
+    bool nonvolatile;
     bool rom_device;
     bool flush_coalesced_mmio;
     bool global_locking;
@@ -480,6 +481,7 @@ static inline FlatView *address_space_to_flatview(AddressSpace *as)
  * @offset_within_address_space: the address of the first byte of the section
  *     relative to the region's address space
  * @readonly: writes to this section are ignored
+ * @nonvolatile: this section is non-volatile
  */
 struct MemoryRegionSection {
     MemoryRegion *mr;
@@ -488,6 +490,7 @@ struct MemoryRegionSection {
     Int128 size;
     hwaddr offset_within_address_space;
     bool readonly;
+    bool nonvolatile;
 };
 
 /**
@@ -1170,6 +1173,17 @@ static inline bool memory_region_is_rom(MemoryRegion *mr)
     return mr->ram && mr->readonly;
 }
 
+/**
+ * memory_region_is_nonvolatile: check whether a memory region is non-volatile
+ *
+ * Returns %true is a memory region is non-volatile memory.
+ *
+ * @mr: the memory region being queried
+ */
+static inline bool memory_region_is_nonvolatile(MemoryRegion *mr)
+{
+    return mr->nonvolatile;
+}
 
 /**
  * memory_region_get_fd: Get a file descriptor backing a RAM memory region.
@@ -1341,6 +1355,17 @@ void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr,
  */
 void memory_region_set_readonly(MemoryRegion *mr, bool readonly);
 
+/**
+ * memory_region_set_nonvolatile: Turn a memory region non-volatile
+ *
+ * Allows a memory region to be marked as non-volatile.
+ * only useful on RAM regions.
+ *
+ * @mr: the region being updated.
+ * @nonvolatile: whether rhe region is to be non-volatile.
+ */
+void memory_region_set_nonvolatile(MemoryRegion *mr, bool nonvolatile);
+
 /**
  * memory_region_rom_device_set_romd: enable/disable ROMD mode
  *
diff --git a/memory.c b/memory.c
index 51204aa079..d14c6dec1d 100644
--- a/memory.c
+++ b/memory.c
@@ -216,6 +216,7 @@ struct FlatRange {
     uint8_t dirty_log_mask;
     bool romd_mode;
     bool readonly;
+    bool nonvolatile;
 };
 
 #define FOR_EACH_FLAT_RANGE(var, view)          \
@@ -231,6 +232,7 @@ section_from_flat_range(FlatRange *fr, FlatView *fv)
         .size = fr->addr.size,
         .offset_within_address_space = int128_get64(fr->addr.start),
         .readonly = fr->readonly,
+        .nonvolatile = fr->nonvolatile,
     };
 }
 
@@ -240,7 +242,8 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b)
         && addrrange_equal(a->addr, b->addr)
         && a->offset_in_region == b->offset_in_region
         && a->romd_mode == b->romd_mode
-        && a->readonly == b->readonly;
+        && a->readonly == b->readonly
+        && a->nonvolatile == b->nonvolatile;
 }
 
 static FlatView *flatview_new(MemoryRegion *mr_root)
@@ -312,7 +315,8 @@ static bool can_merge(FlatRange *r1, FlatRange *r2)
                      int128_make64(r2->offset_in_region))
         && r1->dirty_log_mask == r2->dirty_log_mask
         && r1->romd_mode == r2->romd_mode
-        && r1->readonly == r2->readonly;
+        && r1->readonly == r2->readonly
+        && r1->nonvolatile == r2->nonvolatile;
 }
 
 /* Attempt to simplify a view by merging adjacent ranges */
@@ -592,7 +596,8 @@ static void render_memory_region(FlatView *view,
                                  MemoryRegion *mr,
                                  Int128 base,
                                  AddrRange clip,
-                                 bool readonly)
+                                 bool readonly,
+                                 bool nonvolatile)
 {
     MemoryRegion *subregion;
     unsigned i;
@@ -608,6 +613,7 @@ static void render_memory_region(FlatView *view,
 
     int128_addto(&base, int128_make64(mr->addr));
     readonly |= mr->readonly;
+    nonvolatile |= mr->nonvolatile;
 
     tmp = addrrange_make(base, mr->size);
 
@@ -620,13 +626,15 @@ static void render_memory_region(FlatView *view,
     if (mr->alias) {
         int128_subfrom(&base, int128_make64(mr->alias->addr));
         int128_subfrom(&base, int128_make64(mr->alias_offset));
-        render_memory_region(view, mr->alias, base, clip, readonly);
+        render_memory_region(view, mr->alias, base, clip,
+                             readonly, nonvolatile);
         return;
     }
 
     /* Render subregions in priority order. */
     QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
-        render_memory_region(view, subregion, base, clip, readonly);
+        render_memory_region(view, subregion, base, clip,
+                             readonly, nonvolatile);
     }
 
     if (!mr->terminates) {
@@ -641,6 +649,7 @@ static void render_memory_region(FlatView *view,
     fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);
     fr.romd_mode = mr->romd_mode;
     fr.readonly = readonly;
+    fr.nonvolatile = nonvolatile;
 
     /* Render the region itself into any gaps left by the current view. */
     for (i = 0; i < view->nr && int128_nz(remain); ++i) {
@@ -726,7 +735,8 @@ static FlatView *generate_memory_topology(MemoryRegion *mr)
 
     if (mr) {
         render_memory_region(view, mr, int128_zero(),
-                             addrrange_make(int128_zero(), int128_2_64()), false);
+                             addrrange_make(int128_zero(), int128_2_64()),
+                             false, false);
     }
     flatview_simplify(view);
 
@@ -2039,6 +2049,16 @@ void memory_region_set_readonly(MemoryRegion *mr, bool readonly)
     }
 }
 
+void memory_region_set_nonvolatile(MemoryRegion *mr, bool nonvolatile)
+{
+    if (mr->nonvolatile != nonvolatile) {
+        memory_region_transaction_begin();
+        mr->nonvolatile = nonvolatile;
+        memory_region_update_pending |= mr->enabled;
+        memory_region_transaction_commit();
+    }
+}
+
 void memory_region_rom_device_set_romd(MemoryRegion *mr, bool romd_mode)
 {
     if (mr->romd_mode != romd_mode) {
@@ -2489,6 +2509,7 @@ static MemoryRegionSection memory_region_find_rcu(MemoryRegion *mr,
     ret.size = range.size;
     ret.offset_within_address_space = int128_get64(range.start);
     ret.readonly = fr->readonly;
+    ret.nonvolatile = fr->nonvolatile;
     return ret;
 }
 
@@ -2839,10 +2860,11 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f,
             QTAILQ_INSERT_TAIL(alias_print_queue, ml, mrqueue);
         }
         mon_printf(f, TARGET_FMT_plx "-" TARGET_FMT_plx
-                   " (prio %d, %s): alias %s @%s " TARGET_FMT_plx
+                   " (prio %d, %s%s): alias %s @%s " TARGET_FMT_plx
                    "-" TARGET_FMT_plx "%s",
                    cur_start, cur_end,
                    mr->priority,
+                   mr->nonvolatile ? "nv-" : "",
                    memory_region_type((MemoryRegion *)mr),
                    memory_region_name(mr),
                    memory_region_name(mr->alias),
@@ -2854,9 +2876,10 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f,
         }
     } else {
         mon_printf(f,
-                   TARGET_FMT_plx "-" TARGET_FMT_plx " (prio %d, %s): %s%s",
+                   TARGET_FMT_plx "-" TARGET_FMT_plx " (prio %d, %s%s): %s%s",
                    cur_start, cur_end,
                    mr->priority,
+                   mr->nonvolatile ? "nv-" : "",
                    memory_region_type((MemoryRegion *)mr),
                    memory_region_name(mr),
                    mr->enabled ? "" : " [disabled]");
@@ -2941,19 +2964,21 @@ static void mtree_print_flatview(gpointer key, gpointer value,
         mr = range->mr;
         if (range->offset_in_region) {
             p(f, MTREE_INDENT TARGET_FMT_plx "-"
-              TARGET_FMT_plx " (prio %d, %s): %s @" TARGET_FMT_plx,
+              TARGET_FMT_plx " (prio %d, %s%s): %s @" TARGET_FMT_plx,
               int128_get64(range->addr.start),
               int128_get64(range->addr.start) + MR_SIZE(range->addr.size),
               mr->priority,
+              range->nonvolatile ? "nv-" : "",
               range->readonly ? "rom" : memory_region_type(mr),
               memory_region_name(mr),
               range->offset_in_region);
         } else {
             p(f, MTREE_INDENT TARGET_FMT_plx "-"
-              TARGET_FMT_plx " (prio %d, %s): %s",
+              TARGET_FMT_plx " (prio %d, %s%s): %s",
               int128_get64(range->addr.start),
               int128_get64(range->addr.start) + MR_SIZE(range->addr.size),
               mr->priority,
+              range->nonvolatile ? "nv-" : "",
               range->readonly ? "rom" : memory_region_type(mr),
               memory_region_name(mr));
         }

From 640713d8a17107120ba29c4b2527b0b06951e33a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Wed, 3 Oct 2018 15:44:53 +0400
Subject: [PATCH 08/17] nvdimm: set non-volatile on the memory region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qemu-system-x86_64 -machine pc,nvdimm -m 2G,slots=4,maxmem=16G -enable-kvm -monitor stdio -object memory-backend-file,id=mem1,share=on,mem-path=/tmp/foo,size=1G -device nvdimm,id=nvdimm1,memdev=mem1

HMP info mtree command reflects the flag with "nv-" prefix on memory type:

(qemu) info mtree
0000000100000000-000000013fffffff (prio 0, nv-i/o): alias nvdimm-memory @/objects/mem1 0000000000000000-000000003fffffff

(qemu) info mtree -f
0000000100000000-000000013fffffff (prio 0, nv-ram): /objects/mem1

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20181003114454.5662-3-marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/mem/nvdimm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 49324f3fae..bf2adf5e16 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -116,6 +116,7 @@ static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
     nvdimm->nvdimm_mr = g_new(MemoryRegion, 1);
     memory_region_init_alias(nvdimm->nvdimm_mr, OBJECT(dimm),
                              "nvdimm-memory", mr, 0, pmem_size);
+    memory_region_set_nonvolatile(nvdimm->nvdimm_mr, true);
     nvdimm->nvdimm_mr->align = align;
 }
 

From 17a6ddb6fac51c1979dd5e35588cc82c19e8e75c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Wed, 3 Oct 2018 15:44:54 +0400
Subject: [PATCH 09/17] memory-mapping: skip non-volatile memory regions in
 GuestPhysBlockList
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GuestPhysBlockList is currently used to produce dumps. Given the size
and the typical usage of NVDIMM for storage, they are not a good idea
to have in the dumps. We may want to have an extra dump option to
include them. For now, skip non-volatile regions.

The TCG memory clear function is going to use the GuestPhysBlockList
as well, and will thus skip NVDIMM for similar reasons.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20181003114454.5662-4-marcandre.lureau@redhat.com>
Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 memory_mapping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/memory_mapping.c b/memory_mapping.c
index 775466f3a8..724dd0b417 100644
--- a/memory_mapping.c
+++ b/memory_mapping.c
@@ -206,7 +206,8 @@ static void guest_phys_blocks_region_add(MemoryListener *listener,
 
     /* we only care about RAM */
     if (!memory_region_is_ram(section->mr) ||
-        memory_region_is_ram_device(section->mr)) {
+        memory_region_is_ram_device(section->mr) ||
+        memory_region_is_nonvolatile(section->mr)) {
         return;
     }
 

From 7f135356564bc776083b7ecee81096ab49e670e4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 29 Oct 2018 14:49:36 +0100
Subject: [PATCH 10/17] scripts/dump-guest-memory: Synchronize with
 guest_phys_blocks_region_add
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent patches have removed ram_device and nonvolatile RAM
from dump-guest-memory's output.  Do the same for dumps
that are extracted from a QEMU core file.

Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 scripts/dump-guest-memory.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/dump-guest-memory.py b/scripts/dump-guest-memory.py
index 5a857cebcf..198cd0fe40 100644
--- a/scripts/dump-guest-memory.py
+++ b/scripts/dump-guest-memory.py
@@ -417,7 +417,9 @@ def get_guest_phys_blocks():
         memory_region = flat_range["mr"].dereference()
 
         # we only care about RAM
-        if not memory_region["ram"]:
+        if (not memory_region["ram"] or
+            memory_region["ram_device"] or
+            memory_region["nonvolatile"]):
             continue
 
         section_size = int128_get64(flat_range["addr"]["size"])

From e58ccf039650065a9442de43c9816f81e88f27f6 Mon Sep 17 00:00:00 2001
From: Prasad J Pandit <pjp@fedoraproject.org>
Date: Sat, 27 Oct 2018 01:13:14 +0530
Subject: [PATCH 11/17] lsi53c895a: check message length value is valid

While writing a message in 'lsi_do_msgin', message length value
in 'msg_len' could be invalid due to an invalid migration stream.
Add an assertion to avoid an out of bounds access, and reject
the incoming migration data if it contains an invalid message
length.

Discovered by Deja vu Security. Reported by Oracle.

Signed-off-by: Prasad J Pandit <pjp@fedoraproject.org>
Message-Id: <20181026194314.18663-1-ppandit@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/lsi53c895a.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index d1e6534311..3f207f607c 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -861,10 +861,11 @@ static void lsi_do_status(LSIState *s)
 
 static void lsi_do_msgin(LSIState *s)
 {
-    int len;
+    uint8_t len;
     trace_lsi_do_msgin(s->dbc, s->msg_len);
     s->sfbr = s->msg[0];
     len = s->msg_len;
+    assert(len > 0 && len <= LSI_MAX_MSGIN_LEN);
     if (len > s->dbc)
         len = s->dbc;
     pci_dma_write(PCI_DEVICE(s), s->dnad, s->msg, len);
@@ -1705,8 +1706,10 @@ static uint8_t lsi_reg_readb(LSIState *s, int offset)
         break;
     case 0x58: /* SBDL */
         /* Some drivers peek at the data bus during the MSG IN phase.  */
-        if ((s->sstat1 & PHASE_MASK) == PHASE_MI)
+        if ((s->sstat1 & PHASE_MASK) == PHASE_MI) {
+            assert(s->msg_len > 0);
             return s->msg[0];
+        }
         ret = 0;
         break;
     case 0x59: /* SBDL high */
@@ -2103,11 +2106,23 @@ static int lsi_pre_save(void *opaque)
     return 0;
 }
 
+static int lsi_post_load(void *opaque, int version_id)
+{
+    LSIState *s = opaque;
+
+    if (s->msg_len < 0 || s->msg_len > LSI_MAX_MSGIN_LEN) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static const VMStateDescription vmstate_lsi_scsi = {
     .name = "lsiscsi",
     .version_id = 0,
     .minimum_version_id = 0,
     .pre_save = lsi_pre_save,
+    .post_load = lsi_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_PCI_DEVICE(parent_obj, LSIState),
 

From 6c219fc8a112fc69b29f59ea2c7865717ff6e3e0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Oct 2018 00:46:21 +0200
Subject: [PATCH 12/17] scsi-generic: keep VPD page list sorted

Block limits emulation is just placing 0xb0 as the final byte of the
VPD pages list.  However, VPD page numbers must be sorted, so change
that to an in-place insert.  Since I couldn't find any disk that triggered
the loop more than once, this was tested by adding manually 0xb1
at the end of the list and checking that 0xb0 was added before.

Reported-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-generic.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index d60c4d0fcf..aebb7cdd82 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -144,7 +144,7 @@ static int execute_command(BlockBackend *blk,
 
 static void scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s)
 {
-    uint8_t page, page_len;
+    uint8_t page, page_idx;
 
     /*
      *  EVPD set to zero returns the standard INQUIRY data.
@@ -190,10 +190,21 @@ static void scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s)
              *
              * This way, the guest kernel will be aware of the support
              * and will use it to proper setup the SCSI device.
+             *
+             * VPD page numbers must be sorted, so insert 0xb0 at the
+             * right place with an in-place insert.  After the initialization
+             * part of the for loop is executed, the device response is
+             * at r[0] to r[page_idx - 1].
              */
-            page_len = r->buf[3];
-            r->buf[page_len + 4] = 0xb0;
-            r->buf[3] = ++page_len;
+            for (page_idx = lduw_be_p(r->buf + 2) + 4;
+                 page_idx > 4 && r->buf[page_idx - 1] >= 0xb0;
+                 page_idx--) {
+                if (page_idx < r->buflen) {
+                    r->buf[page_idx] = r->buf[page_idx - 1];
+                }
+            }
+            r->buf[page_idx] = 0xb0;
+            stw_be_p(r->buf + 2, lduw_be_p(r->buf + 2) + 1);
         }
     }
 }

From 57dbb58d800f62b9e56d946660dba4e8dbd20204 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Oct 2018 00:58:43 +0200
Subject: [PATCH 13/17] scsi-generic: avoid out-of-bounds access to VPD page
 list

A device can report an excessive number of VPD pages when asked for a
list; this can cause an out-of-bounds access to buf in
scsi_generic_set_vpd_bl_emulation.  It should not happen, but
it is technically not incorrect so handle it: do not check any byte
past the allocation length that was sent to the INQUIRY command.

Reported-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index aebb7cdd82..c5497bbea8 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -538,7 +538,7 @@ static void scsi_generic_set_vpd_bl_emulation(SCSIDevice *s)
     }
 
     page_len = buf[3];
-    for (i = 4; i < page_len + 4; i++) {
+    for (i = 4; i < MIN(sizeof(buf), page_len + 4); i++) {
         if (buf[i] == 0xb0) {
             s->needs_vpd_bl_emulation = false;
             return;

From 3d4a8bf0eed68a781e06118e4d1df6e2f106a1f2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Oct 2018 00:43:51 +0200
Subject: [PATCH 14/17] scsi-generic: avoid invalid access to struct when
 emulating block limits

Emulation of the block limits VPD page called back into scsi-disk.c,
which however expected the request to be for a SCSIDiskState and
accessed a scsi-generic device outside the bounds of its struct
(namely to retrieve s->max_unmap_size and s->max_io_size).

To avoid this, move the emulation code to a separate function that
takes a new SCSIBlockLimits struct and marshals it into the VPD
response format.

Reported-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/Makefile.objs       |  2 +-
 hw/scsi/emulation.c         | 42 +++++++++++++++++
 hw/scsi/scsi-disk.c         | 92 ++++++++-----------------------------
 hw/scsi/scsi-generic.c      | 35 ++++++++++----
 include/hw/scsi/emulation.h | 16 +++++++
 include/hw/scsi/scsi.h      |  1 -
 6 files changed, 104 insertions(+), 84 deletions(-)
 create mode 100644 hw/scsi/emulation.c
 create mode 100644 include/hw/scsi/emulation.h

diff --git a/hw/scsi/Makefile.objs b/hw/scsi/Makefile.objs
index 718b4c2a68..45167baeaf 100644
--- a/hw/scsi/Makefile.objs
+++ b/hw/scsi/Makefile.objs
@@ -1,4 +1,4 @@
-common-obj-y += scsi-disk.o
+common-obj-y += scsi-disk.o emulation.o
 common-obj-y += scsi-generic.o scsi-bus.o
 common-obj-$(CONFIG_LSI_SCSI_PCI) += lsi53c895a.o
 common-obj-$(CONFIG_MPTSAS_SCSI_PCI) += mptsas.o mptconfig.o mptendian.o
diff --git a/hw/scsi/emulation.c b/hw/scsi/emulation.c
new file mode 100644
index 0000000000..06d62f3c38
--- /dev/null
+++ b/hw/scsi/emulation.c
@@ -0,0 +1,42 @@
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "qemu/bswap.h"
+#include "hw/scsi/emulation.h"
+
+int scsi_emulate_block_limits(uint8_t *outbuf, const SCSIBlockLimits *bl)
+{
+    /* required VPD size with unmap support */
+    memset(outbuf, 0, 0x3c);
+
+    outbuf[0] = bl->wsnz; /* wsnz */
+
+    if (bl->max_io_sectors) {
+        /* optimal transfer length granularity.  This field and the optimal
+         * transfer length can't be greater than maximum transfer length.
+         */
+        stw_be_p(outbuf + 2, MIN(bl->min_io_size, bl->max_io_sectors));
+
+        /* maximum transfer length */
+        stl_be_p(outbuf + 4, bl->max_io_sectors);
+
+        /* optimal transfer length */
+        stl_be_p(outbuf + 8, MIN(bl->opt_io_size, bl->max_io_sectors));
+    } else {
+        stw_be_p(outbuf + 2, bl->min_io_size);
+        stl_be_p(outbuf + 8, bl->opt_io_size);
+    }
+
+    /* max unmap LBA count */
+    stl_be_p(outbuf + 16, bl->max_unmap_sectors);
+
+    /* max unmap descriptors */
+    stl_be_p(outbuf + 20, bl->max_unmap_descr);
+
+    /* optimal unmap granularity; alignment is zero */
+    stl_be_p(outbuf + 24, bl->unmap_sectors);
+
+    /* max write same size, make it the same as maximum transfer length */
+    stl_be_p(outbuf + 36, bl->max_io_sectors);
+
+    return 0x3c;
+}
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index e2c5408aa2..6eb258d3f3 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -33,6 +33,7 @@ do { printf("scsi-disk: " fmt , ## __VA_ARGS__); } while (0)
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "hw/scsi/scsi.h"
+#include "hw/scsi/emulation.h"
 #include "scsi/constants.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
@@ -589,7 +590,7 @@ static uint8_t *scsi_get_buf(SCSIRequest *req)
     return (uint8_t *)r->iov.iov_base;
 }
 
-int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf)
+static int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     uint8_t page_code = req->cmd.buf[2];
@@ -691,89 +692,36 @@ int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf)
     }
     case 0xb0: /* block limits */
     {
-        unsigned int unmap_sectors =
-            s->qdev.conf.discard_granularity / s->qdev.blocksize;
-        unsigned int min_io_size =
-            s->qdev.conf.min_io_size / s->qdev.blocksize;
-        unsigned int opt_io_size =
-            s->qdev.conf.opt_io_size / s->qdev.blocksize;
-        unsigned int max_unmap_sectors =
-            s->max_unmap_size / s->qdev.blocksize;
-        unsigned int max_io_sectors =
-            s->max_io_size / s->qdev.blocksize;
+        SCSIBlockLimits bl = {};
 
         if (s->qdev.type == TYPE_ROM) {
             DPRINTF("Inquiry (EVPD[%02X] not supported for CDROM\n",
                     page_code);
             return -1;
         }
+        bl.wsnz = 1;
+        bl.unmap_sectors =
+            s->qdev.conf.discard_granularity / s->qdev.blocksize;
+        bl.min_io_size =
+            s->qdev.conf.min_io_size / s->qdev.blocksize;
+        bl.opt_io_size =
+            s->qdev.conf.opt_io_size / s->qdev.blocksize;
+        bl.max_unmap_sectors =
+            s->max_unmap_size / s->qdev.blocksize;
+        bl.max_io_sectors =
+            s->max_io_size / s->qdev.blocksize;
+        /* 255 descriptors fit in 4 KiB with an 8-byte header */
+        bl.max_unmap_descr = 255;
+
         if (s->qdev.type == TYPE_DISK) {
             int max_transfer_blk = blk_get_max_transfer(s->qdev.conf.blk);
             int max_io_sectors_blk =
                 max_transfer_blk / s->qdev.blocksize;
 
-            max_io_sectors =
-                MIN_NON_ZERO(max_io_sectors_blk, max_io_sectors);
-
-            /* min_io_size and opt_io_size can't be greater than
-             * max_io_sectors */
-            if (min_io_size) {
-                min_io_size = MIN(min_io_size, max_io_sectors);
-            }
-            if (opt_io_size) {
-                opt_io_size = MIN(opt_io_size, max_io_sectors);
-            }
+            bl.max_io_sectors =
+                MIN_NON_ZERO(max_io_sectors_blk, bl.max_io_sectors);
         }
-        /* required VPD size with unmap support */
-        buflen = 0x40;
-        memset(outbuf + 4, 0, buflen - 4);
-
-        outbuf[4] = 0x1; /* wsnz */
-
-        /* optimal transfer length granularity */
-        outbuf[6] = (min_io_size >> 8) & 0xff;
-        outbuf[7] = min_io_size & 0xff;
-
-        /* maximum transfer length */
-        outbuf[8] = (max_io_sectors >> 24) & 0xff;
-        outbuf[9] = (max_io_sectors >> 16) & 0xff;
-        outbuf[10] = (max_io_sectors >> 8) & 0xff;
-        outbuf[11] = max_io_sectors & 0xff;
-
-        /* optimal transfer length */
-        outbuf[12] = (opt_io_size >> 24) & 0xff;
-        outbuf[13] = (opt_io_size >> 16) & 0xff;
-        outbuf[14] = (opt_io_size >> 8) & 0xff;
-        outbuf[15] = opt_io_size & 0xff;
-
-        /* max unmap LBA count, default is 1GB */
-        outbuf[20] = (max_unmap_sectors >> 24) & 0xff;
-        outbuf[21] = (max_unmap_sectors >> 16) & 0xff;
-        outbuf[22] = (max_unmap_sectors >> 8) & 0xff;
-        outbuf[23] = max_unmap_sectors & 0xff;
-
-        /* max unmap descriptors, 255 fit in 4 kb with an 8-byte header */
-        outbuf[24] = 0;
-        outbuf[25] = 0;
-        outbuf[26] = 0;
-        outbuf[27] = 255;
-
-        /* optimal unmap granularity */
-        outbuf[28] = (unmap_sectors >> 24) & 0xff;
-        outbuf[29] = (unmap_sectors >> 16) & 0xff;
-        outbuf[30] = (unmap_sectors >> 8) & 0xff;
-        outbuf[31] = unmap_sectors & 0xff;
-
-        /* max write same size */
-        outbuf[36] = 0;
-        outbuf[37] = 0;
-        outbuf[38] = 0;
-        outbuf[39] = 0;
-
-        outbuf[40] = (max_io_sectors >> 24) & 0xff;
-        outbuf[41] = (max_io_sectors >> 16) & 0xff;
-        outbuf[42] = (max_io_sectors >> 8) & 0xff;
-        outbuf[43] = max_io_sectors & 0xff;
+        buflen += scsi_emulate_block_limits(outbuf + buflen, &bl);
         break;
     }
     case 0xb1: /* block device characteristics */
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index c5497bbea8..b50ce642d7 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -16,6 +16,7 @@
 #include "qemu-common.h"
 #include "qemu/error-report.h"
 #include "hw/scsi/scsi.h"
+#include "hw/scsi/emulation.h"
 #include "sysemu/block-backend.h"
 
 #ifdef __linux__
@@ -181,7 +182,7 @@ static void scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s)
             /* Also take care of the opt xfer len. */
             stl_be_p(&r->buf[12],
                     MIN_NON_ZERO(max_transfer, ldl_be_p(&r->buf[12])));
-        } else if (page == 0x00 && s->needs_vpd_bl_emulation) {
+        } else if (s->needs_vpd_bl_emulation && page == 0x00) {
             /*
              * Now we're capable of supplying the VPD Block Limits
              * response if the hardware can't. Add it in the INQUIRY
@@ -209,9 +210,24 @@ static void scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s)
     }
 }
 
-static int scsi_emulate_block_limits(SCSIGenericReq *r)
+static int scsi_generic_emulate_block_limits(SCSIGenericReq *r, SCSIDevice *s)
 {
-    r->buflen = scsi_disk_emulate_vpd_page(&r->req, r->buf);
+    int len;
+    uint8_t buf[64];
+
+    SCSIBlockLimits bl = {
+        .max_io_sectors = blk_get_max_transfer(s->conf.blk) / s->blocksize
+    };
+
+    memset(r->buf, 0, r->buflen);
+    stb_p(buf, s->type);
+    stb_p(buf + 1, 0xb0);
+    len = scsi_emulate_block_limits(buf + 4, &bl);
+    assert(len <= sizeof(buf) - 4);
+    stw_be_p(buf + 2, len);
+
+    memcpy(r->buf, buf, MIN(r->buflen, len + 4));
+
     r->io_header.sb_len_wr = 0;
 
     /*
@@ -253,13 +269,12 @@ static void scsi_read_complete(void * opaque, int ret)
      * resulted in sense error but would need emulation.
      * In this case, emulate a valid VPD response.
      */
-    if (s->needs_vpd_bl_emulation) {
-        int is_vpd_bl = r->req.cmd.buf[0] == INQUIRY &&
-                         r->req.cmd.buf[1] & 0x01 &&
-                         r->req.cmd.buf[2] == 0xb0;
-
-        if (is_vpd_bl && sg_io_sense_from_errno(-ret, &r->io_header, &sense)) {
-            len = scsi_emulate_block_limits(r);
+    if (s->needs_vpd_bl_emulation &&
+        r->req.cmd.buf[0] == INQUIRY &&
+        (r->req.cmd.buf[1] & 0x01) &&
+        r->req.cmd.buf[2] == 0xb0) {
+        if (sg_io_sense_from_errno(-ret, &r->io_header, &sense)) {
+            len = scsi_generic_emulate_block_limits(r, s);
             /*
              * No need to let scsi_read_complete go on and handle an
              * INQUIRY VPD BL request we created manually.
diff --git a/include/hw/scsi/emulation.h b/include/hw/scsi/emulation.h
new file mode 100644
index 0000000000..09fba1ff39
--- /dev/null
+++ b/include/hw/scsi/emulation.h
@@ -0,0 +1,16 @@
+#ifndef HW_SCSI_EMULATION_H
+#define HW_SCSI_EMULATION_H 1
+
+typedef struct SCSIBlockLimits {
+    bool wsnz;
+    uint16_t min_io_size;
+    uint32_t max_unmap_descr;
+    uint32_t opt_io_size;
+    uint32_t max_unmap_sectors;
+    uint32_t unmap_sectors;
+    uint32_t max_io_sectors;
+} SCSIBlockLimits;
+
+int scsi_emulate_block_limits(uint8_t *outbuf, const SCSIBlockLimits *bl);
+
+#endif
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index ee3a4118fb..acef25faa4 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -189,7 +189,6 @@ void scsi_device_report_change(SCSIDevice *dev, SCSISense sense);
 void scsi_device_unit_attention_reported(SCSIDevice *dev);
 void scsi_generic_read_device_inquiry(SCSIDevice *dev);
 int scsi_device_get_sense(SCSIDevice *dev, uint8_t *buf, int len, bool fixed);
-int scsi_disk_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf);
 int scsi_SG_IO_FROM_DEV(BlockBackend *blk, uint8_t *cmd, uint8_t cmd_size,
                         uint8_t *buf, uint8_t buf_size);
 SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int target, int lun);

From 763c56872b08b98fde062a1feca003f200e7bd5c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Oct 2018 00:58:21 +0200
Subject: [PATCH 15/17] scsi-generic: do not do VPD emulation for sense other
 than ILLEGAL_REQUEST

Pass other sense, such as UNIT_ATTENTION or BUSY, directly to the
guest.

Reported-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-generic.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index b50ce642d7..7237b4162e 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -246,7 +246,6 @@ static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIGenericReq *r = (SCSIGenericReq *)opaque;
     SCSIDevice *s = r->req.dev;
-    SCSISense sense;
     int len;
 
     assert(r->req.aiocb != NULL);
@@ -269,11 +268,14 @@ static void scsi_read_complete(void * opaque, int ret)
      * resulted in sense error but would need emulation.
      * In this case, emulate a valid VPD response.
      */
-    if (s->needs_vpd_bl_emulation &&
+    if (s->needs_vpd_bl_emulation && ret == 0 &&
+        (r->io_header.driver_status & SG_ERR_DRIVER_SENSE) &&
         r->req.cmd.buf[0] == INQUIRY &&
         (r->req.cmd.buf[1] & 0x01) &&
         r->req.cmd.buf[2] == 0xb0) {
-        if (sg_io_sense_from_errno(-ret, &r->io_header, &sense)) {
+        SCSISense sense =
+            scsi_parse_sense_buf(r->req.sense, r->io_header.sb_len_wr);
+        if (sense.key == ILLEGAL_REQUEST) {
             len = scsi_generic_emulate_block_limits(r, s);
             /*
              * No need to let scsi_read_complete go on and handle an

From ca95173c7fb64a1544b1f560766976425659e5e4 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 5 Nov 2018 13:55:37 +0000
Subject: [PATCH 16/17] include/qemu/thread.h: Document qemu_thread_atexit* API

Add documentation for the qemu_thread_atexit_add() and
qemu_thread_atexit_remove() functions.

We include a (previously undocumented) constraint that notifiers
may not be called if a thread is exiting because the entire
process is exiting. This is fine for our current use because
the callers use it only for cleaning up resources which go away
on process exit (memory, Win32 fibers), and we will need the
flexibility for the new posix implementation.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20181105135538.28025-2-peter.maydell@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/thread.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index b2661b6672..55d83a907c 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -162,7 +162,29 @@ void qemu_thread_exit(void *retval);
 void qemu_thread_naming(bool enable);
 
 struct Notifier;
+/**
+ * qemu_thread_atexit_add:
+ * @notifier: Notifier to add
+ *
+ * Add the specified notifier to a list which will be run via
+ * notifier_list_notify() when this thread exits (either by calling
+ * qemu_thread_exit() or by returning from its start_routine).
+ * The usual usage is that the caller passes a Notifier which is
+ * a per-thread variable; it can then use the callback to free
+ * other per-thread data.
+ *
+ * If the thread exits as part of the entire process exiting,
+ * it is unspecified whether notifiers are called or not.
+ */
 void qemu_thread_atexit_add(struct Notifier *notifier);
+/**
+ * qemu_thread_atexit_remove:
+ * @notifier: Notifier to remove
+ *
+ * Remove the specified notifier from the thread-exit notification
+ * list. It is not valid to try to remove a notifier which is not
+ * on the list.
+ */
 void qemu_thread_atexit_remove(struct Notifier *notifier);
 
 struct QemuSpin {

From a458774ad711bceabefbf01e8f0b91d86ec72e0c Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 5 Nov 2018 13:55:38 +0000
Subject: [PATCH 17/17] util/qemu-thread-posix: Fix qemu_thread_atexit* for OSX

Our current implementation of qemu_thread_atexit* is broken on OSX.
This is because it works by cerating a piece of thread-specific
data with pthread_key_create() and using the destructor function
for that data to run the notifier function passed to it by
the caller of qemu_thread_atexit_add(). The expected use case
is that the caller uses a __thread variable as the notifier,
and uses the callback to clean up information that it is
keeping per-thread in __thread variables.

Unfortunately, on OSX this does not work, because on OSX
a __thread variable may be destroyed (freed) before the
pthread_key_create() destructor runs. (POSIX imposes no
ordering constraint here; the OSX implementation happens
to implement __thread variables in terms of pthread_key_create((),
whereas Linux uses different mechanisms that mean the __thread
variables will still be present when the pthread_key_create()
destructor is run.)

Fix this by switching to a scheme similar to the one qemu-thread-win32
uses for qemu_thread_atexit: keep the thread's notifiers on a
__thread variable, and run the notifiers on calls to
qemu_thread_exit() and on return from the start routine passed
to qemu_thread_start(). We do this with the pthread_cleanup_push()
API.

We take advantage of the qemu_thread_atexit_add() API
permission not to run thread notifiers on process exit to
avoid having to special case the main thread.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20181105135538.28025-3-peter.maydell@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/qemu-thread-posix.c | 44 ++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index dfa66ff2fb..865e476df5 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -443,42 +443,34 @@ void qemu_event_wait(QemuEvent *ev)
     }
 }
 
-static pthread_key_t exit_key;
-
-union NotifierThreadData {
-    void *ptr;
-    NotifierList list;
-};
-QEMU_BUILD_BUG_ON(sizeof(union NotifierThreadData) != sizeof(void *));
+static __thread NotifierList thread_exit;
 
+/*
+ * Note that in this implementation you can register a thread-exit
+ * notifier for the main thread, but it will never be called.
+ * This is OK because main thread exit can only happen when the
+ * entire process is exiting, and the API allows notifiers to not
+ * be called on process exit.
+ */
 void qemu_thread_atexit_add(Notifier *notifier)
 {
-    union NotifierThreadData ntd;
-    ntd.ptr = pthread_getspecific(exit_key);
-    notifier_list_add(&ntd.list, notifier);
-    pthread_setspecific(exit_key, ntd.ptr);
+    notifier_list_add(&thread_exit, notifier);
 }
 
 void qemu_thread_atexit_remove(Notifier *notifier)
 {
-    union NotifierThreadData ntd;
-    ntd.ptr = pthread_getspecific(exit_key);
     notifier_remove(notifier);
-    pthread_setspecific(exit_key, ntd.ptr);
 }
 
-static void qemu_thread_atexit_run(void *arg)
+static void qemu_thread_atexit_notify(void *arg)
 {
-    union NotifierThreadData ntd = { .ptr = arg };
-    notifier_list_notify(&ntd.list, NULL);
+    /*
+     * Called when non-main thread exits (via qemu_thread_exit()
+     * or by returning from its start routine.)
+     */
+    notifier_list_notify(&thread_exit, NULL);
 }
 
-static void __attribute__((constructor)) qemu_thread_atexit_init(void)
-{
-    pthread_key_create(&exit_key, qemu_thread_atexit_run);
-}
-
-
 typedef struct {
     void *(*start_routine)(void *);
     void *arg;
@@ -490,6 +482,7 @@ static void *qemu_thread_start(void *args)
     QemuThreadArgs *qemu_thread_args = args;
     void *(*start_routine)(void *) = qemu_thread_args->start_routine;
     void *arg = qemu_thread_args->arg;
+    void *r;
 
 #ifdef CONFIG_PTHREAD_SETNAME_NP
     /* Attempt to set the threads name; note that this is for debug, so
@@ -501,7 +494,10 @@ static void *qemu_thread_start(void *args)
 #endif
     g_free(qemu_thread_args->name);
     g_free(qemu_thread_args);
-    return start_routine(arg);
+    pthread_cleanup_push(qemu_thread_atexit_notify, NULL);
+    r = start_routine(arg);
+    pthread_cleanup_pop(1);
+    return r;
 }
 
 void qemu_thread_create(QemuThread *thread, const char *name,